public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
@ 2015-03-12 11:16 Jiangjiji
  2015-03-12 11:44 ` Kyrill Tkachov
  0 siblings, 1 reply; 8+ messages in thread
From: Jiangjiji @ 2015-03-12 11:16 UTC (permalink / raw)
  To: gcc-patches, Richard Earnshaw, Yangfei (Felix)

[-- Attachment #1: Type: text/plain, Size: 56998 bytes --]

Hi, 
  This is a ping for: https://gcc.gnu.org/ml/gcc-patches/2014-12/msg00775.html
  Regtested with aarch64-linux-gnu on QEMU.
  This patch has no regressions for aarch64_be-linux-gnu big-endian target too. 
  OK for the trunk? Thanks.


Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 219845)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,38 @@
+2014-12-11  Felix Yang  <felix.yang@huawei.com>
+	    Jiji Jiang  <jiangjiji@huawei.com>
+
+	* config/aarch64/aarch64-simd.md (aarch64_mul_n<mode>,
+	aarch64_<su>mull_n<mode>, aarch64_<su>mull<mode>,
+	aarch64_simd_<su>mull2_n<mode>, aarch64_<su>mull2_n<mode>,
+	aarch64_<su>mull_lane<mode>, aarch64_<su>mull2_lane<mode>_internal,
+	aarch64_<su>mull_laneq<mode>, aarch64_<su>mull2_laneq<mode>_internal,
+	aarch64_smull2_lane<mode>, aarch64_umull2_lane<mode>,
+	aarch64_smull2_laneq<mode>, aarch64_umull2_laneq<mode>,
+	aarch64_fmulx<mode>, aarch64_fmulx<mode>, aarch64_fmulx_lane<mode>,
+	aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns.
+	* config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_,
+	vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n,
+	umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull,
+	umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2,
+	smull2_lane): New builtins.
+	* config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32,
+	vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16,
+	vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16,
+	vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32,
+	vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16,
+	vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32,
+	vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8,
+	vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16,
+	vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16,
+	vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16,
+	vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32,
+	vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16,
+	vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32,
+	vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite
+	using builtin functions.
+	* config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE,
+	VDQF_Q): New unspec and int iterator.
+
 2015-01-19  Jiong Wang  <jiong.wang@arm.com>
 	    Andrew Pinski  <apinski@cavium.com>
 
Index: gcc/config/aarch64/arm_neon.h
===================================================================
--- gcc/config/aarch64/arm_neon.h	(revision 219845)
+++ gcc/config/aarch64/arm_neon.h	(working copy)
@@ -7580,671 +7580,6 @@ vmovn_u64 (uint64x2_t a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_n_f32 (float32x2_t a, float32_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_n_s16 (int16x4_t a, int16_t b)
-{
-  int16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_n_s32 (int32x2_t a, int32_t b)
-{
-  int32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmull_high_lane_s16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_s32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_u16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_u32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_s16(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_s32(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_u16(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_u32(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_high_n_s16 (int16x8_t a, int16_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_high_n_s32 (int32x4_t a, int32_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_high_n_u16 (uint16x8_t a, uint16_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_high_n_u32 (uint32x4_t a, uint32_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmull_high_p8 (poly8x16_t a, poly8x16_t b)
-{
-  poly16x8_t result;
-  __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmull_high_s8 (int8x16_t a, int8x16_t b)
-{
-  int16x8_t result;
-  __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_high_s16 (int16x8_t a, int16x8_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_high_s32 (int32x4_t a, int32x4_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmull_high_u8 (uint8x16_t a, uint8x16_t b)
-{
-  uint16x8_t result;
-  __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_high_u16 (uint16x8_t a, uint16x8_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_high_u32 (uint32x4_t a, uint32x4_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmull_lane_s16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull %0.4s,%1.4h,%2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_s32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull %0.2d,%1.2s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_u16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull %0.4s,%1.4h,%2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_u32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_s16(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_s32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_u16(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_u32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_n_s16 (int16x4_t a, int16_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_n_s32 (int32x2_t a, int32_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmull_p8 (poly8x8_t a, poly8x8_t b)
-{
-  poly16x8_t result;
-  __asm__ ("pmull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmull_s8 (int8x8_t a, int8x8_t b)
-{
-  int16x8_t result;
-  __asm__ ("smull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_s16 (int16x4_t a, int16x4_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_s32 (int32x2_t a, int32x2_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmull_u8 (uint8x8_t a, uint8x8_t b)
-{
-  uint16x8_t result;
-  __asm__ ("umull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_u16 (uint16x4_t a, uint16x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_u32 (uint32x2_t a, uint32x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulq_n_f32 (float32x4_t a, float32_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulq_n_f64 (float64x2_t a, float64_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmulq_n_s16 (int16x8_t a, int16_t b)
-{
-  int16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmulq_n_s32 (int32x4_t a, int32_t b)
-{
-  int32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmulq_n_u16 (uint16x8_t a, uint16_t b)
-{
-  uint16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmulq_n_u32 (uint32x4_t a, uint32_t b)
-{
-  uint32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmulx_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmulx %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmulx_lane_f32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x2_t a_ = (a);                                            \
-       float32x2_t result;                                              \
-       __asm__ ("fmulx %0.2s,%1.2s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vmulxd_f64 (float64_t a, float64_t b)
-{
-  float64_t result;
-  __asm__ ("fmulx %d0, %d1, %d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulxq_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmulx %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulxq_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmulx %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmulxq_lane_f32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x4_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       __asm__ ("fmulx %0.4s,%1.4s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulxq_lane_f64(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t b_ = (b);                                            \
-       float64x2_t a_ = (a);                                            \
-       float64x2_t result;                                              \
-       __asm__ ("fmulx %0.2d,%1.2d,%2.d[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmulxs_f32 (float32_t a, float32_t b)
-{
-  float32_t result;
-  __asm__ ("fmulx %s0, %s1, %s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vmvn_p8 (poly8x8_t a)
 {
@@ -18695,6 +18030,78 @@ vmul_n_f64  (float64x1_t __a, float64_t __b)
   return (float64x1_t) { vget_lane_f64 (__a, 0) * __b };
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmul_n_f32 (float32x2_t __a, float32_t __b)
+{
+  return __builtin_aarch64_mul_nv2sf (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmul_n_s16 (int16x4_t __a, int16_t __b)
+{
+  return __builtin_aarch64_mul_nv4hi (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmul_n_s32 (int32x2_t __a, int32_t __b)
+{
+  return __builtin_aarch64_mul_nv2si (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmul_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+  return (uint16x4_t) __builtin_aarch64_mul_nv4hi ((int16x4_t)__a,
+                                                   (int16_t)__b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmul_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_mul_nv2si ((int32x2_t)__a,
+                                                   (int32_t)__b);
+}
+
+/* vmulq_n  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulq_n_f32 (float32x4_t __a, float32_t __b)
+{
+  return __builtin_aarch64_mul_nv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulq_n_f64 (float64x2_t __a, float64_t __b)
+{
+  return __builtin_aarch64_mul_nv2df (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmulq_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return __builtin_aarch64_mul_nv8hi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmulq_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return __builtin_aarch64_mul_nv4si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+  return (uint16x8_t) __builtin_aarch64_mul_nv8hi ((int16x8_t)__a,
+                                                   (int16_t)__b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+  return (uint32x4_t) __builtin_aarch64_mul_nv4si ((int32x4_t)__a,
+                                                   (int32_t)__b);
+}
+
 /* vmulq_lane  */
 
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
@@ -18772,6 +18179,308 @@ vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, c
   return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
+/* vmull_high_lane  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_lanev8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_lanev4si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c)
+{
+  return (uint32x4_t) __builtin_aarch64_umull2_lanev8hi ((int16x8_t) __a,
+                                                         (int16x4_t) __b,
+                                                         __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c)
+{
+  return (uint64x2_t) __builtin_aarch64_umull2_lanev4si ((int32x4_t) __a,
+                                                         (int32x2_t) __b,
+                                                          __c);
+}
+
+/* vmull_high_laneq  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_laneqv8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_laneqv4si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+{
+  return (uint32x4_t) __builtin_aarch64_umull2_laneqv8hi ((int16x8_t)__a,
+                                                          (int16x8_t)__b,
+                                                          __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+{
+  return (uint64x2_t) __builtin_aarch64_umull2_laneqv4si ((int32x4_t) __a,
+                                                          (int32x4_t) __b,
+                                                           __c);
+}
+
+/* vmull_high_n  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return __builtin_aarch64_smull2_nv8hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return __builtin_aarch64_smull2_nv4si (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+  return __builtin_aarch64_umull2_nv8hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+  return __builtin_aarch64_umull2_nv4si_uuu (__a, __b);
+}
+
+/* vmull_high  */
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmull_high_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+    return  __builtin_aarch64_pmull2v16qi_ppp (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmull_high_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v4si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmull_high_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v16qi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v8hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v4si_uuu (__a, __b);
+}
+
+/* vmull_lane  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_lanev4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_lanev2si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_lanev4hi_uuuu (__a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_lanev2si_uuuu (__a, __b, __c);
+}
+
+/* vmull_laneq  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_laneqv4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_laneqv2si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_laneqv4hi_uuuu (__a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_laneqv2si_uuuu (__a, __b, __c);
+}
+
+/* vmull_n  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_n_s16 (int16x4_t __a, int16_t __b)
+{
+   return __builtin_aarch64_smull_nv4hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_n_s32 (int32x2_t __a, int32_t __b)
+{
+   return __builtin_aarch64_smull_nv2si (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+   return __builtin_aarch64_umull_nv4hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return __builtin_aarch64_umull_nv2si_uuu (__a, __b);
+}
+
+/* vmull  */
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmull_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return __builtin_aarch64_pmullv8qi_ppp (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmull_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return __builtin_aarch64_smullv8qi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return __builtin_aarch64_smullv4hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return __builtin_aarch64_smullv2si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmull_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return __builtin_aarch64_umullv8qi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return __builtin_aarch64_umullv4hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return __builtin_aarch64_umullv2si_uuu (__a, __b);
+}
+
+/* vmulx  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmulx_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return __builtin_aarch64_fmulxv2sf (__a, __b);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmulx_lane_f32 (float32x2_t __a, float32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev2sf (__a, __b, __c);
+}
+
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vmulxd_f64 (float64_t __a, float64_t __b)
+{
+  return __builtin_aarch64_fmulxdf (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulxq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return __builtin_aarch64_fmulxv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulxq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return __builtin_aarch64_fmulxv2df (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulxq_lane_f32 (float32x4_t __a, float32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev4sf (__a, __b, __c);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulxq_lane_f64 (float64x2_t __a, float64x2_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev2df (__a, __b, __c);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vmulxs_f32 (float32_t __a, float32_t __b)
+{
+  return __builtin_aarch64_fmulxsf (__a, __b);
+}
+
 /* vneg  */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
Index: gcc/config/aarch64/iterators.md
===================================================================
--- gcc/config/aarch64/iterators.md	(revision 219845)
+++ gcc/config/aarch64/iterators.md	(working copy)
@@ -276,6 +276,8 @@
     UNSPEC_SHA256SU1    ; Used in aarch64-simd.md.
     UNSPEC_PMULL        ; Used in aarch64-simd.md.
     UNSPEC_PMULL2       ; Used in aarch64-simd.md.
+    UNSPEC_FMULX        ; Used in aarch64-simd.md.
+    UNSPEC_FMULX_LANE   ; Used in aarch64-simd.md.
 ])
 
 ;; -------------------------------------------------------------------
@@ -466,6 +468,9 @@
 
 )
 
+(define_mode_attr VDQF_Q [(V2SF "V4SF") (V4SF "V4SF")
+                          (V2DF "V2DF")])
+
 ;; Widened mode register suffixes for VD_BHSI/VQW.
 (define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s")
 			  (V2SI "2d") (V16QI "8h") 
Index: gcc/config/aarch64/aarch64-simd.md
===================================================================
--- gcc/config/aarch64/aarch64-simd.md	(revision 219845)
+++ gcc/config/aarch64/aarch64-simd.md	(working copy)
@@ -1396,6 +1396,253 @@
  }
 )
 
+(define_insn "aarch64_mul_n<mode>"
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+        (mult:VMUL
+          (match_operand:VMUL 1 "register_operand" "w")
+          (vec_duplicate:VMUL
+            (match_operand:<VEL> 2 "register_operand" "<h_con>"))))]
+  "TARGET_SIMD"
+  "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_long")]
+)
+
+(define_insn "aarch64_<su>mull_n<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (match_operand:<VEL> 2 "register_operand" "<vwx>")))))]
+  "TARGET_SIMD"
+  "<su>mull\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+
+(define_insn "aarch64_<su>mull<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_BHSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_BHSI 2 "register_operand" "w"))))]
+ "TARGET_SIMD"
+ "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_mul_<Vetype>_long")]
+)
+
+(define_insn "aarch64_simd_<su>mull2_n<mode>"
+ [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+       (mult:<VWIDE> (ANY_EXTEND:<VWIDE> (vec_select:<VHALF>
+                            (match_operand:VQ_HSI 1 "register_operand" "w")
+                            (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
+                     (ANY_EXTEND:<VWIDE> (vec_duplicate:<VHALF>
+                            (match_operand:<VEL> 2 "register_operand" "<vw>")))))]
+  "TARGET_SIMD"
+  "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_expand "aarch64_<su>mull2_n<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "")
+   (ANY_EXTEND:<VWIDE> (match_operand:VQ_HSI 1 "register_operand" ""))
+   (match_operand:<VEL> 2 "register_operand" "")]
+ "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+   emit_insn (gen_aarch64_simd_<su>mull2_n<mode> (operands[0],
+                                                  operands[1],
+                                                  operands[2], p));
+   DONE;
+
+ }
+)
+
+(define_insn "aarch64_<su>mull_lane<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (vec_select:<VEL>
+                (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull_laneq<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (vec_select:<VEL>
+                (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull2_lane<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (vec_select:<VHALF>
+              (match_operand:VQ_HSI 1 "register_operand" "w")
+              (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" "")))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:<VHALF>
+              (vec_select:<VEL>
+                (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull2_laneq<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (vec_select:<VHALF>
+              (match_operand:VQ_HSI 1 "register_operand" "w")
+              (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" "")))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:<VHALF>
+              (vec_select:<VEL>
+                (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_expand "aarch64_smull2_lane<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_smull2_lane<mode>_internal (operands[0], operands[1],
+                                                     operands[2], operands[3],
+                                                     p));
+  DONE;
+})
+
+(define_expand "aarch64_umull2_lane<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_umull2_lane<mode>_internal (operands[0], operands[1],
+                                                     operands[2], operands[3],
+                                                     p));
+  DONE;
+})
+
+(define_expand "aarch64_smull2_laneq<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_smull2_laneq<mode>_internal (operands[0], operands[1],
+                                                      operands[2], operands[3],
+                                                      p));
+  DONE;
+})
+
+(define_expand "aarch64_umull2_laneq<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_umull2_laneq<mode>_internal (operands[0], operands[1],
+                                                      operands[2], operands[3],
+                                                      p));
+  DONE;
+})
+
+(define_insn "aarch64_fmulx<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+        (unspec:VDQF  [(match_operand:VDQF 1 "register_operand" "w")
+                       (match_operand:VDQF 2 "register_operand" "w")]
+                      UNSPEC_FMULX))]
+ "TARGET_SIMD"
+ "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vtype>"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_fmulx<mode>"
+  [(set (match_operand:GPF 0 "register_operand" "=w")
+        (unspec:GPF  [(match_operand:GPF 1 "register_operand" "w")
+                      (match_operand:GPF 2 "register_operand" "w")]
+                     UNSPEC_FMULX))]
+ "TARGET_SIMD"
+ "fmulx\\t%<s>0, %<s>1, %<s>2"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_fmulx_lane<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+        (unspec:VDQF  [(match_operand:VDQF 1 "register_operand" "w")
+                       (match_operand:<VDQF_Q> 2 "register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_FMULX_LANE))]
+ "TARGET_SIMD"
+ "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vetype>"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_pmull2v16qi"
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
+       (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "w")
+                     (match_operand:V16QI 2 "register_operand" "w")]
+                    UNSPEC_PMULL2))]
+  "TARGET_SIMD"
+  "pmull2\\t%0.8h, %1.16b, %2.16b"
+  [(set_attr "type" "neon_mul_b_long")]
+)
+
+(define_insn "aarch64_pmullv8qi"
+  [(set (match_operand:V8HI 0 "register_operand" "=w")
+        (unspec:V8HI  [(match_operand:V8QI 1 "register_operand" "w")
+                       (match_operand:V8QI 2 "register_operand" "w")]
+                      UNSPEC_PMULL))]
+ "TARGET_SIMD"
+ "pmull\\t%0.8h, %1.8b, %2.8b"
+  [(set_attr "type" "neon_mul_b_long")]
+)
+
 ;; FP vector operations.
 ;; AArch64 AdvSIMD supports single-precision (32-bit) and 
 ;; double-precision (64-bit) floating-point data types and arithmetic as
Index: gcc/config/aarch64/aarch64-simd-builtins.def
===================================================================
--- gcc/config/aarch64/aarch64-simd-builtins.def	(revision 219845)
+++ gcc/config/aarch64/aarch64-simd-builtins.def	(working copy)
@@ -187,6 +187,39 @@
   BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0)
   BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0)
 
+  /* Implemented by vec_widen_<su>mult_hi_<mode>.  */
+  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10)
+  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10)
+  /* Implemented by aarch64_<su>mull<mode>.  */
+  BUILTIN_VD_BHSI (BINOPU, umull, 0)
+  BUILTIN_VD_BHSI (BINOP, smull, 0)
+  /* Implemented by aarch64_<su>mull_n<mode>.  */
+  BUILTIN_VD_HSI (BINOP, smull_n, 0)
+  BUILTIN_VD_HSI (BINOPU, umull_n, 0)
+  /* Implemented by aarch64_mul_n<mode>.  */
+  BUILTIN_VMUL (BINOP, mul_n, 0)
+  /* Implemented by aarch64_<su>mull2_n<mode>.  */
+  BUILTIN_VQ_HSI (BINOP, smull2_n, 0)
+  BUILTIN_VQ_HSI (BINOPU, umull2_n, 0)
+  /* Implemented by aarch64_<su>mull_lane<q><mode>.  */
+  BUILTIN_VD_HSI (TERNOP, smull_lane, 0)
+  BUILTIN_VD_HSI (TERNOPU, umull_lane, 0)
+  BUILTIN_VD_HSI (TERNOP, smull_laneq, 0)
+  BUILTIN_VD_HSI (TERNOPU, umull_laneq, 0)
+  /* Implemented by aarch64_<su>mull2_lane<q><mode>.  */
+  BUILTIN_VQ_HSI (TERNOP, smull2_lane, 0)
+  BUILTIN_VQ_HSI (TERNOP_LANE, umull2_lane, 0)
+  BUILTIN_VQ_HSI (TERNOP, smull2_laneq, 0)
+  BUILTIN_VQ_HSI (TERNOP_LANE, umull2_laneq, 0)
+  /* Implemented by aarch64_fmulx<mode>.  */
+  BUILTIN_VDQF (BINOP, fmulx, 0)
+  BUILTIN_GPF (BINOP, fmulx, 0)
+  BUILTIN_VDQF (BINOP, fmulx_lane, 0)
+
+  /* Implemented by aarch64_pmull<2><mode>.*/
+  VAR1 (BINOPP, pmull, 0, v8qi)
+  VAR1 (BINOPP, pmull2, 0, v16qi)
+
   BUILTIN_VSDQ_I_DI (BINOP, ashl, 3)
   /* Implemented by aarch64_<sur>shl<mode>.  */
   BUILTIN_VSDQ_I_DI (BINOP, sshl, 0)

[-- Attachment #2: aarch64_vmull_X_v4.diff --]
[-- Type: application/octet-stream, Size: 55320 bytes --]

Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 219845)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,38 @@
+2014-12-11  Felix Yang  <felix.yang@huawei.com>
+	    Jiji Jiang  <jiangjiji@huawei.com>
+
+	* config/aarch64/aarch64-simd.md (aarch64_mul_n<mode>,
+	aarch64_<su>mull_n<mode>, aarch64_<su>mull<mode>,
+	aarch64_simd_<su>mull2_n<mode>, aarch64_<su>mull2_n<mode>,
+	aarch64_<su>mull_lane<mode>, aarch64_<su>mull2_lane<mode>_internal,
+	aarch64_<su>mull_laneq<mode>, aarch64_<su>mull2_laneq<mode>_internal,
+	aarch64_smull2_lane<mode>, aarch64_umull2_lane<mode>,
+	aarch64_smull2_laneq<mode>, aarch64_umull2_laneq<mode>,
+	aarch64_fmulx<mode>, aarch64_fmulx<mode>, aarch64_fmulx_lane<mode>,
+	aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns.
+	* config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_,
+	vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n,
+	umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull,
+	umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2,
+	smull2_lane): New builtins.
+	* config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32,
+	vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16,
+	vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16,
+	vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32,
+	vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16,
+	vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32,
+	vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8,
+	vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16,
+	vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16,
+	vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16,
+	vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32,
+	vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16,
+	vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32,
+	vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite
+	using builtin functions.
+	* config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE,
+	VDQF_Q): New unspec and int iterator.
+
 2015-01-19  Jiong Wang  <jiong.wang@arm.com>
 	    Andrew Pinski  <apinski@cavium.com>
 
Index: gcc/config/aarch64/arm_neon.h
===================================================================
--- gcc/config/aarch64/arm_neon.h	(revision 219845)
+++ gcc/config/aarch64/arm_neon.h	(working copy)
@@ -7580,671 +7580,6 @@ vmovn_u64 (uint64x2_t a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_n_f32 (float32x2_t a, float32_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_n_s16 (int16x4_t a, int16_t b)
-{
-  int16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_n_s32 (int32x2_t a, int32_t b)
-{
-  int32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmull_high_lane_s16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_s32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_u16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_u32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_s16(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_s32(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_u16(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_u32(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_high_n_s16 (int16x8_t a, int16_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_high_n_s32 (int32x4_t a, int32_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_high_n_u16 (uint16x8_t a, uint16_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_high_n_u32 (uint32x4_t a, uint32_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmull_high_p8 (poly8x16_t a, poly8x16_t b)
-{
-  poly16x8_t result;
-  __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmull_high_s8 (int8x16_t a, int8x16_t b)
-{
-  int16x8_t result;
-  __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_high_s16 (int16x8_t a, int16x8_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_high_s32 (int32x4_t a, int32x4_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmull_high_u8 (uint8x16_t a, uint8x16_t b)
-{
-  uint16x8_t result;
-  __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_high_u16 (uint16x8_t a, uint16x8_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_high_u32 (uint32x4_t a, uint32x4_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmull_lane_s16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull %0.4s,%1.4h,%2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_s32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull %0.2d,%1.2s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_u16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull %0.4s,%1.4h,%2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_u32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_s16(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_s32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_u16(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_u32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_n_s16 (int16x4_t a, int16_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_n_s32 (int32x2_t a, int32_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmull_p8 (poly8x8_t a, poly8x8_t b)
-{
-  poly16x8_t result;
-  __asm__ ("pmull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmull_s8 (int8x8_t a, int8x8_t b)
-{
-  int16x8_t result;
-  __asm__ ("smull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_s16 (int16x4_t a, int16x4_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_s32 (int32x2_t a, int32x2_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmull_u8 (uint8x8_t a, uint8x8_t b)
-{
-  uint16x8_t result;
-  __asm__ ("umull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_u16 (uint16x4_t a, uint16x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_u32 (uint32x2_t a, uint32x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulq_n_f32 (float32x4_t a, float32_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulq_n_f64 (float64x2_t a, float64_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmulq_n_s16 (int16x8_t a, int16_t b)
-{
-  int16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmulq_n_s32 (int32x4_t a, int32_t b)
-{
-  int32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmulq_n_u16 (uint16x8_t a, uint16_t b)
-{
-  uint16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmulq_n_u32 (uint32x4_t a, uint32_t b)
-{
-  uint32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmulx_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmulx %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmulx_lane_f32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x2_t a_ = (a);                                            \
-       float32x2_t result;                                              \
-       __asm__ ("fmulx %0.2s,%1.2s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vmulxd_f64 (float64_t a, float64_t b)
-{
-  float64_t result;
-  __asm__ ("fmulx %d0, %d1, %d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulxq_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmulx %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulxq_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmulx %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmulxq_lane_f32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x4_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       __asm__ ("fmulx %0.4s,%1.4s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulxq_lane_f64(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t b_ = (b);                                            \
-       float64x2_t a_ = (a);                                            \
-       float64x2_t result;                                              \
-       __asm__ ("fmulx %0.2d,%1.2d,%2.d[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmulxs_f32 (float32_t a, float32_t b)
-{
-  float32_t result;
-  __asm__ ("fmulx %s0, %s1, %s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vmvn_p8 (poly8x8_t a)
 {
@@ -18695,6 +18030,78 @@ vmul_n_f64  (float64x1_t __a, float64_t __b)
   return (float64x1_t) { vget_lane_f64 (__a, 0) * __b };
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmul_n_f32 (float32x2_t __a, float32_t __b)
+{
+  return __builtin_aarch64_mul_nv2sf (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmul_n_s16 (int16x4_t __a, int16_t __b)
+{
+  return __builtin_aarch64_mul_nv4hi (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmul_n_s32 (int32x2_t __a, int32_t __b)
+{
+  return __builtin_aarch64_mul_nv2si (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmul_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+  return (uint16x4_t) __builtin_aarch64_mul_nv4hi ((int16x4_t)__a,
+                                                   (int16_t)__b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmul_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_mul_nv2si ((int32x2_t)__a,
+                                                   (int32_t)__b);
+}
+
+/* vmulq_n  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulq_n_f32 (float32x4_t __a, float32_t __b)
+{
+  return __builtin_aarch64_mul_nv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulq_n_f64 (float64x2_t __a, float64_t __b)
+{
+  return __builtin_aarch64_mul_nv2df (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmulq_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return __builtin_aarch64_mul_nv8hi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmulq_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return __builtin_aarch64_mul_nv4si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+  return (uint16x8_t) __builtin_aarch64_mul_nv8hi ((int16x8_t)__a,
+                                                   (int16_t)__b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+  return (uint32x4_t) __builtin_aarch64_mul_nv4si ((int32x4_t)__a,
+                                                   (int32_t)__b);
+}
+
 /* vmulq_lane  */
 
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
@@ -18772,6 +18179,308 @@ vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, c
   return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
+/* vmull_high_lane  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_lanev8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_lanev4si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c)
+{
+  return (uint32x4_t) __builtin_aarch64_umull2_lanev8hi ((int16x8_t) __a,
+                                                         (int16x4_t) __b,
+                                                         __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c)
+{
+  return (uint64x2_t) __builtin_aarch64_umull2_lanev4si ((int32x4_t) __a,
+                                                         (int32x2_t) __b,
+                                                          __c);
+}
+
+/* vmull_high_laneq  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_laneqv8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_laneqv4si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+{
+  return (uint32x4_t) __builtin_aarch64_umull2_laneqv8hi ((int16x8_t)__a,
+                                                          (int16x8_t)__b,
+                                                          __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+{
+  return (uint64x2_t) __builtin_aarch64_umull2_laneqv4si ((int32x4_t) __a,
+                                                          (int32x4_t) __b,
+                                                           __c);
+}
+
+/* vmull_high_n  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return __builtin_aarch64_smull2_nv8hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return __builtin_aarch64_smull2_nv4si (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+  return __builtin_aarch64_umull2_nv8hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+  return __builtin_aarch64_umull2_nv4si_uuu (__a, __b);
+}
+
+/* vmull_high  */
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmull_high_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+    return  __builtin_aarch64_pmull2v16qi_ppp (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmull_high_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v4si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmull_high_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v16qi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v8hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v4si_uuu (__a, __b);
+}
+
+/* vmull_lane  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_lanev4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_lanev2si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_lanev4hi_uuuu (__a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_lanev2si_uuuu (__a, __b, __c);
+}
+
+/* vmull_laneq  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_laneqv4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_laneqv2si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_laneqv4hi_uuuu (__a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_laneqv2si_uuuu (__a, __b, __c);
+}
+
+/* vmull_n  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_n_s16 (int16x4_t __a, int16_t __b)
+{
+   return __builtin_aarch64_smull_nv4hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_n_s32 (int32x2_t __a, int32_t __b)
+{
+   return __builtin_aarch64_smull_nv2si (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+   return __builtin_aarch64_umull_nv4hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return __builtin_aarch64_umull_nv2si_uuu (__a, __b);
+}
+
+/* vmull  */
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmull_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return __builtin_aarch64_pmullv8qi_ppp (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmull_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return __builtin_aarch64_smullv8qi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return __builtin_aarch64_smullv4hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return __builtin_aarch64_smullv2si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmull_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return __builtin_aarch64_umullv8qi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return __builtin_aarch64_umullv4hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return __builtin_aarch64_umullv2si_uuu (__a, __b);
+}
+
+/* vmulx  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmulx_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return __builtin_aarch64_fmulxv2sf (__a, __b);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmulx_lane_f32 (float32x2_t __a, float32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev2sf (__a, __b, __c);
+}
+
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vmulxd_f64 (float64_t __a, float64_t __b)
+{
+  return __builtin_aarch64_fmulxdf (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulxq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return __builtin_aarch64_fmulxv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulxq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return __builtin_aarch64_fmulxv2df (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulxq_lane_f32 (float32x4_t __a, float32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev4sf (__a, __b, __c);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulxq_lane_f64 (float64x2_t __a, float64x2_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev2df (__a, __b, __c);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vmulxs_f32 (float32_t __a, float32_t __b)
+{
+  return __builtin_aarch64_fmulxsf (__a, __b);
+}
+
 /* vneg  */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
Index: gcc/config/aarch64/iterators.md
===================================================================
--- gcc/config/aarch64/iterators.md	(revision 219845)
+++ gcc/config/aarch64/iterators.md	(working copy)
@@ -276,6 +276,8 @@
     UNSPEC_SHA256SU1    ; Used in aarch64-simd.md.
     UNSPEC_PMULL        ; Used in aarch64-simd.md.
     UNSPEC_PMULL2       ; Used in aarch64-simd.md.
+    UNSPEC_FMULX        ; Used in aarch64-simd.md.
+    UNSPEC_FMULX_LANE   ; Used in aarch64-simd.md.
 ])
 
 ;; -------------------------------------------------------------------
@@ -466,6 +468,9 @@
 
 )
 
+(define_mode_attr VDQF_Q [(V2SF "V4SF") (V4SF "V4SF")
+                          (V2DF "V2DF")])
+
 ;; Widened mode register suffixes for VD_BHSI/VQW.
 (define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s")
 			  (V2SI "2d") (V16QI "8h") 
Index: gcc/config/aarch64/aarch64-simd.md
===================================================================
--- gcc/config/aarch64/aarch64-simd.md	(revision 219845)
+++ gcc/config/aarch64/aarch64-simd.md	(working copy)
@@ -1396,6 +1396,253 @@
  }
 )
 
+(define_insn "aarch64_mul_n<mode>"
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+        (mult:VMUL
+          (match_operand:VMUL 1 "register_operand" "w")
+          (vec_duplicate:VMUL
+            (match_operand:<VEL> 2 "register_operand" "<h_con>"))))]
+  "TARGET_SIMD"
+  "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_long")]
+)
+
+(define_insn "aarch64_<su>mull_n<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (match_operand:<VEL> 2 "register_operand" "<vwx>")))))]
+  "TARGET_SIMD"
+  "<su>mull\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+
+(define_insn "aarch64_<su>mull<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_BHSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_BHSI 2 "register_operand" "w"))))]
+ "TARGET_SIMD"
+ "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_mul_<Vetype>_long")]
+)
+
+(define_insn "aarch64_simd_<su>mull2_n<mode>"
+ [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+       (mult:<VWIDE> (ANY_EXTEND:<VWIDE> (vec_select:<VHALF>
+                            (match_operand:VQ_HSI 1 "register_operand" "w")
+                            (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
+                     (ANY_EXTEND:<VWIDE> (vec_duplicate:<VHALF>
+                            (match_operand:<VEL> 2 "register_operand" "<vw>")))))]
+  "TARGET_SIMD"
+  "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_expand "aarch64_<su>mull2_n<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "")
+   (ANY_EXTEND:<VWIDE> (match_operand:VQ_HSI 1 "register_operand" ""))
+   (match_operand:<VEL> 2 "register_operand" "")]
+ "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+   emit_insn (gen_aarch64_simd_<su>mull2_n<mode> (operands[0],
+                                                  operands[1],
+                                                  operands[2], p));
+   DONE;
+
+ }
+)
+
+(define_insn "aarch64_<su>mull_lane<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (vec_select:<VEL>
+                (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull_laneq<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (vec_select:<VEL>
+                (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull2_lane<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (vec_select:<VHALF>
+              (match_operand:VQ_HSI 1 "register_operand" "w")
+              (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" "")))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:<VHALF>
+              (vec_select:<VEL>
+                (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull2_laneq<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (vec_select:<VHALF>
+              (match_operand:VQ_HSI 1 "register_operand" "w")
+              (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" "")))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:<VHALF>
+              (vec_select:<VEL>
+                (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_expand "aarch64_smull2_lane<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_smull2_lane<mode>_internal (operands[0], operands[1],
+                                                     operands[2], operands[3],
+                                                     p));
+  DONE;
+})
+
+(define_expand "aarch64_umull2_lane<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_umull2_lane<mode>_internal (operands[0], operands[1],
+                                                     operands[2], operands[3],
+                                                     p));
+  DONE;
+})
+
+(define_expand "aarch64_smull2_laneq<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_smull2_laneq<mode>_internal (operands[0], operands[1],
+                                                      operands[2], operands[3],
+                                                      p));
+  DONE;
+})
+
+(define_expand "aarch64_umull2_laneq<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_umull2_laneq<mode>_internal (operands[0], operands[1],
+                                                      operands[2], operands[3],
+                                                      p));
+  DONE;
+})
+
+(define_insn "aarch64_fmulx<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+        (unspec:VDQF  [(match_operand:VDQF 1 "register_operand" "w")
+                       (match_operand:VDQF 2 "register_operand" "w")]
+                      UNSPEC_FMULX))]
+ "TARGET_SIMD"
+ "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vtype>"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_fmulx<mode>"
+  [(set (match_operand:GPF 0 "register_operand" "=w")
+        (unspec:GPF  [(match_operand:GPF 1 "register_operand" "w")
+                      (match_operand:GPF 2 "register_operand" "w")]
+                     UNSPEC_FMULX))]
+ "TARGET_SIMD"
+ "fmulx\\t%<s>0, %<s>1, %<s>2"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_fmulx_lane<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+        (unspec:VDQF  [(match_operand:VDQF 1 "register_operand" "w")
+                       (match_operand:<VDQF_Q> 2 "register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_FMULX_LANE))]
+ "TARGET_SIMD"
+ "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vetype>"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_pmull2v16qi"
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
+       (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "w")
+                     (match_operand:V16QI 2 "register_operand" "w")]
+                    UNSPEC_PMULL2))]
+  "TARGET_SIMD"
+  "pmull2\\t%0.8h, %1.16b, %2.16b"
+  [(set_attr "type" "neon_mul_b_long")]
+)
+
+(define_insn "aarch64_pmullv8qi"
+  [(set (match_operand:V8HI 0 "register_operand" "=w")
+        (unspec:V8HI  [(match_operand:V8QI 1 "register_operand" "w")
+                       (match_operand:V8QI 2 "register_operand" "w")]
+                      UNSPEC_PMULL))]
+ "TARGET_SIMD"
+ "pmull\\t%0.8h, %1.8b, %2.8b"
+  [(set_attr "type" "neon_mul_b_long")]
+)
+
 ;; FP vector operations.
 ;; AArch64 AdvSIMD supports single-precision (32-bit) and 
 ;; double-precision (64-bit) floating-point data types and arithmetic as
Index: gcc/config/aarch64/aarch64-simd-builtins.def
===================================================================
--- gcc/config/aarch64/aarch64-simd-builtins.def	(revision 219845)
+++ gcc/config/aarch64/aarch64-simd-builtins.def	(working copy)
@@ -187,6 +187,39 @@
   BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0)
   BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0)
 
+  /* Implemented by vec_widen_<su>mult_hi_<mode>.  */
+  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10)
+  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10)
+  /* Implemented by aarch64_<su>mull<mode>.  */
+  BUILTIN_VD_BHSI (BINOPU, umull, 0)
+  BUILTIN_VD_BHSI (BINOP, smull, 0)
+  /* Implemented by aarch64_<su>mull_n<mode>.  */
+  BUILTIN_VD_HSI (BINOP, smull_n, 0)
+  BUILTIN_VD_HSI (BINOPU, umull_n, 0)
+  /* Implemented by aarch64_mul_n<mode>.  */
+  BUILTIN_VMUL (BINOP, mul_n, 0)
+  /* Implemented by aarch64_<su>mull2_n<mode>.  */
+  BUILTIN_VQ_HSI (BINOP, smull2_n, 0)
+  BUILTIN_VQ_HSI (BINOPU, umull2_n, 0)
+  /* Implemented by aarch64_<su>mull_lane<q><mode>.  */
+  BUILTIN_VD_HSI (TERNOP, smull_lane, 0)
+  BUILTIN_VD_HSI (TERNOPU, umull_lane, 0)
+  BUILTIN_VD_HSI (TERNOP, smull_laneq, 0)
+  BUILTIN_VD_HSI (TERNOPU, umull_laneq, 0)
+  /* Implemented by aarch64_<su>mull2_lane<q><mode>.  */
+  BUILTIN_VQ_HSI (TERNOP, smull2_lane, 0)
+  BUILTIN_VQ_HSI (TERNOP_LANE, umull2_lane, 0)
+  BUILTIN_VQ_HSI (TERNOP, smull2_laneq, 0)
+  BUILTIN_VQ_HSI (TERNOP_LANE, umull2_laneq, 0)
+  /* Implemented by aarch64_fmulx<mode>.  */
+  BUILTIN_VDQF (BINOP, fmulx, 0)
+  BUILTIN_GPF (BINOP, fmulx, 0)
+  BUILTIN_VDQF (BINOP, fmulx_lane, 0)
+
+  /* Implemented by aarch64_pmull<2><mode>.*/
+  VAR1 (BINOPP, pmull, 0, v8qi)
+  VAR1 (BINOPP, pmull2, 0, v16qi)
+
   BUILTIN_VSDQ_I_DI (BINOP, ashl, 3)
   /* Implemented by aarch64_<sur>shl<mode>.  */
   BUILTIN_VSDQ_I_DI (BINOP, sshl, 0)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
  2015-03-12 11:16 [PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics Jiangjiji
@ 2015-03-12 11:44 ` Kyrill Tkachov
  2015-03-14 10:09   ` Jiangjiji
                     ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Kyrill Tkachov @ 2015-03-12 11:44 UTC (permalink / raw)
  To: Jiangjiji, gcc-patches, Richard Earnshaw, Yangfei (Felix)

Hi Jiangjiji,

This is definitely stage 1 material by now...
At my glance it all looks like the right approach, I have a question below:

On 12/03/15 09:20, Jiangjiji wrote:
> +
> +(define_insn "aarch64_fmulx_lane<mode>"
> +  [(set (match_operand:VDQF 0 "register_operand" "=w")
> +        (unspec:VDQF  [(match_operand:VDQF 1 "register_operand" "w")
> +                       (match_operand:<VDQF_Q> 2 "register_operand" "w")
> +                       (match_operand:SI 3 "immediate_operand" "i")]
> +                      UNSPEC_FMULX_LANE))]
> + "TARGET_SIMD"
> + "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vetype>"
> +  [(set_attr "type" "neon_mul_s")]
> +)
Where did operand 3 go? Shouldn't his be the lane-element variant of fmulx?

Thanks,
Kyrill

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
  2015-03-12 11:44 ` Kyrill Tkachov
@ 2015-03-14 10:09   ` Jiangjiji
  2015-04-11 10:38   ` [PING^3] " Jiangjiji
  2015-05-04  8:52   ` [PING^4] " Jiangjiji
  2 siblings, 0 replies; 8+ messages in thread
From: Jiangjiji @ 2015-03-14 10:09 UTC (permalink / raw)
  To: Kyrill Tkachov, gcc-patches, Richard Earnshaw, Yangfei (Felix)

[-- Attachment #1: Type: text/plain, Size: 56736 bytes --]

Hi, Kyrill
  Thank you for your suggestion. 
  I fixed it and regtested with aarch64-linux-gnu on QEMU.
  This patch has no regressions for aarch64_be-linux-gnu big-endian target too. 
  OK for the trunk? 

Thanks.
Jiang jiji



Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 221393)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,38 @@
+2015-03-14  Felix Yang  <felix.yang@huawei.com>
+	    Jiji Jiang  <jiangjiji@huawei.com>
+
+	* config/aarch64/aarch64-simd.md (aarch64_mul_n<mode>,
+	aarch64_<su>mull_n<mode>, aarch64_<su>mull<mode>,
+	aarch64_simd_<su>mull2_n<mode>, aarch64_<su>mull2_n<mode>,
+	aarch64_<su>mull_lane<mode>, aarch64_<su>mull2_lane<mode>_internal,
+	aarch64_<su>mull_laneq<mode>, aarch64_<su>mull2_laneq<mode>_internal,
+	aarch64_smull2_lane<mode>, aarch64_umull2_lane<mode>,
+	aarch64_smull2_laneq<mode>, aarch64_umull2_laneq<mode>,
+	aarch64_fmulx<mode>, aarch64_fmulx<mode>, aarch64_fmulx_lane<mode>,
+	aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns.
+	* config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_,
+	vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n,
+	umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull,
+	umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2,
+	smull2_lane): New builtins.
+	* config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32,
+	vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16,
+	vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16,
+	vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32,
+	vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16,
+	vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32,
+	vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8,
+	vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16,
+	vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16,
+	vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16,
+	vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32,
+	vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16,
+	vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32,
+	vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite
+	using builtin functions.
+	* config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE,
+	VDQF_Q): New unspec and int iterator.
+
 2015-03-12  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
 
 	PR rtl-optimization/65235
Index: gcc/config/aarch64/arm_neon.h
===================================================================
--- gcc/config/aarch64/arm_neon.h	(revision 221393)
+++ gcc/config/aarch64/arm_neon.h	(working copy)
@@ -7580,671 +7580,6 @@ vmovn_u64 (uint64x2_t a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_n_f32 (float32x2_t a, float32_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_n_s16 (int16x4_t a, int16_t b)
-{
-  int16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_n_s32 (int32x2_t a, int32_t b)
-{
-  int32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmull_high_lane_s16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_s32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_u16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_u32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_s16(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_s32(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_u16(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_u32(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_high_n_s16 (int16x8_t a, int16_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_high_n_s32 (int32x4_t a, int32_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_high_n_u16 (uint16x8_t a, uint16_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_high_n_u32 (uint32x4_t a, uint32_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmull_high_p8 (poly8x16_t a, poly8x16_t b)
-{
-  poly16x8_t result;
-  __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmull_high_s8 (int8x16_t a, int8x16_t b)
-{
-  int16x8_t result;
-  __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_high_s16 (int16x8_t a, int16x8_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_high_s32 (int32x4_t a, int32x4_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmull_high_u8 (uint8x16_t a, uint8x16_t b)
-{
-  uint16x8_t result;
-  __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_high_u16 (uint16x8_t a, uint16x8_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_high_u32 (uint32x4_t a, uint32x4_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmull_lane_s16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull %0.4s,%1.4h,%2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_s32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull %0.2d,%1.2s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_u16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull %0.4s,%1.4h,%2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_u32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_s16(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_s32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_u16(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_u32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_n_s16 (int16x4_t a, int16_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_n_s32 (int32x2_t a, int32_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmull_p8 (poly8x8_t a, poly8x8_t b)
-{
-  poly16x8_t result;
-  __asm__ ("pmull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmull_s8 (int8x8_t a, int8x8_t b)
-{
-  int16x8_t result;
-  __asm__ ("smull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_s16 (int16x4_t a, int16x4_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_s32 (int32x2_t a, int32x2_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmull_u8 (uint8x8_t a, uint8x8_t b)
-{
-  uint16x8_t result;
-  __asm__ ("umull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_u16 (uint16x4_t a, uint16x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_u32 (uint32x2_t a, uint32x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulq_n_f32 (float32x4_t a, float32_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulq_n_f64 (float64x2_t a, float64_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmulq_n_s16 (int16x8_t a, int16_t b)
-{
-  int16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmulq_n_s32 (int32x4_t a, int32_t b)
-{
-  int32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmulq_n_u16 (uint16x8_t a, uint16_t b)
-{
-  uint16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmulq_n_u32 (uint32x4_t a, uint32_t b)
-{
-  uint32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmulx_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmulx %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmulx_lane_f32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x2_t a_ = (a);                                            \
-       float32x2_t result;                                              \
-       __asm__ ("fmulx %0.2s,%1.2s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vmulxd_f64 (float64_t a, float64_t b)
-{
-  float64_t result;
-  __asm__ ("fmulx %d0, %d1, %d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulxq_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmulx %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulxq_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmulx %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmulxq_lane_f32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x4_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       __asm__ ("fmulx %0.4s,%1.4s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulxq_lane_f64(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t b_ = (b);                                            \
-       float64x2_t a_ = (a);                                            \
-       float64x2_t result;                                              \
-       __asm__ ("fmulx %0.2d,%1.2d,%2.d[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmulxs_f32 (float32_t a, float32_t b)
-{
-  float32_t result;
-  __asm__ ("fmulx %s0, %s1, %s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vmvn_p8 (poly8x8_t a)
 {
@@ -18695,6 +18030,380 @@ vmul_n_f64  (float64x1_t __a, float64_t __b)
   return (float64x1_t) { vget_lane_f64 (__a, 0) * __b };
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmul_n_f32 (float32x2_t __a, float32_t __b)
+{
+  return __builtin_aarch64_mul_nv2sf (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmul_n_s16 (int16x4_t __a, int16_t __b)
+{
+  return __builtin_aarch64_mul_nv4hi (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmul_n_s32 (int32x2_t __a, int32_t __b)
+{
+  return __builtin_aarch64_mul_nv2si (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmul_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+  return (uint16x4_t) __builtin_aarch64_mul_nv4hi ((int16x4_t)__a,
+                                                   (int16_t)__b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmul_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_mul_nv2si ((int32x2_t)__a,
+                                                   (int32_t)__b);
+}
+
+/* vmulq_n  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulq_n_f32 (float32x4_t __a, float32_t __b)
+{
+  return __builtin_aarch64_mul_nv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulq_n_f64 (float64x2_t __a, float64_t __b)
+{
+  return __builtin_aarch64_mul_nv2df (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmulq_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return __builtin_aarch64_mul_nv8hi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmulq_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return __builtin_aarch64_mul_nv4si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+  return (uint16x8_t) __builtin_aarch64_mul_nv8hi ((int16x8_t)__a,
+                                                   (int16_t)__b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+  return (uint32x4_t) __builtin_aarch64_mul_nv4si ((int32x4_t)__a,
+                                                   (int32_t)__b);
+}
+
+/* vmull_high_lane  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_lanev8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_lanev4si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c)
+{
+  return (uint32x4_t) __builtin_aarch64_umull2_lanev8hi ((int16x8_t) __a,
+                                                         (int16x4_t) __b,
+                                                         __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c)
+{
+  return (uint64x2_t) __builtin_aarch64_umull2_lanev4si ((int32x4_t) __a,
+                                                         (int32x2_t) __b,
+                                                          __c);
+}
+
+/* vmull_high_laneq  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_laneqv8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_laneqv4si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+{
+  return (uint32x4_t) __builtin_aarch64_umull2_laneqv8hi ((int16x8_t)__a,
+                                                          (int16x8_t)__b,
+                                                          __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+{
+  return (uint64x2_t) __builtin_aarch64_umull2_laneqv4si ((int32x4_t) __a,
+                                                          (int32x4_t) __b,
+                                                           __c);
+}
+
+/* vmull_high_n  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return __builtin_aarch64_smull2_nv8hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return __builtin_aarch64_smull2_nv4si (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+  return __builtin_aarch64_umull2_nv8hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+  return __builtin_aarch64_umull2_nv4si_uuu (__a, __b);
+}
+
+/* vmull_high  */
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmull_high_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+    return  __builtin_aarch64_pmull2v16qi_ppp (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmull_high_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v4si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmull_high_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v16qi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v8hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v4si_uuu (__a, __b);
+}
+
+/* vmull_lane  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_lanev4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_lanev2si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_lanev4hi_uuuu (__a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_lanev2si_uuuu (__a, __b, __c);
+}
+
+/* vmull_laneq  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_laneqv4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_laneqv2si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_laneqv4hi_uuuu (__a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_laneqv2si_uuuu (__a, __b, __c);
+}
+
+/* vmull_n  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_n_s16 (int16x4_t __a, int16_t __b)
+{
+   return __builtin_aarch64_smull_nv4hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_n_s32 (int32x2_t __a, int32_t __b)
+{
+   return __builtin_aarch64_smull_nv2si (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+   return __builtin_aarch64_umull_nv4hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return __builtin_aarch64_umull_nv2si_uuu (__a, __b);
+}
+
+/* vmull  */
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmull_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return __builtin_aarch64_pmullv8qi_ppp (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmull_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return __builtin_aarch64_smullv8qi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return __builtin_aarch64_smullv4hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return __builtin_aarch64_smullv2si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmull_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return __builtin_aarch64_umullv8qi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return __builtin_aarch64_umullv4hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return __builtin_aarch64_umullv2si_uuu (__a, __b);
+}
+
+/* vmulx  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmulx_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return __builtin_aarch64_fmulxv2sf (__a, __b);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmulx_lane_f32 (float32x2_t __a, float32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev2sf (__a, __b, __c);
+}
+
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vmulxd_f64 (float64_t __a, float64_t __b)
+{
+  return __builtin_aarch64_fmulxdf (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulxq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return __builtin_aarch64_fmulxv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulxq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return __builtin_aarch64_fmulxv2df (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulxq_lane_f32 (float32x4_t __a, float32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev4sf (__a, __b, __c);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulxq_lane_f64 (float64x2_t __a, float64x2_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev2df (__a, __b, __c);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vmulxs_f32 (float32_t __a, float32_t __b)
+{
+  return __builtin_aarch64_fmulxsf (__a, __b);
+}
+
 /* vmulq_lane  */
 
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
Index: gcc/config/aarch64/iterators.md
===================================================================
--- gcc/config/aarch64/iterators.md	(revision 221393)
+++ gcc/config/aarch64/iterators.md	(working copy)
@@ -277,6 +277,8 @@
     UNSPEC_SHA256SU1    ; Used in aarch64-simd.md.
     UNSPEC_PMULL        ; Used in aarch64-simd.md.
     UNSPEC_PMULL2       ; Used in aarch64-simd.md.
+    UNSPEC_FMULX        ; Used in aarch64-simd.md.
+    UNSPEC_FMULX_LANE   ; Used in aarch64-simd.md.
     UNSPEC_REV_REGLIST  ; Used in aarch64-simd.md.
 ])
 
@@ -468,6 +470,9 @@
 
 )
 
+(define_mode_attr VDQF_Q [(V2SF "V4SF") (V4SF "V4SF")
+                          (V2DF "V2DF")])
+
 ;; Widened mode register suffixes for VD_BHSI/VQW.
 (define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s")
 			  (V2SI "2d") (V16QI "8h") 
Index: gcc/config/aarch64/aarch64-simd.md
===================================================================
--- gcc/config/aarch64/aarch64-simd.md	(revision 221393)
+++ gcc/config/aarch64/aarch64-simd.md	(working copy)
@@ -1400,6 +1400,252 @@
  }
 )
 
+(define_insn "aarch64_mul_n<mode>"
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+        (mult:VMUL
+          (match_operand:VMUL 1 "register_operand" "w")
+          (vec_duplicate:VMUL
+            (match_operand:<VEL> 2 "register_operand" "<h_con>"))))]
+  "TARGET_SIMD"
+  "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_long")]
+)
+
+(define_insn "aarch64_<su>mull_n<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (match_operand:<VEL> 2 "register_operand" "<vwx>")))))]
+  "TARGET_SIMD"
+  "<su>mull\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_BHSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_BHSI 2 "register_operand" "w"))))]
+ "TARGET_SIMD"
+ "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_mul_<Vetype>_long")]
+)
+
+(define_insn "aarch64_simd_<su>mull2_n<mode>"
+ [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+       (mult:<VWIDE> (ANY_EXTEND:<VWIDE> (vec_select:<VHALF>
+                            (match_operand:VQ_HSI 1 "register_operand" "w")
+                            (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
+                     (ANY_EXTEND:<VWIDE> (vec_duplicate:<VHALF>
+                            (match_operand:<VEL> 2 "register_operand" "<vw>")))))]
+  "TARGET_SIMD"
+  "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_expand "aarch64_<su>mull2_n<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "")
+   (ANY_EXTEND:<VWIDE> (match_operand:VQ_HSI 1 "register_operand" ""))
+   (match_operand:<VEL> 2 "register_operand" "")]
+ "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+   emit_insn (gen_aarch64_simd_<su>mull2_n<mode> (operands[0],
+                                                  operands[1],
+                                                  operands[2], p));
+   DONE;
+
+ }
+)
+
+(define_insn "aarch64_<su>mull_lane<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (vec_select:<VEL>
+                (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull_laneq<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (vec_select:<VEL>
+                (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull2_lane<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (vec_select:<VHALF>
+              (match_operand:VQ_HSI 1 "register_operand" "w")
+              (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" "")))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:<VHALF>
+              (vec_select:<VEL>
+                (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull2_laneq<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (vec_select:<VHALF>
+              (match_operand:VQ_HSI 1 "register_operand" "w")
+              (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" "")))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:<VHALF>
+              (vec_select:<VEL>
+                (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_expand "aarch64_smull2_lane<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_smull2_lane<mode>_internal (operands[0], operands[1],
+                                                     operands[2], operands[3],
+                                                     p));
+  DONE;
+})
+
+(define_expand "aarch64_umull2_lane<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_umull2_lane<mode>_internal (operands[0], operands[1],
+                                                     operands[2], operands[3],
+                                                     p));
+  DONE;
+})
+
+(define_expand "aarch64_smull2_laneq<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_smull2_laneq<mode>_internal (operands[0], operands[1],
+                                                      operands[2], operands[3],
+                                                      p));
+  DONE;
+})
+
+(define_expand "aarch64_umull2_laneq<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_umull2_laneq<mode>_internal (operands[0], operands[1],
+                                                      operands[2], operands[3],
+                                                      p));
+  DONE;
+})
+
+(define_insn "aarch64_fmulx<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+        (unspec:VDQF  [(match_operand:VDQF 1 "register_operand" "w")
+                       (match_operand:VDQF 2 "register_operand" "w")]
+                      UNSPEC_FMULX))]
+ "TARGET_SIMD"
+ "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vtype>"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_fmulx<mode>"
+  [(set (match_operand:GPF 0 "register_operand" "=w")
+        (unspec:GPF  [(match_operand:GPF 1 "register_operand" "w")
+                      (match_operand:GPF 2 "register_operand" "w")]
+                     UNSPEC_FMULX))]
+ "TARGET_SIMD"
+ "fmulx\\t%<s>0, %<s>1, %<s>2"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_fmulx_lane<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+        (unspec:VDQF  [(match_operand:VDQF 1 "register_operand" "w")
+                       (match_operand:<VDQF_Q> 2 "register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_FMULX_LANE))]
+ "TARGET_SIMD"
+ "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vetype>[%3]"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_pmull2v16qi"
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
+       (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "w")
+                     (match_operand:V16QI 2 "register_operand" "w")]
+                    UNSPEC_PMULL2))]
+  "TARGET_SIMD"
+  "pmull2\\t%0.8h, %1.16b, %2.16b"
+  [(set_attr "type" "neon_mul_b_long")]
+)
+
+(define_insn "aarch64_pmullv8qi"
+  [(set (match_operand:V8HI 0 "register_operand" "=w")
+        (unspec:V8HI  [(match_operand:V8QI 1 "register_operand" "w")
+                       (match_operand:V8QI 2 "register_operand" "w")]
+                      UNSPEC_PMULL))]
+ "TARGET_SIMD"
+ "pmull\\t%0.8h, %1.8b, %2.8b"
+  [(set_attr "type" "neon_mul_b_long")]
+)
+
 ;; FP vector operations.
 ;; AArch64 AdvSIMD supports single-precision (32-bit) and 
 ;; double-precision (64-bit) floating-point data types and arithmetic as
Index: gcc/config/aarch64/aarch64-simd-builtins.def
===================================================================
--- gcc/config/aarch64/aarch64-simd-builtins.def	(revision 221393)
+++ gcc/config/aarch64/aarch64-simd-builtins.def	(working copy)
@@ -187,6 +187,39 @@
   BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0)
   BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0)
 
+  /* Implemented by vec_widen_<su>mult_hi_<mode>.  */
+  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10)
+  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10)
+  /* Implemented by aarch64_<su>mull<mode>.  */
+  BUILTIN_VD_BHSI (BINOPU, umull, 0)
+  BUILTIN_VD_BHSI (BINOP, smull, 0)
+  /* Implemented by aarch64_<su>mull_n<mode>.  */
+  BUILTIN_VD_HSI (BINOP, smull_n, 0)
+  BUILTIN_VD_HSI (BINOPU, umull_n, 0)
+  /* Implemented by aarch64_mul_n<mode>.  */
+  BUILTIN_VMUL (BINOP, mul_n, 0)
+  /* Implemented by aarch64_<su>mull2_n<mode>.  */
+  BUILTIN_VQ_HSI (BINOP, smull2_n, 0)
+  BUILTIN_VQ_HSI (BINOPU, umull2_n, 0)
+  /* Implemented by aarch64_<su>mull_lane<q><mode>.  */
+  BUILTIN_VD_HSI (TERNOP, smull_lane, 0)
+  BUILTIN_VD_HSI (TERNOPU, umull_lane, 0)
+  BUILTIN_VD_HSI (TERNOP, smull_laneq, 0)
+  BUILTIN_VD_HSI (TERNOPU, umull_laneq, 0)
+  /* Implemented by aarch64_<su>mull2_lane<q><mode>.  */
+  BUILTIN_VQ_HSI (TERNOP, smull2_lane, 0)
+  BUILTIN_VQ_HSI (TERNOP_LANE, umull2_lane, 0)
+  BUILTIN_VQ_HSI (TERNOP, smull2_laneq, 0)
+  BUILTIN_VQ_HSI (TERNOP_LANE, umull2_laneq, 0)
+  /* Implemented by aarch64_fmulx<mode>.  */
+  BUILTIN_VDQF (BINOP, fmulx, 0)
+  BUILTIN_GPF (BINOP, fmulx, 0)
+  BUILTIN_VDQF (BINOP, fmulx_lane, 0)
+
+  /* Implemented by aarch64_pmull<2><mode>.*/
+  VAR1 (BINOPP, pmull, 0, v8qi)
+  VAR1 (BINOPP, pmull2, 0, v16qi)
+
   BUILTIN_VSDQ_I_DI (BINOP, ashl, 3)
   /* Implemented by aarch64_<sur>shl<mode>.  */
   BUILTIN_VSDQ_I_DI (BINOP, sshl, 0)




[-- Attachment #2: aarch64_vmull_X_v5.diff --]
[-- Type: application/octet-stream, Size: 55066 bytes --]

Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 221393)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,38 @@
+2015-03-14  Felix Yang  <felix.yang@huawei.com>
+	    Jiji Jiang  <jiangjiji@huawei.com>
+
+	* config/aarch64/aarch64-simd.md (aarch64_mul_n<mode>,
+	aarch64_<su>mull_n<mode>, aarch64_<su>mull<mode>,
+	aarch64_simd_<su>mull2_n<mode>, aarch64_<su>mull2_n<mode>,
+	aarch64_<su>mull_lane<mode>, aarch64_<su>mull2_lane<mode>_internal,
+	aarch64_<su>mull_laneq<mode>, aarch64_<su>mull2_laneq<mode>_internal,
+	aarch64_smull2_lane<mode>, aarch64_umull2_lane<mode>,
+	aarch64_smull2_laneq<mode>, aarch64_umull2_laneq<mode>,
+	aarch64_fmulx<mode>, aarch64_fmulx<mode>, aarch64_fmulx_lane<mode>,
+	aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns.
+	* config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_,
+	vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n,
+	umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull,
+	umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2,
+	smull2_lane): New builtins.
+	* config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32,
+	vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16,
+	vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16,
+	vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32,
+	vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16,
+	vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32,
+	vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8,
+	vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16,
+	vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16,
+	vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16,
+	vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32,
+	vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16,
+	vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32,
+	vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite
+	using builtin functions.
+	* config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE,
+	VDQF_Q): New unspec and int iterator.
+
 2015-03-12  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
 
 	PR rtl-optimization/65235
Index: gcc/config/aarch64/arm_neon.h
===================================================================
--- gcc/config/aarch64/arm_neon.h	(revision 221393)
+++ gcc/config/aarch64/arm_neon.h	(working copy)
@@ -7580,671 +7580,6 @@ vmovn_u64 (uint64x2_t a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_n_f32 (float32x2_t a, float32_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_n_s16 (int16x4_t a, int16_t b)
-{
-  int16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_n_s32 (int32x2_t a, int32_t b)
-{
-  int32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmull_high_lane_s16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_s32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_u16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_u32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_s16(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_s32(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_u16(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_u32(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_high_n_s16 (int16x8_t a, int16_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_high_n_s32 (int32x4_t a, int32_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_high_n_u16 (uint16x8_t a, uint16_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_high_n_u32 (uint32x4_t a, uint32_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmull_high_p8 (poly8x16_t a, poly8x16_t b)
-{
-  poly16x8_t result;
-  __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmull_high_s8 (int8x16_t a, int8x16_t b)
-{
-  int16x8_t result;
-  __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_high_s16 (int16x8_t a, int16x8_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_high_s32 (int32x4_t a, int32x4_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmull_high_u8 (uint8x16_t a, uint8x16_t b)
-{
-  uint16x8_t result;
-  __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_high_u16 (uint16x8_t a, uint16x8_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_high_u32 (uint32x4_t a, uint32x4_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmull_lane_s16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull %0.4s,%1.4h,%2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_s32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull %0.2d,%1.2s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_u16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull %0.4s,%1.4h,%2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_u32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_s16(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_s32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_u16(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_u32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_n_s16 (int16x4_t a, int16_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_n_s32 (int32x2_t a, int32_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmull_p8 (poly8x8_t a, poly8x8_t b)
-{
-  poly16x8_t result;
-  __asm__ ("pmull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmull_s8 (int8x8_t a, int8x8_t b)
-{
-  int16x8_t result;
-  __asm__ ("smull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_s16 (int16x4_t a, int16x4_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_s32 (int32x2_t a, int32x2_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmull_u8 (uint8x8_t a, uint8x8_t b)
-{
-  uint16x8_t result;
-  __asm__ ("umull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_u16 (uint16x4_t a, uint16x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_u32 (uint32x2_t a, uint32x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulq_n_f32 (float32x4_t a, float32_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulq_n_f64 (float64x2_t a, float64_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmulq_n_s16 (int16x8_t a, int16_t b)
-{
-  int16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmulq_n_s32 (int32x4_t a, int32_t b)
-{
-  int32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmulq_n_u16 (uint16x8_t a, uint16_t b)
-{
-  uint16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmulq_n_u32 (uint32x4_t a, uint32_t b)
-{
-  uint32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmulx_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmulx %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmulx_lane_f32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x2_t a_ = (a);                                            \
-       float32x2_t result;                                              \
-       __asm__ ("fmulx %0.2s,%1.2s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vmulxd_f64 (float64_t a, float64_t b)
-{
-  float64_t result;
-  __asm__ ("fmulx %d0, %d1, %d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulxq_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmulx %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulxq_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmulx %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmulxq_lane_f32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x4_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       __asm__ ("fmulx %0.4s,%1.4s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulxq_lane_f64(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t b_ = (b);                                            \
-       float64x2_t a_ = (a);                                            \
-       float64x2_t result;                                              \
-       __asm__ ("fmulx %0.2d,%1.2d,%2.d[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmulxs_f32 (float32_t a, float32_t b)
-{
-  float32_t result;
-  __asm__ ("fmulx %s0, %s1, %s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vmvn_p8 (poly8x8_t a)
 {
@@ -18695,6 +18030,380 @@ vmul_n_f64  (float64x1_t __a, float64_t __b)
   return (float64x1_t) { vget_lane_f64 (__a, 0) * __b };
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmul_n_f32 (float32x2_t __a, float32_t __b)
+{
+  return __builtin_aarch64_mul_nv2sf (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmul_n_s16 (int16x4_t __a, int16_t __b)
+{
+  return __builtin_aarch64_mul_nv4hi (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmul_n_s32 (int32x2_t __a, int32_t __b)
+{
+  return __builtin_aarch64_mul_nv2si (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmul_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+  return (uint16x4_t) __builtin_aarch64_mul_nv4hi ((int16x4_t)__a,
+                                                   (int16_t)__b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmul_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_mul_nv2si ((int32x2_t)__a,
+                                                   (int32_t)__b);
+}
+
+/* vmulq_n  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulq_n_f32 (float32x4_t __a, float32_t __b)
+{
+  return __builtin_aarch64_mul_nv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulq_n_f64 (float64x2_t __a, float64_t __b)
+{
+  return __builtin_aarch64_mul_nv2df (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmulq_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return __builtin_aarch64_mul_nv8hi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmulq_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return __builtin_aarch64_mul_nv4si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+  return (uint16x8_t) __builtin_aarch64_mul_nv8hi ((int16x8_t)__a,
+                                                   (int16_t)__b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+  return (uint32x4_t) __builtin_aarch64_mul_nv4si ((int32x4_t)__a,
+                                                   (int32_t)__b);
+}
+
+/* vmull_high_lane  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_lanev8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_lanev4si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c)
+{
+  return (uint32x4_t) __builtin_aarch64_umull2_lanev8hi ((int16x8_t) __a,
+                                                         (int16x4_t) __b,
+                                                         __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c)
+{
+  return (uint64x2_t) __builtin_aarch64_umull2_lanev4si ((int32x4_t) __a,
+                                                         (int32x2_t) __b,
+                                                          __c);
+}
+
+/* vmull_high_laneq  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_laneqv8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_laneqv4si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+{
+  return (uint32x4_t) __builtin_aarch64_umull2_laneqv8hi ((int16x8_t)__a,
+                                                          (int16x8_t)__b,
+                                                          __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+{
+  return (uint64x2_t) __builtin_aarch64_umull2_laneqv4si ((int32x4_t) __a,
+                                                          (int32x4_t) __b,
+                                                           __c);
+}
+
+/* vmull_high_n  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return __builtin_aarch64_smull2_nv8hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return __builtin_aarch64_smull2_nv4si (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+  return __builtin_aarch64_umull2_nv8hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+  return __builtin_aarch64_umull2_nv4si_uuu (__a, __b);
+}
+
+/* vmull_high  */
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmull_high_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+    return  __builtin_aarch64_pmull2v16qi_ppp (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmull_high_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v4si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmull_high_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v16qi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v8hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v4si_uuu (__a, __b);
+}
+
+/* vmull_lane  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_lanev4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_lanev2si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_lanev4hi_uuuu (__a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_lanev2si_uuuu (__a, __b, __c);
+}
+
+/* vmull_laneq  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_laneqv4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_laneqv2si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_laneqv4hi_uuuu (__a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_laneqv2si_uuuu (__a, __b, __c);
+}
+
+/* vmull_n  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_n_s16 (int16x4_t __a, int16_t __b)
+{
+   return __builtin_aarch64_smull_nv4hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_n_s32 (int32x2_t __a, int32_t __b)
+{
+   return __builtin_aarch64_smull_nv2si (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+   return __builtin_aarch64_umull_nv4hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return __builtin_aarch64_umull_nv2si_uuu (__a, __b);
+}
+
+/* vmull  */
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmull_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return __builtin_aarch64_pmullv8qi_ppp (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmull_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return __builtin_aarch64_smullv8qi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return __builtin_aarch64_smullv4hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return __builtin_aarch64_smullv2si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmull_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return __builtin_aarch64_umullv8qi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return __builtin_aarch64_umullv4hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return __builtin_aarch64_umullv2si_uuu (__a, __b);
+}
+
+/* vmulx  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmulx_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return __builtin_aarch64_fmulxv2sf (__a, __b);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmulx_lane_f32 (float32x2_t __a, float32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev2sf (__a, __b, __c);
+}
+
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vmulxd_f64 (float64_t __a, float64_t __b)
+{
+  return __builtin_aarch64_fmulxdf (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulxq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return __builtin_aarch64_fmulxv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulxq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return __builtin_aarch64_fmulxv2df (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulxq_lane_f32 (float32x4_t __a, float32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev4sf (__a, __b, __c);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulxq_lane_f64 (float64x2_t __a, float64x2_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev2df (__a, __b, __c);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vmulxs_f32 (float32_t __a, float32_t __b)
+{
+  return __builtin_aarch64_fmulxsf (__a, __b);
+}
+
 /* vmulq_lane  */
 
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
Index: gcc/config/aarch64/iterators.md
===================================================================
--- gcc/config/aarch64/iterators.md	(revision 221393)
+++ gcc/config/aarch64/iterators.md	(working copy)
@@ -277,6 +277,8 @@
     UNSPEC_SHA256SU1    ; Used in aarch64-simd.md.
     UNSPEC_PMULL        ; Used in aarch64-simd.md.
     UNSPEC_PMULL2       ; Used in aarch64-simd.md.
+    UNSPEC_FMULX        ; Used in aarch64-simd.md.
+    UNSPEC_FMULX_LANE   ; Used in aarch64-simd.md.
     UNSPEC_REV_REGLIST  ; Used in aarch64-simd.md.
 ])
 
@@ -468,6 +470,9 @@
 
 )
 
+(define_mode_attr VDQF_Q [(V2SF "V4SF") (V4SF "V4SF")
+                          (V2DF "V2DF")])
+
 ;; Widened mode register suffixes for VD_BHSI/VQW.
 (define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s")
 			  (V2SI "2d") (V16QI "8h") 
Index: gcc/config/aarch64/aarch64-simd.md
===================================================================
--- gcc/config/aarch64/aarch64-simd.md	(revision 221393)
+++ gcc/config/aarch64/aarch64-simd.md	(working copy)
@@ -1400,6 +1400,252 @@
  }
 )
 
+(define_insn "aarch64_mul_n<mode>"
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+        (mult:VMUL
+          (match_operand:VMUL 1 "register_operand" "w")
+          (vec_duplicate:VMUL
+            (match_operand:<VEL> 2 "register_operand" "<h_con>"))))]
+  "TARGET_SIMD"
+  "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_long")]
+)
+
+(define_insn "aarch64_<su>mull_n<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (match_operand:<VEL> 2 "register_operand" "<vwx>")))))]
+  "TARGET_SIMD"
+  "<su>mull\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_BHSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_BHSI 2 "register_operand" "w"))))]
+ "TARGET_SIMD"
+ "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_mul_<Vetype>_long")]
+)
+
+(define_insn "aarch64_simd_<su>mull2_n<mode>"
+ [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+       (mult:<VWIDE> (ANY_EXTEND:<VWIDE> (vec_select:<VHALF>
+                            (match_operand:VQ_HSI 1 "register_operand" "w")
+                            (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
+                     (ANY_EXTEND:<VWIDE> (vec_duplicate:<VHALF>
+                            (match_operand:<VEL> 2 "register_operand" "<vw>")))))]
+  "TARGET_SIMD"
+  "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_expand "aarch64_<su>mull2_n<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "")
+   (ANY_EXTEND:<VWIDE> (match_operand:VQ_HSI 1 "register_operand" ""))
+   (match_operand:<VEL> 2 "register_operand" "")]
+ "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+   emit_insn (gen_aarch64_simd_<su>mull2_n<mode> (operands[0],
+                                                  operands[1],
+                                                  operands[2], p));
+   DONE;
+
+ }
+)
+
+(define_insn "aarch64_<su>mull_lane<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (vec_select:<VEL>
+                (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull_laneq<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (vec_select:<VEL>
+                (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull2_lane<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (vec_select:<VHALF>
+              (match_operand:VQ_HSI 1 "register_operand" "w")
+              (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" "")))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:<VHALF>
+              (vec_select:<VEL>
+                (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull2_laneq<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (vec_select:<VHALF>
+              (match_operand:VQ_HSI 1 "register_operand" "w")
+              (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" "")))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:<VHALF>
+              (vec_select:<VEL>
+                (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_expand "aarch64_smull2_lane<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_smull2_lane<mode>_internal (operands[0], operands[1],
+                                                     operands[2], operands[3],
+                                                     p));
+  DONE;
+})
+
+(define_expand "aarch64_umull2_lane<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_umull2_lane<mode>_internal (operands[0], operands[1],
+                                                     operands[2], operands[3],
+                                                     p));
+  DONE;
+})
+
+(define_expand "aarch64_smull2_laneq<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_smull2_laneq<mode>_internal (operands[0], operands[1],
+                                                      operands[2], operands[3],
+                                                      p));
+  DONE;
+})
+
+(define_expand "aarch64_umull2_laneq<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_umull2_laneq<mode>_internal (operands[0], operands[1],
+                                                      operands[2], operands[3],
+                                                      p));
+  DONE;
+})
+
+(define_insn "aarch64_fmulx<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+        (unspec:VDQF  [(match_operand:VDQF 1 "register_operand" "w")
+                       (match_operand:VDQF 2 "register_operand" "w")]
+                      UNSPEC_FMULX))]
+ "TARGET_SIMD"
+ "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vtype>"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_fmulx<mode>"
+  [(set (match_operand:GPF 0 "register_operand" "=w")
+        (unspec:GPF  [(match_operand:GPF 1 "register_operand" "w")
+                      (match_operand:GPF 2 "register_operand" "w")]
+                     UNSPEC_FMULX))]
+ "TARGET_SIMD"
+ "fmulx\\t%<s>0, %<s>1, %<s>2"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_fmulx_lane<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+        (unspec:VDQF  [(match_operand:VDQF 1 "register_operand" "w")
+                       (match_operand:<VDQF_Q> 2 "register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_FMULX_LANE))]
+ "TARGET_SIMD"
+ "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vetype>[%3]"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_pmull2v16qi"
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
+       (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "w")
+                     (match_operand:V16QI 2 "register_operand" "w")]
+                    UNSPEC_PMULL2))]
+  "TARGET_SIMD"
+  "pmull2\\t%0.8h, %1.16b, %2.16b"
+  [(set_attr "type" "neon_mul_b_long")]
+)
+
+(define_insn "aarch64_pmullv8qi"
+  [(set (match_operand:V8HI 0 "register_operand" "=w")
+        (unspec:V8HI  [(match_operand:V8QI 1 "register_operand" "w")
+                       (match_operand:V8QI 2 "register_operand" "w")]
+                      UNSPEC_PMULL))]
+ "TARGET_SIMD"
+ "pmull\\t%0.8h, %1.8b, %2.8b"
+  [(set_attr "type" "neon_mul_b_long")]
+)
+
 ;; FP vector operations.
 ;; AArch64 AdvSIMD supports single-precision (32-bit) and 
 ;; double-precision (64-bit) floating-point data types and arithmetic as
Index: gcc/config/aarch64/aarch64-simd-builtins.def
===================================================================
--- gcc/config/aarch64/aarch64-simd-builtins.def	(revision 221393)
+++ gcc/config/aarch64/aarch64-simd-builtins.def	(working copy)
@@ -187,6 +187,39 @@
   BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0)
   BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0)
 
+  /* Implemented by vec_widen_<su>mult_hi_<mode>.  */
+  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10)
+  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10)
+  /* Implemented by aarch64_<su>mull<mode>.  */
+  BUILTIN_VD_BHSI (BINOPU, umull, 0)
+  BUILTIN_VD_BHSI (BINOP, smull, 0)
+  /* Implemented by aarch64_<su>mull_n<mode>.  */
+  BUILTIN_VD_HSI (BINOP, smull_n, 0)
+  BUILTIN_VD_HSI (BINOPU, umull_n, 0)
+  /* Implemented by aarch64_mul_n<mode>.  */
+  BUILTIN_VMUL (BINOP, mul_n, 0)
+  /* Implemented by aarch64_<su>mull2_n<mode>.  */
+  BUILTIN_VQ_HSI (BINOP, smull2_n, 0)
+  BUILTIN_VQ_HSI (BINOPU, umull2_n, 0)
+  /* Implemented by aarch64_<su>mull_lane<q><mode>.  */
+  BUILTIN_VD_HSI (TERNOP, smull_lane, 0)
+  BUILTIN_VD_HSI (TERNOPU, umull_lane, 0)
+  BUILTIN_VD_HSI (TERNOP, smull_laneq, 0)
+  BUILTIN_VD_HSI (TERNOPU, umull_laneq, 0)
+  /* Implemented by aarch64_<su>mull2_lane<q><mode>.  */
+  BUILTIN_VQ_HSI (TERNOP, smull2_lane, 0)
+  BUILTIN_VQ_HSI (TERNOP_LANE, umull2_lane, 0)
+  BUILTIN_VQ_HSI (TERNOP, smull2_laneq, 0)
+  BUILTIN_VQ_HSI (TERNOP_LANE, umull2_laneq, 0)
+  /* Implemented by aarch64_fmulx<mode>.  */
+  BUILTIN_VDQF (BINOP, fmulx, 0)
+  BUILTIN_GPF (BINOP, fmulx, 0)
+  BUILTIN_VDQF (BINOP, fmulx_lane, 0)
+
+  /* Implemented by aarch64_pmull<2><mode>.*/
+  VAR1 (BINOPP, pmull, 0, v8qi)
+  VAR1 (BINOPP, pmull2, 0, v16qi)
+
   BUILTIN_VSDQ_I_DI (BINOP, ashl, 3)
   /* Implemented by aarch64_<sur>shl<mode>.  */
   BUILTIN_VSDQ_I_DI (BINOP, sshl, 0)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PING^3] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
  2015-03-12 11:44 ` Kyrill Tkachov
  2015-03-14 10:09   ` Jiangjiji
@ 2015-04-11 10:38   ` Jiangjiji
  2015-05-05  6:37     ` James Greenhalgh
  2015-05-04  8:52   ` [PING^4] " Jiangjiji
  2 siblings, 1 reply; 8+ messages in thread
From: Jiangjiji @ 2015-04-11 10:38 UTC (permalink / raw)
  To: Jiangjiji, Kyrill Tkachov, gcc-patches, Richard Earnshaw,
	Yangfei (Felix),
	Marcus Shawcroft

Hi, 
  This is a ping for: https://gcc.gnu.org/ml/gcc-patches/2015-03/msg00772.html
  Regtested with aarch64-linux-gnu on QEMU.
  This patch has no regressions for aarch64_be-linux-gnu big-endian target too. 
  OK for the trunk? 

Thanks.
Jiang jiji


----------
Re: [PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics

Hi, Kyrill
  Thank you for your suggestion. 
  I fixed it and regtested with aarch64-linux-gnu on QEMU.
  This patch has no regressions for aarch64_be-linux-gnu big-endian target too. 
  OK for the trunk? 

Thanks.
Jiang jiji



Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 221393)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,38 @@
+2015-03-14  Felix Yang  <felix.yang@huawei.com>
+	    Jiji Jiang  <jiangjiji@huawei.com>
+
+	* config/aarch64/aarch64-simd.md (aarch64_mul_n<mode>,
+	aarch64_<su>mull_n<mode>, aarch64_<su>mull<mode>,
+	aarch64_simd_<su>mull2_n<mode>, aarch64_<su>mull2_n<mode>,
+	aarch64_<su>mull_lane<mode>, aarch64_<su>mull2_lane<mode>_internal,
+	aarch64_<su>mull_laneq<mode>, aarch64_<su>mull2_laneq<mode>_internal,
+	aarch64_smull2_lane<mode>, aarch64_umull2_lane<mode>,
+	aarch64_smull2_laneq<mode>, aarch64_umull2_laneq<mode>,
+	aarch64_fmulx<mode>, aarch64_fmulx<mode>, aarch64_fmulx_lane<mode>,
+	aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns.
+	* config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_,
+	vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n,
+	umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull,
+	umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2,
+	smull2_lane): New builtins.
+	* config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32,
+	vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16,
+	vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16,
+	vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32,
+	vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16,
+	vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32,
+	vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8,
+	vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16,
+	vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16,
+	vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16,
+	vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32,
+	vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16,
+	vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32,
+	vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite
+	using builtin functions.
+	* config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE,
+	VDQF_Q): New unspec and int iterator.
+
 2015-03-12  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
 
 	PR rtl-optimization/65235
Index: gcc/config/aarch64/arm_neon.h
===================================================================
--- gcc/config/aarch64/arm_neon.h	(revision 221393)
+++ gcc/config/aarch64/arm_neon.h	(working copy)
@@ -7580,671 +7580,6 @@ vmovn_u64 (uint64x2_t a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_n_f32 (float32x2_t a, float32_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_n_s16 (int16x4_t a, int16_t b)
-{
-  int16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_n_s32 (int32x2_t a, int32_t b)
-{
-  int32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmull_high_lane_s16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_s32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_u16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_u32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_s16(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_s32(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_u16(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_u32(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_high_n_s16 (int16x8_t a, int16_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_high_n_s32 (int32x4_t a, int32_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_high_n_u16 (uint16x8_t a, uint16_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_high_n_u32 (uint32x4_t a, uint32_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmull_high_p8 (poly8x16_t a, poly8x16_t b)
-{
-  poly16x8_t result;
-  __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmull_high_s8 (int8x16_t a, int8x16_t b)
-{
-  int16x8_t result;
-  __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_high_s16 (int16x8_t a, int16x8_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_high_s32 (int32x4_t a, int32x4_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmull_high_u8 (uint8x16_t a, uint8x16_t b)
-{
-  uint16x8_t result;
-  __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_high_u16 (uint16x8_t a, uint16x8_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_high_u32 (uint32x4_t a, uint32x4_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmull_lane_s16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull %0.4s,%1.4h,%2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_s32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull %0.2d,%1.2s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_u16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull %0.4s,%1.4h,%2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_u32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_s16(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_s32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_u16(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_u32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_n_s16 (int16x4_t a, int16_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_n_s32 (int32x2_t a, int32_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmull_p8 (poly8x8_t a, poly8x8_t b)
-{
-  poly16x8_t result;
-  __asm__ ("pmull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmull_s8 (int8x8_t a, int8x8_t b)
-{
-  int16x8_t result;
-  __asm__ ("smull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_s16 (int16x4_t a, int16x4_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_s32 (int32x2_t a, int32x2_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmull_u8 (uint8x8_t a, uint8x8_t b)
-{
-  uint16x8_t result;
-  __asm__ ("umull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_u16 (uint16x4_t a, uint16x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_u32 (uint32x2_t a, uint32x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulq_n_f32 (float32x4_t a, float32_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulq_n_f64 (float64x2_t a, float64_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmulq_n_s16 (int16x8_t a, int16_t b)
-{
-  int16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmulq_n_s32 (int32x4_t a, int32_t b)
-{
-  int32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmulq_n_u16 (uint16x8_t a, uint16_t b)
-{
-  uint16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmulq_n_u32 (uint32x4_t a, uint32_t b)
-{
-  uint32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmulx_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmulx %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmulx_lane_f32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x2_t a_ = (a);                                            \
-       float32x2_t result;                                              \
-       __asm__ ("fmulx %0.2s,%1.2s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vmulxd_f64 (float64_t a, float64_t b)
-{
-  float64_t result;
-  __asm__ ("fmulx %d0, %d1, %d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulxq_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmulx %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulxq_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmulx %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmulxq_lane_f32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x4_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       __asm__ ("fmulx %0.4s,%1.4s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulxq_lane_f64(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t b_ = (b);                                            \
-       float64x2_t a_ = (a);                                            \
-       float64x2_t result;                                              \
-       __asm__ ("fmulx %0.2d,%1.2d,%2.d[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmulxs_f32 (float32_t a, float32_t b)
-{
-  float32_t result;
-  __asm__ ("fmulx %s0, %s1, %s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vmvn_p8 (poly8x8_t a)
 {
@@ -18695,6 +18030,380 @@ vmul_n_f64  (float64x1_t __a, float64_t __b)
   return (float64x1_t) { vget_lane_f64 (__a, 0) * __b };
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmul_n_f32 (float32x2_t __a, float32_t __b)
+{
+  return __builtin_aarch64_mul_nv2sf (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmul_n_s16 (int16x4_t __a, int16_t __b)
+{
+  return __builtin_aarch64_mul_nv4hi (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmul_n_s32 (int32x2_t __a, int32_t __b)
+{
+  return __builtin_aarch64_mul_nv2si (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmul_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+  return (uint16x4_t) __builtin_aarch64_mul_nv4hi ((int16x4_t)__a,
+                                                   (int16_t)__b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmul_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_mul_nv2si ((int32x2_t)__a,
+                                                   (int32_t)__b);
+}
+
+/* vmulq_n  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulq_n_f32 (float32x4_t __a, float32_t __b)
+{
+  return __builtin_aarch64_mul_nv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulq_n_f64 (float64x2_t __a, float64_t __b)
+{
+  return __builtin_aarch64_mul_nv2df (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmulq_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return __builtin_aarch64_mul_nv8hi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmulq_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return __builtin_aarch64_mul_nv4si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+  return (uint16x8_t) __builtin_aarch64_mul_nv8hi ((int16x8_t)__a,
+                                                   (int16_t)__b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+  return (uint32x4_t) __builtin_aarch64_mul_nv4si ((int32x4_t)__a,
+                                                   (int32_t)__b);
+}
+
+/* vmull_high_lane  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_lanev8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_lanev4si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c)
+{
+  return (uint32x4_t) __builtin_aarch64_umull2_lanev8hi ((int16x8_t) __a,
+                                                         (int16x4_t) __b,
+                                                         __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c)
+{
+  return (uint64x2_t) __builtin_aarch64_umull2_lanev4si ((int32x4_t) __a,
+                                                         (int32x2_t) __b,
+                                                          __c);
+}
+
+/* vmull_high_laneq  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_laneqv8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_laneqv4si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+{
+  return (uint32x4_t) __builtin_aarch64_umull2_laneqv8hi ((int16x8_t)__a,
+                                                          (int16x8_t)__b,
+                                                          __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+{
+  return (uint64x2_t) __builtin_aarch64_umull2_laneqv4si ((int32x4_t) __a,
+                                                          (int32x4_t) __b,
+                                                           __c);
+}
+
+/* vmull_high_n  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return __builtin_aarch64_smull2_nv8hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return __builtin_aarch64_smull2_nv4si (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+  return __builtin_aarch64_umull2_nv8hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+  return __builtin_aarch64_umull2_nv4si_uuu (__a, __b);
+}
+
+/* vmull_high  */
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmull_high_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+    return  __builtin_aarch64_pmull2v16qi_ppp (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmull_high_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v4si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmull_high_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v16qi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v8hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v4si_uuu (__a, __b);
+}
+
+/* vmull_lane  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_lanev4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_lanev2si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_lanev4hi_uuuu (__a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_lanev2si_uuuu (__a, __b, __c);
+}
+
+/* vmull_laneq  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_laneqv4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_laneqv2si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_laneqv4hi_uuuu (__a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const unsigned int __c)
+{
+  return __builtin_aarch64_umull_laneqv2si_uuuu (__a, __b, __c);
+}
+
+/* vmull_n  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_n_s16 (int16x4_t __a, int16_t __b)
+{
+   return __builtin_aarch64_smull_nv4hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_n_s32 (int32x2_t __a, int32_t __b)
+{
+   return __builtin_aarch64_smull_nv2si (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+   return __builtin_aarch64_umull_nv4hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return __builtin_aarch64_umull_nv2si_uuu (__a, __b);
+}
+
+/* vmull  */
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmull_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return __builtin_aarch64_pmullv8qi_ppp (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmull_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return __builtin_aarch64_smullv8qi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return __builtin_aarch64_smullv4hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return __builtin_aarch64_smullv2si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmull_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return __builtin_aarch64_umullv8qi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return __builtin_aarch64_umullv4hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return __builtin_aarch64_umullv2si_uuu (__a, __b);
+}
+
+/* vmulx  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmulx_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return __builtin_aarch64_fmulxv2sf (__a, __b);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmulx_lane_f32 (float32x2_t __a, float32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev2sf (__a, __b, __c);
+}
+
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vmulxd_f64 (float64_t __a, float64_t __b)
+{
+  return __builtin_aarch64_fmulxdf (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulxq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return __builtin_aarch64_fmulxv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulxq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return __builtin_aarch64_fmulxv2df (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulxq_lane_f32 (float32x4_t __a, float32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev4sf (__a, __b, __c);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulxq_lane_f64 (float64x2_t __a, float64x2_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev2df (__a, __b, __c);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vmulxs_f32 (float32_t __a, float32_t __b)
+{
+  return __builtin_aarch64_fmulxsf (__a, __b);
+}
+
 /* vmulq_lane  */
 
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
Index: gcc/config/aarch64/iterators.md
===================================================================
--- gcc/config/aarch64/iterators.md	(revision 221393)
+++ gcc/config/aarch64/iterators.md	(working copy)
@@ -277,6 +277,8 @@
     UNSPEC_SHA256SU1    ; Used in aarch64-simd.md.
     UNSPEC_PMULL        ; Used in aarch64-simd.md.
     UNSPEC_PMULL2       ; Used in aarch64-simd.md.
+    UNSPEC_FMULX        ; Used in aarch64-simd.md.
+    UNSPEC_FMULX_LANE   ; Used in aarch64-simd.md.
     UNSPEC_REV_REGLIST  ; Used in aarch64-simd.md.
 ])
 
@@ -468,6 +470,9 @@
 
 )
 
+(define_mode_attr VDQF_Q [(V2SF "V4SF") (V4SF "V4SF")
+                          (V2DF "V2DF")])
+
 ;; Widened mode register suffixes for VD_BHSI/VQW.
 (define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s")
 			  (V2SI "2d") (V16QI "8h") 
Index: gcc/config/aarch64/aarch64-simd.md
===================================================================
--- gcc/config/aarch64/aarch64-simd.md	(revision 221393)
+++ gcc/config/aarch64/aarch64-simd.md	(working copy)
@@ -1400,6 +1400,252 @@
  }
 )
 
+(define_insn "aarch64_mul_n<mode>"
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+        (mult:VMUL
+          (match_operand:VMUL 1 "register_operand" "w")
+          (vec_duplicate:VMUL
+            (match_operand:<VEL> 2 "register_operand" "<h_con>"))))]
+  "TARGET_SIMD"
+  "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_long")]
+)
+
+(define_insn "aarch64_<su>mull_n<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (match_operand:<VEL> 2 "register_operand" "<vwx>")))))]
+  "TARGET_SIMD"
+  "<su>mull\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_BHSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_BHSI 2 "register_operand" "w"))))]
+ "TARGET_SIMD"
+ "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_mul_<Vetype>_long")]
+)
+
+(define_insn "aarch64_simd_<su>mull2_n<mode>"
+ [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+       (mult:<VWIDE> (ANY_EXTEND:<VWIDE> (vec_select:<VHALF>
+                            (match_operand:VQ_HSI 1 "register_operand" "w")
+                            (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
+                     (ANY_EXTEND:<VWIDE> (vec_duplicate:<VHALF>
+                            (match_operand:<VEL> 2 "register_operand" "<vw>")))))]
+  "TARGET_SIMD"
+  "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_expand "aarch64_<su>mull2_n<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "")
+   (ANY_EXTEND:<VWIDE> (match_operand:VQ_HSI 1 "register_operand" ""))
+   (match_operand:<VEL> 2 "register_operand" "")]
+ "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+   emit_insn (gen_aarch64_simd_<su>mull2_n<mode> (operands[0],
+                                                  operands[1],
+                                                  operands[2], p));
+   DONE;
+
+ }
+)
+
+(define_insn "aarch64_<su>mull_lane<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (vec_select:<VEL>
+                (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull_laneq<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (vec_select:<VEL>
+                (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull2_lane<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (vec_select:<VHALF>
+              (match_operand:VQ_HSI 1 "register_operand" "w")
+              (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" "")))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:<VHALF>
+              (vec_select:<VEL>
+                (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull2_laneq<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (vec_select:<VHALF>
+              (match_operand:VQ_HSI 1 "register_operand" "w")
+              (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" "")))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:<VHALF>
+              (vec_select:<VEL>
+                (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_expand "aarch64_smull2_lane<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_smull2_lane<mode>_internal (operands[0], operands[1],
+                                                     operands[2], operands[3],
+                                                     p));
+  DONE;
+})
+
+(define_expand "aarch64_umull2_lane<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_umull2_lane<mode>_internal (operands[0], operands[1],
+                                                     operands[2], operands[3],
+                                                     p));
+  DONE;
+})
+
+(define_expand "aarch64_smull2_laneq<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_smull2_laneq<mode>_internal (operands[0], operands[1],
+                                                      operands[2], operands[3],
+                                                      p));
+  DONE;
+})
+
+(define_expand "aarch64_umull2_laneq<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_umull2_laneq<mode>_internal (operands[0], operands[1],
+                                                      operands[2], operands[3],
+                                                      p));
+  DONE;
+})
+
+(define_insn "aarch64_fmulx<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+        (unspec:VDQF  [(match_operand:VDQF 1 "register_operand" "w")
+                       (match_operand:VDQF 2 "register_operand" "w")]
+                      UNSPEC_FMULX))]
+ "TARGET_SIMD"
+ "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vtype>"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_fmulx<mode>"
+  [(set (match_operand:GPF 0 "register_operand" "=w")
+        (unspec:GPF  [(match_operand:GPF 1 "register_operand" "w")
+                      (match_operand:GPF 2 "register_operand" "w")]
+                     UNSPEC_FMULX))]
+ "TARGET_SIMD"
+ "fmulx\\t%<s>0, %<s>1, %<s>2"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_fmulx_lane<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+        (unspec:VDQF  [(match_operand:VDQF 1 "register_operand" "w")
+                       (match_operand:<VDQF_Q> 2 "register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_FMULX_LANE))]
+ "TARGET_SIMD"
+ "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vetype>[%3]"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_pmull2v16qi"
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
+       (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "w")
+                     (match_operand:V16QI 2 "register_operand" "w")]
+                    UNSPEC_PMULL2))]
+  "TARGET_SIMD"
+  "pmull2\\t%0.8h, %1.16b, %2.16b"
+  [(set_attr "type" "neon_mul_b_long")]
+)
+
+(define_insn "aarch64_pmullv8qi"
+  [(set (match_operand:V8HI 0 "register_operand" "=w")
+        (unspec:V8HI  [(match_operand:V8QI 1 "register_operand" "w")
+                       (match_operand:V8QI 2 "register_operand" "w")]
+                      UNSPEC_PMULL))]
+ "TARGET_SIMD"
+ "pmull\\t%0.8h, %1.8b, %2.8b"
+  [(set_attr "type" "neon_mul_b_long")]
+)
+
 ;; FP vector operations.
 ;; AArch64 AdvSIMD supports single-precision (32-bit) and 
 ;; double-precision (64-bit) floating-point data types and arithmetic as
Index: gcc/config/aarch64/aarch64-simd-builtins.def
===================================================================
--- gcc/config/aarch64/aarch64-simd-builtins.def	(revision 221393)
+++ gcc/config/aarch64/aarch64-simd-builtins.def	(working copy)
@@ -187,6 +187,39 @@
   BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0)
   BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0)
 
+  /* Implemented by vec_widen_<su>mult_hi_<mode>.  */
+  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10)
+  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10)
+  /* Implemented by aarch64_<su>mull<mode>.  */
+  BUILTIN_VD_BHSI (BINOPU, umull, 0)
+  BUILTIN_VD_BHSI (BINOP, smull, 0)
+  /* Implemented by aarch64_<su>mull_n<mode>.  */
+  BUILTIN_VD_HSI (BINOP, smull_n, 0)
+  BUILTIN_VD_HSI (BINOPU, umull_n, 0)
+  /* Implemented by aarch64_mul_n<mode>.  */
+  BUILTIN_VMUL (BINOP, mul_n, 0)
+  /* Implemented by aarch64_<su>mull2_n<mode>.  */
+  BUILTIN_VQ_HSI (BINOP, smull2_n, 0)
+  BUILTIN_VQ_HSI (BINOPU, umull2_n, 0)
+  /* Implemented by aarch64_<su>mull_lane<q><mode>.  */
+  BUILTIN_VD_HSI (TERNOP, smull_lane, 0)
+  BUILTIN_VD_HSI (TERNOPU, umull_lane, 0)
+  BUILTIN_VD_HSI (TERNOP, smull_laneq, 0)
+  BUILTIN_VD_HSI (TERNOPU, umull_laneq, 0)
+  /* Implemented by aarch64_<su>mull2_lane<q><mode>.  */
+  BUILTIN_VQ_HSI (TERNOP, smull2_lane, 0)
+  BUILTIN_VQ_HSI (TERNOP_LANE, umull2_lane, 0)
+  BUILTIN_VQ_HSI (TERNOP, smull2_laneq, 0)
+  BUILTIN_VQ_HSI (TERNOP_LANE, umull2_laneq, 0)
+  /* Implemented by aarch64_fmulx<mode>.  */
+  BUILTIN_VDQF (BINOP, fmulx, 0)
+  BUILTIN_GPF (BINOP, fmulx, 0)
+  BUILTIN_VDQF (BINOP, fmulx_lane, 0)
+
+  /* Implemented by aarch64_pmull<2><mode>.*/
+  VAR1 (BINOPP, pmull, 0, v8qi)
+  VAR1 (BINOPP, pmull2, 0, v16qi)
+
   BUILTIN_VSDQ_I_DI (BINOP, ashl, 3)
   /* Implemented by aarch64_<sur>shl<mode>.  */
   BUILTIN_VSDQ_I_DI (BINOP, sshl, 0)



^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PING^4] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
  2015-03-12 11:44 ` Kyrill Tkachov
  2015-03-14 10:09   ` Jiangjiji
  2015-04-11 10:38   ` [PING^3] " Jiangjiji
@ 2015-05-04  8:52   ` Jiangjiji
  2 siblings, 0 replies; 8+ messages in thread
From: Jiangjiji @ 2015-05-04  8:52 UTC (permalink / raw)
  To: Kyrill Tkachov, gcc-patches, Richard Earnshaw, Marcus Shawcroft
  Cc: Yangfei (Felix)


Hi, 
  This is a ping for: https://gcc.gnu.org/ml/gcc-patches/2015-03/msg00772.html
  Regtested with aarch64-linux-gnu on QEMU.
  This patch has no regressions for aarch64_be-linux-gnu big-endian target too. 
  OK for the trunk? 

Thanks.
Jiang jiji





^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PING^3] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
  2015-04-11 10:38   ` [PING^3] " Jiangjiji
@ 2015-05-05  6:37     ` James Greenhalgh
  2015-05-05 13:14       ` Jiangjiji
  0 siblings, 1 reply; 8+ messages in thread
From: James Greenhalgh @ 2015-05-05  6:37 UTC (permalink / raw)
  To: Jiangjiji
  Cc: Kyrylo Tkachov, gcc-patches, Richard Earnshaw, Yangfei (Felix),
	Marcus Shawcroft

On Sat, Apr 11, 2015 at 11:37:47AM +0100, Jiangjiji wrote:
> Hi,
>   This is a ping for: https://gcc.gnu.org/ml/gcc-patches/2015-03/msg00772.html
>   Regtested with aarch64-linux-gnu on QEMU.
>   This patch has no regressions for aarch64_be-linux-gnu big-endian target too.
>   OK for the trunk?
> 
> Thanks.
> Jiang jiji
> 
> 
> ----------
> Re: [PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
> 
> Hi, Kyrill
>   Thank you for your suggestion.
>   I fixed it and regtested with aarch64-linux-gnu on QEMU.
>   This patch has no regressions for aarch64_be-linux-gnu big-endian target too.
>   OK for the trunk?

Hi Jiang,

I'm sorry that I've taken so long to get to this, I've been out of office
for several weeks. I have one comment.

> +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
> +vmul_n_f32 (float32x2_t __a, float32_t __b)
> +{
> +  return __builtin_aarch64_mul_nv2sf (__a, __b);
> +}
> +

For vmul_n_* intrinsics, is there a reason we don't want to use the
GCC vector extension syntax to allow us to write these as:

  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
  vmul_n_f32 (float32x2_t __a, float32_t __b)
  {
    return __a * __b;
  }

It would be great if we could make that work.

Thanks,
James

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PING^3] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
  2015-05-05  6:37     ` James Greenhalgh
@ 2015-05-05 13:14       ` Jiangjiji
  2015-06-16  9:06         ` James Greenhalgh
  0 siblings, 1 reply; 8+ messages in thread
From: Jiangjiji @ 2015-05-05 13:14 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: Kyrylo Tkachov, gcc-patches, Richard Earnshaw, Yangfei (Felix),
	Marcus Shawcroft

Hi James,

Thanks for your comment.

Seems we need a 'dup' before 'fmul' if we use the GCC vector extension syntax way.

Example:
        dup     v1.2s, v1.s[0]
        fmul    v0.2s, v1.2s, v0.2s

And we need another pattern to combine this two insns into 'fmul %0.2s,%1.2s,%2.s[0]', which is kind of complex.

BTW: maybe it's better to reconsider this issue after this patch, right?


Thanks.
Jiang jiji



On Sat, Apr 11, 2015 at 11:37:47AM +0100, Jiangjiji wrote:
> Hi,
>   This is a ping for: https://gcc.gnu.org/ml/gcc-patches/2015-03/msg00772.html
>   Regtested with aarch64-linux-gnu on QEMU.
>   This patch has no regressions for aarch64_be-linux-gnu big-endian target too.
>   OK for the trunk?
> 
> Thanks.
> Jiang jiji
> 
> 
> ----------
> Re: [PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
> 
> Hi, Kyrill
>   Thank you for your suggestion.
>   I fixed it and regtested with aarch64-linux-gnu on QEMU.
>   This patch has no regressions for aarch64_be-linux-gnu big-endian target too.
>   OK for the trunk?

Hi Jiang,

I'm sorry that I've taken so long to get to this, I've been out of office for several weeks. I have one comment.

> +__extension__ static __inline float32x2_t __attribute__ 
> +((__always_inline__))
> +vmul_n_f32 (float32x2_t __a, float32_t __b) {
> +  return __builtin_aarch64_mul_nv2sf (__a, __b); }
> +

For vmul_n_* intrinsics, is there a reason we don't want to use the GCC vector extension syntax to allow us to write these as:

  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
  vmul_n_f32 (float32x2_t __a, float32_t __b)
  {
    return __a * __b;
  }

It would be great if we could make that work.

Thanks,
James


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PING^3] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
  2015-05-05 13:14       ` Jiangjiji
@ 2015-06-16  9:06         ` James Greenhalgh
  0 siblings, 0 replies; 8+ messages in thread
From: James Greenhalgh @ 2015-06-16  9:06 UTC (permalink / raw)
  To: Jiangjiji
  Cc: Kyrylo Tkachov, gcc-patches, Richard Earnshaw, Yangfei (Felix),
	Marcus Shawcroft

On Tue, May 05, 2015 at 02:14:16PM +0100, Jiangjiji wrote:
> Hi James,
> 
> Thanks for your comment.
> 
> Seems we need a 'dup' before 'fmul' if we use the GCC vector extension syntax way.
> 
> Example:
>         dup     v1.2s, v1.s[0]
>         fmul    v0.2s, v1.2s, v0.2s
> 
> And we need another pattern to combine this two insns into 'fmul
> %0.2s,%1.2s,%2.s[0]', which is kind of complex.

This would be the best way to handle the intrinsic - then we can
potentially make a nice optimization of lots of additional code. I
think it would be best if we add that extra pattern to combine the two
instructions, then write the intrinsic as __a * __b.

Looking at the pattern you propose to add for vmul_n_f32, I'm not sure
why this does not currently work, maybe it is just as simple as swapping
the operand order in:

+(define_insn "aarch64_mul_n<mode>"
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+        (mult:VMUL
+          (match_operand:VMUL 1 "register_operand" "w")
+          (vec_duplicate:VMUL
+            (match_operand:<VEL> 2 "register_operand" "<h_con>"))))]
+  "TARGET_SIMD"
+  "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_long")]
+)
+

To look like:

+(define_insn "aarch64_mul_n<mode>"
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+        (mult:VMUL
+          (vec_duplicate:VMUL
+            (match_operand:<VEL> 2 "register_operand" "<h_con>"))
+          (match_operand:VMUL 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+  "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_long")]
+)
+

Which I think better matches the canonicalization rules for combine?


> BTW: maybe it's better to reconsider this issue after this patch, right?

Maybe, but I'd rather see this done now rather than later. Otherwise,
we have to add and remove the 

If you want to split your patch in to two parts, one part for the
vmul_n_* intrinsics, and one part for the other intrinsics, then I'd be
happy to review them as two separate patches.

Thanks,
James

> On Sat, Apr 11, 2015 at 11:37:47AM +0100, Jiangjiji wrote:
> > Hi,
> >   This is a ping for: https://gcc.gnu.org/ml/gcc-patches/2015-03/msg00772.html
> >   Regtested with aarch64-linux-gnu on QEMU.
> >   This patch has no regressions for aarch64_be-linux-gnu big-endian target too.
> >   OK for the trunk?
> > 
> > Thanks.
> > Jiang jiji
> > 
> > 
> > ----------
> > Re: [PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
> > 
> > Hi, Kyrill
> >   Thank you for your suggestion.
> >   I fixed it and regtested with aarch64-linux-gnu on QEMU.
> >   This patch has no regressions for aarch64_be-linux-gnu big-endian target too.
> >   OK for the trunk?
> 
> Hi Jiang,
> 
> I'm sorry that I've taken so long to get to this, I've been out of office for several weeks. I have one comment.
> 
> > +__extension__ static __inline float32x2_t __attribute__ 
> > +((__always_inline__))
> > +vmul_n_f32 (float32x2_t __a, float32_t __b) {
> > +  return __builtin_aarch64_mul_nv2sf (__a, __b); }
> > +
> 
> For vmul_n_* intrinsics, is there a reason we don't want to use the GCC vector extension syntax to allow us to write these as:
> 
>   __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
>   vmul_n_f32 (float32x2_t __a, float32_t __b)
>   {
>     return __a * __b;
>   }
> 
> It would be great if we could make that work.
> 
> Thanks,
> James
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2015-06-16  9:05 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-12 11:16 [PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics Jiangjiji
2015-03-12 11:44 ` Kyrill Tkachov
2015-03-14 10:09   ` Jiangjiji
2015-04-11 10:38   ` [PING^3] " Jiangjiji
2015-05-05  6:37     ` James Greenhalgh
2015-05-05 13:14       ` Jiangjiji
2015-06-16  9:06         ` James Greenhalgh
2015-05-04  8:52   ` [PING^4] " Jiangjiji

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).