From 5c8855d44db480772803b6395cd698c704353408 Mon Sep 17 00:00:00 2001 From: Matthew Wahab Date: Tue, 5 Jul 2016 14:53:19 +0100 Subject: [PATCH] [ARM] Enable FP16 vector arithmetic operations. Tested for arm-none-linux-gnueabihf with native bootstrap and make check on ARMv8-A and for arm-none-eabi and armeb-none-eabi with cross-compiled make check on an ARMv8.2-A emulator. gcc/ 2016-09-23 Matthew Wahab * config/arm/arm_neon.h (vadd_f16): Use standard arithmetic operations in fast-math mode. (vaddq_f16): Likewise. (vmul_f16): Likewise. (vmulq_f16): Likewise. (vsub_f16): Likewise. (vsubq_f16): Likewise. * config/arm/neon.md (add3): New. (sub3): New. (fma:3): New. Also remove outdated comment. (mul3): New. testsuite/ 2016-09-23 Matthew Wahab * gcc.target/arm/armv8_2-fp16-arith-1.c: Expand comment. Update expected output of vadd, vsub and vmul instructions. * gcc.target/arm/armv8_2-fp16-arith-2.c: New. * gcc.target/arm/armv8_2-fp16-neon-2.c: New. * gcc.target/arm/armv8_2-fp16-neon-3.c: New. --- gcc/config/arm/arm_neon.h | 24 + gcc/config/arm/neon.md | 52 ++- .../gcc.target/arm/armv8_2-fp16-arith-1.c | 18 +- .../gcc.target/arm/armv8_2-fp16-arith-2.c | 109 +++++ gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c | 491 +++++++++++++++++++++ gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-3.c | 108 +++++ 6 files changed, 796 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-2.c create mode 100644 gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c create mode 100644 gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-3.c diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 54bbc7d..b19ed4f 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -14875,13 +14875,21 @@ vabsq_f16 (float16x8_t __a) __extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) vadd_f16 (float16x4_t __a, float16x4_t __b) { +#ifdef __FAST_MATH__ + return __a + __b; +#else return __builtin_neon_vaddv4hf (__a, __b); +#endif } __extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) vaddq_f16 (float16x8_t __a, float16x8_t __b) { +#ifdef __FAST_MATH__ + return __a + __b; +#else return __builtin_neon_vaddv8hf (__a, __b); +#endif } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) @@ -15319,7 +15327,11 @@ vminnmq_f16 (float16x8_t __a, float16x8_t __b) __extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) vmul_f16 (float16x4_t __a, float16x4_t __b) { +#ifdef __FAST_MATH__ + return __a * __b; +#else return __builtin_neon_vmulfv4hf (__a, __b); +#endif } __extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) @@ -15337,7 +15349,11 @@ vmul_n_f16 (float16x4_t __a, float16_t __b) __extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) vmulq_f16 (float16x8_t __a, float16x8_t __b) { +#ifdef __FAST_MATH__ + return __a * __b; +#else return __builtin_neon_vmulfv8hf (__a, __b); +#endif } __extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) @@ -15505,13 +15521,21 @@ vrsqrtsq_f16 (float16x8_t __a, float16x8_t __b) __extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) vsub_f16 (float16x4_t __a, float16x4_t __b) { +#ifdef __FAST_MATH__ + return __a - __b; +#else return __builtin_neon_vsubv4hf (__a, __b); +#endif } __extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) vsubq_f16 (float16x8_t __a, float16x8_t __b) { +#ifdef __FAST_MATH__ + return __a - __b; +#else return __builtin_neon_vsubv8hf (__a, __b); +#endif } #endif /* __ARM_FEATURE_VECTOR_FP16_ARITHMETIC. */ diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 0532333..c7718d5 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -505,6 +505,23 @@ (const_string "neon_add")))] ) +;; As with SFmode, full support for HFmode vector arithmetic is only available +;; when flag-unsafe-math-optimizations is enabled. + +(define_insn "add3" + [(set + (match_operand:VH 0 "s_register_operand" "=w") + (plus:VH + (match_operand:VH 1 "s_register_operand" "w") + (match_operand:VH 2 "s_register_operand" "w")))] + "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations" + "vadd.\t%0, %1, %2" + [(set (attr "type") + (if_then_else (match_test "") + (const_string "neon_fp_addsub_s") + (const_string "neon_add")))] +) + (define_insn "add3_fp16" [(set (match_operand:VH 0 "s_register_operand" "=w") @@ -557,6 +574,17 @@ (const_string "neon_sub")))] ) +(define_insn "sub3" + [(set + (match_operand:VH 0 "s_register_operand" "=w") + (minus:VH + (match_operand:VH 1 "s_register_operand" "w") + (match_operand:VH 2 "s_register_operand" "w")))] + "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations" + "vsub.\t%0, %1, %2" + [(set_attr "type" "neon_sub")] +) + (define_insn "sub3_fp16" [(set (match_operand:VH 0 "s_register_operand" "=w") @@ -664,8 +692,17 @@ [(set_attr "type" "neon_fp_mla_s")] ) -;; There is limited support for unsafe-math optimizations using the NEON FP16 -;; arithmetic instructions, so only the intrinsic is currently supported. +(define_insn "fma4" + [(set (match_operand:VH 0 "register_operand" "=w") + (fma:VH + (match_operand:VH 1 "register_operand" "w") + (match_operand:VH 2 "register_operand" "w") + (match_operand:VH 3 "register_operand" "0")))] + "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations" + "vfma.\\t%0, %1, %2" + [(set_attr "type" "neon_fp_mla_s")] +) + (define_insn "fma4_intrinsic" [(set (match_operand:VH 0 "register_operand" "=w") (fma:VH @@ -2169,6 +2206,17 @@ (const_string "neon_mul_")))] ) +(define_insn "mul3" + [(set + (match_operand:VH 0 "s_register_operand" "=w") + (mult:VH + (match_operand:VH 1 "s_register_operand" "w") + (match_operand:VH 2 "s_register_operand" "w")))] + "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations" + "vmul.f16\t%0, %1, %2" + [(set_attr "type" "neon_mul_")] +) + (define_insn "neon_vmulf" [(set (match_operand:VH 0 "s_register_operand" "=w") diff --git a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c index b88f43f..921d26e 100644 --- a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c +++ b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c @@ -3,7 +3,8 @@ /* { dg-options "-O2 -ffast-math" } */ /* { dg-add-options arm_v8_2a_fp16_neon } */ -/* Test instructions generated for half-precision arithmetic. */ +/* Test instructions generated for half-precision arithmetic with + unsafe-math-optimizations enabled. */ typedef __fp16 float16_t; typedef __simd64_float16_t float16x4_t; @@ -90,9 +91,18 @@ TEST_CMP (greaterthanqual, >=, int16x8_t, float16x8_t) /* { dg-final { scan-assembler-times {vneg\.f16\tq[0-9]+, q[0-9]+} 1 } } */ /* { dg-final { scan-assembler-times {vabs\.f16\ts[0-9]+, s[0-9]+} 2 } } */ -/* { dg-final { scan-assembler-times {vadd\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */ -/* { dg-final { scan-assembler-times {vsub\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */ -/* { dg-final { scan-assembler-times {vmul\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */ +/* { dg-final { scan-assembler-times {vadd\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vadd\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vadd\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +/* { dg-final { scan-assembler-times {vsub\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vsub\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vsub\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +/* { dg-final { scan-assembler-times {vmul\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + /* { dg-final { scan-assembler-times {vdiv\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */ /* { dg-final { scan-assembler-times {vcmp\.f32\ts[0-9]+, s[0-9]+} 26 } } */ /* { dg-final { scan-assembler-times {vcmpe\.f32\ts[0-9]+, s[0-9]+} 52 } } */ diff --git a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-2.c b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-2.c new file mode 100644 index 0000000..24d0528 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-2.c @@ -0,0 +1,109 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */ +/* { dg-options "-O2 -fno-fast-math" } */ +/* { dg-add-options arm_v8_2a_fp16_neon } */ + +/* Test instructions generated for half-precision arithmetic without + unsafe-math-optimizations. */ + +typedef __fp16 float16_t; +typedef __simd64_float16_t float16x4_t; +typedef __simd128_float16_t float16x8_t; + +typedef short int16x4_t __attribute__ ((vector_size (8))); +typedef short int int16x8_t __attribute__ ((vector_size (16))); + +float16_t +fp16_abs (float16_t a) +{ + return (a < 0) ? -a : a; +} + +#define TEST_UNOP(NAME, OPERATOR, TY) \ + TY test_##NAME##_##TY (TY a) \ + { \ + return OPERATOR (a); \ + } + +#define TEST_BINOP(NAME, OPERATOR, TY) \ + TY test_##NAME##_##TY (TY a, TY b) \ + { \ + return a OPERATOR b; \ + } + +#define TEST_CMP(NAME, OPERATOR, RTY, TY) \ + RTY test_##NAME##_##TY (TY a, TY b) \ + { \ + return a OPERATOR b; \ + } + +/* Scalars. */ + +TEST_UNOP (neg, -, float16_t) +TEST_UNOP (abs, fp16_abs, float16_t) + +TEST_BINOP (add, +, float16_t) +TEST_BINOP (sub, -, float16_t) +TEST_BINOP (mult, *, float16_t) +TEST_BINOP (div, /, float16_t) + +TEST_CMP (equal, ==, int, float16_t) +TEST_CMP (unequal, !=, int, float16_t) +TEST_CMP (lessthan, <, int, float16_t) +TEST_CMP (greaterthan, >, int, float16_t) +TEST_CMP (lessthanequal, <=, int, float16_t) +TEST_CMP (greaterthanqual, >=, int, float16_t) + +/* Vectors of size 4. */ + +TEST_UNOP (neg, -, float16x4_t) + +TEST_BINOP (add, +, float16x4_t) +TEST_BINOP (sub, -, float16x4_t) +TEST_BINOP (mult, *, float16x4_t) +TEST_BINOP (div, /, float16x4_t) + +TEST_CMP (equal, ==, int16x4_t, float16x4_t) +TEST_CMP (unequal, !=, int16x4_t, float16x4_t) +TEST_CMP (lessthan, <, int16x4_t, float16x4_t) +TEST_CMP (greaterthan, >, int16x4_t, float16x4_t) +TEST_CMP (lessthanequal, <=, int16x4_t, float16x4_t) +TEST_CMP (greaterthanqual, >=, int16x4_t, float16x4_t) + +/* Vectors of size 8. */ + +TEST_UNOP (neg, -, float16x8_t) + +TEST_BINOP (add, +, float16x8_t) +TEST_BINOP (sub, -, float16x8_t) +TEST_BINOP (mult, *, float16x8_t) +TEST_BINOP (div, /, float16x8_t) + +TEST_CMP (equal, ==, int16x8_t, float16x8_t) +TEST_CMP (unequal, !=, int16x8_t, float16x8_t) +TEST_CMP (lessthan, <, int16x8_t, float16x8_t) +TEST_CMP (greaterthan, >, int16x8_t, float16x8_t) +TEST_CMP (lessthanequal, <=, int16x8_t, float16x8_t) +TEST_CMP (greaterthanqual, >=, int16x8_t, float16x8_t) + +/* { dg-final { scan-assembler-times {vneg\.f16\ts[0-9]+, s[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vneg\.f16\td[0-9]+, d[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vneg\.f16\tq[0-9]+, q[0-9]+} 1 } } */ + +/* { dg-final { scan-assembler-times {vadd\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */ +/* { dg-final { scan-assembler-times {vsub\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */ +/* { dg-final { scan-assembler-times {vmul\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */ +/* { dg-final { scan-assembler-times {vdiv\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */ +/* { dg-final { scan-assembler-times {vcmp\.f32\ts[0-9]+, s[0-9]+} 26 } } */ + +/* { dg-final { scan-assembler-times {vcmpe\.f32\ts[0-9]+, s[0-9]+} 52 } } */ +/* { dg-final { scan-assembler-times {vcmpe\.f32\ts[0-9]+, #0} 2 } } */ + +/* { dg-final { scan-assembler-not {vabs\.f16} } } */ + +/* { dg-final { scan-assembler-not {vadd\.f32} } } */ +/* { dg-final { scan-assembler-not {vsub\.f32} } } */ +/* { dg-final { scan-assembler-not {vmul\.f32} } } */ +/* { dg-final { scan-assembler-not {vdiv\.f32} } } */ +/* { dg-final { scan-assembler-not {vcmp\.f16} } } */ +/* { dg-final { scan-assembler-not {vcmpe\.f16} } } */ diff --git a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c new file mode 100644 index 0000000..6cd9354 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c @@ -0,0 +1,491 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */ +/* { dg-options "-O2 -ffast-math" } */ +/* { dg-add-options arm_v8_2a_fp16_neon } */ + +/* Test instructions generated for the FP16 vector intrinsics with + -ffast-math */ + +#include + +#define MSTRCAT(L, str) L##str + +#define UNOP_TEST(insn) \ + float16x4_t \ + MSTRCAT (test_##insn, _16x4) (float16x4_t a) \ + { \ + return MSTRCAT (insn, _f16) (a); \ + } \ + float16x8_t \ + MSTRCAT (test_##insn, _16x8) (float16x8_t a) \ + { \ + return MSTRCAT (insn, q_f16) (a); \ + } + +#define BINOP_TEST(insn) \ + float16x4_t \ + MSTRCAT (test_##insn, _16x4) (float16x4_t a, float16x4_t b) \ + { \ + return MSTRCAT (insn, _f16) (a, b); \ + } \ + float16x8_t \ + MSTRCAT (test_##insn, _16x8) (float16x8_t a, float16x8_t b) \ + { \ + return MSTRCAT (insn, q_f16) (a, b); \ + } + +#define BINOP_LANE_TEST(insn, I) \ + float16x4_t \ + MSTRCAT (test_##insn##_lane, _16x4) (float16x4_t a, float16x4_t b) \ + { \ + return MSTRCAT (insn, _lane_f16) (a, b, I); \ + } \ + float16x8_t \ + MSTRCAT (test_##insn##_lane, _16x8) (float16x8_t a, float16x4_t b) \ + { \ + return MSTRCAT (insn, q_lane_f16) (a, b, I); \ + } + +#define BINOP_LANEQ_TEST(insn, I) \ + float16x4_t \ + MSTRCAT (test_##insn##_laneq, _16x4) (float16x4_t a, float16x8_t b) \ + { \ + return MSTRCAT (insn, _laneq_f16) (a, b, I); \ + } \ + float16x8_t \ + MSTRCAT (test_##insn##_laneq, _16x8) (float16x8_t a, float16x8_t b) \ + { \ + return MSTRCAT (insn, q_laneq_f16) (a, b, I); \ + } \ + +#define BINOP_N_TEST(insn) \ + float16x4_t \ + MSTRCAT (test_##insn##_n, _16x4) (float16x4_t a, float16_t b) \ + { \ + return MSTRCAT (insn, _n_f16) (a, b); \ + } \ + float16x8_t \ + MSTRCAT (test_##insn##_n, _16x8) (float16x8_t a, float16_t b) \ + { \ + return MSTRCAT (insn, q_n_f16) (a, b); \ + } + +#define TERNOP_TEST(insn) \ + float16_t \ + MSTRCAT (test_##insn, _16) (float16_t a, float16_t b, float16_t c) \ + { \ + return MSTRCAT (insn, h_f16) (a, b, c); \ + } \ + float16x4_t \ + MSTRCAT (test_##insn, _16x4) (float16x4_t a, float16x4_t b, \ + float16x4_t c) \ + { \ + return MSTRCAT (insn, _f16) (a, b, c); \ + } \ + float16x8_t \ + MSTRCAT (test_##insn, _16x8) (float16x8_t a, float16x8_t b, \ + float16x8_t c) \ + { \ + return MSTRCAT (insn, q_f16) (a, b, c); \ + } + +#define VCMP1_TEST(insn) \ + uint16x4_t \ + MSTRCAT (test_##insn, _16x4) (float16x4_t a) \ + { \ + return MSTRCAT (insn, _f16) (a); \ + } \ + uint16x8_t \ + MSTRCAT (test_##insn, _16x8) (float16x8_t a) \ + { \ + return MSTRCAT (insn, q_f16) (a); \ + } + +#define VCMP2_TEST(insn) \ + uint16x4_t \ + MSTRCAT (test_##insn, _16x4) (float16x4_t a, float16x4_t b) \ + { \ + return MSTRCAT (insn, _f16) (a, b); \ + } \ + uint16x8_t \ + MSTRCAT (test_##insn, _16x8) (float16x8_t a, float16x8_t b) \ + { \ + return MSTRCAT (insn, q_f16) (a, b); \ + } + +#define VCVT_TEST(insn, TY, TO, FR) \ + MSTRCAT (TO, 16x4_t) \ + MSTRCAT (test_##insn, TY) (MSTRCAT (FR, 16x4_t) a) \ + { \ + return MSTRCAT (insn, TY) (a); \ + } \ + MSTRCAT (TO, 16x8_t) \ + MSTRCAT (test_##insn##_q, TY) (MSTRCAT (FR, 16x8_t) a) \ + { \ + return MSTRCAT (insn, q##TY) (a); \ + } + +#define VCVT_N_TEST(insn, TY, TO, FR) \ + MSTRCAT (TO, 16x4_t) \ + MSTRCAT (test_##insn##_n, TY) (MSTRCAT (FR, 16x4_t) a) \ + { \ + return MSTRCAT (insn, _n##TY) (a, 1); \ + } \ + MSTRCAT (TO, 16x8_t) \ + MSTRCAT (test_##insn##_n_q, TY) (MSTRCAT (FR, 16x8_t) a) \ + { \ + return MSTRCAT (insn, q_n##TY) (a, 1); \ + } + +VCMP1_TEST (vceqz) +/* { dg-final { scan-assembler-times {vceq\.f16\td[0-9]+, d[0-0]+, #0} 1 } } */ +/* { dg-final { scan-assembler-times {vceq\.f16\tq[0-9]+, q[0-9]+, #0} 1 } } */ + +VCMP1_TEST (vcgtz) +/* { dg-final { scan-assembler-times {vcgt\.f16\td[0-9]+, d[0-9]+, #0} 1 } } */ +/* { dg-final { scan-assembler-times {vceq\.f16\tq[0-9]+, q[0-9]+, #0} 1 } } */ + +VCMP1_TEST (vcgez) +/* { dg-final { scan-assembler-times {vcge\.f16\td[0-9]+, d[0-9]+, #0} 1 } } */ +/* { dg-final { scan-assembler-times {vcge\.f16\tq[0-9]+, q[0-9]+, #0} 1 } } */ + +VCMP1_TEST (vcltz) +/* { dg-final { scan-assembler-times {vclt.f16\td[0-9]+, d[0-9]+, #0} 1 } } */ +/* { dg-final { scan-assembler-times {vclt.f16\tq[0-9]+, q[0-9]+, #0} 1 } } */ + +VCMP1_TEST (vclez) +/* { dg-final { scan-assembler-times {vcle\.f16\td[0-9]+, d[0-9]+, #0} 1 } } */ +/* { dg-final { scan-assembler-times {vcle\.f16\tq[0-9]+, q[0-9]+, #0} 1 } } */ + +VCVT_TEST (vcvt, _f16_s16, float, int) +VCVT_N_TEST (vcvt, _f16_s16, float, int) +/* { dg-final { scan-assembler-times {vcvt\.f16\.s16\td[0-9]+, d[0-9]+} 2 } } + { dg-final { scan-assembler-times {vcvt\.f16\.s16\tq[0-9]+, q[0-9]+} 2 } } + { dg-final { scan-assembler-times {vcvt\.f16\.s16\td[0-9]+, d[0-9]+, #1} 1 } } + { dg-final { scan-assembler-times {vcvt\.f16\.s16\tq[0-9]+, q[0-9]+, #1} 1 } } */ + +VCVT_TEST (vcvt, _f16_u16, float, uint) +VCVT_N_TEST (vcvt, _f16_u16, float, uint) +/* { dg-final { scan-assembler-times {vcvt\.f16\.u16\td[0-9]+, d[0-9]+} 2 } } + { dg-final { scan-assembler-times {vcvt\.f16\.u16\tq[0-9]+, q[0-9]+} 2 } } + { dg-final { scan-assembler-times {vcvt\.f16\.u16\td[0-9]+, d[0-9]+, #1} 1 } } + { dg-final { scan-assembler-times {vcvt\.f16\.u16\tq[0-9]+, q[0-9]+, #1} 1 } } */ + +VCVT_TEST (vcvt, _s16_f16, int, float) +VCVT_N_TEST (vcvt, _s16_f16, int, float) +/* { dg-final { scan-assembler-times {vcvt\.s16\.f16\td[0-9]+, d[0-9]+} 2 } } + { dg-final { scan-assembler-times {vcvt\.s16\.f16\tq[0-9]+, q[0-9]+} 2 } } + { dg-final { scan-assembler-times {vcvt\.s16\.f16\td[0-9]+, d[0-9]+, #1} 1 } } + { dg-final { scan-assembler-times {vcvt\.s16\.f16\tq[0-9]+, q[0-9]+, #1} 1 } } */ + +VCVT_TEST (vcvt, _u16_f16, uint, float) +VCVT_N_TEST (vcvt, _u16_f16, uint, float) +/* { dg-final { scan-assembler-times {vcvt\.u16\.f16\td[0-9]+, d[0-9]+} 2 } } + { dg-final { scan-assembler-times {vcvt\.u16\.f16\tq[0-9]+, q[0-9]+} 2 } } + { dg-final { scan-assembler-times {vcvt\.u16\.f16\td[0-9]+, d[0-9]+, #1} 1 } } + { dg-final { scan-assembler-times {vcvt\.u16\.f16\tq[0-9]+, q[0-9]+, #1} 1 } } */ + +VCVT_TEST (vcvta, _s16_f16, int, float) +/* { dg-final { scan-assembler-times {vcvta\.s16\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vcvta\.s16\.f16\tq[0-9]+, q[0-9]+} 1 } } +*/ + +VCVT_TEST (vcvta, _u16_f16, uint, float) +/* { dg-final { scan-assembler-times {vcvta\.u16\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vcvta\.u16\.f16\tq[0-9]+, q[0-9]+} 1 } } +*/ + +VCVT_TEST (vcvtm, _s16_f16, int, float) +/* { dg-final { scan-assembler-times {vcvtm\.s16\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vcvtm\.s16\.f16\tq[0-9]+, q[0-9]+} 1 } } +*/ + +VCVT_TEST (vcvtm, _u16_f16, uint, float) +/* { dg-final { scan-assembler-times {vcvtm\.u16\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vcvtm\.u16\.f16\tq[0-9]+, q[0-9]+} 1 } } +*/ + +VCVT_TEST (vcvtn, _s16_f16, int, float) +/* { dg-final { scan-assembler-times {vcvtn\.s16\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vcvtn\.s16\.f16\tq[0-9]+, q[0-9]+} 1 } } +*/ + +VCVT_TEST (vcvtn, _u16_f16, uint, float) +/* { dg-final { scan-assembler-times {vcvtn\.u16\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vcvtn\.u16\.f16\tq[0-9]+, q[0-9]+} 1 } } +*/ + +VCVT_TEST (vcvtp, _s16_f16, int, float) +/* { dg-final { scan-assembler-times {vcvtp\.s16\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vcvtp\.s16\.f16\tq[0-9]+, q[0-9]+} 1 } } +*/ + +VCVT_TEST (vcvtp, _u16_f16, uint, float) +/* { dg-final { scan-assembler-times {vcvtp\.u16\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vcvtp\.u16\.f16\tq[0-9]+, q[0-9]+} 1 } } +*/ + +UNOP_TEST (vabs) +/* { dg-final { scan-assembler-times {vabs\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vabs\.f16\tq[0-9]+, q[0-9]+} 1 } } */ + +UNOP_TEST (vneg) +/* { dg-final { scan-assembler-times {vneg\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vneg\.f16\tq[0-9]+, q[0-9]+} 1 } } */ + +UNOP_TEST (vrecpe) +/* { dg-final { scan-assembler-times {vrecpe\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vrecpe\.f16\tq[0-9]+, q[0-9]+} 1 } } */ + +UNOP_TEST (vrnd) +/* { dg-final { scan-assembler-times {vrintz\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vrintz\.f16\tq[0-9]+, q[0-9]+} 1 } } */ + +UNOP_TEST (vrnda) +/* { dg-final { scan-assembler-times {vrinta\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vrinta\.f16\tq[0-9]+, q[0-9]+} 1 } } */ + +UNOP_TEST (vrndm) +/* { dg-final { scan-assembler-times {vrintm\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vrintm\.f16\tq[0-9]+, q[0-9]+} 1 } } */ + +UNOP_TEST (vrndn) +/* { dg-final { scan-assembler-times {vrintn\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vrintn\.f16\tq[0-9]+, q[0-9]+} 1 } } */ + +UNOP_TEST (vrndp) +/* { dg-final { scan-assembler-times {vrintp\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vrintp\.f16\tq[0-9]+, q[0-9]+} 1 } } */ + +UNOP_TEST (vrndx) +/* { dg-final { scan-assembler-times {vrintx\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vrintx\.f16\tq[0-9]+, q[0-9]+} 1 } } */ + +UNOP_TEST (vrsqrte) +/* { dg-final { scan-assembler-times {vrsqrte\.f16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vrsqrte\.f16\tq[0-9]+, q[0-9]+} 1 } } */ + +BINOP_TEST (vadd) +/* { dg-final { scan-assembler-times {vadd\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vadd\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +BINOP_TEST (vabd) +/* { dg-final { scan-assembler-times {vabd\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vabd\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +VCMP2_TEST (vcage) +/* { dg-final { scan-assembler-times {vacge\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vacge\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +VCMP2_TEST (vcagt) +/* { dg-final { scan-assembler-times {vacgt\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vacgt\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +VCMP2_TEST (vcale) +/* { dg-final { scan-assembler-times {vacle\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vacle\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +VCMP2_TEST (vcalt) +/* { dg-final { scan-assembler-times {vaclt\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vaclt\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +VCMP2_TEST (vceq) +/* { dg-final { scan-assembler-times {vceq\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vceq\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +VCMP2_TEST (vcge) +/* { dg-final { scan-assembler-times {vcge\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vcge\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +VCMP2_TEST (vcgt) +/* { dg-final { scan-assembler-times {vcgt\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vcgt\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +VCMP2_TEST (vcle) +/* { dg-final { scan-assembler-times {vcle\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vcle\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +VCMP2_TEST (vclt) +/* { dg-final { scan-assembler-times {vclt\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vclt\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +BINOP_TEST (vmax) +/* { dg-final { scan-assembler-times {vmax\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vmax\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +BINOP_TEST (vmin) +/* { dg-final { scan-assembler-times {vmin\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vmin\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +BINOP_TEST (vmaxnm) +/* { dg-final { scan-assembler-times {vmaxnm\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vmaxnm\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +BINOP_TEST (vminnm) +/* { dg-final { scan-assembler-times {vminnm\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vminnm\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +BINOP_TEST (vmul) +/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 3 } } + { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ +BINOP_LANE_TEST (vmul, 2) +/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+\[2\]} 1 } } + { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, d[0-9]+\[2\]} 1 } } */ +BINOP_N_TEST (vmul) +/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+\[0\]} 1 } } + { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, d[0-9]+\[0\]} 1 } }*/ + +float16x4_t +test_vpadd_16x4 (float16x4_t a, float16x4_t b) +{ + return vpadd_f16 (a, b); +} +/* { dg-final { scan-assembler-times {vpadd\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } */ + +float16x4_t +test_vpmax_16x4 (float16x4_t a, float16x4_t b) +{ + return vpmax_f16 (a, b); +} +/* { dg-final { scan-assembler-times {vpmax\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } */ + +float16x4_t +test_vpmin_16x4 (float16x4_t a, float16x4_t b) +{ + return vpmin_f16 (a, b); +} +/* { dg-final { scan-assembler-times {vpmin\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } */ + +BINOP_TEST (vsub) +/* { dg-final { scan-assembler-times {vsub\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vsub\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +BINOP_TEST (vrecps) +/* { dg-final { scan-assembler-times {vrecps\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vrecps\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +BINOP_TEST (vrsqrts) +/* { dg-final { scan-assembler-times {vrsqrts\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vrsqrts\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +TERNOP_TEST (vfma) +/* { dg-final { scan-assembler-times {vfma\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vfma\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +TERNOP_TEST (vfms) +/* { dg-final { scan-assembler-times {vfms\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vfms\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +float16x4_t +test_vmov_n_f16 (float16_t a) +{ + return vmov_n_f16 (a); +} + +float16x4_t +test_vdup_n_f16 (float16_t a) +{ + return vdup_n_f16 (a); +} +/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, r[0-9]+} 2 } } */ + +float16x8_t +test_vmovq_n_f16 (float16_t a) +{ + return vmovq_n_f16 (a); +} + +float16x8_t +test_vdupq_n_f16 (float16_t a) +{ + return vdupq_n_f16 (a); +} +/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, r[0-9]+} 2 } } */ + +float16x4_t +test_vdup_lane_f16 (float16x4_t a) +{ + return vdup_lane_f16 (a, 1); +} +/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, d[0-9]+\[1\]} 1 } } */ + +float16x8_t +test_vdupq_lane_f16 (float16x4_t a) +{ + return vdupq_lane_f16 (a, 1); +} +/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, d[0-9]+\[1\]} 1 } } */ + +float16x4_t +test_vext_f16 (float16x4_t a, float16x4_t b) +{ + return vext_f16 (a, b, 1); +} +/* { dg-final { scan-assembler-times {vext\.16\td[0-9]+, d[0-9]+, d[0-9]+, #1} 1 } } */ + +float16x8_t +test_vextq_f16 (float16x8_t a, float16x8_t b) +{ + return vextq_f16 (a, b, 1); +} +/* { dg-final { scan-assembler-times {vext\.16\tq[0-9]+, q[0-9]+, q[0-9]+, #1} 1 } } */ + +UNOP_TEST (vrev64) +/* { dg-final { scan-assembler-times {vrev64\.16\td[0-9]+, d[0-9]+} 1 } } + { dg-final { scan-assembler-times {vrev64\.16\tq[0-9]+, q[0-9]+} 1 } } */ + +float16x4_t +test_vbsl16x4 (uint16x4_t a, float16x4_t b, float16x4_t c) +{ + return vbsl_f16 (a, b, c); +} +/* { dg-final { scan-assembler-times {vbsl\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } */ + +float16x8_t +test_vbslq16x8 (uint16x8_t a, float16x8_t b, float16x8_t c) +{ + return vbslq_f16 (a, b, c); +} +/*{ dg-final { scan-assembler-times {vbsl\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + +float16x4x2_t +test_vzip16x4 (float16x4_t a, float16x4_t b) +{ + return vzip_f16 (a, b); +} +/* { dg-final { scan-assembler-times {vzip\.16\td[0-9]+, d[0-9]+} 1 } } */ + +float16x8x2_t +test_vzipq16x8 (float16x8_t a, float16x8_t b) +{ + return vzipq_f16 (a, b); +} +/*{ dg-final { scan-assembler-times {vzip\.16\tq[0-9]+, q[0-9]+} 1 } } */ + +float16x4x2_t +test_vuzp16x4 (float16x4_t a, float16x4_t b) +{ + return vuzp_f16 (a, b); +} +/* { dg-final { scan-assembler-times {vuzp\.16\td[0-9]+, d[0-9]+} 1 } } */ + +float16x8x2_t +test_vuzpq16x8 (float16x8_t a, float16x8_t b) +{ + return vuzpq_f16 (a, b); +} +/*{ dg-final { scan-assembler-times {vuzp\.16\tq[0-9]+, q[0-9]+} 1 } } */ + +float16x4x2_t +test_vtrn16x4 (float16x4_t a, float16x4_t b) +{ + return vtrn_f16 (a, b); +} +/* { dg-final { scan-assembler-times {vtrn\.16\td[0-9]+, d[0-9]+} 1 } } */ + +float16x8x2_t +test_vtrnq16x8 (float16x8_t a, float16x8_t b) +{ + return vtrnq_f16 (a, b); +} +/*{ dg-final { scan-assembler-times {vtrn\.16\tq[0-9]+, q[0-9]+} 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-3.c b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-3.c new file mode 100644 index 0000000..20366e2 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-3.c @@ -0,0 +1,108 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */ +/* { dg-options "-O2 -ffast-math" } */ +/* { dg-add-options arm_v8_2a_fp16_neon } */ + +/* Test compiler use of FP16 FMA/FMS instructions with -ffast-math. */ + +#include + +float16x4_t +test_vfma_1 (float16x4_t a, float16x4_t b, float16x4_t c) +{ + return vadd_f16 (vmul_f16 (a, b), c); +} + +float16x4_t +test_vfma_2 (float16x4_t a, float16x4_t b, float16x4_t c) +{ + return vsub_f16 (vmul_f16 (a, b), vneg_f16 (c)); +} + +float16x4_t +test_vfma_3 (float16x4_t a, float16x4_t b, float16x4_t c) +{ + return vsub_f16 (vmul_f16 (vneg_f16 (a), vneg_f16 (b)), vneg_f16 (c)); +} + +float16x4_t +test_vfma_4 (float16x4_t a, float16x4_t b, float16x4_t c) +{ + return vsub_f16 (vmul_f16 (a, b), vneg_f16 (c)); +} +/* { dg-final { scan-assembler-times {vfma\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 4 } } */ + +float16x8_t +test_vfmaq_1 (float16x8_t a, float16x8_t b, float16x8_t c) +{ + return vaddq_f16 (vmulq_f16 (a, b), c); +} + +float16x8_t +test_vfmaq_2 (float16x8_t a, float16x8_t b, float16x8_t c) +{ + return vsubq_f16 (vmulq_f16 (a, b), vnegq_f16 (c)); +} + +float16x8_t +test_vfmaq_3 (float16x8_t a, float16x8_t b, float16x8_t c) +{ + return vsubq_f16 (vmulq_f16 (vnegq_f16 (a), vnegq_f16 (b)), vnegq_f16 (c)); +} + +float16x8_t +test_vfmaq_4 (float16x8_t a, float16x8_t b, float16x8_t c) +{ + return vsubq_f16 (vmulq_f16 (a, b), vnegq_f16 (c)); +} +/* { dg-final { scan-assembler-times {vfma\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 4 } } */ + +float16x4_t +test_vfms_1 (float16x4_t a, float16x4_t b, float16x4_t c) +{ + return vsub_f16 (c, vmul_f16 (a, b)); +} + +float16x4_t +test_vfms_2 (float16x4_t a, float16x4_t b, float16x4_t c) +{ + return vsub_f16 (a, vmul_f16 (b, c)); +} + +float16x4_t +test_vfms_3 (float16x4_t a, float16x4_t b, float16x4_t c) +{ + return vadd_f16 (vmul_f16 (vneg_f16 (a), b), c); +} + +float16x4_t +test_vfms_4 (float16x4_t a, float16x4_t b, float16x4_t c) +{ + return vadd_f16 (vmul_f16 (a, vneg_f16 (b)), c); +} +/* { dg-final { scan-assembler-times {vfms\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 4 } } */ + +float16x8_t +test_vfmsq_1 (float16x8_t a, float16x8_t b, float16x8_t c) +{ + return vsubq_f16 (c, vmulq_f16 (a, b)); +} + +float16x8_t +test_vfmsq_2 (float16x8_t a, float16x8_t b, float16x8_t c) +{ + return vsubq_f16 (a, vmulq_f16 (b, c)); +} + +float16x8_t +test_vfmsq_3 (float16x8_t a, float16x8_t b, float16x8_t c) +{ + return vaddq_f16 (vmulq_f16 (vnegq_f16 (a), b), c); +} + +float16x8_t +test_vfmsq_4 (float16x8_t a, float16x8_t b, float16x8_t c) +{ + return vaddq_f16 (vmulq_f16 (a, vnegq_f16 (b)), c); +} +/* { dg-final { scan-assembler-times {vfms\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 4 } } */ -- 2.1.4