Hi All, The backend has an existing V2HFmode that is used by pairwise operations. This mode was however never made fully functional. Amongst other things it was never declared as a vector type which made it unusable from the mid-end. It's also lacking an implementation for load/stores so reload ICEs if this mode is every used. This finishes the implementation by providing the above. Note that I have created a new iterator VHSDF_P instead of extending VHSDF because the previous iterator is used in far more things than just load/stores. It's also used for instance in intrinsics and extending this would force me to provide support for mangling the type while we never expose it through intrinsics. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-simd.md (*aarch64_simd_movv2hf): New. (mov, movmisalign, aarch64_dup_lane, aarch64_store_lane0, aarch64_simd_vec_set, @aarch64_simd_vec_copy_lane, vec_set, reduc__scal_, reduc__scal_, aarch64_reduc__internal, aarch64_get_lane, vec_init, vec_extract): Support V2HF. * config/aarch64/aarch64.cc (aarch64_classify_vector_mode): Add E_V2HFmode. * config/aarch64/iterators.md (VHSDF_P): New. (V2F, VALL_F16_FULL, nunits, Vtype, Vmtype, Vetype, stype, VEL, Vel, q, vp): Add V2HF. * config/arm/types.md (neon_fp_reduc_add_h): New. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/slp_1.c: Update testcase. --- inline copy of patch -- diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 25aed74f8cf939562ed65a578fe32ca76605b58a..93a2888f567460ad10ec050ea7d4f701df4729d1 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -19,10 +19,10 @@ ;; . (define_expand "mov" - [(set (match_operand:VALL_F16 0 "nonimmediate_operand") - (match_operand:VALL_F16 1 "general_operand"))] + [(set (match_operand:VALL_F16_FULL 0 "nonimmediate_operand") + (match_operand:VALL_F16_FULL 1 "general_operand"))] "TARGET_SIMD" - " +{ /* Force the operand into a register if it is not an immediate whose use can be replaced with xzr. If the mode is 16 bytes wide, then we will be doing @@ -46,12 +46,11 @@ (define_expand "mov" aarch64_expand_vector_init (operands[0], operands[1]); DONE; } - " -) +}) (define_expand "movmisalign" - [(set (match_operand:VALL_F16 0 "nonimmediate_operand") - (match_operand:VALL_F16 1 "general_operand"))] + [(set (match_operand:VALL_F16_FULL 0 "nonimmediate_operand") + (match_operand:VALL_F16_FULL 1 "general_operand"))] "TARGET_SIMD && !STRICT_ALIGNMENT" { /* This pattern is not permitted to fail during expansion: if both arguments @@ -85,10 +84,10 @@ (define_insn "aarch64_simd_dup" ) (define_insn "aarch64_dup_lane" - [(set (match_operand:VALL_F16 0 "register_operand" "=w") - (vec_duplicate:VALL_F16 + [(set (match_operand:VALL_F16_FULL 0 "register_operand" "=w") + (vec_duplicate:VALL_F16_FULL (vec_select: - (match_operand:VALL_F16 1 "register_operand" "w") + (match_operand:VALL_F16_FULL 1 "register_operand" "w") (parallel [(match_operand:SI 2 "immediate_operand" "i")]) )))] "TARGET_SIMD" @@ -142,6 +141,29 @@ (define_insn "*aarch64_simd_mov" mov_reg, neon_move")] ) +(define_insn "*aarch64_simd_movv2hf" + [(set (match_operand:V2HF 0 "nonimmediate_operand" + "=w, m, m, w, ?r, ?w, ?r, w, w") + (match_operand:V2HF 1 "general_operand" + "m, Dz, w, w, w, r, r, Dz, Dn"))] + "TARGET_SIMD_F16INST + && (register_operand (operands[0], V2HFmode) + || aarch64_simd_reg_or_zero (operands[1], V2HFmode))" + "@ + ldr\\t%s0, %1 + str\\twzr, %0 + str\\t%s1, %0 + mov\\t%0.2s[0], %1.2s[0] + umov\\t%w0, %1.s[0] + fmov\\t%s0, %1 + mov\\t%0, %1 + movi\\t%d0, 0 + * return aarch64_output_simd_mov_immediate (operands[1], 32);" + [(set_attr "type" "neon_load1_1reg, store_8, neon_store1_1reg,\ + neon_logic, neon_to_gp, f_mcr,\ + mov_reg, neon_move, neon_move")] +) + (define_insn "*aarch64_simd_mov" [(set (match_operand:VQMOV 0 "nonimmediate_operand" "=w, Umn, m, w, ?r, ?w, ?r, w") @@ -182,7 +204,7 @@ (define_insn "*aarch64_simd_mov" (define_insn "aarch64_store_lane0" [(set (match_operand: 0 "memory_operand" "=m") - (vec_select: (match_operand:VALL_F16 1 "register_operand" "w") + (vec_select: (match_operand:VALL_F16_FULL 1 "register_operand" "w") (parallel [(match_operand 2 "const_int_operand" "n")])))] "TARGET_SIMD && ENDIAN_LANE_N (, INTVAL (operands[2])) == 0" @@ -1035,11 +1057,11 @@ (define_insn "one_cmpl2" ) (define_insn "aarch64_simd_vec_set" - [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w") - (vec_merge:VALL_F16 - (vec_duplicate:VALL_F16 + [(set (match_operand:VALL_F16_FULL 0 "register_operand" "=w,w,w") + (vec_merge:VALL_F16_FULL + (vec_duplicate:VALL_F16_FULL (match_operand: 1 "aarch64_simd_nonimmediate_operand" "w,?r,Utv")) - (match_operand:VALL_F16 3 "register_operand" "0,0,0") + (match_operand:VALL_F16_FULL 3 "register_operand" "0,0,0") (match_operand:SI 2 "immediate_operand" "i,i,i")))] "TARGET_SIMD" { @@ -1061,14 +1083,14 @@ (define_insn "aarch64_simd_vec_set" ) (define_insn "@aarch64_simd_vec_copy_lane" - [(set (match_operand:VALL_F16 0 "register_operand" "=w") - (vec_merge:VALL_F16 - (vec_duplicate:VALL_F16 + [(set (match_operand:VALL_F16_FULL 0 "register_operand" "=w") + (vec_merge:VALL_F16_FULL + (vec_duplicate:VALL_F16_FULL (vec_select: - (match_operand:VALL_F16 3 "register_operand" "w") + (match_operand:VALL_F16_FULL 3 "register_operand" "w") (parallel [(match_operand:SI 4 "immediate_operand" "i")]))) - (match_operand:VALL_F16 1 "register_operand" "0") + (match_operand:VALL_F16_FULL 1 "register_operand" "0") (match_operand:SI 2 "immediate_operand" "i")))] "TARGET_SIMD" { @@ -1376,7 +1398,7 @@ (define_insn "vec_shr_" ) (define_expand "vec_set" - [(match_operand:VALL_F16 0 "register_operand") + [(match_operand:VALL_F16_FULL 0 "register_operand") (match_operand: 1 "aarch64_simd_nonimmediate_operand") (match_operand:SI 2 "immediate_operand")] "TARGET_SIMD" @@ -3503,7 +3525,7 @@ (define_insn "popcount2" ;; gimple_fold'd to the IFN_REDUC_(MAX|MIN) function. (This is FP smax/smin). (define_expand "reduc__scal_" [(match_operand: 0 "register_operand") - (unspec: [(match_operand:VHSDF 1 "register_operand")] + (unspec: [(match_operand:VHSDF_P 1 "register_operand")] FMAXMINV)] "TARGET_SIMD" { @@ -3518,7 +3540,7 @@ (define_expand "reduc__scal_" (define_expand "reduc__scal_" [(match_operand: 0 "register_operand") - (unspec: [(match_operand:VHSDF 1 "register_operand")] + (unspec: [(match_operand:VHSDF_P 1 "register_operand")] FMAXMINNMV)] "TARGET_SIMD" { @@ -3562,8 +3584,8 @@ (define_insn "aarch64_reduc__internalv2si" ) (define_insn "aarch64_reduc__internal" - [(set (match_operand:VHSDF 0 "register_operand" "=w") - (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")] + [(set (match_operand:VHSDF_P 0 "register_operand" "=w") + (unspec:VHSDF_P [(match_operand:VHSDF_P 1 "register_operand" "w")] FMAXMINV))] "TARGET_SIMD" "\\t%0, %1." @@ -4208,7 +4230,7 @@ (define_insn "*aarch64_get_lane_zero_extend" (define_insn_and_split "aarch64_get_lane" [(set (match_operand: 0 "aarch64_simd_nonimmediate_operand" "=?r, w, Utv") (vec_select: - (match_operand:VALL_F16 1 "register_operand" "w, w, w") + (match_operand:VALL_F16_FULL 1 "register_operand" "w, w, w") (parallel [(match_operand:SI 2 "immediate_operand" "i, i, i")])))] "TARGET_SIMD" { @@ -7989,7 +8011,7 @@ (define_expand "aarch64_st1" ;; Standard pattern name vec_init. (define_expand "vec_init" - [(match_operand:VALL_F16 0 "register_operand") + [(match_operand:VALL_F16_FULL 0 "register_operand") (match_operand 1 "" "")] "TARGET_SIMD" { @@ -8068,7 +8090,7 @@ (define_insn "aarch64_urecpe" (define_expand "vec_extract" [(match_operand: 0 "aarch64_simd_nonimmediate_operand") - (match_operand:VALL_F16 1 "register_operand") + (match_operand:VALL_F16_FULL 1 "register_operand") (match_operand:SI 2 "immediate_operand")] "TARGET_SIMD" { diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index f05bac713e88ea8c7feaa2367d55bd523ca66f57..1e08f8453688210afe1566092b19b59c9bdd0c97 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -3566,6 +3566,7 @@ aarch64_classify_vector_mode (machine_mode mode) case E_V8BFmode: case E_V4SFmode: case E_V2DFmode: + case E_V2HFmode: return TARGET_SIMD ? VEC_ADVSIMD : 0; default: diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 37d8161a33b1c399d80be82afa67613a087389d4..1df09f7fe2eb35aed96113476541e0faa5393551 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -160,6 +160,10 @@ (define_mode_iterator VDQF [V2SF V4SF V2DF]) (define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST") (V8HF "TARGET_SIMD_F16INST") V2SF V4SF V2DF]) +;; Advanced SIMD Float modes suitable for pairwise operations. +(define_mode_iterator VHSDF_P [(V4HF "TARGET_SIMD_F16INST") + (V8HF "TARGET_SIMD_F16INST") + V2SF V4SF V2DF (V2HF "TARGET_SIMD_F16INST")]) ;; Advanced SIMD Float modes, and DF. (define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF]) @@ -188,15 +192,23 @@ (define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF V2DI]) (define_mode_iterator VALLF [V2SF V4SF V2DF SF DF]) ;; Advanced SIMD Float modes with 2 elements. -(define_mode_iterator V2F [V2SF V2DF]) +(define_mode_iterator V2F [V2SF V2DF V2HF]) ;; All Advanced SIMD modes on which we support any arithmetic operations. (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF]) -;; All Advanced SIMD modes suitable for moving, loading, and storing. +;; All Advanced SIMD modes suitable for moving, loading, and storing +;; except V2HF. (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V4HF V8HF V4BF V8BF V2SF V4SF V2DF]) +;; All Advanced SIMD modes suitable for moving, loading, and storing +;; including V2HF +(define_mode_iterator VALL_F16_FULL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI + V4HF V8HF V4BF V8BF V2SF V4SF V2DF + (V2HF "TARGET_SIMD_F16INST")]) + + ;; The VALL_F16 modes except the 128-bit 2-element ones. (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI V4HF V8HF V2SF V4SF]) @@ -1076,7 +1088,7 @@ (define_mode_attr nunits [(V8QI "8") (V16QI "16") (V2SF "2") (V4SF "4") (V1DF "1") (V2DF "2") (DI "1") (DF "1") - (V8DI "8")]) + (V8DI "8") (V2HF "2")]) ;; Map a mode to the number of bits in it, if the size of the mode ;; is constant. @@ -1090,6 +1102,7 @@ (define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")]) ;; Give the length suffix letter for a sign- or zero-extension. (define_mode_attr size [(QI "b") (HI "h") (SI "w")]) +(define_mode_attr sizel [(QI "b") (HI "h") (SI "")]) ;; Give the number of bits in the mode (define_mode_attr sizen [(QI "8") (HI "16") (SI "32") (DI "64")]) @@ -1134,8 +1147,9 @@ (define_mode_attr Vtype [(V8QI "8b") (V16QI "16b") (V2SI "2s") (V4SI "4s") (DI "1d") (DF "1d") (V2DI "2d") (V2SF "2s") - (V4SF "4s") (V2DF "2d") - (V4HF "4h") (V8HF "8h") + (V2HF "2h") (V4SF "4s") + (V2DF "2d") (V4HF "4h") + (V8HF "8h") (V2x8QI "8b") (V2x4HI "4h") (V2x2SI "2s") (V2x1DI "1d") (V2x4HF "4h") (V2x2SF "2s") @@ -1175,9 +1189,10 @@ (define_mode_attr Vmtype [(V8QI ".8b") (V16QI ".16b") (V4HI ".4h") (V8HI ".8h") (V2SI ".2s") (V4SI ".4s") (V2DI ".2d") (V4HF ".4h") - (V8HF ".8h") (V4BF ".4h") - (V8BF ".8h") (V2SF ".2s") - (V4SF ".4s") (V2DF ".2d") + (V8HF ".8h") (V2HF ".2h") + (V4BF ".4h") (V8BF ".8h") + (V2SF ".2s") (V4SF ".4s") + (V2DF ".2d") (DI "") (SI "") (HI "") (QI "") (TI "") (HF "") @@ -1193,7 +1208,7 @@ (define_mode_attr Vmntype [(V8HI ".8b") (V4SI ".4h") (define_mode_attr Vetype [(V8QI "b") (V16QI "b") (V4HI "h") (V8HI "h") (V2SI "s") (V4SI "s") - (V2DI "d") + (V2DI "d") (V2HF "h") (V4HF "h") (V8HF "h") (V2SF "s") (V4SF "s") (V2DF "d") @@ -1285,7 +1300,7 @@ (define_mode_attr Vcwtype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "w") (VNx2QI "d") ;; more accurately. (define_mode_attr stype [(V8QI "b") (V16QI "b") (V4HI "s") (V8HI "s") (V2SI "s") (V4SI "s") (V2DI "d") (V4HF "s") - (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d") + (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d") (V2HF "s") (HF "s") (SF "s") (DF "d") (QI "b") (HI "s") (SI "s") (DI "d")]) @@ -1360,8 +1375,8 @@ (define_mode_attr VEL [(V8QI "QI") (V16QI "QI") (V4HF "HF") (V8HF "HF") (V2SF "SF") (V4SF "SF") (DF "DF") (V2DF "DF") - (SI "SI") (HI "HI") - (QI "QI") + (SI "SI") (V2HF "HF") + (QI "QI") (HI "HI") (V4BF "BF") (V8BF "BF") (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI") (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI") @@ -1381,7 +1396,7 @@ (define_mode_attr Vel [(V8QI "qi") (V16QI "qi") (V2SF "sf") (V4SF "sf") (V2DF "df") (DF "df") (SI "si") (HI "hi") - (QI "qi") + (QI "qi") (V2HF "hf") (V4BF "bf") (V8BF "bf") (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi") (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi") @@ -1866,7 +1881,7 @@ (define_mode_attr q [(V8QI "") (V16QI "_q") (V4HF "") (V8HF "_q") (V4BF "") (V8BF "_q") (V2SF "") (V4SF "_q") - (V2DF "_q") + (V2HF "") (V2DF "_q") (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "") (V2x8QI "") (V2x16QI "_q") (V2x4HI "") (V2x8HI "_q") @@ -1905,6 +1920,7 @@ (define_mode_attr vp [(V8QI "v") (V16QI "v") (V2SI "p") (V4SI "v") (V2DI "p") (V2DF "p") (V2SF "p") (V4SF "v") + (V2HF "p") (V4HF "v") (V8HF "v")]) (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi") diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md index 7d0504bdd944e9c0d1b545b0b66a9a1adc808714..3cfbc7a93cca1bea4925853e51d0a147c5722247 100644 --- a/gcc/config/arm/types.md +++ b/gcc/config/arm/types.md @@ -483,6 +483,7 @@ (define_attr "autodetect_type" ; neon_fp_minmax_s_q ; neon_fp_minmax_d ; neon_fp_minmax_d_q +; neon_fp_reduc_add_h ; neon_fp_reduc_add_s ; neon_fp_reduc_add_s_q ; neon_fp_reduc_add_d @@ -1033,6 +1034,7 @@ (define_attr "type" neon_fp_minmax_d,\ neon_fp_minmax_d_q,\ \ + neon_fp_reduc_add_h,\ neon_fp_reduc_add_s,\ neon_fp_reduc_add_s_q,\ neon_fp_reduc_add_d,\ @@ -1257,8 +1259,8 @@ (define_attr "is_neon_type" "yes,no" neon_fp_compare_d, neon_fp_compare_d_q, neon_fp_minmax_s,\ neon_fp_minmax_s_q, neon_fp_minmax_d, neon_fp_minmax_d_q,\ neon_fp_neg_s, neon_fp_neg_s_q, neon_fp_neg_d, neon_fp_neg_d_q,\ - neon_fp_reduc_add_s, neon_fp_reduc_add_s_q, neon_fp_reduc_add_d,\ - neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s, + neon_fp_reduc_add_h, neon_fp_reduc_add_s, neon_fp_reduc_add_s_q,\ + neon_fp_reduc_add_d, neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s,\ neon_fp_reduc_minmax_s_q, neon_fp_reduc_minmax_d,\ neon_fp_reduc_minmax_d_q,\ neon_fp_cvt_narrow_s_q, neon_fp_cvt_narrow_d_q,\ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c index 07d71a63414b1066ea431e287286ad048515711a..8e35e0b574d49913b43c7d8d4f4ba75f127f42e9 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c @@ -30,11 +30,9 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n) \ TEST_ALL (VEC_PERM) /* We should use one DUP for each of the 8-, 16- and 32-bit types, - although we currently use LD1RW for _Float16. We should use two - DUPs for each of the three 64-bit types. */ + We should use two DUPs for each of the three 64-bit types. */ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } } */ -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 2 } } */ -/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 1 } } */ +/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } } */ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } } */ /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */ /* { dg-final { scan-assembler-not {\tzip2\t} } } */ --