[gcc r13-4616] aarch64: Make existing V2HF be usable.

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc r13-4616] aarch64: Make existing V2HF be usable.
@ 2022-12-12 15:15 Tamar Christina
  0 siblings, 0 replies; only message in thread
From: Tamar Christina @ 2022-12-12 15:15 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:2cba118e538ba0b7582af7f9fb5ba2dfbb772f8e

commit r13-4616-g2cba118e538ba0b7582af7f9fb5ba2dfbb772f8e
Author: Tamar Christina <tamar.christina@arm.com>
Date:   Mon Dec 12 15:15:07 2022 +0000

    aarch64: Make existing V2HF be usable.
    
    The backend has an existing V2HFmode that is used by pairwise operations.
    This mode was however never made fully functional.  Amongst other things it was
    never declared as a vector type which made it unusable from the mid-end.
    
    It's also lacking an implementation for load/stores so reload ICEs if this mode
    is every used.  This finishes the implementation by providing the above.
    
    Note that I have created a new iterator VHSDF_P instead of extending VHSDF
    because the previous iterator is used in far more things than just load/stores.
    
    It's also used for instance in intrinsics and extending this would force me to
    provide support for mangling the type while we never expose it through
    intrinsics.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-simd.md (*aarch64_simd_movv2hf): New.
            (mov<mode>, movmisalign<mode>, aarch64_dup_lane<mode>,
            aarch64_store_lane0<mode>, aarch64_simd_vec_set<mode>,
            @aarch64_simd_vec_copy_lane<mode>, vec_set<mode>,
            reduc_<optab>_scal_<mode>, reduc_<fmaxmin>_scal_<mode>,
            aarch64_reduc_<optab>_internal<mode>, aarch64_get_lane<mode>,
            vec_init<mode><Vel>, vec_extract<mode><Vel>): Support V2HF.
            (aarch64_simd_dupv2hf): New.
            * config/aarch64/aarch64.cc (aarch64_classify_vector_mode):
            Add E_V2HFmode.
            * config/aarch64/iterators.md (VHSDF_P): New.
            (V2F, VMOVE, nunits, Vtype, Vmtype, Vetype, stype, VEL,
            Vel, q, vp): Add V2HF.
            * config/arm/types.md (neon_fp_reduc_add_h): New.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/sve/slp_1.c: Update testcase.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md           | 88 +++++++++++++++++++---------
 gcc/config/aarch64/aarch64.cc                |  1 +
 gcc/config/aarch64/iterators.md              | 30 +++++++---
 gcc/config/arm/types.md                      |  6 +-
 gcc/testsuite/gcc.target/aarch64/sve/slp_1.c | 10 ++--
 5 files changed, 90 insertions(+), 45 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 104088f67d2..c0e6164b3bd 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -19,10 +19,10 @@
 ;; <http://www.gnu.org/licenses/>.
 
 (define_expand "mov<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
-	(match_operand:VALL_F16 1 "general_operand"))]
+  [(set (match_operand:VMOVE 0 "nonimmediate_operand")
+	(match_operand:VMOVE 1 "general_operand"))]
   "TARGET_FLOAT"
-  "
+{
   /* Force the operand into a register if it is not an
      immediate whose use can be replaced with xzr.
      If the mode is 16 bytes wide, then we will be doing
@@ -46,12 +46,11 @@
       aarch64_expand_vector_init (operands[0], operands[1]);
       DONE;
     }
-  "
-)
+})
 
 (define_expand "movmisalign<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
-        (match_operand:VALL_F16 1 "general_operand"))]
+  [(set (match_operand:VMOVE 0 "nonimmediate_operand")
+        (match_operand:VMOVE 1 "general_operand"))]
   "TARGET_FLOAT && !STRICT_ALIGNMENT"
 {
   /* This pattern is not permitted to fail during expansion: if both arguments
@@ -73,6 +72,16 @@
   [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
 )
 
+(define_insn "aarch64_simd_dupv2hf"
+  [(set (match_operand:V2HF 0 "register_operand" "=w")
+	(vec_duplicate:V2HF
+	  (match_operand:HF 1 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "@
+   sli\\t%d0, %d0, 16"
+  [(set_attr "type" "neon_shift_imm")]
+)
+
 (define_insn "aarch64_simd_dup<mode>"
   [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
 	(vec_duplicate:VDQF_F16
@@ -85,10 +94,10 @@
 )
 
 (define_insn "aarch64_dup_lane<mode>"
-  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
-	(vec_duplicate:VALL_F16
+  [(set (match_operand:VMOVE 0 "register_operand" "=w")
+	(vec_duplicate:VMOVE
 	  (vec_select:<VEL>
-	    (match_operand:VALL_F16 1 "register_operand" "w")
+	    (match_operand:VMOVE 1 "register_operand" "w")
 	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
           )))]
   "TARGET_SIMD"
@@ -150,6 +159,29 @@
    (set_attr "arch" "*,*,*,*,*,*,*,simd,*")]
 )
 
+(define_insn "*aarch64_simd_movv2hf"
+  [(set (match_operand:V2HF 0 "nonimmediate_operand"
+		"=w, m,  m,  w, ?r, ?w, ?r, w, w")
+	(match_operand:V2HF 1 "general_operand"
+		"m,  Dz, w,  w,  w,  r,  r, Dz, Dn"))]
+  "TARGET_SIMD_F16INST
+   && (register_operand (operands[0], V2HFmode)
+       || aarch64_simd_reg_or_zero (operands[1], V2HFmode))"
+   "@
+    ldr\\t%s0, %1
+    str\\twzr, %0
+    str\\t%s1, %0
+    mov\\t%0.2s[0], %1.2s[0]
+    umov\\t%w0, %1.s[0]
+    fmov\\t%s0, %w1
+    mov\\t%w0, %w1
+    movi\\t%d0, 0
+    * return aarch64_output_simd_mov_immediate (operands[1], 32);"
+  [(set_attr "type" "neon_load1_1reg, store_8, neon_store1_1reg,\
+		     neon_logic, neon_to_gp, f_mcr,\
+		     mov_reg, neon_move, neon_move")]
+)
+
 (define_insn "*aarch64_simd_mov<VQMOV:mode>"
   [(set (match_operand:VQMOV 0 "nonimmediate_operand"
 		"=w, Umn,  m,  w, ?r, ?w, ?r, w,  w")
@@ -193,7 +225,7 @@
 
 (define_insn "aarch64_store_lane0<mode>"
   [(set (match_operand:<VEL> 0 "memory_operand" "=m")
-	(vec_select:<VEL> (match_operand:VALL_F16 1 "register_operand" "w")
+	(vec_select:<VEL> (match_operand:VMOVE 1 "register_operand" "w")
 			(parallel [(match_operand 2 "const_int_operand" "n")])))]
   "TARGET_SIMD
    && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 0"
@@ -1058,11 +1090,11 @@
 )
 
 (define_insn "aarch64_simd_vec_set<mode>"
-  [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
-	(vec_merge:VALL_F16
-	    (vec_duplicate:VALL_F16
+  [(set (match_operand:VMOVE 0 "register_operand" "=w,w,w")
+	(vec_merge:VMOVE
+	    (vec_duplicate:VMOVE
 		(match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "w,?r,Utv"))
-	    (match_operand:VALL_F16 3 "register_operand" "0,0,0")
+	    (match_operand:VMOVE 3 "register_operand" "0,0,0")
 	    (match_operand:SI 2 "immediate_operand" "i,i,i")))]
   "TARGET_SIMD"
   {
@@ -1084,14 +1116,14 @@
 )
 
 (define_insn "@aarch64_simd_vec_copy_lane<mode>"
-  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
-	(vec_merge:VALL_F16
-	    (vec_duplicate:VALL_F16
+  [(set (match_operand:VMOVE 0 "register_operand" "=w")
+	(vec_merge:VMOVE
+	    (vec_duplicate:VMOVE
 	      (vec_select:<VEL>
-		(match_operand:VALL_F16 3 "register_operand" "w")
+		(match_operand:VMOVE 3 "register_operand" "w")
 		(parallel
 		  [(match_operand:SI 4 "immediate_operand" "i")])))
-	    (match_operand:VALL_F16 1 "register_operand" "0")
+	    (match_operand:VMOVE 1 "register_operand" "0")
 	    (match_operand:SI 2 "immediate_operand" "i")))]
   "TARGET_SIMD"
   {
@@ -1399,7 +1431,7 @@
 )
 
 (define_expand "vec_set<mode>"
-  [(match_operand:VALL_F16 0 "register_operand")
+  [(match_operand:VMOVE 0 "register_operand")
    (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand")
    (match_operand:SI 2 "immediate_operand")]
   "TARGET_SIMD"
@@ -3518,7 +3550,7 @@
 ;; gimple_fold'd to the IFN_REDUC_(MAX|MIN) function.  (This is FP smax/smin).
 (define_expand "reduc_<optab>_scal_<mode>"
   [(match_operand:<VEL> 0 "register_operand")
-   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
+   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
 		 FMAXMINV)]
   "TARGET_SIMD"
   {
@@ -3533,7 +3565,7 @@
 
 (define_expand "reduc_<fmaxmin>_scal_<mode>"
   [(match_operand:<VEL> 0 "register_operand")
-   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
+   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
 		 FMAXMINNMV)]
   "TARGET_SIMD"
   {
@@ -3577,8 +3609,8 @@
 )
 
 (define_insn "aarch64_reduc_<optab>_internal<mode>"
- [(set (match_operand:VHSDF 0 "register_operand" "=w")
-       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
+ [(set (match_operand:VHSDF_P 0 "register_operand" "=w")
+       (unspec:VHSDF_P [(match_operand:VHSDF_P 1 "register_operand" "w")]
 		      FMAXMINV))]
  "TARGET_SIMD"
  "<maxmin_uns_op><vp>\\t%<Vetype>0, %1.<Vtype>"
@@ -4223,7 +4255,7 @@
 (define_insn_and_split "aarch64_get_lane<mode>"
   [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=?r, w, Utv")
 	(vec_select:<VEL>
-	  (match_operand:VALL_F16 1 "register_operand" "w, w, w")
+	  (match_operand:VMOVE 1 "register_operand" "w, w, w")
 	  (parallel [(match_operand:SI 2 "immediate_operand" "i, i, i")])))]
   "TARGET_SIMD"
   {
@@ -8028,7 +8060,7 @@
 ;; Standard pattern name vec_init<mode><Vel>.
 
 (define_expand "vec_init<mode><Vel>"
-  [(match_operand:VALL_F16 0 "register_operand")
+  [(match_operand:VMOVE 0 "register_operand")
    (match_operand 1 "" "")]
   "TARGET_SIMD"
 {
@@ -8107,7 +8139,7 @@
 
 (define_expand "vec_extract<mode><Vel>"
   [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand")
-   (match_operand:VALL_F16 1 "register_operand")
+   (match_operand:VMOVE 1 "register_operand")
    (match_operand:SI 2 "immediate_operand")]
   "TARGET_SIMD"
 {
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 45d659a4a91..fd92212f96a 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -3634,6 +3634,7 @@ aarch64_classify_vector_mode (machine_mode mode)
     case E_V8BFmode:
     case E_V4SFmode:
     case E_V2DFmode:
+    case E_V2HFmode:
       return TARGET_FLOAT ? VEC_ADVSIMD : 0;
 
     default:
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 7c7fcbbc24b..d10cf93572e 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -160,6 +160,10 @@
 (define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
 			     (V8HF "TARGET_SIMD_F16INST")
 			     V2SF V4SF V2DF])
+;; Advanced SIMD Float modes suitable for reduction or pairwise operations
+(define_mode_iterator VHSDF_P [(V4HF "TARGET_SIMD_F16INST")
+			       (V8HF "TARGET_SIMD_F16INST")
+			       V2SF V4SF V2DF (V2HF "TARGET_SIMD_F16INST")])
 
 ;; Advanced SIMD Float modes, and DF.
 (define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF])
@@ -188,15 +192,22 @@
 (define_mode_iterator VALLF [V2SF V4SF V2DF SF DF])
 
 ;; Advanced SIMD Float modes with 2 elements.
-(define_mode_iterator V2F [V2SF V2DF])
+(define_mode_iterator V2F [V2SF V2DF V2HF])
 
 ;; All Advanced SIMD modes on which we support any arithmetic operations.
 (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
 
-;; All Advanced SIMD modes suitable for moving, loading, and storing.
+;; The set of all modes for which vld1 intrinsics are provided.
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
 				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
 
+;; All Advanced SIMD modes suitable for moving, loading, and storing
+;; including V2HF
+(define_mode_iterator VMOVE [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+			     V4HF V8HF V4BF V8BF V2SF V4SF V2DF
+			     (V2HF "TARGET_SIMD_F16INST")])
+
+
 ;; The VALL_F16 modes except the 128-bit 2-element ones.
 (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
 				V4HF V8HF V2SF V4SF])
@@ -1079,7 +1090,7 @@
 			  (V2SF "2") (V4SF "4")
 			  (V1DF "1") (V2DF "2")
 			  (DI "1") (DF "1")
-			  (V8DI "8")])
+			  (V8DI "8") (V2HF "2")])
 
 ;; Map a mode to the number of bits in it, if the size of the mode
 ;; is constant.
@@ -1196,7 +1207,7 @@
 (define_mode_attr Vetype [(V8QI "b") (V16QI "b")
 			  (V4HI "h") (V8HI  "h")
 			  (V2SI "s") (V4SI  "s")
-			  (V2DI "d")
+			  (V2DI "d") (V2HF  "h")
 			  (V4HF "h") (V8HF  "h")
 			  (V2SF "s") (V4SF  "s")
 			  (V2DF "d")
@@ -1288,7 +1299,7 @@
 ;; more accurately.
 (define_mode_attr stype [(V8QI "b") (V16QI "b") (V4HI "s") (V8HI "s")
 			 (V2SI "s") (V4SI "s") (V2DI "d") (V4HF "s")
-			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d")
+			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d") (V2HF "s")
 			 (HF "s") (SF "s") (DF "d") (QI "b") (HI "s")
 			 (SI "s") (DI "d")])
 
@@ -1363,8 +1374,8 @@
 		       (V4HF "HF") (V8HF  "HF")
 		       (V2SF "SF") (V4SF  "SF")
 		       (DF   "DF") (V2DF  "DF")
-		       (SI   "SI") (HI    "HI")
-		       (QI   "QI")
+		       (SI   "SI") (V2HF  "HF")
+		       (QI   "QI") (HI    "HI")
 		       (V4BF "BF") (V8BF "BF")
 		       (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
 		       (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
@@ -1384,7 +1395,7 @@
 		       (V2SF "sf") (V4SF "sf")
 		       (V2DF "df") (DF   "df")
 		       (SI   "si") (HI   "hi")
-		       (QI   "qi")
+		       (QI   "qi") (V2HF "hf")
 		       (V4BF "bf") (V8BF "bf")
 		       (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
 		       (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
@@ -1869,7 +1880,7 @@
 		     (V4HF "") (V8HF "_q")
 		     (V4BF "") (V8BF "_q")
 		     (V2SF "") (V4SF  "_q")
-			       (V2DF  "_q")
+		     (V2HF "") (V2DF  "_q")
 		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")
 		     (V2x8QI "") (V2x16QI "_q")
 		     (V2x4HI "") (V2x8HI "_q")
@@ -1908,6 +1919,7 @@
 		      (V2SI "p") (V4SI  "v")
 		      (V2DI "p") (V2DF  "p")
 		      (V2SF "p") (V4SF  "v")
+		      (V2HF "p")
 		      (V4HF "v") (V8HF  "v")])
 
 (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")
diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
index d0d9997efd2..880353dbeed 100644
--- a/gcc/config/arm/types.md
+++ b/gcc/config/arm/types.md
@@ -484,6 +484,7 @@
 ; neon_fp_minmax_s_q
 ; neon_fp_minmax_d
 ; neon_fp_minmax_d_q
+; neon_fp_reduc_add_h
 ; neon_fp_reduc_add_s
 ; neon_fp_reduc_add_s_q
 ; neon_fp_reduc_add_d
@@ -1034,6 +1035,7 @@
   neon_fp_minmax_d,\
   neon_fp_minmax_d_q,\
 \
+  neon_fp_reduc_add_h,\
   neon_fp_reduc_add_s,\
   neon_fp_reduc_add_s_q,\
   neon_fp_reduc_add_d,\
@@ -1258,8 +1260,8 @@
           neon_fp_compare_d, neon_fp_compare_d_q, neon_fp_minmax_s,\
           neon_fp_minmax_s_q, neon_fp_minmax_d, neon_fp_minmax_d_q,\
           neon_fp_neg_s, neon_fp_neg_s_q, neon_fp_neg_d, neon_fp_neg_d_q,\
-          neon_fp_reduc_add_s, neon_fp_reduc_add_s_q, neon_fp_reduc_add_d,\
-          neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s,
+          neon_fp_reduc_add_h, neon_fp_reduc_add_s, neon_fp_reduc_add_s_q,\
+          neon_fp_reduc_add_d, neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s,\
           neon_fp_reduc_minmax_s_q, neon_fp_reduc_minmax_d,\
           neon_fp_reduc_minmax_d_q,\
           neon_fp_cvt_narrow_s_q, neon_fp_cvt_narrow_d_q,\
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
index 07d71a63414..e6021c5a427 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
@@ -30,11 +30,9 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n)	\
 TEST_ALL (VEC_PERM)
 
 /* We should use one DUP for each of the 8-, 16- and 32-bit types,
-   although we currently use LD1RW for _Float16.  We should use two
-   DUPs for each of the three 64-bit types.  */
+   We should use two DUPs for each of the three 64-bit types.  */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 2 } } */
-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } } */
 /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
 /* { dg-final { scan-assembler-not {\tzip2\t} } } */
@@ -53,7 +51,7 @@ TEST_ALL (VEC_PERM)
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */
 /* { dg-final { scan-assembler-not {\tldr} } } */
-/* { dg-final { scan-assembler-times {\tstr} 2 } } */
-/* { dg-final { scan-assembler-times {\tstr\th[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-not {\tstr} } } */
+/* { dg-final { scan-assembler-not {\tstr\th[0-9]+} } } */
 
 /* { dg-final { scan-assembler-not {\tuqdec} } } */

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-12-12 15:15 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-12 15:15 [gcc r13-4616] aarch64: Make existing V2HF be usable Tamar Christina

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).