public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r11-6884] Arm: Add NEON and MVE complex mul, mla and mls patterns.
@ 2021-01-25  8:57 Tamar Christina
  0 siblings, 0 replies; only message in thread
From: Tamar Christina @ 2021-01-25  8:57 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:389b67feac78c8f21c6946bf8e36a16060f45728

commit r11-6884-g389b67feac78c8f21c6946bf8e36a16060f45728
Author: Tamar Christina <tamar.christina@arm.com>
Date:   Mon Jan 25 08:56:37 2021 +0000

    Arm: Add NEON and MVE complex mul, mla and mls patterns.
    
    This adds implementation for the optabs for complex operations.  With this the
    following C code:
    
      void g (float complex a[restrict N], float complex b[restrict N],
              float complex c[restrict N])
      {
        for (int i=0; i < N; i++)
          c[i] =  a[i] * b[i];
      }
    
    generates
    
    NEON:
    
    g:
            vmov.f32        q11, #0.0  @ v4sf
            add     r3, r2, #1600
    .L2:
            vmov    q8, q11  @ v4sf
            vld1.32 {q10}, [r1]!
            vld1.32 {q9}, [r0]!
            vcmla.f32       q8, q9, q10, #0
            vcmla.f32       q8, q9, q10, #90
            vst1.32 {q8}, [r2]!
            cmp     r3, r2
            bne     .L2
            bx      lr
    
    MVE:
    
    g:
            push    {lr}
            mov     lr, #100
            dls     lr, lr
    .L2:
            vldrw.32        q1, [r1], #16
            vldrw.32        q2, [r0], #16
            vcmul.f32       q3, q2, q1, #0
            vcmla.f32       q3, q2, q1, #90
            vstrw.32        q3, [r2], #16
            le      lr, .L2
            ldr     pc, [sp], #4
    
    instead of
    
    g:
            add     r3, r2, #1600
    .L2:
            vld2.32 {d20-d23}, [r0]!
            vld2.32 {d16-d19}, [r1]!
            vmul.f32        q14, q11, q9
            vmul.f32        q15, q11, q8
            vneg.f32        q14, q14
            vfma.f32        q15, q10, q9
            vfma.f32        q14, q10, q8
            vmov    q13, q15  @ v4sf
            vmov    q12, q14  @ v4sf
            vst2.32 {d24-d27}, [r2]!
            cmp     r3, r2
            bne     .L2
            bx      lr
    
    and
    
    g:
            add     r3, r2, #1600
    .L2:
            vld2.32 {d20-d23}, [r0]!
            vld2.32 {d16-d19}, [r1]!
            vmul.f32        q15, q10, q8
            vmul.f32        q14, q10, q9
            vmls.f32        q15, q11, q9
            vmla.f32        q14, q11, q8
            vmov    q12, q15  @ v4sf
            vmov    q13, q14  @ v4sf
            vst2.32 {d24-d27}, [r2]!
            cmp     r3, r2
            bne     .L2
            bx      lr
    
    respectively.
    
    gcc/ChangeLog:
    
            * config/arm/iterators.md (rotsplit1, rotsplit2, conj_op, fcmac1,
            VCMLA_OP, VCMUL_OP): New.
            * config/arm/mve.md (mve_vcmlaq<mve_rot><mode>): Support vec_dup 0.
            * config/arm/neon.md (cmul<conj_op><mode>3): New.
            * config/arm/unspecs.md (UNSPEC_VCMLA_CONJ, UNSPEC_VCMLA180_CONJ,
            UNSPEC_VCMUL_CONJ): New.
            * config/arm/vec-common.md (cmul<conj_op><mode>3, arm_vcmla<rot><mode>,
            cml<fcmac1><conj_op><mode>4): New.

Diff:
---
 gcc/config/arm/iterators.md  | 40 +++++++++++++++++++++++++++++++
 gcc/config/arm/mve.md        | 13 +++++-----
 gcc/config/arm/neon.md       | 19 +++++++++++++++
 gcc/config/arm/unspecs.md    |  3 +++
 gcc/config/arm/vec-common.md | 57 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 126 insertions(+), 6 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 2e0aacbd3f7..b9027905307 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -1186,6 +1186,33 @@
 		      (UNSPEC_VCMLA180 "180")
 		      (UNSPEC_VCMLA270 "270")])
 
+;; The complex operations when performed on a real complex number require two
+;; instructions to perform the operation. e.g. complex multiplication requires
+;; two VCMUL with a particular rotation value.
+;;
+;; These values can be looked up in rotsplit1 and rotsplit2.  as an example
+;; VCMUL needs the first instruction to use #0 and the second #90.
+(define_int_attr rotsplit1 [(UNSPEC_VCMLA "0")
+			    (UNSPEC_VCMLA_CONJ "0")
+			    (UNSPEC_VCMUL "0")
+			    (UNSPEC_VCMUL_CONJ "0")
+			    (UNSPEC_VCMLA180 "180")
+			    (UNSPEC_VCMLA180_CONJ "180")])
+
+(define_int_attr rotsplit2 [(UNSPEC_VCMLA "90")
+			    (UNSPEC_VCMLA_CONJ "270")
+			    (UNSPEC_VCMUL "90")
+			    (UNSPEC_VCMUL_CONJ "270")
+			    (UNSPEC_VCMLA180 "270")
+			    (UNSPEC_VCMLA180_CONJ "90")])
+
+(define_int_attr conj_op [(UNSPEC_VCMLA180 "")
+			  (UNSPEC_VCMLA180_CONJ "_conj")
+			  (UNSPEC_VCMLA "")
+			  (UNSPEC_VCMLA_CONJ "_conj")
+			  (UNSPEC_VCMUL "")
+			  (UNSPEC_VCMUL_CONJ "_conj")])
+
 (define_int_attr mve_rot [(UNSPEC_VCADD90 "_rot90")
 			  (UNSPEC_VCADD270 "_rot270")
 			  (UNSPEC_VCMLA "")
@@ -1200,6 +1227,9 @@
 (define_int_iterator VCMUL [UNSPEC_VCMUL UNSPEC_VCMUL90
 			    UNSPEC_VCMUL180 UNSPEC_VCMUL270])
 
+(define_int_attr fcmac1 [(UNSPEC_VCMLA "a") (UNSPEC_VCMLA_CONJ "a")
+			 (UNSPEC_VCMLA180 "s") (UNSPEC_VCMLA180_CONJ "s")])
+
 (define_int_attr simd32_op [(UNSPEC_QADD8 "qadd8") (UNSPEC_QSUB8 "qsub8")
 			    (UNSPEC_SHADD8 "shadd8") (UNSPEC_SHSUB8 "shsub8")
 			    (UNSPEC_UHADD8 "uhadd8") (UNSPEC_UHSUB8 "uhsub8")
@@ -1723,3 +1753,13 @@
 (define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48])
 (define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48])
 (define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])
+
+;; Define iterators for VCMLA operations
+(define_int_iterator VCMLA_OP [UNSPEC_VCMLA
+			       UNSPEC_VCMLA_CONJ
+			       UNSPEC_VCMLA180
+			       UNSPEC_VCMLA180_CONJ])
+
+;; Define iterators for VCMLA operations as MUL
+(define_int_iterator VCMUL_OP [UNSPEC_VCMUL
+			       UNSPEC_VCMUL_CONJ])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 62ff12365ab..465f71c4eee 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -4101,15 +4101,16 @@
 (define_insn "mve_vcmlaq<mve_rot><mode>"
   [
    (set (match_operand:MVE_0 0 "s_register_operand" "=w,w")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0,Dz")
-		       (match_operand:MVE_0 2 "s_register_operand" "w,w")
-		       (match_operand:MVE_0 3 "s_register_operand" "w,w")]
-	 VCMLA))
+	(plus:MVE_0 (match_operand:MVE_0 1 "reg_or_zero_operand" "Dz,0")
+		    (unspec:MVE_0
+		        [(match_operand:MVE_0 2 "s_register_operand" "w,w")
+		         (match_operand:MVE_0 3 "s_register_operand" "w,w")]
+		     VCMLA)))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
   "@
-   vcmla.f%#<V_sz_elem>	%q0, %q2, %q3, #<rot>
-   vcmul.f%#<V_sz_elem>	%q0, %q2, %q3, #<rot>"
+   vcmul.f%#<V_sz_elem>	%q0, %q2, %q3, #<rot>
+   vcmla.f%#<V_sz_elem>	%q0, %q2, %q3, #<rot>"
   [(set_attr "type" "mve_move")
 ])
 
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index e904db97ea7..fec2cc91d24 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -2952,6 +2952,25 @@
   [(set_attr "type" "neon_fcmla")]
 )
 
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cmul<conj_op><mode>3"
+  [(set (match_operand:VDF 0 "register_operand")
+	(unspec:VDF [(match_operand:VDF 1 "register_operand")
+		     (match_operand:VDF 2 "register_operand")]
+		    VCMUL_OP))]
+  "TARGET_COMPLEX && !BYTES_BIG_ENDIAN"
+{
+  rtx res1 = gen_reg_rtx (<MODE>mode);
+  rtx tmp = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+  emit_insn (gen_neon_vcmla<rotsplit1><mode> (res1, tmp,
+					      operands[2], operands[1]));
+  emit_insn (gen_neon_vcmla<rotsplit2><mode> (operands[0], res1,
+					      operands[2], operands[1]));
+  DONE;
+})
+
 
 ;; These instructions map to the __builtins for the Dot Product operations.
 (define_insn "neon_<sup>dot<vsi2qi>"
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 97a803e8da5..c6ebb6fc2b6 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -510,10 +510,13 @@
   UNSPEC_VCMLA90
   UNSPEC_VCMLA180
   UNSPEC_VCMLA270
+  UNSPEC_VCMLA_CONJ
+  UNSPEC_VCMLA180_CONJ
   UNSPEC_VCMUL
   UNSPEC_VCMUL90
   UNSPEC_VCMUL180
   UNSPEC_VCMUL270
+  UNSPEC_VCMUL_CONJ
   UNSPEC_MATMUL_S
   UNSPEC_MATMUL_U
   UNSPEC_MATMUL_US
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index ff448da126b..692b28ea8cc 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -215,6 +215,63 @@
 		      && ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
 )
 
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cmul<conj_op><mode>3"
+  [(set (match_operand:VQ_HSF 0 "register_operand")
+        (unspec:VQ_HSF [(match_operand:VQ_HSF 1 "register_operand")
+			(match_operand:VQ_HSF 2 "register_operand")]
+		       VCMUL_OP))]
+  "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT))
+   && !BYTES_BIG_ENDIAN"
+{
+  rtx res1 = gen_reg_rtx (<MODE>mode);
+  if (TARGET_COMPLEX)
+    {
+      rtx tmp = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+      emit_insn (gen_arm_vcmla<rotsplit1><mode> (res1, tmp,
+						 operands[2], operands[1]));
+    }
+  else
+    emit_insn (gen_arm_vcmla<rotsplit1><mode> (res1, CONST0_RTX (<MODE>mode),
+					       operands[2], operands[1]));
+
+  emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], res1,
+					     operands[2], operands[1]));
+  DONE;
+})
+
+(define_expand "arm_vcmla<rot><mode>"
+  [(set (match_operand:VF 0 "register_operand")
+	(plus:VF (match_operand:VF 1 "register_operand")
+		 (unspec:VF [(match_operand:VF 2 "register_operand")
+			     (match_operand:VF 3 "register_operand")]
+			     VCMLA)))]
+  "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
+		      && ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
+)
+
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cml<fcmac1><conj_op><mode>4"
+  [(set (match_operand:VF 0 "register_operand")
+	(plus:VF (match_operand:VF 1 "register_operand")
+		 (unspec:VF [(match_operand:VF 2 "register_operand")
+			     (match_operand:VF 3 "register_operand")]
+			    VCMLA_OP)))]
+  "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
+		      && ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_arm_vcmla<rotsplit1><mode> (tmp, operands[1],
+					     operands[3], operands[2]));
+  emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], tmp,
+					     operands[3], operands[2]));
+  DONE;
+})
+
 (define_expand "movmisalign<mode>"
  [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
 	(unspec:VDQX [(match_operand:VDQX 1 "neon_perm_struct_or_reg_operand")]


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-01-25  8:57 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-25  8:57 [gcc r11-6884] Arm: Add NEON and MVE complex mul, mla and mls patterns Tamar Christina

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).