[PATCH v2 11/16]AArch64: Add SVE RTL patterns for Complex Addition, Multiply and FMA.

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH v2 11/16]AArch64: Add SVE RTL patterns for Complex Addition, Multiply and FMA.
@ 2020-09-25 14:30 Tamar Christina
  2020-11-14 15:12 ` Tamar Christina
  0 siblings, 1 reply; 2+ messages in thread
From: Tamar Christina @ 2020-09-25 14:30 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov,
	richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 1298 bytes --]

Hi All,

This adds implementation for the optabs for complex operations.  With this the
following C code:

  void f90 (float complex a[restrict N], float complex b[restrict N],
	    float complex c[restrict N])
  {
    for (int i=0; i < N; i++)
      c[i] = a[i] + (b[i] * I);
  }

generates

  f90:
	  mov     x3, 0
	  mov     x4, 400
	  ptrue   p1.b, all
	  whilelo p0.s, xzr, x4
	  .p2align 3,,7
  .L2:
	  ld1w    z0.s, p0/z, [x0, x3, lsl 2]
	  ld1w    z1.s, p0/z, [x1, x3, lsl 2]
	  fcadd   z0.s, p1/m, z0.s, z1.s, #90
	  st1w    z0.s, p0, [x2, x3, lsl 2]
	  incw    x3
	  whilelo p0.s, x3, x4
	  b.any   .L2
	  ret

instead of

  f90:
	  mov     x3, 0
	  mov     x4, 0
	  mov     w5, 200
	  whilelo p0.s, wzr, w5
	  .p2align 3,,7
  .L2:
	  ld2w    {z4.s - z5.s}, p0/z, [x0, x3, lsl 2]
	  ld2w    {z2.s - z3.s}, p0/z, [x1, x3, lsl 2]
	  fsub    z0.s, z4.s, z3.s
	  fadd    z1.s, z2.s, z5.s
	  st2w    {z0.s - z1.s}, p0, [x2, x3, lsl 2]
	  incw    x4
	  inch    x3
	  whilelo p0.s, w4, w5
	  b.any   .L2
	  ret

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-sve.md (cadd<rot><mode>3,
	cml<fcmac1><rot_op><mode>4, cmul<rot_op><mode>3): New.
	* config/aarch64/iterators.md (sve_rot1, sve_rot2): New.

-- 

[-- Attachment #2: rb13515.patch --]
[-- Type: text/x-diff, Size: 4647 bytes --]

diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index cd79aba90ec9cdb5da9e9758495015ef36b2d869..12bc8077994f5a130ff4af6e9bfa7ca1237d0868 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -5109,6 +5109,20 @@ (define_expand "@cond_<optab><mode>"
   "TARGET_SVE"
 )
 
+;; Predicated FCADD using ptrue for unpredicated optab for auto-vectorizer
+(define_expand "@cadd<rot><mode>3"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_dup 3)
+	   (const_int SVE_RELAXED_GP)
+	   (match_operand:SVE_FULL_F 1 "register_operand")
+	   (match_operand:SVE_FULL_F 2 "register_operand")]
+	  SVE_COND_FCADD))]
+  "TARGET_SVE"
+{
+  operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+})
+
 ;; Predicated FCADD, merging with the first input.
 (define_insn_and_rewrite "*cond_<optab><mode>_2"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
@@ -6554,6 +6568,62 @@ (define_insn "@aarch64_pred_<optab><mode>"
   [(set_attr "movprfx" "*,yes")]
 )
 
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cml<fcmac1><rot_op><mode>4"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_dup 4)
+	   (match_dup 5)
+	   (match_operand:SVE_FULL_F 1 "register_operand")
+	   (match_operand:SVE_FULL_F 2 "register_operand")
+	   (match_operand:SVE_FULL_F 3 "register_operand")]
+	  FCMLA_OP))]
+  "TARGET_SVE"
+{
+  operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+  operands[5] = gen_int_mode (SVE_RELAXED_GP, SImode);
+  emit_insn (
+    gen_aarch64_pred_fcmla<sve_rot1><mode> (operands[0], operands[4],
+					    operands[1], operands[2],
+					    operands[3], operands[5]));
+  emit_insn (
+    gen_aarch64_pred_fcmla<sve_rot2><mode> (operands[0], operands[4],
+					    operands[0], operands[2],
+					    operands[3], operands[5]));
+  DONE;
+})
+
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cmul<rot_op><mode>3"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_dup 3)
+	   (match_dup 4)
+	   (match_operand:SVE_FULL_F 1 "register_operand")
+	   (match_operand:SVE_FULL_F 2 "register_operand")
+	   (match_dup 5)]
+	  FCMUL_OP))]
+  "TARGET_SVE"
+{
+  operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+  operands[4] = gen_int_mode (SVE_RELAXED_GP, SImode);
+  operands[5] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+  emit_insn (
+    gen_aarch64_pred_fcmla<sve_rot1><mode> (operands[0], operands[3], operands[1],
+					    operands[2], operands[5], operands[4]));
+  emit_insn (
+    gen_aarch64_pred_fcmla<sve_rot2><mode> (operands[0], operands[3], operands[1],
+					    operands[2], operands[0],
+					    operands[4]));
+  DONE;
+})
+
 ;; Predicated FCMLA with merging.
 (define_expand "@cond_<optab><mode>"
   [(set (match_operand:SVE_FULL_F 0 "register_operand")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 98217c9fd3ee2b6063f7564193e400e9ef71c6ac..7662b929e2c4f6c103cc06e051eb574247320809 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -3443,6 +3443,35 @@ (define_int_attr rotsplit2 [(UNSPEC_FCMLA "90")
 			    (UNSPEC_FCMLS "180")
 			    (UNSPEC_FCMLS180 "180")])
 
+;; SVE has slightly different namings from NEON so we have to split these
+;; iterators.
+(define_int_attr sve_rot1 [(UNSPEC_FCMLA "")
+			   (UNSPEC_FCMLA180 "")
+			   (UNSPEC_FCMUL "")
+			   (UNSPEC_FCMUL180 "")
+			   (UNSPEC_FCMLS "270")
+			   (UNSPEC_FCMLS180 "90")
+			   (UNSPEC_CMLA "")
+			   (UNSPEC_CMLA180 "")
+			   (UNSPEC_CMUL "")
+			   (UNSPEC_CMUL180 "")
+			   (UNSPEC_CMLS "270")
+			   (UNSPEC_CMLS180 "90")])
+
+(define_int_attr sve_rot2 [(UNSPEC_FCMLA "90")
+			   (UNSPEC_FCMLA180 "270")
+			   (UNSPEC_FCMUL "90")
+			   (UNSPEC_FCMUL180 "270")
+			   (UNSPEC_FCMLS "180")
+			   (UNSPEC_FCMLS180 "180")
+			   (UNSPEC_CMLA "90")
+			   (UNSPEC_CMLA180 "270")
+			   (UNSPEC_CMUL "90")
+			   (UNSPEC_CMUL180 "270")
+			   (UNSPEC_CMLS "180")
+			   (UNSPEC_CMLS180 "180")])
+
+
 (define_int_attr fcmac1 [(UNSPEC_FCMLA "a") (UNSPEC_FCMLA180 "a")
 			 (UNSPEC_FCMLS "s") (UNSPEC_FCMLS180 "s")
 			 (UNSPEC_CMLA "a") (UNSPEC_CMLA180 "a")


^ permalink raw reply	[flat|nested] 2+ messages in thread

* RE: [PATCH v2 11/16]AArch64: Add SVE RTL patterns for Complex Addition, Multiply and FMA.
  2020-09-25 14:30 [PATCH v2 11/16]AArch64: Add SVE RTL patterns for Complex Addition, Multiply and FMA Tamar Christina
@ 2020-11-14 15:12 ` Tamar Christina
  0 siblings, 0 replies; 2+ messages in thread
From: Tamar Christina @ 2020-11-14 15:12 UTC (permalink / raw)
  To: Tamar Christina, gcc-patches
  Cc: Richard Earnshaw, nd, Marcus Shawcroft, Kyrylo Tkachov,
	Richard Sandiford

ping

> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of Tamar
> Christina
> Sent: Friday, September 25, 2020 3:30 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Richard Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>
> Subject: [PATCH v2 11/16]AArch64: Add SVE RTL patterns for Complex
> Addition, Multiply and FMA.
> 
> Hi All,
> 
> This adds implementation for the optabs for complex operations.  With this
> the following C code:
> 
>   void f90 (float complex a[restrict N], float complex b[restrict N],
> 	    float complex c[restrict N])
>   {
>     for (int i=0; i < N; i++)
>       c[i] = a[i] + (b[i] * I);
>   }
> 
> generates
> 
>   f90:
> 	  mov     x3, 0
> 	  mov     x4, 400
> 	  ptrue   p1.b, all
> 	  whilelo p0.s, xzr, x4
> 	  .p2align 3,,7
>   .L2:
> 	  ld1w    z0.s, p0/z, [x0, x3, lsl 2]
> 	  ld1w    z1.s, p0/z, [x1, x3, lsl 2]
> 	  fcadd   z0.s, p1/m, z0.s, z1.s, #90
> 	  st1w    z0.s, p0, [x2, x3, lsl 2]
> 	  incw    x3
> 	  whilelo p0.s, x3, x4
> 	  b.any   .L2
> 	  ret
> 
> instead of
> 
>   f90:
> 	  mov     x3, 0
> 	  mov     x4, 0
> 	  mov     w5, 200
> 	  whilelo p0.s, wzr, w5
> 	  .p2align 3,,7
>   .L2:
> 	  ld2w    {z4.s - z5.s}, p0/z, [x0, x3, lsl 2]
> 	  ld2w    {z2.s - z3.s}, p0/z, [x1, x3, lsl 2]
> 	  fsub    z0.s, z4.s, z3.s
> 	  fadd    z1.s, z2.s, z5.s
> 	  st2w    {z0.s - z1.s}, p0, [x2, x3, lsl 2]
> 	  incw    x4
> 	  inch    x3
> 	  whilelo p0.s, w4, w5
> 	  b.any   .L2
> 	  ret
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64-sve.md (cadd<rot><mode>3,
> 	cml<fcmac1><rot_op><mode>4, cmul<rot_op><mode>3): New.
> 	* config/aarch64/iterators.md (sve_rot1, sve_rot2): New.
> 
> --

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2020-11-14 15:12 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-25 14:30 [PATCH v2 11/16]AArch64: Add SVE RTL patterns for Complex Addition, Multiply and FMA Tamar Christina
2020-11-14 15:12 ` Tamar Christina

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).