* [PATCH v2 11/16]AArch64: Add SVE RTL patterns for Complex Addition, Multiply and FMA.
@ 2020-09-25 14:30 Tamar Christina
2020-11-14 15:12 ` Tamar Christina
0 siblings, 1 reply; 2+ messages in thread
From: Tamar Christina @ 2020-09-25 14:30 UTC (permalink / raw)
To: gcc-patches
Cc: nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov,
richard.sandiford
[-- Attachment #1: Type: text/plain, Size: 1298 bytes --]
Hi All,
This adds implementation for the optabs for complex operations. With this the
following C code:
void f90 (float complex a[restrict N], float complex b[restrict N],
float complex c[restrict N])
{
for (int i=0; i < N; i++)
c[i] = a[i] + (b[i] * I);
}
generates
f90:
mov x3, 0
mov x4, 400
ptrue p1.b, all
whilelo p0.s, xzr, x4
.p2align 3,,7
.L2:
ld1w z0.s, p0/z, [x0, x3, lsl 2]
ld1w z1.s, p0/z, [x1, x3, lsl 2]
fcadd z0.s, p1/m, z0.s, z1.s, #90
st1w z0.s, p0, [x2, x3, lsl 2]
incw x3
whilelo p0.s, x3, x4
b.any .L2
ret
instead of
f90:
mov x3, 0
mov x4, 0
mov w5, 200
whilelo p0.s, wzr, w5
.p2align 3,,7
.L2:
ld2w {z4.s - z5.s}, p0/z, [x0, x3, lsl 2]
ld2w {z2.s - z3.s}, p0/z, [x1, x3, lsl 2]
fsub z0.s, z4.s, z3.s
fadd z1.s, z2.s, z5.s
st2w {z0.s - z1.s}, p0, [x2, x3, lsl 2]
incw x4
inch x3
whilelo p0.s, w4, w5
b.any .L2
ret
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
* config/aarch64/aarch64-sve.md (cadd<rot><mode>3,
cml<fcmac1><rot_op><mode>4, cmul<rot_op><mode>3): New.
* config/aarch64/iterators.md (sve_rot1, sve_rot2): New.
--
[-- Attachment #2: rb13515.patch --]
[-- Type: text/x-diff, Size: 4647 bytes --]
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index cd79aba90ec9cdb5da9e9758495015ef36b2d869..12bc8077994f5a130ff4af6e9bfa7ca1237d0868 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -5109,6 +5109,20 @@ (define_expand "@cond_<optab><mode>"
"TARGET_SVE"
)
+;; Predicated FCADD using ptrue for unpredicated optab for auto-vectorizer
+(define_expand "@cadd<rot><mode>3"
+ [(set (match_operand:SVE_FULL_F 0 "register_operand")
+ (unspec:SVE_FULL_F
+ [(match_dup 3)
+ (const_int SVE_RELAXED_GP)
+ (match_operand:SVE_FULL_F 1 "register_operand")
+ (match_operand:SVE_FULL_F 2 "register_operand")]
+ SVE_COND_FCADD))]
+ "TARGET_SVE"
+{
+ operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+})
+
;; Predicated FCADD, merging with the first input.
(define_insn_and_rewrite "*cond_<optab><mode>_2"
[(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
@@ -6554,6 +6568,62 @@ (define_insn "@aarch64_pred_<optab><mode>"
[(set_attr "movprfx" "*,yes")]
)
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder. Because of this, expand early.
+(define_expand "cml<fcmac1><rot_op><mode>4"
+ [(set (match_operand:SVE_FULL_F 0 "register_operand")
+ (unspec:SVE_FULL_F
+ [(match_dup 4)
+ (match_dup 5)
+ (match_operand:SVE_FULL_F 1 "register_operand")
+ (match_operand:SVE_FULL_F 2 "register_operand")
+ (match_operand:SVE_FULL_F 3 "register_operand")]
+ FCMLA_OP))]
+ "TARGET_SVE"
+{
+ operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+ operands[5] = gen_int_mode (SVE_RELAXED_GP, SImode);
+ emit_insn (
+ gen_aarch64_pred_fcmla<sve_rot1><mode> (operands[0], operands[4],
+ operands[1], operands[2],
+ operands[3], operands[5]));
+ emit_insn (
+ gen_aarch64_pred_fcmla<sve_rot2><mode> (operands[0], operands[4],
+ operands[0], operands[2],
+ operands[3], operands[5]));
+ DONE;
+})
+
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder. Because of this, expand early.
+(define_expand "cmul<rot_op><mode>3"
+ [(set (match_operand:SVE_FULL_F 0 "register_operand")
+ (unspec:SVE_FULL_F
+ [(match_dup 3)
+ (match_dup 4)
+ (match_operand:SVE_FULL_F 1 "register_operand")
+ (match_operand:SVE_FULL_F 2 "register_operand")
+ (match_dup 5)]
+ FCMUL_OP))]
+ "TARGET_SVE"
+{
+ operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+ operands[4] = gen_int_mode (SVE_RELAXED_GP, SImode);
+ operands[5] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+ emit_insn (
+ gen_aarch64_pred_fcmla<sve_rot1><mode> (operands[0], operands[3], operands[1],
+ operands[2], operands[5], operands[4]));
+ emit_insn (
+ gen_aarch64_pred_fcmla<sve_rot2><mode> (operands[0], operands[3], operands[1],
+ operands[2], operands[0],
+ operands[4]));
+ DONE;
+})
+
;; Predicated FCMLA with merging.
(define_expand "@cond_<optab><mode>"
[(set (match_operand:SVE_FULL_F 0 "register_operand")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 98217c9fd3ee2b6063f7564193e400e9ef71c6ac..7662b929e2c4f6c103cc06e051eb574247320809 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -3443,6 +3443,35 @@ (define_int_attr rotsplit2 [(UNSPEC_FCMLA "90")
(UNSPEC_FCMLS "180")
(UNSPEC_FCMLS180 "180")])
+;; SVE has slightly different namings from NEON so we have to split these
+;; iterators.
+(define_int_attr sve_rot1 [(UNSPEC_FCMLA "")
+ (UNSPEC_FCMLA180 "")
+ (UNSPEC_FCMUL "")
+ (UNSPEC_FCMUL180 "")
+ (UNSPEC_FCMLS "270")
+ (UNSPEC_FCMLS180 "90")
+ (UNSPEC_CMLA "")
+ (UNSPEC_CMLA180 "")
+ (UNSPEC_CMUL "")
+ (UNSPEC_CMUL180 "")
+ (UNSPEC_CMLS "270")
+ (UNSPEC_CMLS180 "90")])
+
+(define_int_attr sve_rot2 [(UNSPEC_FCMLA "90")
+ (UNSPEC_FCMLA180 "270")
+ (UNSPEC_FCMUL "90")
+ (UNSPEC_FCMUL180 "270")
+ (UNSPEC_FCMLS "180")
+ (UNSPEC_FCMLS180 "180")
+ (UNSPEC_CMLA "90")
+ (UNSPEC_CMLA180 "270")
+ (UNSPEC_CMUL "90")
+ (UNSPEC_CMUL180 "270")
+ (UNSPEC_CMLS "180")
+ (UNSPEC_CMLS180 "180")])
+
+
(define_int_attr fcmac1 [(UNSPEC_FCMLA "a") (UNSPEC_FCMLA180 "a")
(UNSPEC_FCMLS "s") (UNSPEC_FCMLS180 "s")
(UNSPEC_CMLA "a") (UNSPEC_CMLA180 "a")
^ permalink raw reply [flat|nested] 2+ messages in thread
* RE: [PATCH v2 11/16]AArch64: Add SVE RTL patterns for Complex Addition, Multiply and FMA.
2020-09-25 14:30 [PATCH v2 11/16]AArch64: Add SVE RTL patterns for Complex Addition, Multiply and FMA Tamar Christina
@ 2020-11-14 15:12 ` Tamar Christina
0 siblings, 0 replies; 2+ messages in thread
From: Tamar Christina @ 2020-11-14 15:12 UTC (permalink / raw)
To: Tamar Christina, gcc-patches
Cc: Richard Earnshaw, nd, Marcus Shawcroft, Kyrylo Tkachov,
Richard Sandiford
ping
> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of Tamar
> Christina
> Sent: Friday, September 25, 2020 3:30 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Richard Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>
> Subject: [PATCH v2 11/16]AArch64: Add SVE RTL patterns for Complex
> Addition, Multiply and FMA.
>
> Hi All,
>
> This adds implementation for the optabs for complex operations. With this
> the following C code:
>
> void f90 (float complex a[restrict N], float complex b[restrict N],
> float complex c[restrict N])
> {
> for (int i=0; i < N; i++)
> c[i] = a[i] + (b[i] * I);
> }
>
> generates
>
> f90:
> mov x3, 0
> mov x4, 400
> ptrue p1.b, all
> whilelo p0.s, xzr, x4
> .p2align 3,,7
> .L2:
> ld1w z0.s, p0/z, [x0, x3, lsl 2]
> ld1w z1.s, p0/z, [x1, x3, lsl 2]
> fcadd z0.s, p1/m, z0.s, z1.s, #90
> st1w z0.s, p0, [x2, x3, lsl 2]
> incw x3
> whilelo p0.s, x3, x4
> b.any .L2
> ret
>
> instead of
>
> f90:
> mov x3, 0
> mov x4, 0
> mov w5, 200
> whilelo p0.s, wzr, w5
> .p2align 3,,7
> .L2:
> ld2w {z4.s - z5.s}, p0/z, [x0, x3, lsl 2]
> ld2w {z2.s - z3.s}, p0/z, [x1, x3, lsl 2]
> fsub z0.s, z4.s, z3.s
> fadd z1.s, z2.s, z5.s
> st2w {z0.s - z1.s}, p0, [x2, x3, lsl 2]
> incw x4
> inch x3
> whilelo p0.s, w4, w5
> b.any .L2
> ret
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64-sve.md (cadd<rot><mode>3,
> cml<fcmac1><rot_op><mode>4, cmul<rot_op><mode>3): New.
> * config/aarch64/iterators.md (sve_rot1, sve_rot2): New.
>
> --
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2020-11-14 15:12 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-25 14:30 [PATCH v2 11/16]AArch64: Add SVE RTL patterns for Complex Addition, Multiply and FMA Tamar Christina
2020-11-14 15:12 ` Tamar Christina
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).