* [PATCH]Arm: Add NEON and MVE complex mul, mla and mls patterns.
@ 2021-01-21 18:54 Tamar Christina
2021-01-22 9:40 ` Kyrylo Tkachov
0 siblings, 1 reply; 2+ messages in thread
From: Tamar Christina @ 2021-01-21 18:54 UTC (permalink / raw)
To: gcc-patches
Cc: nd, Ramana.Radhakrishnan, Richard.Earnshaw, nickc, Kyrylo.Tkachov
[-- Attachment #1: Type: text/plain, Size: 10704 bytes --]
Hi All,
This adds implementation for the optabs for complex operations. With this the
following C code:
void g (float complex a[restrict N], float complex b[restrict N],
float complex c[restrict N])
{
for (int i=0; i < N; i++)
c[i] = a[i] * b[i];
}
generates
NEON:
g:
vmov.f32 q11, #0.0 @ v4sf
add r3, r2, #1600
.L2:
vmov q8, q11 @ v4sf
vld1.32 {q10}, [r1]!
vld1.32 {q9}, [r0]!
vcmla.f32 q8, q9, q10, #0
vcmla.f32 q8, q9, q10, #90
vst1.32 {q8}, [r2]!
cmp r3, r2
bne .L2
bx lr
MVE:
g:
push {lr}
mov lr, #100
dls lr, lr
.L2:
vldrw.32 q1, [r1], #16
vldrw.32 q2, [r0], #16
vcmul.f32 q3, q2, q1, #0
vcmla.f32 q3, q2, q1, #90
vstrw.32 q3, [r2], #16
le lr, .L2
ldr pc, [sp], #4
instead of
g:
add r3, r2, #1600
.L2:
vld2.32 {d20-d23}, [r0]!
vld2.32 {d16-d19}, [r1]!
vmul.f32 q14, q11, q9
vmul.f32 q15, q11, q8
vneg.f32 q14, q14
vfma.f32 q15, q10, q9
vfma.f32 q14, q10, q8
vmov q13, q15 @ v4sf
vmov q12, q14 @ v4sf
vst2.32 {d24-d27}, [r2]!
cmp r3, r2
bne .L2
bx lr
and
g:
add r3, r2, #1600
.L2:
vld2.32 {d20-d23}, [r0]!
vld2.32 {d16-d19}, [r1]!
vmul.f32 q15, q10, q8
vmul.f32 q14, q10, q9
vmls.f32 q15, q11, q9
vmla.f32 q14, q11, q8
vmov q12, q15 @ v4sf
vmov q13, q14 @ v4sf
vst2.32 {d24-d27}, [r2]!
cmp r3, r2
bne .L2
bx lr
respectively.
Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.
Execution tests verified with QEMU.
Generic tests for these are in the mid-end and I will enable them with a
different patch.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
* config/arm/iterators.md (rotsplit1, rotsplit2, conj_op, fcmac1,
VCMLA_OP, VCMUL_OP): New.
* config/arm/mve.md (mve_vcmlaq<mve_rot><mode>): Support vec_dup 0.
* config/arm/neon.md (cmul<conj_op><mode>3): New.
* config/arm/unspecs.md (UNSPEC_VCMLA_CONJ, UNSPEC_VCMLA180_CONJ,
UNSPEC_VCMUL_CONJ): New.
* config/arm/vec-common.md (cmul<conj_op><mode>3, arm_vcmla<rot><mode>,
cml<fcmac1><conj_op><mode>4): New.
--- inline copy of patch --
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 2e0aacbd3f742538073e441b53fcffc45e37c790..b9027905307fe19d60d164cef23dac6ab119cd9b 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -1186,6 +1186,33 @@ (define_int_attr rot [(UNSPEC_VCADD90 "90")
(UNSPEC_VCMLA180 "180")
(UNSPEC_VCMLA270 "270")])
+;; The complex operations when performed on a real complex number require two
+;; instructions to perform the operation. e.g. complex multiplication requires
+;; two VCMUL with a particular rotation value.
+;;
+;; These values can be looked up in rotsplit1 and rotsplit2. as an example
+;; VCMUL needs the first instruction to use #0 and the second #90.
+(define_int_attr rotsplit1 [(UNSPEC_VCMLA "0")
+ (UNSPEC_VCMLA_CONJ "0")
+ (UNSPEC_VCMUL "0")
+ (UNSPEC_VCMUL_CONJ "0")
+ (UNSPEC_VCMLA180 "180")
+ (UNSPEC_VCMLA180_CONJ "180")])
+
+(define_int_attr rotsplit2 [(UNSPEC_VCMLA "90")
+ (UNSPEC_VCMLA_CONJ "270")
+ (UNSPEC_VCMUL "90")
+ (UNSPEC_VCMUL_CONJ "270")
+ (UNSPEC_VCMLA180 "270")
+ (UNSPEC_VCMLA180_CONJ "90")])
+
+(define_int_attr conj_op [(UNSPEC_VCMLA180 "")
+ (UNSPEC_VCMLA180_CONJ "_conj")
+ (UNSPEC_VCMLA "")
+ (UNSPEC_VCMLA_CONJ "_conj")
+ (UNSPEC_VCMUL "")
+ (UNSPEC_VCMUL_CONJ "_conj")])
+
(define_int_attr mve_rot [(UNSPEC_VCADD90 "_rot90")
(UNSPEC_VCADD270 "_rot270")
(UNSPEC_VCMLA "")
@@ -1200,6 +1227,9 @@ (define_int_attr mve_rot [(UNSPEC_VCADD90 "_rot90")
(define_int_iterator VCMUL [UNSPEC_VCMUL UNSPEC_VCMUL90
UNSPEC_VCMUL180 UNSPEC_VCMUL270])
+(define_int_attr fcmac1 [(UNSPEC_VCMLA "a") (UNSPEC_VCMLA_CONJ "a")
+ (UNSPEC_VCMLA180 "s") (UNSPEC_VCMLA180_CONJ "s")])
+
(define_int_attr simd32_op [(UNSPEC_QADD8 "qadd8") (UNSPEC_QSUB8 "qsub8")
(UNSPEC_SHADD8 "shadd8") (UNSPEC_SHSUB8 "shsub8")
(UNSPEC_UHADD8 "uhadd8") (UNSPEC_UHSUB8 "uhsub8")
@@ -1723,3 +1753,13 @@ (define_int_iterator VADCQ_M [VADCQ_M_U VADCQ_M_S])
(define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48])
(define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48])
(define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])
+
+;; Define iterators for VCMLA operations
+(define_int_iterator VCMLA_OP [UNSPEC_VCMLA
+ UNSPEC_VCMLA_CONJ
+ UNSPEC_VCMLA180
+ UNSPEC_VCMLA180_CONJ])
+
+;; Define iterators for VCMLA operations as MUL
+(define_int_iterator VCMUL_OP [UNSPEC_VCMUL
+ UNSPEC_VCMUL_CONJ])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 62ff12365ab3f92f177704927d230fefc415f1cb..465f71c4eee5f77e4d5904e8508c4134d1c9573f 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -4101,15 +4101,16 @@ (define_insn "mve_vaddlvaq_p_<supf>v4si"
(define_insn "mve_vcmlaq<mve_rot><mode>"
[
(set (match_operand:MVE_0 0 "s_register_operand" "=w,w")
- (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0,Dz")
- (match_operand:MVE_0 2 "s_register_operand" "w,w")
- (match_operand:MVE_0 3 "s_register_operand" "w,w")]
- VCMLA))
+ (plus:MVE_0 (match_operand:MVE_0 1 "reg_or_zero_operand" "Dz,0")
+ (unspec:MVE_0
+ [(match_operand:MVE_0 2 "s_register_operand" "w,w")
+ (match_operand:MVE_0 3 "s_register_operand" "w,w")]
+ VCMLA)))
]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
"@
- vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>
- vcmul.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>"
+ vcmul.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>
+ vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>"
[(set_attr "type" "mve_move")
])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index e904db97ea7bd4cb0f32199038ace3d334ffb8f9..fec2cc91d24b6eff7b6fc8fdd54f39b3d646c468 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -2952,6 +2952,25 @@ (define_insn "neon_vcmlaq_lane<rot><mode>"
[(set_attr "type" "neon_fcmla")]
)
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder. Because of this, expand early.
+(define_expand "cmul<conj_op><mode>3"
+ [(set (match_operand:VDF 0 "register_operand")
+ (unspec:VDF [(match_operand:VDF 1 "register_operand")
+ (match_operand:VDF 2 "register_operand")]
+ VCMUL_OP))]
+ "TARGET_COMPLEX && !BYTES_BIG_ENDIAN"
+{
+ rtx res1 = gen_reg_rtx (<MODE>mode);
+ rtx tmp = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+ emit_insn (gen_neon_vcmla<rotsplit1><mode> (res1, tmp,
+ operands[2], operands[1]));
+ emit_insn (gen_neon_vcmla<rotsplit2><mode> (operands[0], res1,
+ operands[2], operands[1]));
+ DONE;
+})
+
;; These instructions map to the __builtins for the Dot Product operations.
(define_insn "neon_<sup>dot<vsi2qi>"
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 97a803e8da50c0119d15bcd4af47c298d3758c47..c6ebb6fc2b6a8d9e46f126dd857222a892c84093 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -510,10 +510,13 @@ (define_c_enum "unspec" [
UNSPEC_VCMLA90
UNSPEC_VCMLA180
UNSPEC_VCMLA270
+ UNSPEC_VCMLA_CONJ
+ UNSPEC_VCMLA180_CONJ
UNSPEC_VCMUL
UNSPEC_VCMUL90
UNSPEC_VCMUL180
UNSPEC_VCMUL270
+ UNSPEC_VCMUL_CONJ
UNSPEC_MATMUL_S
UNSPEC_MATMUL_U
UNSPEC_MATMUL_US
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index ff448da126b2250605d772ad423c70c16b753338..692b28ea8ccb18abac016a0c1b45ac7d0bf073d4 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -215,6 +215,63 @@ (define_expand "cadd<rot><mode>3"
&& ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
)
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder. Because of this, expand early.
+(define_expand "cmul<conj_op><mode>3"
+ [(set (match_operand:VQ_HSF 0 "register_operand")
+ (unspec:VQ_HSF [(match_operand:VQ_HSF 1 "register_operand")
+ (match_operand:VQ_HSF 2 "register_operand")]
+ VCMUL_OP))]
+ "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT))
+ && !BYTES_BIG_ENDIAN"
+{
+ rtx res1 = gen_reg_rtx (<MODE>mode);
+ if (TARGET_COMPLEX)
+ {
+ rtx tmp = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+ emit_insn (gen_arm_vcmla<rotsplit1><mode> (res1, tmp,
+ operands[2], operands[1]));
+ }
+ else
+ emit_insn (gen_arm_vcmla<rotsplit1><mode> (res1, CONST0_RTX (<MODE>mode),
+ operands[2], operands[1]));
+
+ emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], res1,
+ operands[2], operands[1]));
+ DONE;
+})
+
+(define_expand "arm_vcmla<rot><mode>"
+ [(set (match_operand:VF 0 "register_operand")
+ (plus:VF (match_operand:VF 1 "register_operand")
+ (unspec:VF [(match_operand:VF 2 "register_operand")
+ (match_operand:VF 3 "register_operand")]
+ VCMLA)))]
+ "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
+ && ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
+)
+
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder. Because of this, expand early.
+(define_expand "cml<fcmac1><conj_op><mode>4"
+ [(set (match_operand:VF 0 "register_operand")
+ (plus:VF (match_operand:VF 1 "register_operand")
+ (unspec:VF [(match_operand:VF 2 "register_operand")
+ (match_operand:VF 3 "register_operand")]
+ VCMLA_OP)))]
+ "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
+ && ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
+{
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_arm_vcmla<rotsplit1><mode> (tmp, operands[1],
+ operands[3], operands[2]));
+ emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], tmp,
+ operands[3], operands[2]));
+ DONE;
+})
+
(define_expand "movmisalign<mode>"
[(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
(unspec:VDQX [(match_operand:VDQX 1 "neon_perm_struct_or_reg_operand")]
--
[-- Attachment #2: rb14048.patch --]
[-- Type: text/x-diff, Size: 8155 bytes --]
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 2e0aacbd3f742538073e441b53fcffc45e37c790..b9027905307fe19d60d164cef23dac6ab119cd9b 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -1186,6 +1186,33 @@ (define_int_attr rot [(UNSPEC_VCADD90 "90")
(UNSPEC_VCMLA180 "180")
(UNSPEC_VCMLA270 "270")])
+;; The complex operations when performed on a real complex number require two
+;; instructions to perform the operation. e.g. complex multiplication requires
+;; two VCMUL with a particular rotation value.
+;;
+;; These values can be looked up in rotsplit1 and rotsplit2. as an example
+;; VCMUL needs the first instruction to use #0 and the second #90.
+(define_int_attr rotsplit1 [(UNSPEC_VCMLA "0")
+ (UNSPEC_VCMLA_CONJ "0")
+ (UNSPEC_VCMUL "0")
+ (UNSPEC_VCMUL_CONJ "0")
+ (UNSPEC_VCMLA180 "180")
+ (UNSPEC_VCMLA180_CONJ "180")])
+
+(define_int_attr rotsplit2 [(UNSPEC_VCMLA "90")
+ (UNSPEC_VCMLA_CONJ "270")
+ (UNSPEC_VCMUL "90")
+ (UNSPEC_VCMUL_CONJ "270")
+ (UNSPEC_VCMLA180 "270")
+ (UNSPEC_VCMLA180_CONJ "90")])
+
+(define_int_attr conj_op [(UNSPEC_VCMLA180 "")
+ (UNSPEC_VCMLA180_CONJ "_conj")
+ (UNSPEC_VCMLA "")
+ (UNSPEC_VCMLA_CONJ "_conj")
+ (UNSPEC_VCMUL "")
+ (UNSPEC_VCMUL_CONJ "_conj")])
+
(define_int_attr mve_rot [(UNSPEC_VCADD90 "_rot90")
(UNSPEC_VCADD270 "_rot270")
(UNSPEC_VCMLA "")
@@ -1200,6 +1227,9 @@ (define_int_attr mve_rot [(UNSPEC_VCADD90 "_rot90")
(define_int_iterator VCMUL [UNSPEC_VCMUL UNSPEC_VCMUL90
UNSPEC_VCMUL180 UNSPEC_VCMUL270])
+(define_int_attr fcmac1 [(UNSPEC_VCMLA "a") (UNSPEC_VCMLA_CONJ "a")
+ (UNSPEC_VCMLA180 "s") (UNSPEC_VCMLA180_CONJ "s")])
+
(define_int_attr simd32_op [(UNSPEC_QADD8 "qadd8") (UNSPEC_QSUB8 "qsub8")
(UNSPEC_SHADD8 "shadd8") (UNSPEC_SHSUB8 "shsub8")
(UNSPEC_UHADD8 "uhadd8") (UNSPEC_UHSUB8 "uhsub8")
@@ -1723,3 +1753,13 @@ (define_int_iterator VADCQ_M [VADCQ_M_U VADCQ_M_S])
(define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48])
(define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48])
(define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])
+
+;; Define iterators for VCMLA operations
+(define_int_iterator VCMLA_OP [UNSPEC_VCMLA
+ UNSPEC_VCMLA_CONJ
+ UNSPEC_VCMLA180
+ UNSPEC_VCMLA180_CONJ])
+
+;; Define iterators for VCMLA operations as MUL
+(define_int_iterator VCMUL_OP [UNSPEC_VCMUL
+ UNSPEC_VCMUL_CONJ])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 62ff12365ab3f92f177704927d230fefc415f1cb..465f71c4eee5f77e4d5904e8508c4134d1c9573f 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -4101,15 +4101,16 @@ (define_insn "mve_vaddlvaq_p_<supf>v4si"
(define_insn "mve_vcmlaq<mve_rot><mode>"
[
(set (match_operand:MVE_0 0 "s_register_operand" "=w,w")
- (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0,Dz")
- (match_operand:MVE_0 2 "s_register_operand" "w,w")
- (match_operand:MVE_0 3 "s_register_operand" "w,w")]
- VCMLA))
+ (plus:MVE_0 (match_operand:MVE_0 1 "reg_or_zero_operand" "Dz,0")
+ (unspec:MVE_0
+ [(match_operand:MVE_0 2 "s_register_operand" "w,w")
+ (match_operand:MVE_0 3 "s_register_operand" "w,w")]
+ VCMLA)))
]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
"@
- vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>
- vcmul.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>"
+ vcmul.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>
+ vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>"
[(set_attr "type" "mve_move")
])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index e904db97ea7bd4cb0f32199038ace3d334ffb8f9..fec2cc91d24b6eff7b6fc8fdd54f39b3d646c468 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -2952,6 +2952,25 @@ (define_insn "neon_vcmlaq_lane<rot><mode>"
[(set_attr "type" "neon_fcmla")]
)
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder. Because of this, expand early.
+(define_expand "cmul<conj_op><mode>3"
+ [(set (match_operand:VDF 0 "register_operand")
+ (unspec:VDF [(match_operand:VDF 1 "register_operand")
+ (match_operand:VDF 2 "register_operand")]
+ VCMUL_OP))]
+ "TARGET_COMPLEX && !BYTES_BIG_ENDIAN"
+{
+ rtx res1 = gen_reg_rtx (<MODE>mode);
+ rtx tmp = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+ emit_insn (gen_neon_vcmla<rotsplit1><mode> (res1, tmp,
+ operands[2], operands[1]));
+ emit_insn (gen_neon_vcmla<rotsplit2><mode> (operands[0], res1,
+ operands[2], operands[1]));
+ DONE;
+})
+
;; These instructions map to the __builtins for the Dot Product operations.
(define_insn "neon_<sup>dot<vsi2qi>"
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 97a803e8da50c0119d15bcd4af47c298d3758c47..c6ebb6fc2b6a8d9e46f126dd857222a892c84093 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -510,10 +510,13 @@ (define_c_enum "unspec" [
UNSPEC_VCMLA90
UNSPEC_VCMLA180
UNSPEC_VCMLA270
+ UNSPEC_VCMLA_CONJ
+ UNSPEC_VCMLA180_CONJ
UNSPEC_VCMUL
UNSPEC_VCMUL90
UNSPEC_VCMUL180
UNSPEC_VCMUL270
+ UNSPEC_VCMUL_CONJ
UNSPEC_MATMUL_S
UNSPEC_MATMUL_U
UNSPEC_MATMUL_US
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index ff448da126b2250605d772ad423c70c16b753338..692b28ea8ccb18abac016a0c1b45ac7d0bf073d4 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -215,6 +215,63 @@ (define_expand "cadd<rot><mode>3"
&& ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
)
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder. Because of this, expand early.
+(define_expand "cmul<conj_op><mode>3"
+ [(set (match_operand:VQ_HSF 0 "register_operand")
+ (unspec:VQ_HSF [(match_operand:VQ_HSF 1 "register_operand")
+ (match_operand:VQ_HSF 2 "register_operand")]
+ VCMUL_OP))]
+ "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT))
+ && !BYTES_BIG_ENDIAN"
+{
+ rtx res1 = gen_reg_rtx (<MODE>mode);
+ if (TARGET_COMPLEX)
+ {
+ rtx tmp = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+ emit_insn (gen_arm_vcmla<rotsplit1><mode> (res1, tmp,
+ operands[2], operands[1]));
+ }
+ else
+ emit_insn (gen_arm_vcmla<rotsplit1><mode> (res1, CONST0_RTX (<MODE>mode),
+ operands[2], operands[1]));
+
+ emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], res1,
+ operands[2], operands[1]));
+ DONE;
+})
+
+(define_expand "arm_vcmla<rot><mode>"
+ [(set (match_operand:VF 0 "register_operand")
+ (plus:VF (match_operand:VF 1 "register_operand")
+ (unspec:VF [(match_operand:VF 2 "register_operand")
+ (match_operand:VF 3 "register_operand")]
+ VCMLA)))]
+ "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
+ && ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
+)
+
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder. Because of this, expand early.
+(define_expand "cml<fcmac1><conj_op><mode>4"
+ [(set (match_operand:VF 0 "register_operand")
+ (plus:VF (match_operand:VF 1 "register_operand")
+ (unspec:VF [(match_operand:VF 2 "register_operand")
+ (match_operand:VF 3 "register_operand")]
+ VCMLA_OP)))]
+ "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
+ && ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
+{
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_arm_vcmla<rotsplit1><mode> (tmp, operands[1],
+ operands[3], operands[2]));
+ emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], tmp,
+ operands[3], operands[2]));
+ DONE;
+})
+
(define_expand "movmisalign<mode>"
[(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
(unspec:VDQX [(match_operand:VDQX 1 "neon_perm_struct_or_reg_operand")]
^ permalink raw reply [flat|nested] 2+ messages in thread
* RE: [PATCH]Arm: Add NEON and MVE complex mul, mla and mls patterns.
2021-01-21 18:54 [PATCH]Arm: Add NEON and MVE complex mul, mla and mls patterns Tamar Christina
@ 2021-01-22 9:40 ` Kyrylo Tkachov
0 siblings, 0 replies; 2+ messages in thread
From: Kyrylo Tkachov @ 2021-01-22 9:40 UTC (permalink / raw)
To: Tamar Christina, gcc-patches
Cc: nd, Ramana Radhakrishnan, Richard Earnshaw, nickc
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: 21 January 2021 18:54
> To: gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Ramana Radhakrishnan
> <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> Subject: [PATCH]Arm: Add NEON and MVE complex mul, mla and mls
> patterns.
>
> Hi All,
>
> This adds implementation for the optabs for complex operations. With this
> the
> following C code:
>
> void g (float complex a[restrict N], float complex b[restrict N],
> float complex c[restrict N])
> {
> for (int i=0; i < N; i++)
> c[i] = a[i] * b[i];
> }
>
> generates
>
>
> NEON:
>
> g:
> vmov.f32 q11, #0.0 @ v4sf
> add r3, r2, #1600
> .L2:
> vmov q8, q11 @ v4sf
> vld1.32 {q10}, [r1]!
> vld1.32 {q9}, [r0]!
> vcmla.f32 q8, q9, q10, #0
> vcmla.f32 q8, q9, q10, #90
> vst1.32 {q8}, [r2]!
> cmp r3, r2
> bne .L2
> bx lr
>
> MVE:
>
> g:
> push {lr}
> mov lr, #100
> dls lr, lr
> .L2:
> vldrw.32 q1, [r1], #16
> vldrw.32 q2, [r0], #16
> vcmul.f32 q3, q2, q1, #0
> vcmla.f32 q3, q2, q1, #90
> vstrw.32 q3, [r2], #16
> le lr, .L2
> ldr pc, [sp], #4
>
> instead of
>
> g:
> add r3, r2, #1600
> .L2:
> vld2.32 {d20-d23}, [r0]!
> vld2.32 {d16-d19}, [r1]!
> vmul.f32 q14, q11, q9
> vmul.f32 q15, q11, q8
> vneg.f32 q14, q14
> vfma.f32 q15, q10, q9
> vfma.f32 q14, q10, q8
> vmov q13, q15 @ v4sf
> vmov q12, q14 @ v4sf
> vst2.32 {d24-d27}, [r2]!
> cmp r3, r2
> bne .L2
> bx lr
>
> and
>
> g:
> add r3, r2, #1600
> .L2:
> vld2.32 {d20-d23}, [r0]!
> vld2.32 {d16-d19}, [r1]!
> vmul.f32 q15, q10, q8
> vmul.f32 q14, q10, q9
> vmls.f32 q15, q11, q9
> vmla.f32 q14, q11, q8
> vmov q12, q15 @ v4sf
> vmov q13, q14 @ v4sf
> vst2.32 {d24-d27}, [r2]!
> cmp r3, r2
> bne .L2
> bx lr
>
> respectively.
>
> Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.
>
> Execution tests verified with QEMU.
>
> Generic tests for these are in the mid-end and I will enable them with a
> different patch.
>
> Ok for master?
Ok. Thanks for your perseverance.
Kyrill
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> * config/arm/iterators.md (rotsplit1, rotsplit2, conj_op, fcmac1,
> VCMLA_OP, VCMUL_OP): New.
> * config/arm/mve.md (mve_vcmlaq<mve_rot><mode>): Support
> vec_dup 0.
> * config/arm/neon.md (cmul<conj_op><mode>3): New.
> * config/arm/unspecs.md (UNSPEC_VCMLA_CONJ,
> UNSPEC_VCMLA180_CONJ,
> UNSPEC_VCMUL_CONJ): New.
> * config/arm/vec-common.md (cmul<conj_op><mode>3,
> arm_vcmla<rot><mode>,
> cml<fcmac1><conj_op><mode>4): New.
>
> --- inline copy of patch --
> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> index
> 2e0aacbd3f742538073e441b53fcffc45e37c790..b9027905307fe19d60d164c
> ef23dac6ab119cd9b 100644
> --- a/gcc/config/arm/iterators.md
> +++ b/gcc/config/arm/iterators.md
> @@ -1186,6 +1186,33 @@ (define_int_attr rot [(UNSPEC_VCADD90 "90")
> (UNSPEC_VCMLA180 "180")
> (UNSPEC_VCMLA270 "270")])
>
> +;; The complex operations when performed on a real complex number
> require two
> +;; instructions to perform the operation. e.g. complex multiplication
> requires
> +;; two VCMUL with a particular rotation value.
> +;;
> +;; These values can be looked up in rotsplit1 and rotsplit2. as an example
> +;; VCMUL needs the first instruction to use #0 and the second #90.
> +(define_int_attr rotsplit1 [(UNSPEC_VCMLA "0")
> + (UNSPEC_VCMLA_CONJ "0")
> + (UNSPEC_VCMUL "0")
> + (UNSPEC_VCMUL_CONJ "0")
> + (UNSPEC_VCMLA180 "180")
> + (UNSPEC_VCMLA180_CONJ "180")])
> +
> +(define_int_attr rotsplit2 [(UNSPEC_VCMLA "90")
> + (UNSPEC_VCMLA_CONJ "270")
> + (UNSPEC_VCMUL "90")
> + (UNSPEC_VCMUL_CONJ "270")
> + (UNSPEC_VCMLA180 "270")
> + (UNSPEC_VCMLA180_CONJ "90")])
> +
> +(define_int_attr conj_op [(UNSPEC_VCMLA180 "")
> + (UNSPEC_VCMLA180_CONJ "_conj")
> + (UNSPEC_VCMLA "")
> + (UNSPEC_VCMLA_CONJ "_conj")
> + (UNSPEC_VCMUL "")
> + (UNSPEC_VCMUL_CONJ "_conj")])
> +
> (define_int_attr mve_rot [(UNSPEC_VCADD90 "_rot90")
> (UNSPEC_VCADD270 "_rot270")
> (UNSPEC_VCMLA "")
> @@ -1200,6 +1227,9 @@ (define_int_attr mve_rot [(UNSPEC_VCADD90
> "_rot90")
> (define_int_iterator VCMUL [UNSPEC_VCMUL UNSPEC_VCMUL90
> UNSPEC_VCMUL180 UNSPEC_VCMUL270])
>
> +(define_int_attr fcmac1 [(UNSPEC_VCMLA "a") (UNSPEC_VCMLA_CONJ "a")
> + (UNSPEC_VCMLA180 "s")
> (UNSPEC_VCMLA180_CONJ "s")])
> +
> (define_int_attr simd32_op [(UNSPEC_QADD8 "qadd8") (UNSPEC_QSUB8
> "qsub8")
> (UNSPEC_SHADD8 "shadd8") (UNSPEC_SHSUB8
> "shsub8")
> (UNSPEC_UHADD8 "uhadd8") (UNSPEC_UHSUB8
> "uhsub8")
> @@ -1723,3 +1753,13 @@ (define_int_iterator VADCQ_M [VADCQ_M_U
> VADCQ_M_S])
> (define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48])
> (define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48])
> (define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])
> +
> +;; Define iterators for VCMLA operations
> +(define_int_iterator VCMLA_OP [UNSPEC_VCMLA
> + UNSPEC_VCMLA_CONJ
> + UNSPEC_VCMLA180
> + UNSPEC_VCMLA180_CONJ])
> +
> +;; Define iterators for VCMLA operations as MUL
> +(define_int_iterator VCMUL_OP [UNSPEC_VCMUL
> + UNSPEC_VCMUL_CONJ])
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index
> 62ff12365ab3f92f177704927d230fefc415f1cb..465f71c4eee5f77e4d5904e8
> 508c4134d1c9573f 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -4101,15 +4101,16 @@ (define_insn "mve_vaddlvaq_p_<supf>v4si"
> (define_insn "mve_vcmlaq<mve_rot><mode>"
> [
> (set (match_operand:MVE_0 0 "s_register_operand" "=w,w")
> - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand"
> "0,Dz")
> - (match_operand:MVE_0 2 "s_register_operand" "w,w")
> - (match_operand:MVE_0 3 "s_register_operand" "w,w")]
> - VCMLA))
> + (plus:MVE_0 (match_operand:MVE_0 1 "reg_or_zero_operand"
> "Dz,0")
> + (unspec:MVE_0
> + [(match_operand:MVE_0 2 "s_register_operand" "w,w")
> + (match_operand:MVE_0 3 "s_register_operand" "w,w")]
> + VCMLA)))
> ]
> "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> "@
> - vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>
> - vcmul.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>"
> + vcmul.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>
> + vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>"
> [(set_attr "type" "mve_move")
> ])
>
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index
> e904db97ea7bd4cb0f32199038ace3d334ffb8f9..fec2cc91d24b6eff7b6fc8fdd
> 54f39b3d646c468 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -2952,6 +2952,25 @@ (define_insn "neon_vcmlaq_lane<rot><mode>"
> [(set_attr "type" "neon_fcmla")]
> )
>
> +;; The complex mul operations always need to expand to two instructions.
> +;; The first operation does half the computation and the second does the
> +;; remainder. Because of this, expand early.
> +(define_expand "cmul<conj_op><mode>3"
> + [(set (match_operand:VDF 0 "register_operand")
> + (unspec:VDF [(match_operand:VDF 1 "register_operand")
> + (match_operand:VDF 2 "register_operand")]
> + VCMUL_OP))]
> + "TARGET_COMPLEX && !BYTES_BIG_ENDIAN"
> +{
> + rtx res1 = gen_reg_rtx (<MODE>mode);
> + rtx tmp = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
> + emit_insn (gen_neon_vcmla<rotsplit1><mode> (res1, tmp,
> + operands[2], operands[1]));
> + emit_insn (gen_neon_vcmla<rotsplit2><mode> (operands[0], res1,
> + operands[2], operands[1]));
> + DONE;
> +})
> +
>
> ;; These instructions map to the __builtins for the Dot Product operations.
> (define_insn "neon_<sup>dot<vsi2qi>"
> diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
> index
> 97a803e8da50c0119d15bcd4af47c298d3758c47..c6ebb6fc2b6a8d9e46f126d
> d857222a892c84093 100644
> --- a/gcc/config/arm/unspecs.md
> +++ b/gcc/config/arm/unspecs.md
> @@ -510,10 +510,13 @@ (define_c_enum "unspec" [
> UNSPEC_VCMLA90
> UNSPEC_VCMLA180
> UNSPEC_VCMLA270
> + UNSPEC_VCMLA_CONJ
> + UNSPEC_VCMLA180_CONJ
> UNSPEC_VCMUL
> UNSPEC_VCMUL90
> UNSPEC_VCMUL180
> UNSPEC_VCMUL270
> + UNSPEC_VCMUL_CONJ
> UNSPEC_MATMUL_S
> UNSPEC_MATMUL_U
> UNSPEC_MATMUL_US
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-
> common.md
> index
> ff448da126b2250605d772ad423c70c16b753338..692b28ea8ccb18abac016a
> 0c1b45ac7d0bf073d4 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -215,6 +215,63 @@ (define_expand "cadd<rot><mode>3"
> && ARM_HAVE_<MODE>_ARITH))
> && !BYTES_BIG_ENDIAN"
> )
>
> +;; The complex mul operations always need to expand to two instructions.
> +;; The first operation does half the computation and the second does the
> +;; remainder. Because of this, expand early.
> +(define_expand "cmul<conj_op><mode>3"
> + [(set (match_operand:VQ_HSF 0 "register_operand")
> + (unspec:VQ_HSF [(match_operand:VQ_HSF 1 "register_operand")
> + (match_operand:VQ_HSF 2 "register_operand")]
> + VCMUL_OP))]
> + "(TARGET_COMPLEX || (TARGET_HAVE_MVE &&
> TARGET_HAVE_MVE_FLOAT))
> + && !BYTES_BIG_ENDIAN"
> +{
> + rtx res1 = gen_reg_rtx (<MODE>mode);
> + if (TARGET_COMPLEX)
> + {
> + rtx tmp = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
> + emit_insn (gen_arm_vcmla<rotsplit1><mode> (res1, tmp,
> + operands[2], operands[1]));
> + }
> + else
> + emit_insn (gen_arm_vcmla<rotsplit1><mode> (res1, CONST0_RTX
> (<MODE>mode),
> + operands[2], operands[1]));
> +
> + emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], res1,
> + operands[2], operands[1]));
> + DONE;
> +})
> +
> +(define_expand "arm_vcmla<rot><mode>"
> + [(set (match_operand:VF 0 "register_operand")
> + (plus:VF (match_operand:VF 1 "register_operand")
> + (unspec:VF [(match_operand:VF 2 "register_operand")
> + (match_operand:VF 3 "register_operand")]
> + VCMLA)))]
> + "(TARGET_COMPLEX || (TARGET_HAVE_MVE &&
> TARGET_HAVE_MVE_FLOAT
> + && ARM_HAVE_<MODE>_ARITH))
> && !BYTES_BIG_ENDIAN"
> +)
> +
> +;; The complex mla/mls operations always need to expand to two
> instructions.
> +;; The first operation does half the computation and the second does the
> +;; remainder. Because of this, expand early.
> +(define_expand "cml<fcmac1><conj_op><mode>4"
> + [(set (match_operand:VF 0 "register_operand")
> + (plus:VF (match_operand:VF 1 "register_operand")
> + (unspec:VF [(match_operand:VF 2 "register_operand")
> + (match_operand:VF 3 "register_operand")]
> + VCMLA_OP)))]
> + "(TARGET_COMPLEX || (TARGET_HAVE_MVE &&
> TARGET_HAVE_MVE_FLOAT
> + && ARM_HAVE_<MODE>_ARITH))
> && !BYTES_BIG_ENDIAN"
> +{
> + rtx tmp = gen_reg_rtx (<MODE>mode);
> + emit_insn (gen_arm_vcmla<rotsplit1><mode> (tmp, operands[1],
> + operands[3], operands[2]));
> + emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], tmp,
> + operands[3], operands[2]));
> + DONE;
> +})
> +
> (define_expand "movmisalign<mode>"
> [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
> (unspec:VDQX [(match_operand:VDQX 1
> "neon_perm_struct_or_reg_operand")]
>
>
> --
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2021-01-22 9:40 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-21 18:54 [PATCH]Arm: Add NEON and MVE complex mul, mla and mls patterns Tamar Christina
2021-01-22 9:40 ` Kyrylo Tkachov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).