* [PATCH][1/2][committed] aarch64: Reimplement (R){ADD,SUB}HN intrinsics with RTL codes
@ 2023-05-04 14:24 Kyrylo Tkachov
0 siblings, 0 replies; only message in thread
From: Kyrylo Tkachov @ 2023-05-04 14:24 UTC (permalink / raw)
To: gcc-patches
[-- Attachment #1: Type: text/plain, Size: 1440 bytes --]
Hi all,
We can implement the halving-narrowing add/sub patterns with standard RTL codes as well rather than relying on unspecs.
This patch handles the low-part ones and the second patch does the high-part ones and removes the unspecs themselves.
The operation ADDHN on V4SI, for example, is represented as (truncate:V4HI ((src1:V4SI + src2:V4SI) >> 16))
and RADDHN as (truncate:V4HI ((src1:V4SI + src2:V4SI + (1 << 15)) >> 16)).
Taking this opportunity I specified the patterns returning the narrow mode and annotated them with the
<vczle><vczbe> define_subst rules to get the vec_concat-zero meta-patterns too. This allows us to simplify
the expanders somewhat too. Tests are added to check that the combinations work.
Bootstrapped and tested on aarch64-none-linux-gnu. Also tested on aarch64_be-none-elf.
Pushing to trunk.
Thanks,
Kyrill
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md (aarch64_<sur><addsub>hn<mode>_insn_le):
Delete.
(aarch64_<optab>hn<mode>_insn<vczle><vczbe>): New define_insn.
(aarch64_<sur><addsub>hn<mode>_insn_be): Delete.
(aarch64_r<optab>hn<mode>_insn<vczle><vczbe>): New define_insn.
(aarch64_<sur><addsub>hn<mode>): Delete.
(aarch64_<optab>hn<mode>): New define_expand.
(aarch64_r<optab>hn<mode>): Likewise.
* config/aarch64/predicates.md (aarch64_simd_raddsubhn_imm_vec):
New predicate.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/pr99195_4.c: New test.
[-- Attachment #2: addhn.patch --]
[-- Type: application/octet-stream, Size: 6547 bytes --]
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 705c4b0b4b404355adb5f738be936dad46488a79..421173e7079da3e00e3464c3c5676f8050cd75c2 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4955,49 +4955,61 @@ (define_insn "aarch64_<sur>h<addsub><mode><vczle><vczbe>"
;; <r><addsub>hn<q>.
-(define_insn "aarch64_<sur><addsub>hn<mode>_insn_le"
- [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
- (vec_concat:<VNARROWQ2>
- (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")
- (match_operand:VQN 2 "register_operand" "w")]
- ADDSUBHN)
- (match_operand:<VNARROWQ> 3 "aarch64_simd_or_scalar_imm_zero")))]
- "TARGET_SIMD && !BYTES_BIG_ENDIAN"
- "<sur><addsub>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>"
- [(set_attr "type" "neon_<addsub>_halve_narrow_q")]
+(define_insn "aarch64_<optab>hn<mode>_insn<vczle><vczbe>"
+ [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
+ (truncate:<VNARROWQ>
+ (ashiftrt:VQN
+ (ADDSUB:VQN (match_operand:VQN 1 "register_operand" "w")
+ (match_operand:VQN 2 "register_operand" "w"))
+ (match_operand:VQN 3 "aarch64_simd_shift_imm_vec_exact_top"))))]
+ "TARGET_SIMD"
+ "<optab>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>"
+ [(set_attr "type" "neon_<optab>_halve_narrow_q")]
)
-(define_insn "aarch64_<sur><addsub>hn<mode>_insn_be"
- [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
- (vec_concat:<VNARROWQ2>
- (match_operand:<VNARROWQ> 3 "aarch64_simd_or_scalar_imm_zero")
- (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")
- (match_operand:VQN 2 "register_operand" "w")]
- ADDSUBHN)))]
- "TARGET_SIMD && BYTES_BIG_ENDIAN"
- "<sur><addsub>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>"
- [(set_attr "type" "neon_<addsub>_halve_narrow_q")]
+(define_insn "aarch64_r<optab>hn<mode>_insn<vczle><vczbe>"
+ [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
+ (truncate:<VNARROWQ>
+ (ashiftrt:VQN
+ (plus:VQN
+ (ADDSUB:VQN (match_operand:VQN 1 "register_operand" "w")
+ (match_operand:VQN 2 "register_operand" "w"))
+ (match_operand:VQN 3 "aarch64_simd_raddsubhn_imm_vec"))
+ (match_operand:VQN 4 "aarch64_simd_shift_imm_vec_exact_top"))))]
+ "TARGET_SIMD"
+ "r<optab>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>"
+ [(set_attr "type" "neon_<optab>_halve_narrow_q")]
)
-(define_expand "aarch64_<sur><addsub>hn<mode>"
+(define_expand "aarch64_<optab>hn<mode>"
[(set (match_operand:<VNARROWQ> 0 "register_operand")
- (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand")
- (match_operand:VQN 2 "register_operand")]
- ADDSUBHN))]
+ (ADDSUB:VQN (match_operand:VQN 1 "register_operand")
+ (match_operand:VQN 2 "register_operand")))]
"TARGET_SIMD"
{
- rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
- if (BYTES_BIG_ENDIAN)
- emit_insn (gen_aarch64_<sur><addsub>hn<mode>_insn_be (tmp, operands[1],
- operands[2], CONST0_RTX (<VNARROWQ>mode)));
- else
- emit_insn (gen_aarch64_<sur><addsub>hn<mode>_insn_le (tmp, operands[1],
- operands[2], CONST0_RTX (<VNARROWQ>mode)));
+ rtx shft
+ = aarch64_simd_gen_const_vector_dup (<MODE>mode,
+ GET_MODE_UNIT_BITSIZE (<MODE>mode) / 2);
+ emit_insn (gen_aarch64_<optab>hn<mode>_insn (operands[0], operands[1],
+ operands[2], shft));
+ DONE;
+ }
+)
- /* The intrinsic expects a narrow result, so emit a subreg that will get
- optimized away as appropriate. */
- emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
- <VNARROWQ2>mode));
+(define_expand "aarch64_r<optab>hn<mode>"
+ [(set (match_operand:<VNARROWQ> 0 "register_operand")
+ (ADDSUB:VQN (match_operand:VQN 1 "register_operand")
+ (match_operand:VQN 2 "register_operand")))]
+ "TARGET_SIMD"
+ {
+ rtx shft
+ = aarch64_simd_gen_const_vector_dup (<MODE>mode,
+ GET_MODE_UNIT_BITSIZE (<MODE>mode) / 2);
+ rtx rnd
+ = aarch64_simd_gen_const_vector_dup (<MODE>mode,
+ HOST_WIDE_INT_1U << (GET_MODE_UNIT_BITSIZE (<MODE>mode) / 2 - 1));
+ emit_insn (gen_aarch64_r<optab>hn<mode>_insn (operands[0], operands[1],
+ operands[2], rnd, shft));
DONE;
}
)
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 242f10aea1f483cc2e54435701d62df36301ad39..73f7ade87074cd05c2538f73c806503c5dafd364 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -593,6 +593,12 @@ (define_predicate "aarch64_simd_rshrn_imm_vec"
HOST_WIDE_INT_1U
<< (GET_MODE_UNIT_BITSIZE (mode) - 1))")))
+(define_predicate "aarch64_simd_raddsubhn_imm_vec"
+ (and (match_code "const_vector")
+ (match_test "aarch64_const_vec_all_same_in_range_p (op, 1,
+ HOST_WIDE_INT_1U
+ << (GET_MODE_UNIT_BITSIZE (mode) / 2 - 1))")))
+
(define_predicate "aarch64_simd_shift_imm_bitsize_qi"
(and (match_code "const_int")
(match_test "IN_RANGE (INTVAL (op), 0, 8)")))
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/pr99195_4.c b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..b6ef15b6a972366979125252c60cc5d6996151ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_4.c
@@ -0,0 +1,35 @@
+/* PR target/99195. */
+/* Check that we take advantage of 64-bit Advanced SIMD operations clearing
+ the top half of the vector register and no explicit zeroing instructions
+ are emitted. */
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+#include <arm_neon.h>
+
+#define MYOP(OT,IT,IMT,OP,IS,OS) \
+OT \
+foo_##OP##_##OS (IT a, IT b) \
+{ \
+ IMT zeros = vcreate_##OS (0); \
+ return vcombine_##OS (v##OP##_##IS (a, b), zeros); \
+}
+
+
+#define FUNC(OT,IT,IMT,IS,OS) \
+MYOP (OT, IT, IMT, addhn, IS, OS) \
+MYOP (OT, IT, IMT, subhn, IS, OS) \
+MYOP (OT, IT, IMT, raddhn, IS, OS) \
+MYOP (OT, IT, IMT, rsubhn, IS, OS)
+
+FUNC (int8x16_t, int16x8_t, int8x8_t, s16, s8)
+FUNC (int16x8_t, int32x4_t, int16x4_t, s32, s16)
+FUNC (int32x4_t, int64x2_t, int32x2_t, s64, s32)
+
+FUNC (uint8x16_t, uint16x8_t, uint8x8_t, u16, u8)
+FUNC (uint16x8_t, uint32x4_t, uint16x4_t, u32, u16)
+FUNC (uint32x4_t, uint64x2_t, uint32x2_t, u64, u32)
+
+/* { dg-final { scan-assembler-not {\tfmov\t} } } */
+/* { dg-final { scan-assembler-not {\tmov\t} } } */
+
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2023-05-04 14:24 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-04 14:24 [PATCH][1/2][committed] aarch64: Reimplement (R){ADD,SUB}HN intrinsics with RTL codes Kyrylo Tkachov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).