> -----Original Message----- > From: Richard Sandiford > Sent: Monday, November 14, 2022 9:59 PM > To: Tamar Christina > Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw > ; Marcus Shawcroft > ; Kyrylo Tkachov > Subject: Re: [PATCH 2/2]AArch64 Perform more late folding of reg moves > and shifts which arrive after expand > > (Sorry, immediately following up to myself for a second time recently.) > > Richard Sandiford writes: > > Tamar Christina writes: > >>> > >>> The same thing ought to work for smov, so it would be good to do both. > >>> That would also make the split between the original and new patterns > >>> more > >>> obvious: left shift for the old pattern, right shift for the new pattern. > >>> > >> > >> Done, though because umov can do multilevel extensions I couldn't > >> combine them Into a single pattern. > > > > Hmm, but the pattern is: > > > > (define_insn "*si3_insn2_uxtw" > > [(set (match_operand:GPI 0 "register_operand" "=r,r,r") > > (zero_extend:GPI (LSHIFTRT_ONLY:SI > > (match_operand:SI 1 "register_operand" "w,r,r") > > (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" > "Usl,Uss,r"))))] > > > > GPI is just SI or DI, so in the SI case we're zero-extending SI to SI, > > which isn't a valid operation. The original patch was just for > > extending to DI, which seems correct. The choice between printing %x > > for smov and %w for umov can then depend on the code. You're right, GPI made no sense here. Fixed. > > My original comment quoted above was about using smov in the zero- > extend pattern. I.e. the original: > > (define_insn "*si3_insn2_uxtw" > [(set (match_operand:DI 0 "register_operand" "=r,?r,r") > (zero_extend:DI (LSHIFTRT:SI > (match_operand:SI 1 "register_operand" "w,r,r") > (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" > "Usl,Uss,r"))))] > > could instead be: > > (define_insn "*si3_insn2_uxtw" > [(set (match_operand:DI 0 "register_operand" "=r,?r,r") > (zero_extend:DI (SHIFTRT:SI > (match_operand:SI 1 "register_operand" "w,r,r") > (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" > "Usl,Uss,r"))))] > > with the pattern using "smov %w0, ..." for ashiftft case. Almost, except the none immediate cases don't work with shifts. i.e. a right shift can't be used to sign extend from 32 to 64 bits. I've merged the cases but added a guard for this. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64.md (*si3_insn_uxtw): Split SHIFT into left and right ones. (*aarch64_ashr_sisd_or_int_3): Support smov. (*si3_insn2_xtw): New. * config/aarch64/constraints.md (Usl): New. * config/aarch64/iterators.md (is_zeroE, extend_op): New. gcc/testsuite/ChangeLog: * gcc.target/aarch64/shift-read_1.c: New test. * gcc.target/aarch64/shift-read_2.c: New test. --- inline copy of patch --- diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 39e65979528fb7f748ed456399ca38f929dba1d4..4c181a96e555c2a58c59fc991000b2a2fa9bd244 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -5425,20 +5425,42 @@ (define_split ;; Arithmetic right shift using SISD or Integer instruction (define_insn "*aarch64_ashr_sisd_or_int_3" - [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w") + [(set (match_operand:GPI 0 "register_operand" "=r,r,w,r,&w,&w") (ashiftrt:GPI - (match_operand:GPI 1 "register_operand" "r,r,w,w,w") + (match_operand:GPI 1 "register_operand" "r,r,w,w,w,w") (match_operand:QI 2 "aarch64_reg_or_shift_imm_di" - "Us,r,Us,w,0")))] + "Us,r,Us,Usl,w,0")))] "" - "@ - asr\t%0, %1, %2 - asr\t%0, %1, %2 - sshr\t%0, %1, %2 - # - #" - [(set_attr "type" "bfx,shift_reg,neon_shift_imm,neon_shift_reg,neon_shift_reg") - (set_attr "arch" "*,*,simd,simd,simd")] + { + switch (which_alternative) + { + case 0: + return "asr\t%0, %1, %2"; + case 1: + return "asr\t%0, %1, %2"; + case 2: + return "sshr\t%0, %1, %2"; + case 3: + { + int val = INTVAL (operands[2]); + int size = 32 - val; + + if (size == 16) + return "smov\\t%0, %1.h[1]"; + if (size == 8) + return "smov\\t%0, %1.b[3]"; + gcc_unreachable (); + } + case 4: + return "#"; + case 5: + return "#"; + default: + gcc_unreachable (); + } + } + [(set_attr "type" "bfx,shift_reg,neon_shift_imm,neon_to_gp, neon_shift_reg,neon_shift_reg") + (set_attr "arch" "*,*,simd,simd,simd,simd")] ) (define_split @@ -5548,7 +5570,7 @@ (define_insn "*rol3_insn" ;; zero_extend version of shifts (define_insn "*si3_insn_uxtw" [(set (match_operand:DI 0 "register_operand" "=r,r") - (zero_extend:DI (SHIFT_no_rotate:SI + (zero_extend:DI (SHIFT_arith:SI (match_operand:SI 1 "register_operand" "r,r") (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))] "" @@ -5583,6 +5605,37 @@ (define_insn "*rolsi3_insn_uxtw" [(set_attr "type" "rotate_imm")] ) +(define_insn "*si3_insn2_xtw" + [(set (match_operand:DI 0 "register_operand" "=r,r,r") + (:DI (SHIFTRT:SI + (match_operand:SI 1 "register_operand" "w,r,r") + (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))] + " || satisfies_constraint_Usl (operands[2])" + { + switch (which_alternative) + { + case 0: + { + int val = INTVAL (operands[2]); + int size = 32 - val; + + if (size == 16) + return "mov\\t%x0, %1.h[1]"; + if (size == 8) + return "mov\\t%x0, %1.b[3]"; + gcc_unreachable (); + } + case 1: + return "\\t%w0, %w1, %2"; + case 2: + return "\\t%w0, %w1, %w2"; + default: + gcc_unreachable (); + } + } + [(set_attr "type" "neon_to_gp,bfx,shift_reg")] +) + (define_insn "*3_insn" [(set (match_operand:SHORT 0 "register_operand" "=r") (ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r") diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md index 29efb6c0cff7574c9b239ef358acaca96dd75d03..c2a696cb77f49cae23239b0ed8a8aa5168f8898c 100644 --- a/gcc/config/aarch64/constraints.md +++ b/gcc/config/aarch64/constraints.md @@ -171,6 +171,14 @@ (define_constraint "Uss" (and (match_code "const_int") (match_test "(unsigned HOST_WIDE_INT) ival < 32"))) +(define_constraint "Usl" + "@internal + A constraint that matches an immediate shift constant in SImode that has an + exact mode available to use." + (and (match_code "const_int") + (and (match_test "satisfies_constraint_Uss (op)") + (match_test "(32 - ival == 8) || (32 - ival == 16)")))) + (define_constraint "Usn" "A constant that can be used with a CCMN operation (once negated)." (and (match_code "const_int") diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 7c69b124f076b4fb2540241f287c6999c32123c1..df72c079f218db9727a96924cab496e91ce6df59 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -2149,8 +2149,8 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")]) ;; This code iterator allows the various shifts supported on the core (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate]) -;; This code iterator allows all shifts except for rotates. -(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt]) +;; This code iterator allows arithmetic shifts +(define_code_iterator SHIFT_arith [ashift ashiftrt]) ;; This code iterator allows the shifts supported in arithmetic instructions (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt]) @@ -2378,9 +2378,18 @@ (define_code_attr shift [(ashift "lsl") (ashiftrt "asr") (define_code_attr is_rotl [(ashift "0") (ashiftrt "0") (lshiftrt "0") (rotatert "0") (rotate "1")]) +;; True if zero extending operation or not +(define_code_attr is_zeroE [(ashift "false") (ashiftrt "false") + (lshiftrt "true")]) + + ;; Op prefix for shift right and accumulate. (define_code_attr sra_op [(ashiftrt "s") (lshiftrt "u")]) +;; Extensions that can be performed with Op +(define_code_attr extend_op [(ashiftrt "sign_extend") + (lshiftrt "zero_extend")]) + ;; op prefix for shift right and narrow. (define_code_attr srn_op [(ashiftrt "r") (lshiftrt "")]) diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_1.c b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c new file mode 100644 index 0000000000000000000000000000000000000000..864cfcb1650ae6553a18e753c8d8d0e85cd0ba7b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c @@ -0,0 +1,73 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O2" } */ +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ + +#include + +/* +** foor: +** umov w0, v0.h\[3\] +** ret +*/ +unsigned int foor (uint32x4_t x) +{ + return x[1] >> 16; +} + +/* +** fool: +** umov w0, v0.s\[1\] +** lsl w0, w0, 16 +** ret +*/ +unsigned int fool (uint32x4_t x) +{ + return x[1] << 16; +} + +/* +** foor2: +** umov w0, v0.h\[7\] +** ret +*/ +unsigned short foor2 (uint32x4_t x) +{ + return x[3] >> 16; +} + +/* +** fool2: +** fmov w0, s0 +** lsl w0, w0, 16 +** ret +*/ +unsigned int fool2 (uint32x4_t x) +{ + return x[0] << 16; +} + +typedef int v4si __attribute__ ((vector_size (16))); + +/* +** bar: +** addv s0, v0.4s +** fmov w0, s0 +** lsr w1, w0, 16 +** add w0, w1, w0, uxth +** ret +*/ +int bar (v4si x) +{ + unsigned int sum = vaddvq_s32 (x); + return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16)); +} + +/* +** foo: +** lsr w0, w0, 16 +** ret +*/ +unsigned short foo (unsigned x) +{ + return x >> 16; +} diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_2.c b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c new file mode 100644 index 0000000000000000000000000000000000000000..bdc214d1941807ce5aa21c369fcfe23c1927e98b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c @@ -0,0 +1,84 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O2" } */ +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ + +#include + +/* +** foor_1: +** smov w0, v0.h\[3\] +** ret +*/ +int32_t foor_1 (int32x4_t x) +{ + return x[1] >> 16; +} + +/* +** foor_2: +** smov x0, v0.h\[3\] +** ret +*/ +int64_t foor_2 (int32x4_t x) +{ + return x[1] >> 16; +} + + +/* +** fool: +** [su]mov w0, v0.s\[1\] +** lsl w0, w0, 16 +** ret +*/ +int fool (int32x4_t x) +{ + return x[1] << 16; +} + +/* +** foor2: +** umov w0, v0.h\[7\] +** ret +*/ +short foor2 (int32x4_t x) +{ + return x[3] >> 16; +} + +/* +** fool2: +** fmov w0, s0 +** lsl w0, w0, 16 +** ret +*/ +int fool2 (int32x4_t x) +{ + return x[0] << 16; +} + +typedef int v4si __attribute__ ((vector_size (16))); + +/* +** bar: +** addv s0, v0.4s +** fmov w0, s0 +** lsr w1, w0, 16 +** add w0, w1, w0, uxth +** ret +*/ +int bar (v4si x) +{ + unsigned int sum = vaddvq_s32 (x); + return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16)); +} + +/* +** foo: +** lsr w0, w0, 16 +** ret +*/ +short foo (int x) +{ + return x >> 16; +}