* [PATCH]AArch64[RFC] Force complicated constant to memory when beneficial
@ 2021-08-31 13:26 Tamar Christina
2021-10-08 16:12 ` Richard Sandiford
0 siblings, 1 reply; 2+ messages in thread
From: Tamar Christina @ 2021-08-31 13:26 UTC (permalink / raw)
To: gcc-patches
Cc: nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov,
richard.sandiford
[-- Attachment #1: Type: text/plain, Size: 5468 bytes --]
Hi All,
Consider the following case
#include <arm_neon.h>
uint64_t
test4 (uint8x16_t input)
{
uint8x16_t bool_input = vshrq_n_u8(input, 7);
poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
vgetq_lane_p64(mask, 0));
poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
return vget_lane_u16((uint16x4_t)res, 3);
}
which generates (after my CSE patches):
test4:
ushr v0.16b, v0.16b, 7
mov x0, 16512
movk x0, 0x1020, lsl 16
movk x0, 0x408, lsl 32
movk x0, 0x102, lsl 48
fmov d1, x0
pmull v2.1q, v0.1d, v1.1d
dup v1.2d, v1.d[0]
pmull2 v0.1q, v0.2d, v1.2d
trn2 v2.8b, v2.8b, v0.8b
umov w0, v2.h[3]
re
which is suboptimal since the constant is never needed on the genreg side and
should have been materialized on the SIMD side since the constant is so big
that it requires 5 instruction to create otherwise. 4 mov/movk and one fmov.
The problem is that the choice of on which side to materialize the constant can
only be done during reload. We may need an extra register (to hold the
addressing) and so can't be done after reload.
I have tried to support this with a pattern during reload, but the problem is I
can't seem to find a way to tell reload it should spill a constant under
condition x. Instead I tried with a split which reload selects when the
condition hold.
This has a couple of issues:
1. The pattern can be expanded late (could be fixed with !reload_completed).
2. Because it's split so late we can't seem to be able to share the anchors for
the ADRP.
3. Because it's split so late and basically reload doesn't know about the spill
and so the ADD lo12 isn't pushed into the addressing mode of the LDR.
I don't know how to properly fix these since I think the only way is for reload
to do the spill properly itself, but in this case not having the patter makes it
avoid the mem pattern and pick r <- n instead followed by r -> w.
Thanks,
Tamar
gcc/ChangeLog:
* config/aarch64/aarch64.md (*movdi_aarch6): Add Dx -> W.
* config/aarch64/constraints.md (Dx): New.
--- inline copy of patch --
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index eb8ccd4b97bbd4f0c3ff5791e48cfcfb42ec6c2e..a18886cb65c86daa16baa1691b1718f2d3a1be6c 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1298,8 +1298,8 @@ (define_insn_and_split "*movsi_aarch64"
)
(define_insn_and_split "*movdi_aarch64"
- [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m, r, r, w,r,w, w")
- (match_operand:DI 1 "aarch64_mov_operand" " r,r,k,N,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
+ [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,w ,r ,r,w, m,m, r, r, w,r,w,w")
+ (match_operand:DI 1 "aarch64_mov_operand" " r,r,k,N,M,n,Dx,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
"(register_operand (operands[0], DImode)
|| aarch64_reg_or_zero (operands[1], DImode))"
"@
@@ -1309,6 +1309,7 @@ (define_insn_and_split "*movdi_aarch64"
mov\\t%x0, %1
mov\\t%w0, %1
#
+ #
* return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
ldr\\t%x0, %1
ldr\\t%d0, %1
@@ -1321,17 +1322,27 @@ (define_insn_and_split "*movdi_aarch64"
fmov\\t%d0, %d1
* return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
"(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode))
- && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+ && REG_P (operands[0])
+ && (GP_REGNUM_P (REGNO (operands[0]))
+ || (can_create_pseudo_p ()
+ && !aarch64_can_const_movi_rtx_p (operands[1], DImode)))"
[(const_int 0)]
"{
- aarch64_expand_mov_immediate (operands[0], operands[1]);
+ if (GP_REGNUM_P (REGNO (operands[0])))
+ aarch64_expand_mov_immediate (operands[0], operands[1]);
+ else
+ {
+ rtx mem = force_const_mem (DImode, operands[1]);
+ gcc_assert (mem);
+ emit_move_insn (operands[0], mem);
+ }
DONE;
}"
;; The "mov_imm" type for CNTD is just a placeholder.
- [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
+ [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,mov_imm,
load_8,load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov,
neon_move")
- (set_attr "arch" "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
+ (set_attr "arch" "*,*,*,*,*,*,simd,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
)
(define_insn "insv_imm<mode>"
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 3b49b452119c49320020fa9183314d9a25b92491..422d95b50a8e9608b57f0f39745c89d58ea1e8a4 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -474,6 +474,14 @@ (define_address_constraint "Dp"
An address valid for a prefetch instruction."
(match_test "aarch64_address_valid_for_prefetch_p (op, true)"))
+(define_constraint "Dx"
+ "@internal
+ A constraint that matches an integer immediate operand not valid\
+ for AdvSIMD scalar operations in DImode."
+ (and (match_code "const_int")
+ (match_test "!aarch64_can_const_movi_rtx_p (op, DImode)")
+ (match_test "!aarch64_move_imm (INTVAL (op), DImode)")))
+
(define_constraint "vgb"
"@internal
A constraint that matches an immediate offset valid for SVE LD1B
--
[-- Attachment #2: rb14775.patch --]
[-- Type: text/x-diff, Size: 3245 bytes --]
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index eb8ccd4b97bbd4f0c3ff5791e48cfcfb42ec6c2e..a18886cb65c86daa16baa1691b1718f2d3a1be6c 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1298,8 +1298,8 @@ (define_insn_and_split "*movsi_aarch64"
)
(define_insn_and_split "*movdi_aarch64"
- [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m, r, r, w,r,w, w")
- (match_operand:DI 1 "aarch64_mov_operand" " r,r,k,N,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
+ [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,w ,r ,r,w, m,m, r, r, w,r,w,w")
+ (match_operand:DI 1 "aarch64_mov_operand" " r,r,k,N,M,n,Dx,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
"(register_operand (operands[0], DImode)
|| aarch64_reg_or_zero (operands[1], DImode))"
"@
@@ -1309,6 +1309,7 @@ (define_insn_and_split "*movdi_aarch64"
mov\\t%x0, %1
mov\\t%w0, %1
#
+ #
* return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
ldr\\t%x0, %1
ldr\\t%d0, %1
@@ -1321,17 +1322,27 @@ (define_insn_and_split "*movdi_aarch64"
fmov\\t%d0, %d1
* return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
"(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode))
- && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+ && REG_P (operands[0])
+ && (GP_REGNUM_P (REGNO (operands[0]))
+ || (can_create_pseudo_p ()
+ && !aarch64_can_const_movi_rtx_p (operands[1], DImode)))"
[(const_int 0)]
"{
- aarch64_expand_mov_immediate (operands[0], operands[1]);
+ if (GP_REGNUM_P (REGNO (operands[0])))
+ aarch64_expand_mov_immediate (operands[0], operands[1]);
+ else
+ {
+ rtx mem = force_const_mem (DImode, operands[1]);
+ gcc_assert (mem);
+ emit_move_insn (operands[0], mem);
+ }
DONE;
}"
;; The "mov_imm" type for CNTD is just a placeholder.
- [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
+ [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,mov_imm,
load_8,load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov,
neon_move")
- (set_attr "arch" "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
+ (set_attr "arch" "*,*,*,*,*,*,simd,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
)
(define_insn "insv_imm<mode>"
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 3b49b452119c49320020fa9183314d9a25b92491..422d95b50a8e9608b57f0f39745c89d58ea1e8a4 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -474,6 +474,14 @@ (define_address_constraint "Dp"
An address valid for a prefetch instruction."
(match_test "aarch64_address_valid_for_prefetch_p (op, true)"))
+(define_constraint "Dx"
+ "@internal
+ A constraint that matches an integer immediate operand not valid\
+ for AdvSIMD scalar operations in DImode."
+ (and (match_code "const_int")
+ (match_test "!aarch64_can_const_movi_rtx_p (op, DImode)")
+ (match_test "!aarch64_move_imm (INTVAL (op), DImode)")))
+
(define_constraint "vgb"
"@internal
A constraint that matches an immediate offset valid for SVE LD1B
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [PATCH]AArch64[RFC] Force complicated constant to memory when beneficial
2021-08-31 13:26 [PATCH]AArch64[RFC] Force complicated constant to memory when beneficial Tamar Christina
@ 2021-10-08 16:12 ` Richard Sandiford
0 siblings, 0 replies; 2+ messages in thread
From: Richard Sandiford @ 2021-10-08 16:12 UTC (permalink / raw)
To: Tamar Christina
Cc: gcc-patches, nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov
Catching up on backlog, sorry for the very late response:
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> Consider the following case
>
> #include <arm_neon.h>
>
> uint64_t
> test4 (uint8x16_t input)
> {
> uint8x16_t bool_input = vshrq_n_u8(input, 7);
> poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
> poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
> vgetq_lane_p64(mask, 0));
> poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
> uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
> return vget_lane_u16((uint16x4_t)res, 3);
> }
>
> which generates (after my CSE patches):
>
> test4:
> ushr v0.16b, v0.16b, 7
> mov x0, 16512
> movk x0, 0x1020, lsl 16
> movk x0, 0x408, lsl 32
> movk x0, 0x102, lsl 48
> fmov d1, x0
> pmull v2.1q, v0.1d, v1.1d
> dup v1.2d, v1.d[0]
> pmull2 v0.1q, v0.2d, v1.2d
> trn2 v2.8b, v2.8b, v0.8b
> umov w0, v2.h[3]
> re
>
> which is suboptimal since the constant is never needed on the genreg side and
> should have been materialized on the SIMD side since the constant is so big
> that it requires 5 instruction to create otherwise. 4 mov/movk and one fmov.
>
> The problem is that the choice of on which side to materialize the constant can
> only be done during reload. We may need an extra register (to hold the
> addressing) and so can't be done after reload.
>
> I have tried to support this with a pattern during reload, but the problem is I
> can't seem to find a way to tell reload it should spill a constant under
> condition x. Instead I tried with a split which reload selects when the
> condition hold.
If this is still an issue, one thing to try would be to put a "$" before
the "r" in the GPR alternative. If that doesn't work then yeah,
I think we're out of luck describing this directly. If "$" does work,
it'd be interesting to see whether "^" does too.
Thanks,
Richard
>
> This has a couple of issues:
>
> 1. The pattern can be expanded late (could be fixed with !reload_completed).
> 2. Because it's split so late we can't seem to be able to share the anchors for
> the ADRP.
> 3. Because it's split so late and basically reload doesn't know about the spill
> and so the ADD lo12 isn't pushed into the addressing mode of the LDR.
>
> I don't know how to properly fix these since I think the only way is for reload
> to do the spill properly itself, but in this case not having the patter makes it
> avoid the mem pattern and pick r <- n instead followed by r -> w.
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64.md (*movdi_aarch6): Add Dx -> W.
> * config/aarch64/constraints.md (Dx): New.
>
> --- inline copy of patch --
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index eb8ccd4b97bbd4f0c3ff5791e48cfcfb42ec6c2e..a18886cb65c86daa16baa1691b1718f2d3a1be6c 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1298,8 +1298,8 @@ (define_insn_and_split "*movsi_aarch64"
> )
>
> (define_insn_and_split "*movdi_aarch64"
> - [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m, r, r, w,r,w, w")
> - (match_operand:DI 1 "aarch64_mov_operand" " r,r,k,N,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
> + [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,w ,r ,r,w, m,m, r, r, w,r,w,w")
> + (match_operand:DI 1 "aarch64_mov_operand" " r,r,k,N,M,n,Dx,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
> "(register_operand (operands[0], DImode)
> || aarch64_reg_or_zero (operands[1], DImode))"
> "@
> @@ -1309,6 +1309,7 @@ (define_insn_and_split "*movdi_aarch64"
> mov\\t%x0, %1
> mov\\t%w0, %1
> #
> + #
> * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
> ldr\\t%x0, %1
> ldr\\t%d0, %1
> @@ -1321,17 +1322,27 @@ (define_insn_and_split "*movdi_aarch64"
> fmov\\t%d0, %d1
> * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
> "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode))
> - && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
> + && REG_P (operands[0])
> + && (GP_REGNUM_P (REGNO (operands[0]))
> + || (can_create_pseudo_p ()
> + && !aarch64_can_const_movi_rtx_p (operands[1], DImode)))"
> [(const_int 0)]
> "{
> - aarch64_expand_mov_immediate (operands[0], operands[1]);
> + if (GP_REGNUM_P (REGNO (operands[0])))
> + aarch64_expand_mov_immediate (operands[0], operands[1]);
> + else
> + {
> + rtx mem = force_const_mem (DImode, operands[1]);
> + gcc_assert (mem);
> + emit_move_insn (operands[0], mem);
> + }
> DONE;
> }"
> ;; The "mov_imm" type for CNTD is just a placeholder.
> - [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
> + [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,mov_imm,
> load_8,load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov,
> neon_move")
> - (set_attr "arch" "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
> + (set_attr "arch" "*,*,*,*,*,*,simd,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
> )
>
> (define_insn "insv_imm<mode>"
> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
> index 3b49b452119c49320020fa9183314d9a25b92491..422d95b50a8e9608b57f0f39745c89d58ea1e8a4 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -474,6 +474,14 @@ (define_address_constraint "Dp"
> An address valid for a prefetch instruction."
> (match_test "aarch64_address_valid_for_prefetch_p (op, true)"))
>
> +(define_constraint "Dx"
> + "@internal
> + A constraint that matches an integer immediate operand not valid\
> + for AdvSIMD scalar operations in DImode."
> + (and (match_code "const_int")
> + (match_test "!aarch64_can_const_movi_rtx_p (op, DImode)")
> + (match_test "!aarch64_move_imm (INTVAL (op), DImode)")))
> +
> (define_constraint "vgb"
> "@internal
> A constraint that matches an immediate offset valid for SVE LD1B
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2021-10-08 16:12 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-31 13:26 [PATCH]AArch64[RFC] Force complicated constant to memory when beneficial Tamar Christina
2021-10-08 16:12 ` Richard Sandiford
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).