public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH]AArch64[RFC] Force complicated constant to memory when beneficial
@ 2021-08-31 13:26 Tamar Christina
  2021-10-08 16:12 ` Richard Sandiford
  0 siblings, 1 reply; 2+ messages in thread
From: Tamar Christina @ 2021-08-31 13:26 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov,
	richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 5468 bytes --]

Hi All,

Consider the following case

#include <arm_neon.h>

uint64_t
test4 (uint8x16_t input)
{
    uint8x16_t bool_input = vshrq_n_u8(input, 7);
    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
				vgetq_lane_p64(mask, 0));
    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
    return vget_lane_u16((uint16x4_t)res, 3);
}

which generates (after my CSE patches):

test4:
	ushr	v0.16b, v0.16b, 7
	mov	x0, 16512
	movk	x0, 0x1020, lsl 16
	movk	x0, 0x408, lsl 32
	movk	x0, 0x102, lsl 48
	fmov	d1, x0
	pmull	v2.1q, v0.1d, v1.1d
	dup	v1.2d, v1.d[0]
	pmull2	v0.1q, v0.2d, v1.2d
	trn2	v2.8b, v2.8b, v0.8b
	umov	w0, v2.h[3]
	re

which is suboptimal since the constant is never needed on the genreg side and
should have been materialized on the SIMD side since the constant is so big
that it requires 5 instruction to create otherwise. 4 mov/movk and one fmov.

The problem is that the choice of on which side to materialize the constant can
only be done during reload.  We may need an extra register (to hold the
addressing) and so can't be done after reload.

I have tried to support this with a pattern during reload, but the problem is I
can't seem to find a way to tell reload it should spill a constant under
condition x.  Instead I tried with a split which reload selects when the
condition hold.

This has a couple of issues:

1. The pattern can be expanded late (could be fixed with !reload_completed).
2. Because it's split so late we can't seem to be able to share the anchors for
   the ADRP.
3. Because it's split so late and basically reload doesn't know about the spill
   and so the ADD lo12 isn't pushed into the addressing mode of the LDR.

I don't know how to properly fix these since I think the only way is for reload
to do the spill properly itself, but in this case not having the patter makes it
avoid the mem pattern and pick r <- n instead followed by r -> w.

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.md (*movdi_aarch6): Add Dx -> W.
	* config/aarch64/constraints.md (Dx): New.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index eb8ccd4b97bbd4f0c3ff5791e48cfcfb42ec6c2e..a18886cb65c86daa16baa1691b1718f2d3a1be6c 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1298,8 +1298,8 @@ (define_insn_and_split "*movsi_aarch64"
 )
 
 (define_insn_and_split "*movdi_aarch64"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m,  r,  r, w,r,w, w")
-	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,w  ,r  ,r,w, m,m,  r,  r, w,r,w,w")
+	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Dx,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
   "(register_operand (operands[0], DImode)
     || aarch64_reg_or_zero (operands[1], DImode))"
   "@
@@ -1309,6 +1309,7 @@ (define_insn_and_split "*movdi_aarch64"
    mov\\t%x0, %1
    mov\\t%w0, %1
    #
+   #
    * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
    ldr\\t%x0, %1
    ldr\\t%d0, %1
@@ -1321,17 +1322,27 @@ (define_insn_and_split "*movdi_aarch64"
    fmov\\t%d0, %d1
    * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
    "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode))
-    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+    && REG_P (operands[0])
+    && (GP_REGNUM_P (REGNO (operands[0]))
+	|| (can_create_pseudo_p ()
+	    && !aarch64_can_const_movi_rtx_p (operands[1], DImode)))"
    [(const_int 0)]
    "{
-       aarch64_expand_mov_immediate (operands[0], operands[1]);
+       if (GP_REGNUM_P (REGNO (operands[0])))
+	 aarch64_expand_mov_immediate (operands[0], operands[1]);
+       else
+	 {
+	   rtx mem = force_const_mem (DImode, operands[1]);
+	   gcc_assert (mem);
+	   emit_move_insn (operands[0], mem);
+	 }
        DONE;
     }"
   ;; The "mov_imm" type for CNTD is just a placeholder.
-  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
+  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,mov_imm,
 		     load_8,load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov,
 		     neon_move")
-   (set_attr "arch" "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
+   (set_attr "arch" "*,*,*,*,*,*,simd,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
 )
 
 (define_insn "insv_imm<mode>"
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 3b49b452119c49320020fa9183314d9a25b92491..422d95b50a8e9608b57f0f39745c89d58ea1e8a4 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -474,6 +474,14 @@ (define_address_constraint "Dp"
  An address valid for a prefetch instruction."
  (match_test "aarch64_address_valid_for_prefetch_p (op, true)"))
 
+(define_constraint "Dx"
+  "@internal
+ A constraint that matches an integer immediate operand not valid\
+ for AdvSIMD scalar operations in DImode."
+ (and (match_code "const_int")
+      (match_test "!aarch64_can_const_movi_rtx_p (op, DImode)")
+      (match_test "!aarch64_move_imm (INTVAL (op), DImode)")))
+
 (define_constraint "vgb"
   "@internal
    A constraint that matches an immediate offset valid for SVE LD1B


-- 

[-- Attachment #2: rb14775.patch --]
[-- Type: text/x-diff, Size: 3245 bytes --]

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index eb8ccd4b97bbd4f0c3ff5791e48cfcfb42ec6c2e..a18886cb65c86daa16baa1691b1718f2d3a1be6c 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1298,8 +1298,8 @@ (define_insn_and_split "*movsi_aarch64"
 )
 
 (define_insn_and_split "*movdi_aarch64"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m,  r,  r, w,r,w, w")
-	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,w  ,r  ,r,w, m,m,  r,  r, w,r,w,w")
+	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Dx,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
   "(register_operand (operands[0], DImode)
     || aarch64_reg_or_zero (operands[1], DImode))"
   "@
@@ -1309,6 +1309,7 @@ (define_insn_and_split "*movdi_aarch64"
    mov\\t%x0, %1
    mov\\t%w0, %1
    #
+   #
    * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
    ldr\\t%x0, %1
    ldr\\t%d0, %1
@@ -1321,17 +1322,27 @@ (define_insn_and_split "*movdi_aarch64"
    fmov\\t%d0, %d1
    * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
    "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode))
-    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+    && REG_P (operands[0])
+    && (GP_REGNUM_P (REGNO (operands[0]))
+	|| (can_create_pseudo_p ()
+	    && !aarch64_can_const_movi_rtx_p (operands[1], DImode)))"
    [(const_int 0)]
    "{
-       aarch64_expand_mov_immediate (operands[0], operands[1]);
+       if (GP_REGNUM_P (REGNO (operands[0])))
+	 aarch64_expand_mov_immediate (operands[0], operands[1]);
+       else
+	 {
+	   rtx mem = force_const_mem (DImode, operands[1]);
+	   gcc_assert (mem);
+	   emit_move_insn (operands[0], mem);
+	 }
        DONE;
     }"
   ;; The "mov_imm" type for CNTD is just a placeholder.
-  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
+  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,mov_imm,
 		     load_8,load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov,
 		     neon_move")
-   (set_attr "arch" "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
+   (set_attr "arch" "*,*,*,*,*,*,simd,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
 )
 
 (define_insn "insv_imm<mode>"
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 3b49b452119c49320020fa9183314d9a25b92491..422d95b50a8e9608b57f0f39745c89d58ea1e8a4 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -474,6 +474,14 @@ (define_address_constraint "Dp"
  An address valid for a prefetch instruction."
  (match_test "aarch64_address_valid_for_prefetch_p (op, true)"))
 
+(define_constraint "Dx"
+  "@internal
+ A constraint that matches an integer immediate operand not valid\
+ for AdvSIMD scalar operations in DImode."
+ (and (match_code "const_int")
+      (match_test "!aarch64_can_const_movi_rtx_p (op, DImode)")
+      (match_test "!aarch64_move_imm (INTVAL (op), DImode)")))
+
 (define_constraint "vgb"
   "@internal
    A constraint that matches an immediate offset valid for SVE LD1B


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH]AArch64[RFC] Force complicated constant to memory when beneficial
  2021-08-31 13:26 [PATCH]AArch64[RFC] Force complicated constant to memory when beneficial Tamar Christina
@ 2021-10-08 16:12 ` Richard Sandiford
  0 siblings, 0 replies; 2+ messages in thread
From: Richard Sandiford @ 2021-10-08 16:12 UTC (permalink / raw)
  To: Tamar Christina
  Cc: gcc-patches, nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov

Catching up on backlog, sorry for the very late response:

Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> Consider the following case
>
> #include <arm_neon.h>
>
> uint64_t
> test4 (uint8x16_t input)
> {
>     uint8x16_t bool_input = vshrq_n_u8(input, 7);
>     poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
>     poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
> 				vgetq_lane_p64(mask, 0));
>     poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
>     uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
>     return vget_lane_u16((uint16x4_t)res, 3);
> }
>
> which generates (after my CSE patches):
>
> test4:
> 	ushr	v0.16b, v0.16b, 7
> 	mov	x0, 16512
> 	movk	x0, 0x1020, lsl 16
> 	movk	x0, 0x408, lsl 32
> 	movk	x0, 0x102, lsl 48
> 	fmov	d1, x0
> 	pmull	v2.1q, v0.1d, v1.1d
> 	dup	v1.2d, v1.d[0]
> 	pmull2	v0.1q, v0.2d, v1.2d
> 	trn2	v2.8b, v2.8b, v0.8b
> 	umov	w0, v2.h[3]
> 	re
>
> which is suboptimal since the constant is never needed on the genreg side and
> should have been materialized on the SIMD side since the constant is so big
> that it requires 5 instruction to create otherwise. 4 mov/movk and one fmov.
>
> The problem is that the choice of on which side to materialize the constant can
> only be done during reload.  We may need an extra register (to hold the
> addressing) and so can't be done after reload.
>
> I have tried to support this with a pattern during reload, but the problem is I
> can't seem to find a way to tell reload it should spill a constant under
> condition x.  Instead I tried with a split which reload selects when the
> condition hold.

If this is still an issue, one thing to try would be to put a "$" before
the "r" in the GPR alternative.  If that doesn't work then yeah,
I think we're out of luck describing this directly.  If "$" does work,
it'd be interesting to see whether "^" does too.

Thanks,
Richard

>
> This has a couple of issues:
>
> 1. The pattern can be expanded late (could be fixed with !reload_completed).
> 2. Because it's split so late we can't seem to be able to share the anchors for
>    the ADRP.
> 3. Because it's split so late and basically reload doesn't know about the spill
>    and so the ADD lo12 isn't pushed into the addressing mode of the LDR.
>
> I don't know how to properly fix these since I think the only way is for reload
> to do the spill properly itself, but in this case not having the patter makes it
> avoid the mem pattern and pick r <- n instead followed by r -> w.
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.md (*movdi_aarch6): Add Dx -> W.
> 	* config/aarch64/constraints.md (Dx): New.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index eb8ccd4b97bbd4f0c3ff5791e48cfcfb42ec6c2e..a18886cb65c86daa16baa1691b1718f2d3a1be6c 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1298,8 +1298,8 @@ (define_insn_and_split "*movsi_aarch64"
>  )
>  
>  (define_insn_and_split "*movdi_aarch64"
> -  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m,  r,  r, w,r,w, w")
> -	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
> +  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,w  ,r  ,r,w, m,m,  r,  r, w,r,w,w")
> +	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Dx,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
>    "(register_operand (operands[0], DImode)
>      || aarch64_reg_or_zero (operands[1], DImode))"
>    "@
> @@ -1309,6 +1309,7 @@ (define_insn_and_split "*movdi_aarch64"
>     mov\\t%x0, %1
>     mov\\t%w0, %1
>     #
> +   #
>     * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
>     ldr\\t%x0, %1
>     ldr\\t%d0, %1
> @@ -1321,17 +1322,27 @@ (define_insn_and_split "*movdi_aarch64"
>     fmov\\t%d0, %d1
>     * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
>     "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode))
> -    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
> +    && REG_P (operands[0])
> +    && (GP_REGNUM_P (REGNO (operands[0]))
> +	|| (can_create_pseudo_p ()
> +	    && !aarch64_can_const_movi_rtx_p (operands[1], DImode)))"
>     [(const_int 0)]
>     "{
> -       aarch64_expand_mov_immediate (operands[0], operands[1]);
> +       if (GP_REGNUM_P (REGNO (operands[0])))
> +	 aarch64_expand_mov_immediate (operands[0], operands[1]);
> +       else
> +	 {
> +	   rtx mem = force_const_mem (DImode, operands[1]);
> +	   gcc_assert (mem);
> +	   emit_move_insn (operands[0], mem);
> +	 }
>         DONE;
>      }"
>    ;; The "mov_imm" type for CNTD is just a placeholder.
> -  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
> +  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,mov_imm,
>  		     load_8,load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov,
>  		     neon_move")
> -   (set_attr "arch" "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
> +   (set_attr "arch" "*,*,*,*,*,*,simd,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
>  )
>  
>  (define_insn "insv_imm<mode>"
> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
> index 3b49b452119c49320020fa9183314d9a25b92491..422d95b50a8e9608b57f0f39745c89d58ea1e8a4 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -474,6 +474,14 @@ (define_address_constraint "Dp"
>   An address valid for a prefetch instruction."
>   (match_test "aarch64_address_valid_for_prefetch_p (op, true)"))
>  
> +(define_constraint "Dx"
> +  "@internal
> + A constraint that matches an integer immediate operand not valid\
> + for AdvSIMD scalar operations in DImode."
> + (and (match_code "const_int")
> +      (match_test "!aarch64_can_const_movi_rtx_p (op, DImode)")
> +      (match_test "!aarch64_move_imm (INTVAL (op), DImode)")))
> +
>  (define_constraint "vgb"
>    "@internal
>     A constraint that matches an immediate offset valid for SVE LD1B

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2021-10-08 16:12 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-31 13:26 [PATCH]AArch64[RFC] Force complicated constant to memory when beneficial Tamar Christina
2021-10-08 16:12 ` Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).