[PATCH][GCC][AArch64] optimize float immediate moves (2 /4)

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.
@ 2017-06-07 11:38 Tamar Christina
  2017-06-12  7:31 ` Tamar Christina
  2017-06-14  8:43 ` James Greenhalgh
  0 siblings, 2 replies; 16+ messages in thread
From: Tamar Christina @ 2017-06-07 11:38 UTC (permalink / raw)
  To: GCC Patches; +Cc: nd, James Greenhalgh, Marcus Shawcroft, Richard Earnshaw

[-- Attachment #1: Type: text/plain, Size: 868 bytes --]

Hi All, 


This patch adds support for creating floating point constants
using mov immediate instructions.  The movi SIMD instruction can
be used for HFmode and SFmode constants, eg. for -0.0f we generate:

        movi     v0.2s, 0x80, lsl 24

More complex constants can be generated using an integer MOV or
MOV+MOVK:

     mov   w0, 48128
     movk  w0, 0x47f0, lsl 16
     fmov  s0, w0

We allow up to 3 instructions as this allows all HF, SF and most DF
constants to be generated without a literal load, and is overall best
for codesize.


Regression tested on aarch64-none-linux-gnu and no regressions.

OK for trunk?

Thanks,
Tamar


gcc/
2017-06-07  Tamar Christina  <tamar.christina@arm.com>

	* config/aarch64/aarch64.md (mov<mode>): Generalize.
	(*movhf_aarch64, *movsf_aarch64, *movdf_aarch64):
	Add integer and movi cases.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: float-move-2.patch --]
[-- Type: text/x-patch; name="float-move-2.patch", Size: 5625 bytes --]

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 5adc5edb8dde9c30450b04932a37c41f84cc5ed1..7f107672882b13809be01355ffafbc2807cc5adb 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1167,66 +1167,120 @@
   }
 )
 
-(define_insn "*movhf_aarch64"
-  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w,m,r,m ,r")
-	(match_operand:HF 1 "general_operand"      "Y ,?rY, w,w,m,w,m,rY,r"))]
+(define_insn_and_split "*movhf_aarch64"
+  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r")
+	(match_operand:HF 1 "general_operand"      "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r"))]
   "TARGET_FLOAT && (register_operand (operands[0], HFmode)
-    || aarch64_reg_or_fp_zero (operands[1], HFmode))"
+    || aarch64_reg_or_fp_float (operands[1], HFmode))"
   "@
    movi\\t%0.4h, #0
-   mov\\t%0.h[0], %w1
+   fmov\\t%s0, %w1
    umov\\t%w0, %1.h[0]
    mov\\t%0.h[0], %1.h[0]
+   fmov\\t%s0, %1
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);
    ldr\\t%h0, %1
    str\\t%h1, %0
    ldrh\\t%w0, %1
    strh\\t%w1, %0
    mov\\t%w0, %w1"
-  [(set_attr "type" "neon_move,neon_from_gp,neon_to_gp,neon_move,\
-                     f_loads,f_stores,load1,store1,mov_reg")
-   (set_attr "simd" "yes,yes,yes,yes,*,*,*,*,*")]
+  "&& can_create_pseudo_p ()
+   && !aarch64_can_const_movi_rtx_p (operands[1], HFmode)
+   && !aarch64_float_const_representable_p (operands[1])
+   &&  aarch64_float_const_rtx_p (operands[1])"
+  [(const_int 0)]
+  "{
+    unsigned HOST_WIDE_INT ival;
+    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
+      FAIL;
+
+    rtx tmp = gen_reg_rtx (SImode);
+    aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
+    tmp = simplify_gen_subreg (HImode, tmp, SImode, 0);
+    emit_move_insn (operands[0], gen_lowpart (HFmode, tmp));
+    DONE;
+  }"
+  [(set_attr "type" "neon_move,f_mcr,neon_to_gp,neon_move,fconsts, \
+		     neon_move,f_loads,f_stores,load1,store1,mov_reg")
+   (set_attr "simd" "yes,*,yes,yes,*,yes,*,*,*,*,*")]
 )
 
-(define_insn "*movsf_aarch64"
-  [(set (match_operand:SF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w,m,r,m ,r")
-	(match_operand:SF 1 "general_operand"      "Y ,?rY, w,w,Ufc,m,w,m,rY,r"))]
+(define_insn_and_split "*movsf_aarch64"
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
+	(match_operand:SF 1 "general_operand"      "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r,M"))]
   "TARGET_FLOAT && (register_operand (operands[0], SFmode)
-    || aarch64_reg_or_fp_zero (operands[1], SFmode))"
+    || aarch64_reg_or_fp_float (operands[1], SFmode))"
   "@
    movi\\t%0.2s, #0
    fmov\\t%s0, %w1
    fmov\\t%w0, %s1
    fmov\\t%s0, %s1
    fmov\\t%s0, %1
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);
    ldr\\t%s0, %1
    str\\t%s1, %0
    ldr\\t%w0, %1
    str\\t%w1, %0
-   mov\\t%w0, %w1"
-  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconsts,\
-                     f_loads,f_stores,load1,store1,mov_reg")
-   (set_attr "simd" "yes,*,*,*,*,*,*,*,*,*")]
+   mov\\t%w0, %w1
+   mov\\t%w0, %1"
+  "&& can_create_pseudo_p ()
+   && !aarch64_can_const_movi_rtx_p (operands[1], SFmode)
+   && !aarch64_float_const_representable_p (operands[1])
+   &&  aarch64_float_const_rtx_p (operands[1])"
+  [(const_int 0)]
+  "{
+    unsigned HOST_WIDE_INT ival;
+    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
+      FAIL;
+
+    rtx tmp = gen_reg_rtx (SImode);
+    aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
+    emit_move_insn (operands[0], gen_lowpart (SFmode, tmp));
+    DONE;
+  }"
+  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconsts,neon_move,\
+		     f_loads,f_stores,load1,store1,mov_reg,\
+		     fconsts")
+   (set_attr "simd" "yes,*,*,*,*,yes,*,*,*,*,*,*")]
 )
 
-(define_insn "*movdf_aarch64"
-  [(set (match_operand:DF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w,m,r,m ,r")
-	(match_operand:DF 1 "general_operand"      "Y ,?rY, w,w,Ufc,m,w,m,rY,r"))]
+(define_insn_and_split "*movdf_aarch64"
+  [(set (match_operand:DF 0 "nonimmediate_operand" "=w, w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
+	(match_operand:DF 1 "general_operand"      "Y , ?rY, w,w,Ufc,Uvi,m,w,m,rY,r,N"))]
   "TARGET_FLOAT && (register_operand (operands[0], DFmode)
-    || aarch64_reg_or_fp_zero (operands[1], DFmode))"
+    || aarch64_reg_or_fp_float (operands[1], DFmode))"
   "@
    movi\\t%d0, #0
    fmov\\t%d0, %x1
    fmov\\t%x0, %d1
    fmov\\t%d0, %d1
    fmov\\t%d0, %1
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);
    ldr\\t%d0, %1
    str\\t%d1, %0
    ldr\\t%x0, %1
    str\\t%x1, %0
-   mov\\t%x0, %x1"
-  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,\
-                     f_loadd,f_stored,load1,store1,mov_reg")
-   (set_attr "simd" "yes,*,*,*,*,*,*,*,*,*")]
+   mov\\t%x0, %x1
+   mov\\t%x0, %1"
+  "&& can_create_pseudo_p ()
+   && !aarch64_can_const_movi_rtx_p (operands[1], DFmode)
+   && !aarch64_float_const_representable_p (operands[1])
+   &&  aarch64_float_const_rtx_p (operands[1])"
+  [(const_int 0)]
+  "{
+    unsigned HOST_WIDE_INT ival;
+    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
+      FAIL;
+
+    rtx tmp = gen_reg_rtx (DImode);
+    aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
+    emit_move_insn (operands[0], gen_lowpart (DFmode, tmp));
+    DONE;
+  }"
+  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,neon_move,\
+		     f_loadd,f_stored,load1,store1,mov_reg,\
+		     fconstd")
+   (set_attr "simd" "yes,*,*,*,*,yes,*,*,*,*,*,*")]
 )
 
 (define_insn "*movtf_aarch64"

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.
  2017-06-07 11:38 [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode Tamar Christina
@ 2017-06-12  7:31 ` Tamar Christina
  2017-06-14 10:06   ` Richard Sandiford
  2017-06-14  8:43 ` James Greenhalgh
  1 sibling, 1 reply; 16+ messages in thread
From: Tamar Christina @ 2017-06-12  7:31 UTC (permalink / raw)
  To: GCC Patches; +Cc: nd, James Greenhalgh, Marcus Shawcroft, Richard Earnshaw

[-- Attachment #1: Type: text/plain, Size: 1365 bytes --]

Hi All,

Updating this patch with the feedback I've received from patch 1/4.

Thanks,
Tamar
________________________________________
From: gcc-patches-owner@gcc.gnu.org <gcc-patches-owner@gcc.gnu.org> on behalf of Tamar Christina <Tamar.Christina@arm.com>
Sent: Wednesday, June 7, 2017 12:38:37 PM
To: GCC Patches
Cc: nd; James Greenhalgh; Marcus Shawcroft; Richard Earnshaw
Subject: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.

Hi All,


This patch adds support for creating floating point constants
using mov immediate instructions.  The movi SIMD instruction can
be used for HFmode and SFmode constants, eg. for -0.0f we generate:

        movi     v0.2s, 0x80, lsl 24

More complex constants can be generated using an integer MOV or
MOV+MOVK:

     mov   w0, 48128
     movk  w0, 0x47f0, lsl 16
     fmov  s0, w0

We allow up to 3 instructions as this allows all HF, SF and most DF
constants to be generated without a literal load, and is overall best
for codesize.


Regression tested on aarch64-none-linux-gnu and no regressions.

OK for trunk?

Thanks,
Tamar


gcc/
2017-06-07  Tamar Christina  <tamar.christina@arm.com>

        * config/aarch64/aarch64.md (mov<mode>): Generalize.
        (*movhf_aarch64, *movsf_aarch64, *movdf_aarch64):
        Add integer and movi cases.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: float-move-2-r2.patch --]
[-- Type: text/x-patch; name="float-move-2-r2.patch", Size: 5664 bytes --]

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 5adc5edb8dde9c30450b04932a37c41f84cc5ed1..62ad76731d2c5b4b3d02def8c2b1457c713c2d8e 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1167,66 +1167,120 @@
   }
 )
 
-(define_insn "*movhf_aarch64"
-  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w,m,r,m ,r")
-	(match_operand:HF 1 "general_operand"      "Y ,?rY, w,w,m,w,m,rY,r"))]
+(define_insn_and_split "*movhf_aarch64"
+  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r")
+	(match_operand:HF 1 "general_operand"      "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r"))]
   "TARGET_FLOAT && (register_operand (operands[0], HFmode)
-    || aarch64_reg_or_fp_zero (operands[1], HFmode))"
+    || aarch64_reg_or_fp_float (operands[1], HFmode))"
   "@
    movi\\t%0.4h, #0
-   mov\\t%0.h[0], %w1
+   fmov\\t%s0, %w1
    umov\\t%w0, %1.h[0]
    mov\\t%0.h[0], %1.h[0]
+   fmov\\t%s0, %1
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);
    ldr\\t%h0, %1
    str\\t%h1, %0
    ldrh\\t%w0, %1
    strh\\t%w1, %0
    mov\\t%w0, %w1"
-  [(set_attr "type" "neon_move,neon_from_gp,neon_to_gp,neon_move,\
-                     f_loads,f_stores,load1,store1,mov_reg")
-   (set_attr "simd" "yes,yes,yes,yes,*,*,*,*,*")]
+  "&& can_create_pseudo_p ()
+   && !aarch64_can_const_movi_rtx_p (operands[1], HFmode)
+   && !aarch64_float_const_representable_p (operands[1])
+   &&  aarch64_float_const_rtx_p (operands[1])"
+  [(const_int 0)]
+  "{
+    unsigned HOST_WIDE_INT ival;
+    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
+      FAIL;
+
+    rtx tmp = gen_reg_rtx (SImode);
+    aarch64_expand_mov_immediate (tmp, gen_int_mode (ival, SImode));
+    tmp = simplify_gen_subreg (HImode, tmp, SImode, 0);
+    emit_move_insn (operands[0], gen_lowpart (HFmode, tmp));
+    DONE;
+  }"
+  [(set_attr "type" "neon_move,f_mcr,neon_to_gp,neon_move,fconsts, \
+		     neon_move,f_loads,f_stores,load1,store1,mov_reg")
+   (set_attr "simd" "yes,*,yes,yes,*,yes,*,*,*,*,*")]
 )
 
-(define_insn "*movsf_aarch64"
-  [(set (match_operand:SF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w,m,r,m ,r")
-	(match_operand:SF 1 "general_operand"      "Y ,?rY, w,w,Ufc,m,w,m,rY,r"))]
+(define_insn_and_split "*movsf_aarch64"
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
+	(match_operand:SF 1 "general_operand"      "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r,M"))]
   "TARGET_FLOAT && (register_operand (operands[0], SFmode)
-    || aarch64_reg_or_fp_zero (operands[1], SFmode))"
+    || aarch64_reg_or_fp_float (operands[1], SFmode))"
   "@
    movi\\t%0.2s, #0
    fmov\\t%s0, %w1
    fmov\\t%w0, %s1
    fmov\\t%s0, %s1
    fmov\\t%s0, %1
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);
    ldr\\t%s0, %1
    str\\t%s1, %0
    ldr\\t%w0, %1
    str\\t%w1, %0
-   mov\\t%w0, %w1"
-  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconsts,\
-                     f_loads,f_stores,load1,store1,mov_reg")
-   (set_attr "simd" "yes,*,*,*,*,*,*,*,*,*")]
+   mov\\t%w0, %w1
+   mov\\t%w0, %1"
+  "&& can_create_pseudo_p ()
+   && !aarch64_can_const_movi_rtx_p (operands[1], SFmode)
+   && !aarch64_float_const_representable_p (operands[1])
+   &&  aarch64_float_const_rtx_p (operands[1])"
+  [(const_int 0)]
+  "{
+    unsigned HOST_WIDE_INT ival;
+    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
+      FAIL;
+
+    rtx tmp = gen_reg_rtx (SImode);
+    aarch64_expand_mov_immediate (tmp, gen_int_mode (ival, SImode));
+    emit_move_insn (operands[0], gen_lowpart (SFmode, tmp));
+    DONE;
+  }"
+  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconsts,neon_move,\
+		     f_loads,f_stores,load1,store1,mov_reg,\
+		     fconsts")
+   (set_attr "simd" "yes,*,*,*,*,yes,*,*,*,*,*,*")]
 )
 
-(define_insn "*movdf_aarch64"
-  [(set (match_operand:DF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w,m,r,m ,r")
-	(match_operand:DF 1 "general_operand"      "Y ,?rY, w,w,Ufc,m,w,m,rY,r"))]
+(define_insn_and_split "*movdf_aarch64"
+  [(set (match_operand:DF 0 "nonimmediate_operand" "=w, w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
+	(match_operand:DF 1 "general_operand"      "Y , ?rY, w,w,Ufc,Uvi,m,w,m,rY,r,N"))]
   "TARGET_FLOAT && (register_operand (operands[0], DFmode)
-    || aarch64_reg_or_fp_zero (operands[1], DFmode))"
+    || aarch64_reg_or_fp_float (operands[1], DFmode))"
   "@
    movi\\t%d0, #0
    fmov\\t%d0, %x1
    fmov\\t%x0, %d1
    fmov\\t%d0, %d1
    fmov\\t%d0, %1
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);
    ldr\\t%d0, %1
    str\\t%d1, %0
    ldr\\t%x0, %1
    str\\t%x1, %0
-   mov\\t%x0, %x1"
-  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,\
-                     f_loadd,f_stored,load1,store1,mov_reg")
-   (set_attr "simd" "yes,*,*,*,*,*,*,*,*,*")]
+   mov\\t%x0, %x1
+   mov\\t%x0, %1"
+  "&& can_create_pseudo_p ()
+   && !aarch64_can_const_movi_rtx_p (operands[1], DFmode)
+   && !aarch64_float_const_representable_p (operands[1])
+   &&  aarch64_float_const_rtx_p (operands[1])"
+  [(const_int 0)]
+  "{
+    unsigned HOST_WIDE_INT ival;
+    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
+      FAIL;
+
+    rtx tmp = gen_reg_rtx (DImode);
+    aarch64_expand_mov_immediate (tmp, gen_int_mode (ival, DImode));
+    emit_move_insn (operands[0], gen_lowpart (DFmode, tmp));
+    DONE;
+  }"
+  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,neon_move,\
+		     f_loadd,f_stored,load1,store1,mov_reg,\
+		     fconstd")
+   (set_attr "simd" "yes,*,*,*,*,yes,*,*,*,*,*,*")]
 )
 
 (define_insn "*movtf_aarch64"

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.
  2017-06-07 11:38 [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode Tamar Christina
  2017-06-12  7:31 ` Tamar Christina
@ 2017-06-14  8:43 ` James Greenhalgh
  2017-06-21 10:48   ` Tamar Christina
  1 sibling, 1 reply; 16+ messages in thread
From: James Greenhalgh @ 2017-06-14  8:43 UTC (permalink / raw)
  To: Tamar Christina; +Cc: GCC Patches, nd, Marcus Shawcroft, Richard Earnshaw

On Wed, Jun 07, 2017 at 12:38:37PM +0100, Tamar Christina wrote:
> Hi All, 
> 
> 
> This patch adds support for creating floating point constants
> using mov immediate instructions.  The movi SIMD instruction can
> be used for HFmode and SFmode constants, eg. for -0.0f we generate:
> 
>         movi     v0.2s, 0x80, lsl 24
> 
> More complex constants can be generated using an integer MOV or
> MOV+MOVK:
> 
>      mov   w0, 48128
>      movk  w0, 0x47f0, lsl 16
>      fmov  s0, w0
> 
> We allow up to 3 instructions as this allows all HF, SF and most DF
> constants to be generated without a literal load, and is overall best
> for codesize.
> 
> 
> Regression tested on aarch64-none-linux-gnu and no regressions.
> 
> OK for trunk?
> 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 5adc5edb8dde9c30450b04932a37c41f84cc5ed1..7f107672882b13809be01355ffafbc2807cc5adb 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1167,66 +1167,120 @@
>    }
>  )
>  
> -(define_insn "*movhf_aarch64"
> -  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w,m,r,m ,r")
> -	(match_operand:HF 1 "general_operand"      "Y ,?rY, w,w,m,w,m,rY,r"))]
> +(define_insn_and_split "*movhf_aarch64"
> +  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r")
> +	(match_operand:HF 1 "general_operand"      "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r"))]
>    "TARGET_FLOAT && (register_operand (operands[0], HFmode)
> -    || aarch64_reg_or_fp_zero (operands[1], HFmode))"
> +    || aarch64_reg_or_fp_float (operands[1], HFmode))"
>    "@
>     movi\\t%0.4h, #0
> -   mov\\t%0.h[0], %w1
> +   fmov\\t%s0, %w1

Should this not be %h0?

>     umov\\t%w0, %1.h[0]
>     mov\\t%0.h[0], %1.h[0]
> +   fmov\\t%s0, %1

Likewise, and much more important for correctness as it changes the way
the bit pattern ends up in the register (see table C2-1 in release B.a of
the ARM Architecture Reference Manual for ARMv8-A), here.

> +   * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);
>     ldr\\t%h0, %1
>     str\\t%h1, %0
>     ldrh\\t%w0, %1
>     strh\\t%w1, %0
>     mov\\t%w0, %w1"
> -  [(set_attr "type" "neon_move,neon_from_gp,neon_to_gp,neon_move,\
> -                     f_loads,f_stores,load1,store1,mov_reg")
> -   (set_attr "simd" "yes,yes,yes,yes,*,*,*,*,*")]
> +  "&& can_create_pseudo_p ()
> +   && !aarch64_can_const_movi_rtx_p (operands[1], HFmode)
> +   && !aarch64_float_const_representable_p (operands[1])
> +   &&  aarch64_float_const_rtx_p (operands[1])"
> +  [(const_int 0)]
> +  "{
> +    unsigned HOST_WIDE_INT ival;
> +    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
> +      FAIL;
> +
> +    rtx tmp = gen_reg_rtx (SImode);
> +    aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
> +    tmp = simplify_gen_subreg (HImode, tmp, SImode, 0);
> +    emit_move_insn (operands[0], gen_lowpart (HFmode, tmp));
> +    DONE;
> +  }"
> +  [(set_attr "type" "neon_move,f_mcr,neon_to_gp,neon_move,fconsts, \
> +		     neon_move,f_loads,f_stores,load1,store1,mov_reg")
> +   (set_attr "simd" "yes,*,yes,yes,*,yes,*,*,*,*,*")]
>  )

Thanks,
James

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.
  2017-06-12  7:31 ` Tamar Christina
@ 2017-06-14 10:06   ` Richard Sandiford
  2017-06-15 13:25     ` Tamar Christina
  0 siblings, 1 reply; 16+ messages in thread
From: Richard Sandiford @ 2017-06-14 10:06 UTC (permalink / raw)
  To: Tamar Christina
  Cc: GCC Patches, nd, James Greenhalgh, Marcus Shawcroft, Richard Earnshaw

In addition to James's comments:

Tamar Christina <Tamar.Christina@arm.com> writes:
> +  [(const_int 0)]
> +  "{

"{ ... }" isn't necessary, we can just use { ... }

> +    unsigned HOST_WIDE_INT ival;
> +    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
> +      FAIL;
> +
> +    rtx tmp = gen_reg_rtx (SImode);
> +    aarch64_expand_mov_immediate (tmp, gen_int_mode (ival, SImode));
> +    tmp = simplify_gen_subreg (HImode, tmp, SImode, 0);

This looks wrong for big-endian, and...

> +    emit_move_insn (operands[0], gen_lowpart (HFmode, tmp));

...either it should be OK to go directly from tmp to the HFmode lowpart,
or we should move the HImode temporary into a fresh REG.  Current
validate_subreg seems to suggest that we need the latter.

Isn't it possible to use a HImode move immediate instead of an SImode one?

Thanks,
Richard

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.
  2017-06-14 10:06   ` Richard Sandiford
@ 2017-06-15 13:25     ` Tamar Christina
  2017-06-15 14:28       ` Tamar Christina
  0 siblings, 1 reply; 16+ messages in thread
From: Tamar Christina @ 2017-06-15 13:25 UTC (permalink / raw)
  To: Richard Sandiford
  Cc: GCC Patches, nd, James Greenhalgh, Marcus Shawcroft, Richard Earnshaw

Hi Richard,

> > +    rtx tmp = gen_reg_rtx (SImode);
> > +    aarch64_expand_mov_immediate (tmp, gen_int_mode (ival, SImode));
> > +    tmp = simplify_gen_subreg (HImode, tmp, SImode, 0);
> 
> This looks wrong for big-endian, and...
> 
> > +    emit_move_insn (operands[0], gen_lowpart (HFmode, tmp));
> 
> ...either it should be OK to go directly from tmp to the HFmode lowpart, or
> we should move the HImode temporary into a fresh REG.  Current
> validate_subreg seems to suggest that we need the latter.
> 
> Isn't it possible to use a HImode move immediate instead of an SImode one?

We don't really have a movehi pattern, currently a movhi would end up in the general
mov<mode>_aarch64 pattern which would then use end up using a w register as well.

I'll take a look at big-endian. 

> 
> Thanks,
> Richard

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.
  2017-06-15 13:25     ` Tamar Christina
@ 2017-06-15 14:28       ` Tamar Christina
  2017-06-16  7:53         ` Richard Sandiford
  0 siblings, 1 reply; 16+ messages in thread
From: Tamar Christina @ 2017-06-15 14:28 UTC (permalink / raw)
  To: Richard Sandiford
  Cc: GCC Patches, nd, James Greenhalgh, Marcus Shawcroft, Richard Earnshaw

Hi Richard,
 
> > > +    rtx tmp = gen_reg_rtx (SImode);
> > > +    aarch64_expand_mov_immediate (tmp, gen_int_mode (ival,
> SImode));
> > > +    tmp = simplify_gen_subreg (HImode, tmp, SImode, 0);
> >
> > This looks wrong for big-endian, and...
> >
> > > +    emit_move_insn (operands[0], gen_lowpart (HFmode, tmp));
> >
> > ...either it should be OK to go directly from tmp to the HFmode
> > lowpart, or we should move the HImode temporary into a fresh REG.
> > Current validate_subreg seems to suggest that we need the latter.
> >
> > Isn't it possible to use a HImode move immediate instead of an SImode
> one?
> 
> We don't really have a movehi pattern, currently a movhi would end up in
> the general
> mov<mode>_aarch64 pattern which would then use end up using a w
> register as well.

Also aarch64_expand_mov_immediate doesn't allow HImode moves, only SI and DI.

> 
> I'll take a look at big-endian.
> 
> >
> > Thanks,
> > Richard

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.
  2017-06-15 14:28       ` Tamar Christina
@ 2017-06-16  7:53         ` Richard Sandiford
  2017-06-16  8:42           ` Tamar Christina
  0 siblings, 1 reply; 16+ messages in thread
From: Richard Sandiford @ 2017-06-16  7:53 UTC (permalink / raw)
  To: Tamar Christina
  Cc: GCC Patches, nd, James Greenhalgh, Marcus Shawcroft, Richard Earnshaw

Tamar Christina <Tamar.Christina@arm.com> writes:
> Hi Richard,
>> > > +    rtx tmp = gen_reg_rtx (SImode);
>> > > +    aarch64_expand_mov_immediate (tmp, gen_int_mode (ival,
>> SImode));
>> > > +    tmp = simplify_gen_subreg (HImode, tmp, SImode, 0);
>> >
>> > This looks wrong for big-endian, and...
>> >
>> > > +    emit_move_insn (operands[0], gen_lowpart (HFmode, tmp));
>> >
>> > ...either it should be OK to go directly from tmp to the HFmode
>> > lowpart, or we should move the HImode temporary into a fresh REG.
>> > Current validate_subreg seems to suggest that we need the latter.
>> >
>> > Isn't it possible to use a HImode move immediate instead of an SImode
>> one?
>> 
>> We don't really have a movehi pattern, currently a movhi would end up
>> in the general mov<mode>_aarch64 pattern

movqi and movhi patterns are defined from the same mov<mode> template,
but they're still "proper" move patterns.

>> which would then use end up using a w register as well.

Isn't that what you want though?  f16_mov_immediate_1.c is testing for:

/* { dg-final { scan-assembler-times "mov\tw\[0-9\]+, #?19520"           3 } } */

> Also aarch64_expand_mov_immediate doesn't allow HImode moves, only SI and DI.

It doesn't need to, because all HImode CONST_INTs are already legitimate.
You can just use emit_move_insn instead.

FWIW, the following seems to pass the same tests and avoids the subreg
dance.  Just a proof of concept, and I'm not attached to the new
iterator name.

Thanks,
Richard


Index: gcc/gcc/config/aarch64/aarch64.md
===================================================================
--- gcc.orig/gcc/config/aarch64/aarch64.md
+++ gcc/gcc/config/aarch64/aarch64.md
@@ -1063,7 +1063,28 @@
   }
 )
 
-(define_insn_and_split "*movhf_aarch64"
+(define_split
+  [(set (match_operand:GPF_MOV_F16 0 "nonimmediate_operand")
+        (match_operand:GPF_MOV_F16 1 "immediate_operand"))]
+  "TARGET_FLOAT
+   && can_create_pseudo_p ()
+   && !aarch64_can_const_movi_rtx_p (operands[1], <MODE>mode)
+   && !aarch64_float_const_representable_p (operands[1])
+   && aarch64_float_const_rtx_p (operands[1])"
+  [(const_int 0)]
+  {
+    unsigned HOST_WIDE_INT ival;
+    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
+      FAIL;
+
+    rtx tmp = gen_reg_rtx (<FCVT_TARGET>mode);
+    emit_move_insn (tmp, gen_int_mode (ival, <FCVT_TARGET>mode));
+    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp));
+    DONE;
+  }
+)
+
+(define_insn "*movhf_aarch64"
   [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r")
 	(match_operand:HF 1 "general_operand"      "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r"))]
   "TARGET_FLOAT && (register_operand (operands[0], HFmode)
@@ -1080,28 +1101,12 @@
    ldrh\\t%w0, %1
    strh\\t%w1, %0
    mov\\t%w0, %w1"
-  "&& can_create_pseudo_p ()
-   && !aarch64_can_const_movi_rtx_p (operands[1], HFmode)
-   && !aarch64_float_const_representable_p (operands[1])
-   &&  aarch64_float_const_rtx_p (operands[1])"
-  [(const_int 0)]
-  "{
-    unsigned HOST_WIDE_INT ival;
-    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
-      FAIL;
-
-    rtx tmp = gen_reg_rtx (SImode);
-    aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
-    tmp = simplify_gen_subreg (HImode, tmp, SImode, 0);
-    emit_move_insn (operands[0], gen_lowpart (HFmode, tmp));
-    DONE;
-  }"
   [(set_attr "type" "neon_move,f_mcr,neon_to_gp,neon_move,fconsts, \
 		     neon_move,f_loads,f_stores,load1,store1,mov_reg")
    (set_attr "simd" "yes,*,yes,yes,*,yes,*,*,*,*,*")]
 )
 
-(define_insn_and_split "*movsf_aarch64"
+(define_insn "*movsf_aarch64"
   [(set (match_operand:SF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
 	(match_operand:SF 1 "general_operand"      "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r,M"))]
   "TARGET_FLOAT && (register_operand (operands[0], SFmode)
@@ -1119,28 +1124,13 @@
    str\\t%w1, %0
    mov\\t%w0, %w1
    mov\\t%w0, %1"
-  "&& can_create_pseudo_p ()
-   && !aarch64_can_const_movi_rtx_p (operands[1], SFmode)
-   && !aarch64_float_const_representable_p (operands[1])
-   &&  aarch64_float_const_rtx_p (operands[1])"
-  [(const_int 0)]
-  "{
-    unsigned HOST_WIDE_INT ival;
-    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
-      FAIL;
-
-    rtx tmp = gen_reg_rtx (SImode);
-    aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
-    emit_move_insn (operands[0], gen_lowpart (SFmode, tmp));
-    DONE;
-  }"
   [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconsts,neon_move,\
 		     f_loads,f_stores,load1,store1,mov_reg,\
 		     fconsts")
    (set_attr "simd" "yes,*,*,*,*,yes,*,*,*,*,*,*")]
 )
 
-(define_insn_and_split "*movdf_aarch64"
+(define_insn "*movdf_aarch64"
   [(set (match_operand:DF 0 "nonimmediate_operand" "=w, w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
 	(match_operand:DF 1 "general_operand"      "Y , ?rY, w,w,Ufc,Uvi,m,w,m,rY,r,N"))]
   "TARGET_FLOAT && (register_operand (operands[0], DFmode)
@@ -1158,21 +1148,6 @@
    str\\t%x1, %0
    mov\\t%x0, %x1
    mov\\t%x0, %1"
-  "&& can_create_pseudo_p ()
-   && !aarch64_can_const_movi_rtx_p (operands[1], DFmode)
-   && !aarch64_float_const_representable_p (operands[1])
-   &&  aarch64_float_const_rtx_p (operands[1])"
-  [(const_int 0)]
-  "{
-    unsigned HOST_WIDE_INT ival;
-    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
-      FAIL;
-
-    rtx tmp = gen_reg_rtx (DImode);
-    aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
-    emit_move_insn (operands[0], gen_lowpart (DFmode, tmp));
-    DONE;
-  }"
   [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,neon_move,\
 		     f_loadd,f_stored,load1,store1,mov_reg,\
 		     fconstd")
Index: gcc/gcc/config/aarch64/iterators.md
===================================================================
--- gcc.orig/gcc/config/aarch64/iterators.md
+++ gcc/gcc/config/aarch64/iterators.md
@@ -44,6 +44,10 @@
 ;; Iterator for all scalar floating point modes (HF, SF, DF)
 (define_mode_iterator GPF_F16 [(HF "AARCH64_ISA_F16") SF DF])
 
+;; Iterator for all scalar floating point modes (HF, SF, DF), without
+;; requiring AARCH64_ISA_F16 for HF.
+(define_mode_iterator GPF_MOV_F16 [HF SF DF])
+
 ;; Iterator for all scalar floating point modes (HF, SF, DF and TF)
 (define_mode_iterator GPF_TF_F16 [HF SF DF TF])
 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.
  2017-06-16  7:53         ` Richard Sandiford
@ 2017-06-16  8:42           ` Tamar Christina
  0 siblings, 0 replies; 16+ messages in thread
From: Tamar Christina @ 2017-06-16  8:42 UTC (permalink / raw)
  To: Richard Sandiford
  Cc: GCC Patches, nd, James Greenhalgh, Marcus Shawcroft, Richard Earnshaw

> 
> It doesn't need to, because all HImode CONST_INTs are already legitimate.
> You can just use emit_move_insn instead.
> 

Ah right, that's true.

> FWIW, the following seems to pass the same tests and avoids the subreg
> dance.  Just a proof of concept, and I'm not attached to the new iterator
> name.

Ah thanks! that is a bit simpler. I'll take a similar approach. 

> Thanks,
> Richard
> 
> 
> Index: gcc/gcc/config/aarch64/aarch64.md
> ==========================================================
> =========
> --- gcc.orig/gcc/config/aarch64/aarch64.md
> +++ gcc/gcc/config/aarch64/aarch64.md
> @@ -1063,7 +1063,28 @@
>    }
>  )
> 
> -(define_insn_and_split "*movhf_aarch64"
> +(define_split
> +  [(set (match_operand:GPF_MOV_F16 0 "nonimmediate_operand")
> +        (match_operand:GPF_MOV_F16 1 "immediate_operand"))]
> +  "TARGET_FLOAT
> +   && can_create_pseudo_p ()
> +   && !aarch64_can_const_movi_rtx_p (operands[1], <MODE>mode)
> +   && !aarch64_float_const_representable_p (operands[1])
> +   && aarch64_float_const_rtx_p (operands[1])"
> +  [(const_int 0)]
> +  {
> +    unsigned HOST_WIDE_INT ival;
> +    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
> +      FAIL;
> +
> +    rtx tmp = gen_reg_rtx (<FCVT_TARGET>mode);
> +    emit_move_insn (tmp, gen_int_mode (ival, <FCVT_TARGET>mode));
> +    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp));
> +    DONE;
> +  }
> +)
> +
> +(define_insn "*movhf_aarch64"
>    [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w
> ,w,m,r,m ,r")
>  	(match_operand:HF 1 "general_operand"      "Y ,?rY,
> w,w,Ufc,Uvi,m,w,m,rY,r"))]
>    "TARGET_FLOAT && (register_operand (operands[0], HFmode) @@ -
> 1080,28 +1101,12 @@
>     ldrh\\t%w0, %1
>     strh\\t%w1, %0
>     mov\\t%w0, %w1"
> -  "&& can_create_pseudo_p ()
> -   && !aarch64_can_const_movi_rtx_p (operands[1], HFmode)
> -   && !aarch64_float_const_representable_p (operands[1])
> -   &&  aarch64_float_const_rtx_p (operands[1])"
> -  [(const_int 0)]
> -  "{
> -    unsigned HOST_WIDE_INT ival;
> -    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
> -      FAIL;
> -
> -    rtx tmp = gen_reg_rtx (SImode);
> -    aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
> -    tmp = simplify_gen_subreg (HImode, tmp, SImode, 0);
> -    emit_move_insn (operands[0], gen_lowpart (HFmode, tmp));
> -    DONE;
> -  }"
>    [(set_attr "type" "neon_move,f_mcr,neon_to_gp,neon_move,fconsts, \
>  		     neon_move,f_loads,f_stores,load1,store1,mov_reg")
>     (set_attr "simd" "yes,*,yes,yes,*,yes,*,*,*,*,*")]
>  )
> 
> -(define_insn_and_split "*movsf_aarch64"
> +(define_insn "*movsf_aarch64"
>    [(set (match_operand:SF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w
> ,w,m,r,m ,r,r")
>  	(match_operand:SF 1 "general_operand"      "Y ,?rY,
> w,w,Ufc,Uvi,m,w,m,rY,r,M"))]
>    "TARGET_FLOAT && (register_operand (operands[0], SFmode) @@ -
> 1119,28 +1124,13 @@
>     str\\t%w1, %0
>     mov\\t%w0, %w1
>     mov\\t%w0, %1"
> -  "&& can_create_pseudo_p ()
> -   && !aarch64_can_const_movi_rtx_p (operands[1], SFmode)
> -   && !aarch64_float_const_representable_p (operands[1])
> -   &&  aarch64_float_const_rtx_p (operands[1])"
> -  [(const_int 0)]
> -  "{
> -    unsigned HOST_WIDE_INT ival;
> -    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
> -      FAIL;
> -
> -    rtx tmp = gen_reg_rtx (SImode);
> -    aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
> -    emit_move_insn (operands[0], gen_lowpart (SFmode, tmp));
> -    DONE;
> -  }"
>    [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconsts,neon_move,\
>  		     f_loads,f_stores,load1,store1,mov_reg,\
>  		     fconsts")
>     (set_attr "simd" "yes,*,*,*,*,yes,*,*,*,*,*,*")]
>  )
> 
> -(define_insn_and_split "*movdf_aarch64"
> +(define_insn "*movdf_aarch64"
>    [(set (match_operand:DF 0 "nonimmediate_operand" "=w, w  ,?r,w,w  ,w
> ,w,m,r,m ,r,r")
>  	(match_operand:DF 1 "general_operand"      "Y , ?rY,
> w,w,Ufc,Uvi,m,w,m,rY,r,N"))]
>    "TARGET_FLOAT && (register_operand (operands[0], DFmode) @@ -
> 1158,21 +1148,6 @@
>     str\\t%x1, %0
>     mov\\t%x0, %x1
>     mov\\t%x0, %1"
> -  "&& can_create_pseudo_p ()
> -   && !aarch64_can_const_movi_rtx_p (operands[1], DFmode)
> -   && !aarch64_float_const_representable_p (operands[1])
> -   &&  aarch64_float_const_rtx_p (operands[1])"
> -  [(const_int 0)]
> -  "{
> -    unsigned HOST_WIDE_INT ival;
> -    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
> -      FAIL;
> -
> -    rtx tmp = gen_reg_rtx (DImode);
> -    aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
> -    emit_move_insn (operands[0], gen_lowpart (DFmode, tmp));
> -    DONE;
> -  }"
>    [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,neon_move,\
>  		     f_loadd,f_stored,load1,store1,mov_reg,\
>  		     fconstd")
> Index: gcc/gcc/config/aarch64/iterators.md
> ==========================================================
> =========
> --- gcc.orig/gcc/config/aarch64/iterators.md
> +++ gcc/gcc/config/aarch64/iterators.md
> @@ -44,6 +44,10 @@
>  ;; Iterator for all scalar floating point modes (HF, SF, DF)
> (define_mode_iterator GPF_F16 [(HF "AARCH64_ISA_F16") SF DF])
> 
> +;; Iterator for all scalar floating point modes (HF, SF, DF), without
> +;; requiring AARCH64_ISA_F16 for HF.
> +(define_mode_iterator GPF_MOV_F16 [HF SF DF])
> +
>  ;; Iterator for all scalar floating point modes (HF, SF, DF and TF)
> (define_mode_iterator GPF_TF_F16 [HF SF DF TF])
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.
  2017-06-14  8:43 ` James Greenhalgh
@ 2017-06-21 10:48   ` Tamar Christina
  2017-06-26 10:50     ` Tamar Christina
  0 siblings, 1 reply; 16+ messages in thread
From: Tamar Christina @ 2017-06-21 10:48 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches, nd, Marcus Shawcroft, Richard Earnshaw

> >     movi\\t%0.4h, #0
> > -   mov\\t%0.h[0], %w1
> > +   fmov\\t%s0, %w1
> 
> Should this not be %h0?

The problem is that H registers are only available in ARMv8.2+,
I'm not sure what to do about ARMv8.1 given your other feedback
Pointing out that the bit patterns between how it's stored in s vs h registers
differ.

> 
> >     umov\\t%w0, %1.h[0]
> >     mov\\t%0.h[0], %1.h[0]
> > +   fmov\\t%s0, %1
> 
> Likewise, and much more important for correctness as it changes the way the
> bit pattern ends up in the register (see table C2-1 in release B.a of the ARM
> Architecture Reference Manual for ARMv8-A), here.
> 
> > +   * return aarch64_output_scalar_simd_mov_immediate (operands[1],
> > + SImode);
> >     ldr\\t%h0, %1
> >     str\\t%h1, %0
> >     ldrh\\t%w0, %1
> >     strh\\t%w1, %0
> >     mov\\t%w0, %w1"
> > -  [(set_attr "type"
> "neon_move,neon_from_gp,neon_to_gp,neon_move,\
> > -                     f_loads,f_stores,load1,store1,mov_reg")
> > -   (set_attr "simd" "yes,yes,yes,yes,*,*,*,*,*")]
> > +  "&& can_create_pseudo_p ()
> > +   && !aarch64_can_const_movi_rtx_p (operands[1], HFmode)
> > +   && !aarch64_float_const_representable_p (operands[1])
> > +   &&  aarch64_float_const_rtx_p (operands[1])"
> > +  [(const_int 0)]
> > +  "{
> > +    unsigned HOST_WIDE_INT ival;
> > +    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
> > +      FAIL;
> > +
> > +    rtx tmp = gen_reg_rtx (SImode);
> > +    aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
> > +    tmp = simplify_gen_subreg (HImode, tmp, SImode, 0);
> > +    emit_move_insn (operands[0], gen_lowpart (HFmode, tmp));
> > +    DONE;
> > +  }"
> > +  [(set_attr "type" "neon_move,f_mcr,neon_to_gp,neon_move,fconsts,
> \
> > +		     neon_move,f_loads,f_stores,load1,store1,mov_reg")
> > +   (set_attr "simd" "yes,*,yes,yes,*,yes,*,*,*,*,*")]
> >  )
> 
> Thanks,
> James

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.
  2017-06-21 10:48   ` Tamar Christina
@ 2017-06-26 10:50     ` Tamar Christina
  2017-07-03  6:12       ` Tamar Christina
                         ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Tamar Christina @ 2017-06-26 10:50 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches, nd, Marcus Shawcroft, Richard Earnshaw

[-- Attachment #1: Type: text/plain, Size: 2939 bytes --]

Hi all,

Here's the re-spun patch.
Aside from the grouping of the split patterns it now also uses h register for the fmov for HF when available,
otherwise it forces a literal load.

Regression tested on aarch64-none-linux-gnu and no regressions.

OK for trunk?

Thanks,
Tamar


gcc/
2017-06-26  Tamar Christina  <tamar.christina@arm.com>
            Richard Sandiford <richard.sandiford@linaro.org>

        * config/aarch64/aarch64.md (mov<mode>): Generalize.
        (*movhf_aarch64, *movsf_aarch64, *movdf_aarch64):
        Add integer and movi cases.
        (movi-split-hf-df-sf split, fp16): New.
        (enabled): Added TARGET_FP_F16INST.
        * config/aarch64/iterators.md (GPF_HF): New.
________________________________________
From: Tamar Christina
Sent: Wednesday, June 21, 2017 11:48:33 AM
To: James Greenhalgh
Cc: GCC Patches; nd; Marcus Shawcroft; Richard Earnshaw
Subject: RE: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.

> >     movi\\t%0.4h, #0
> > -   mov\\t%0.h[0], %w1
> > +   fmov\\t%s0, %w1
>
> Should this not be %h0?

The problem is that H registers are only available in ARMv8.2+,
I'm not sure what to do about ARMv8.1 given your other feedback
Pointing out that the bit patterns between how it's stored in s vs h registers
differ.

>
> >     umov\\t%w0, %1.h[0]
> >     mov\\t%0.h[0], %1.h[0]
> > +   fmov\\t%s0, %1
>
> Likewise, and much more important for correctness as it changes the way the
> bit pattern ends up in the register (see table C2-1 in release B.a of the ARM
> Architecture Reference Manual for ARMv8-A), here.
>
> > +   * return aarch64_output_scalar_simd_mov_immediate (operands[1],
> > + SImode);
> >     ldr\\t%h0, %1
> >     str\\t%h1, %0
> >     ldrh\\t%w0, %1
> >     strh\\t%w1, %0
> >     mov\\t%w0, %w1"
> > -  [(set_attr "type"
> "neon_move,neon_from_gp,neon_to_gp,neon_move,\
> > -                     f_loads,f_stores,load1,store1,mov_reg")
> > -   (set_attr "simd" "yes,yes,yes,yes,*,*,*,*,*")]
> > +  "&& can_create_pseudo_p ()
> > +   && !aarch64_can_const_movi_rtx_p (operands[1], HFmode)
> > +   && !aarch64_float_const_representable_p (operands[1])
> > +   &&  aarch64_float_const_rtx_p (operands[1])"
> > +  [(const_int 0)]
> > +  "{
> > +    unsigned HOST_WIDE_INT ival;
> > +    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
> > +      FAIL;
> > +
> > +    rtx tmp = gen_reg_rtx (SImode);
> > +    aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
> > +    tmp = simplify_gen_subreg (HImode, tmp, SImode, 0);
> > +    emit_move_insn (operands[0], gen_lowpart (HFmode, tmp));
> > +    DONE;
> > +  }"
> > +  [(set_attr "type" "neon_move,f_mcr,neon_to_gp,neon_move,fconsts,
> \
> > +                neon_move,f_loads,f_stores,load1,store1,mov_reg")
> > +   (set_attr "simd" "yes,*,yes,yes,*,yes,*,*,*,*,*")]
> >  )
>
> Thanks,
> James


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: float-mov_modes-2.patch --]
[-- Type: text/x-patch; name="float-mov_modes-2.patch", Size: 6529 bytes --]

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 1a721bfbe42270ec75268b6e2366290aa6ad2134..c951efc383c17ea81800e482e39760eb17830c0a 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -181,6 +181,11 @@
 ;; will be disabled when !TARGET_FLOAT.
 (define_attr "fp" "no,yes" (const_string "no"))
 
+;; Attribute that specifies whether or not the instruction touches half
+;; precision fp registers.  When this is set to yes for an alternative,
+;; that alternative will be disabled when !TARGET_FP_F16INST.
+(define_attr "fp16" "no,yes" (const_string "no"))
+
 ;; Attribute that specifies whether or not the instruction touches simd
 ;; registers.  When this is set to yes for an alternative, that alternative
 ;; will be disabled when !TARGET_SIMD.
@@ -194,11 +199,14 @@
 ;; registers when -mgeneral-regs-only is specified.
 (define_attr "enabled" "no,yes"
   (cond [(ior
-	(and (eq_attr "fp" "yes")
-	     (eq (symbol_ref "TARGET_FLOAT") (const_int 0)))
-	(and (eq_attr "simd" "yes")
-	     (eq (symbol_ref "TARGET_SIMD") (const_int 0))))
-	     (const_string "no")
+	    (ior
+		(and (eq_attr "fp" "yes")
+		     (eq (symbol_ref "TARGET_FLOAT") (const_int 0)))
+		(and (eq_attr "simd" "yes")
+		     (eq (symbol_ref "TARGET_SIMD") (const_int 0))))
+	    (and (eq_attr "fp16" "yes")
+		 (eq (symbol_ref "TARGET_FP_F16INST") (const_int 0))))
+	    (const_string "no")
 	] (const_string "yes")))
 
 ;; Attribute that specifies whether we are dealing with a branch to a
@@ -1062,65 +1070,94 @@
 )
 
 (define_insn "*movhf_aarch64"
-  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w,m,r,m ,r")
-	(match_operand:HF 1 "general_operand"      "Y ,?rY, w,w,m,w,m,rY,r"))]
+  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r")
+	(match_operand:HF 1 "general_operand"      "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r"))]
   "TARGET_FLOAT && (register_operand (operands[0], HFmode)
-    || aarch64_reg_or_fp_zero (operands[1], HFmode))"
+    || aarch64_reg_or_fp_float (operands[1], HFmode))"
   "@
    movi\\t%0.4h, #0
-   mov\\t%0.h[0], %w1
+   fmov\\t%h0, %w1
    umov\\t%w0, %1.h[0]
    mov\\t%0.h[0], %1.h[0]
+   fmov\\t%h0, %1
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);
    ldr\\t%h0, %1
    str\\t%h1, %0
    ldrh\\t%w0, %1
    strh\\t%w1, %0
    mov\\t%w0, %w1"
-  [(set_attr "type" "neon_move,neon_from_gp,neon_to_gp,neon_move,\
-                     f_loads,f_stores,load1,store1,mov_reg")
-   (set_attr "simd" "yes,yes,yes,yes,*,*,*,*,*")]
+  [(set_attr "type" "neon_move,f_mcr,neon_to_gp,neon_move,fconsts, \
+		     neon_move,f_loads,f_stores,load1,store1,mov_reg")
+   (set_attr "simd" "yes,*,yes,yes,*,yes,*,*,*,*,*")
+   (set_attr "fp16"   "*,yes,*,*,yes,*,*,*,*,*,*")]
 )
 
 (define_insn "*movsf_aarch64"
-  [(set (match_operand:SF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w,m,r,m ,r")
-	(match_operand:SF 1 "general_operand"      "Y ,?rY, w,w,Ufc,m,w,m,rY,r"))]
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
+	(match_operand:SF 1 "general_operand"      "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r,M"))]
   "TARGET_FLOAT && (register_operand (operands[0], SFmode)
-    || aarch64_reg_or_fp_zero (operands[1], SFmode))"
+    || aarch64_reg_or_fp_float (operands[1], SFmode))"
   "@
    movi\\t%0.2s, #0
    fmov\\t%s0, %w1
    fmov\\t%w0, %s1
    fmov\\t%s0, %s1
    fmov\\t%s0, %1
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);
    ldr\\t%s0, %1
    str\\t%s1, %0
    ldr\\t%w0, %1
    str\\t%w1, %0
-   mov\\t%w0, %w1"
-  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconsts,\
-                     f_loads,f_stores,load1,store1,mov_reg")
-   (set_attr "simd" "yes,*,*,*,*,*,*,*,*,*")]
+   mov\\t%w0, %w1
+   mov\\t%w0, %1"
+  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconsts,neon_move,\
+		     f_loads,f_stores,load1,store1,mov_reg,\
+		     fconsts")
+   (set_attr "simd" "yes,*,*,*,*,yes,*,*,*,*,*,*")]
 )
 
 (define_insn "*movdf_aarch64"
-  [(set (match_operand:DF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w,m,r,m ,r")
-	(match_operand:DF 1 "general_operand"      "Y ,?rY, w,w,Ufc,m,w,m,rY,r"))]
+  [(set (match_operand:DF 0 "nonimmediate_operand" "=w, w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
+	(match_operand:DF 1 "general_operand"      "Y , ?rY, w,w,Ufc,Uvi,m,w,m,rY,r,N"))]
   "TARGET_FLOAT && (register_operand (operands[0], DFmode)
-    || aarch64_reg_or_fp_zero (operands[1], DFmode))"
+    || aarch64_reg_or_fp_float (operands[1], DFmode))"
   "@
    movi\\t%d0, #0
    fmov\\t%d0, %x1
    fmov\\t%x0, %d1
    fmov\\t%d0, %d1
    fmov\\t%d0, %1
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);
    ldr\\t%d0, %1
    str\\t%d1, %0
    ldr\\t%x0, %1
    str\\t%x1, %0
-   mov\\t%x0, %x1"
-  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,\
-                     f_loadd,f_stored,load1,store1,mov_reg")
-   (set_attr "simd" "yes,*,*,*,*,*,*,*,*,*")]
+   mov\\t%x0, %x1
+   mov\\t%x0, %1"
+  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,neon_move,\
+		     f_loadd,f_stored,load1,store1,mov_reg,\
+		     fconstd")
+   (set_attr "simd" "yes,*,*,*,*,yes,*,*,*,*,*,*")]
+)
+
+(define_split
+  [(set (match_operand:GPF_HF 0 "nonimmediate_operand")
+	(match_operand:GPF_HF 1 "general_operand"))]
+  "can_create_pseudo_p ()
+   && !aarch64_can_const_movi_rtx_p (operands[1], <MODE>mode)
+   && !aarch64_float_const_representable_p (operands[1])
+   &&  aarch64_float_const_rtx_p (operands[1])"
+  [(const_int 0)]
+  {
+    unsigned HOST_WIDE_INT ival;
+    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
+      FAIL;
+
+    rtx tmp = gen_reg_rtx (<FCVT_TARGET>mode);
+    emit_move_insn (tmp, gen_int_mode (ival, <FCVT_TARGET>mode));
+    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp));
+    DONE;
+  }
 )
 
 (define_insn "*movtf_aarch64"
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 43be7fd361116d305b0300a814ffca82a9c44a88..067cef785331a243632715e0d0fe35a314eeebff 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -44,6 +44,9 @@
 ;; Iterator for all scalar floating point modes (HF, SF, DF)
 (define_mode_iterator GPF_F16 [(HF "AARCH64_ISA_F16") SF DF])
 
+;; Iterator for all scalar floating point modes (HF, SF, DF)
+(define_mode_iterator GPF_HF [HF SF DF])
+
 ;; Iterator for all scalar floating point modes (HF, SF, DF and TF)
 (define_mode_iterator GPF_TF_F16 [HF SF DF TF])
 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.
  2017-06-26 10:50     ` Tamar Christina
@ 2017-07-03  6:12       ` Tamar Christina
  2017-07-10  7:35         ` Tamar Christina
  2017-07-27 16:09       ` James Greenhalgh
  2017-08-01 11:47       ` Bin.Cheng
  2 siblings, 1 reply; 16+ messages in thread
From: Tamar Christina @ 2017-07-03  6:12 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches, nd, Marcus Shawcroft, Richard Earnshaw

Ping
________________________________________
From: gcc-patches-owner@gcc.gnu.org <gcc-patches-owner@gcc.gnu.org> on behalf of Tamar Christina <Tamar.Christina@arm.com>
Sent: Monday, June 26, 2017 11:50:51 AM
To: James Greenhalgh
Cc: GCC Patches; nd; Marcus Shawcroft; Richard Earnshaw
Subject: Re: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.

Hi all,

Here's the re-spun patch.
Aside from the grouping of the split patterns it now also uses h register for the fmov for HF when available,
otherwise it forces a literal load.

Regression tested on aarch64-none-linux-gnu and no regressions.

OK for trunk?

Thanks,
Tamar


gcc/
2017-06-26  Tamar Christina  <tamar.christina@arm.com>
            Richard Sandiford <richard.sandiford@linaro.org>

        * config/aarch64/aarch64.md (mov<mode>): Generalize.
        (*movhf_aarch64, *movsf_aarch64, *movdf_aarch64):
        Add integer and movi cases.
        (movi-split-hf-df-sf split, fp16): New.
        (enabled): Added TARGET_FP_F16INST.
        * config/aarch64/iterators.md (GPF_HF): New.
________________________________________
From: Tamar Christina
Sent: Wednesday, June 21, 2017 11:48:33 AM
To: James Greenhalgh
Cc: GCC Patches; nd; Marcus Shawcroft; Richard Earnshaw
Subject: RE: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.

> >     movi\\t%0.4h, #0
> > -   mov\\t%0.h[0], %w1
> > +   fmov\\t%s0, %w1
>
> Should this not be %h0?

The problem is that H registers are only available in ARMv8.2+,
I'm not sure what to do about ARMv8.1 given your other feedback
Pointing out that the bit patterns between how it's stored in s vs h registers
differ.

>
> >     umov\\t%w0, %1.h[0]
> >     mov\\t%0.h[0], %1.h[0]
> > +   fmov\\t%s0, %1
>
> Likewise, and much more important for correctness as it changes the way the
> bit pattern ends up in the register (see table C2-1 in release B.a of the ARM
> Architecture Reference Manual for ARMv8-A), here.
>
> > +   * return aarch64_output_scalar_simd_mov_immediate (operands[1],
> > + SImode);
> >     ldr\\t%h0, %1
> >     str\\t%h1, %0
> >     ldrh\\t%w0, %1
> >     strh\\t%w1, %0
> >     mov\\t%w0, %w1"
> > -  [(set_attr "type"
> "neon_move,neon_from_gp,neon_to_gp,neon_move,\
> > -                     f_loads,f_stores,load1,store1,mov_reg")
> > -   (set_attr "simd" "yes,yes,yes,yes,*,*,*,*,*")]
> > +  "&& can_create_pseudo_p ()
> > +   && !aarch64_can_const_movi_rtx_p (operands[1], HFmode)
> > +   && !aarch64_float_const_representable_p (operands[1])
> > +   &&  aarch64_float_const_rtx_p (operands[1])"
> > +  [(const_int 0)]
> > +  "{
> > +    unsigned HOST_WIDE_INT ival;
> > +    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
> > +      FAIL;
> > +
> > +    rtx tmp = gen_reg_rtx (SImode);
> > +    aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
> > +    tmp = simplify_gen_subreg (HImode, tmp, SImode, 0);
> > +    emit_move_insn (operands[0], gen_lowpart (HFmode, tmp));
> > +    DONE;
> > +  }"
> > +  [(set_attr "type" "neon_move,f_mcr,neon_to_gp,neon_move,fconsts,
> \
> > +                neon_move,f_loads,f_stores,load1,store1,mov_reg")
> > +   (set_attr "simd" "yes,*,yes,yes,*,yes,*,*,*,*,*")]
> >  )
>
> Thanks,
> James

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.
  2017-07-03  6:12       ` Tamar Christina
@ 2017-07-10  7:35         ` Tamar Christina
  0 siblings, 0 replies; 16+ messages in thread
From: Tamar Christina @ 2017-07-10  7:35 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: GCC Patches, nd, Marcus Shawcroft, Richard Earnshaw, Richard Sandiford

Ping
________________________________________
From: Tamar Christina
Sent: Monday, July 3, 2017 7:12:05 AM
To: James Greenhalgh
Cc: GCC Patches; nd; Marcus Shawcroft; Richard Earnshaw
Subject: Re: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.

Ping
________________________________________
From: gcc-patches-owner@gcc.gnu.org <gcc-patches-owner@gcc.gnu.org> on behalf of Tamar Christina <Tamar.Christina@arm.com>
Sent: Monday, June 26, 2017 11:50:51 AM
To: James Greenhalgh
Cc: GCC Patches; nd; Marcus Shawcroft; Richard Earnshaw
Subject: Re: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.

Hi all,

Here's the re-spun patch.
Aside from the grouping of the split patterns it now also uses h register for the fmov for HF when available,
otherwise it forces a literal load.

Regression tested on aarch64-none-linux-gnu and no regressions.

OK for trunk?

Thanks,
Tamar


gcc/
2017-06-26  Tamar Christina  <tamar.christina@arm.com>
            Richard Sandiford <richard.sandiford@linaro.org>

        * config/aarch64/aarch64.md (mov<mode>): Generalize.
        (*movhf_aarch64, *movsf_aarch64, *movdf_aarch64):
        Add integer and movi cases.
        (movi-split-hf-df-sf split, fp16): New.
        (enabled): Added TARGET_FP_F16INST.
        * config/aarch64/iterators.md (GPF_HF): New.
________________________________________
From: Tamar Christina
Sent: Wednesday, June 21, 2017 11:48:33 AM
To: James Greenhalgh
Cc: GCC Patches; nd; Marcus Shawcroft; Richard Earnshaw
Subject: RE: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.

> >     movi\\t%0.4h, #0
> > -   mov\\t%0.h[0], %w1
> > +   fmov\\t%s0, %w1
>
> Should this not be %h0?

The problem is that H registers are only available in ARMv8.2+,
I'm not sure what to do about ARMv8.1 given your other feedback
Pointing out that the bit patterns between how it's stored in s vs h registers
differ.

>
> >     umov\\t%w0, %1.h[0]
> >     mov\\t%0.h[0], %1.h[0]
> > +   fmov\\t%s0, %1
>
> Likewise, and much more important for correctness as it changes the way the
> bit pattern ends up in the register (see table C2-1 in release B.a of the ARM
> Architecture Reference Manual for ARMv8-A), here.
>
> > +   * return aarch64_output_scalar_simd_mov_immediate (operands[1],
> > + SImode);
> >     ldr\\t%h0, %1
> >     str\\t%h1, %0
> >     ldrh\\t%w0, %1
> >     strh\\t%w1, %0
> >     mov\\t%w0, %w1"
> > -  [(set_attr "type"
> "neon_move,neon_from_gp,neon_to_gp,neon_move,\
> > -                     f_loads,f_stores,load1,store1,mov_reg")
> > -   (set_attr "simd" "yes,yes,yes,yes,*,*,*,*,*")]
> > +  "&& can_create_pseudo_p ()
> > +   && !aarch64_can_const_movi_rtx_p (operands[1], HFmode)
> > +   && !aarch64_float_const_representable_p (operands[1])
> > +   &&  aarch64_float_const_rtx_p (operands[1])"
> > +  [(const_int 0)]
> > +  "{
> > +    unsigned HOST_WIDE_INT ival;
> > +    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
> > +      FAIL;
> > +
> > +    rtx tmp = gen_reg_rtx (SImode);
> > +    aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
> > +    tmp = simplify_gen_subreg (HImode, tmp, SImode, 0);
> > +    emit_move_insn (operands[0], gen_lowpart (HFmode, tmp));
> > +    DONE;
> > +  }"
> > +  [(set_attr "type" "neon_move,f_mcr,neon_to_gp,neon_move,fconsts,
> \
> > +                neon_move,f_loads,f_stores,load1,store1,mov_reg")
> > +   (set_attr "simd" "yes,*,yes,yes,*,yes,*,*,*,*,*")]
> >  )
>
> Thanks,
> James

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.
  2017-06-26 10:50     ` Tamar Christina
  2017-07-03  6:12       ` Tamar Christina
@ 2017-07-27 16:09       ` James Greenhalgh
  2017-08-01 11:47       ` Bin.Cheng
  2 siblings, 0 replies; 16+ messages in thread
From: James Greenhalgh @ 2017-07-27 16:09 UTC (permalink / raw)
  To: Tamar Christina; +Cc: GCC Patches, nd, Marcus Shawcroft, Richard Earnshaw

On Mon, Jun 26, 2017 at 11:50:51AM +0100, Tamar Christina wrote:
> Hi all,
> 
> Here's the re-spun patch.
> Aside from the grouping of the split patterns it now also uses h register for the fmov for HF when available,
> otherwise it forces a literal load.
> 
> Regression tested on aarch64-none-linux-gnu and no regressions.
> 
> OK for trunk?

OK.

Thanks,
James

> 
> Thanks,
> Tamar
> 
> 
> gcc/
> 2017-06-26  Tamar Christina  <tamar.christina@arm.com>
>             Richard Sandiford <richard.sandiford@linaro.org>
> 
>         * config/aarch64/aarch64.md (mov<mode>): Generalize.
>         (*movhf_aarch64, *movsf_aarch64, *movdf_aarch64):
>         Add integer and movi cases.
>         (movi-split-hf-df-sf split, fp16): New.
>         (enabled): Added TARGET_FP_F16INST.
>         * config/aarch64/iterators.md (GPF_HF): New.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.
  2017-06-26 10:50     ` Tamar Christina
  2017-07-03  6:12       ` Tamar Christina
  2017-07-27 16:09       ` James Greenhalgh
@ 2017-08-01 11:47       ` Bin.Cheng
  2017-08-01 11:51         ` Tamar Christina
  2 siblings, 1 reply; 16+ messages in thread
From: Bin.Cheng @ 2017-08-01 11:47 UTC (permalink / raw)
  To: Tamar Christina
  Cc: James Greenhalgh, GCC Patches, nd, Marcus Shawcroft, Richard Earnshaw

On Mon, Jun 26, 2017 at 11:50 AM, Tamar Christina
<Tamar.Christina@arm.com> wrote:
> Hi all,
>
> Here's the re-spun patch.
> Aside from the grouping of the split patterns it now also uses h register for the fmov for HF when available,
> otherwise it forces a literal load.
>
> Regression tested on aarch64-none-linux-gnu and no regressions.
Hi,
There are lots of test failures on aarch64_be-none-elf, I verified two:
gcc.dg/vect/pr61680.c execution test
gcc.dg/vect/pr63148.c execution test

are caused by svn+ssh://gcc.gnu.org/svn/gcc/trunk@250673

Given review comment already pointed out big-endian issue and patch
was updated to address it, I would expect reg-test on a big-endian
target before applying patch, right?

Thanks,
bin
>
> OK for trunk?
>
> Thanks,
> Tamar
>
>
> gcc/
> 2017-06-26  Tamar Christina  <tamar.christina@arm.com>
>             Richard Sandiford <richard.sandiford@linaro.org>
>
>         * config/aarch64/aarch64.md (mov<mode>): Generalize.
>         (*movhf_aarch64, *movsf_aarch64, *movdf_aarch64):
>         Add integer and movi cases.
>         (movi-split-hf-df-sf split, fp16): New.
>         (enabled): Added TARGET_FP_F16INST.
>         * config/aarch64/iterators.md (GPF_HF): New.
> ________________________________________
> From: Tamar Christina
> Sent: Wednesday, June 21, 2017 11:48:33 AM
> To: James Greenhalgh
> Cc: GCC Patches; nd; Marcus Shawcroft; Richard Earnshaw
> Subject: RE: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.
>
>> >     movi\\t%0.4h, #0
>> > -   mov\\t%0.h[0], %w1
>> > +   fmov\\t%s0, %w1
>>
>> Should this not be %h0?
>
> The problem is that H registers are only available in ARMv8.2+,
> I'm not sure what to do about ARMv8.1 given your other feedback
> Pointing out that the bit patterns between how it's stored in s vs h registers
> differ.
>
>>
>> >     umov\\t%w0, %1.h[0]
>> >     mov\\t%0.h[0], %1.h[0]
>> > +   fmov\\t%s0, %1
>>
>> Likewise, and much more important for correctness as it changes the way the
>> bit pattern ends up in the register (see table C2-1 in release B.a of the ARM
>> Architecture Reference Manual for ARMv8-A), here.
>>
>> > +   * return aarch64_output_scalar_simd_mov_immediate (operands[1],
>> > + SImode);
>> >     ldr\\t%h0, %1
>> >     str\\t%h1, %0
>> >     ldrh\\t%w0, %1
>> >     strh\\t%w1, %0
>> >     mov\\t%w0, %w1"
>> > -  [(set_attr "type"
>> "neon_move,neon_from_gp,neon_to_gp,neon_move,\
>> > -                     f_loads,f_stores,load1,store1,mov_reg")
>> > -   (set_attr "simd" "yes,yes,yes,yes,*,*,*,*,*")]
>> > +  "&& can_create_pseudo_p ()
>> > +   && !aarch64_can_const_movi_rtx_p (operands[1], HFmode)
>> > +   && !aarch64_float_const_representable_p (operands[1])
>> > +   &&  aarch64_float_const_rtx_p (operands[1])"
>> > +  [(const_int 0)]
>> > +  "{
>> > +    unsigned HOST_WIDE_INT ival;
>> > +    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
>> > +      FAIL;
>> > +
>> > +    rtx tmp = gen_reg_rtx (SImode);
>> > +    aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
>> > +    tmp = simplify_gen_subreg (HImode, tmp, SImode, 0);
>> > +    emit_move_insn (operands[0], gen_lowpart (HFmode, tmp));
>> > +    DONE;
>> > +  }"
>> > +  [(set_attr "type" "neon_move,f_mcr,neon_to_gp,neon_move,fconsts,
>> \
>> > +                neon_move,f_loads,f_stores,load1,store1,mov_reg")
>> > +   (set_attr "simd" "yes,*,yes,yes,*,yes,*,*,*,*,*")]
>> >  )
>>
>> Thanks,
>> James
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.
  2017-08-01 11:47       ` Bin.Cheng
@ 2017-08-01 11:51         ` Tamar Christina
  2017-08-01 12:04           ` Bin.Cheng
  0 siblings, 1 reply; 16+ messages in thread
From: Tamar Christina @ 2017-08-01 11:51 UTC (permalink / raw)
  To: Bin.Cheng
  Cc: James Greenhalgh, GCC Patches, nd, Marcus Shawcroft, Richard Earnshaw

> 
> Given review comment already pointed out big-endian issue and patch was
> updated to address it, I would expect reg-test on a big-endian target before
> applying patch, right?

The patch spent 6 months in external review.
Given that, I simply forgot to rerun big endian before the commit as I did the rest.

The failing tests were all added after the submission of this patch. I'll have a look.

> Thanks,
> bin
> >
> > OK for trunk?
> >
> > Thanks,
> > Tamar
> >
> >
> > gcc/
> > 2017-06-26  Tamar Christina  <tamar.christina@arm.com>
> >             Richard Sandiford <richard.sandiford@linaro.org>
> >
> >         * config/aarch64/aarch64.md (mov<mode>): Generalize.
> >         (*movhf_aarch64, *movsf_aarch64, *movdf_aarch64):
> >         Add integer and movi cases.
> >         (movi-split-hf-df-sf split, fp16): New.
> >         (enabled): Added TARGET_FP_F16INST.
> >         * config/aarch64/iterators.md (GPF_HF): New.
> > ________________________________________
> > From: Tamar Christina
> > Sent: Wednesday, June 21, 2017 11:48:33 AM
> > To: James Greenhalgh
> > Cc: GCC Patches; nd; Marcus Shawcroft; Richard Earnshaw
> > Subject: RE: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4)
> - HF/DF/SF mode.
> >
> >> >     movi\\t%0.4h, #0
> >> > -   mov\\t%0.h[0], %w1
> >> > +   fmov\\t%s0, %w1
> >>
> >> Should this not be %h0?
> >
> > The problem is that H registers are only available in ARMv8.2+, I'm
> > not sure what to do about ARMv8.1 given your other feedback Pointing
> > out that the bit patterns between how it's stored in s vs h registers
> > differ.
> >
> >>
> >> >     umov\\t%w0, %1.h[0]
> >> >     mov\\t%0.h[0], %1.h[0]
> >> > +   fmov\\t%s0, %1
> >>
> >> Likewise, and much more important for correctness as it changes the
> >> way the bit pattern ends up in the register (see table C2-1 in
> >> release B.a of the ARM Architecture Reference Manual for ARMv8-A),
> here.
> >>
> >> > +   * return aarch64_output_scalar_simd_mov_immediate
> (operands[1],
> >> > + SImode);
> >> >     ldr\\t%h0, %1
> >> >     str\\t%h1, %0
> >> >     ldrh\\t%w0, %1
> >> >     strh\\t%w1, %0
> >> >     mov\\t%w0, %w1"
> >> > -  [(set_attr "type"
> >> "neon_move,neon_from_gp,neon_to_gp,neon_move,\
> >> > -                     f_loads,f_stores,load1,store1,mov_reg")
> >> > -   (set_attr "simd" "yes,yes,yes,yes,*,*,*,*,*")]
> >> > +  "&& can_create_pseudo_p ()
> >> > +   && !aarch64_can_const_movi_rtx_p (operands[1], HFmode)
> >> > +   && !aarch64_float_const_representable_p (operands[1])
> >> > +   &&  aarch64_float_const_rtx_p (operands[1])"
> >> > +  [(const_int 0)]
> >> > +  "{
> >> > +    unsigned HOST_WIDE_INT ival;
> >> > +    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
> >> > +      FAIL;
> >> > +
> >> > +    rtx tmp = gen_reg_rtx (SImode);
> >> > +    aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
> >> > +    tmp = simplify_gen_subreg (HImode, tmp, SImode, 0);
> >> > +    emit_move_insn (operands[0], gen_lowpart (HFmode, tmp));
> >> > +    DONE;
> >> > +  }"
> >> > +  [(set_attr "type"
> "neon_move,f_mcr,neon_to_gp,neon_move,fconsts,
> >> \
> >> > +                neon_move,f_loads,f_stores,load1,store1,mov_reg")
> >> > +   (set_attr "simd" "yes,*,yes,yes,*,yes,*,*,*,*,*")]
> >> >  )
> >>
> >> Thanks,
> >> James
> >

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.
  2017-08-01 11:51         ` Tamar Christina
@ 2017-08-01 12:04           ` Bin.Cheng
  0 siblings, 0 replies; 16+ messages in thread
From: Bin.Cheng @ 2017-08-01 12:04 UTC (permalink / raw)
  To: Tamar Christina
  Cc: James Greenhalgh, GCC Patches, nd, Marcus Shawcroft, Richard Earnshaw

On Tue, Aug 1, 2017 at 12:51 PM, Tamar Christina
<Tamar.Christina@arm.com> wrote:
>>
>> Given review comment already pointed out big-endian issue and patch was
>> updated to address it, I would expect reg-test on a big-endian target before
>> applying patch, right?
>
> The patch spent 6 months in external review.
> Given that, I simply forgot to rerun big endian before the commit as I did the rest.
>
> The failing tests were all added after the submission of this patch. I'll have a look.
I may bisect to wrong commit?  The patch is committed three days ago,
how could all failing tests be added after it?

Thanks,
bin

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2017-08-01 12:04 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-07 11:38 [PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode Tamar Christina
2017-06-12  7:31 ` Tamar Christina
2017-06-14 10:06   ` Richard Sandiford
2017-06-15 13:25     ` Tamar Christina
2017-06-15 14:28       ` Tamar Christina
2017-06-16  7:53         ` Richard Sandiford
2017-06-16  8:42           ` Tamar Christina
2017-06-14  8:43 ` James Greenhalgh
2017-06-21 10:48   ` Tamar Christina
2017-06-26 10:50     ` Tamar Christina
2017-07-03  6:12       ` Tamar Christina
2017-07-10  7:35         ` Tamar Christina
2017-07-27 16:09       ` James Greenhalgh
2017-08-01 11:47       ` Bin.Cheng
2017-08-01 11:51         ` Tamar Christina
2017-08-01 12:04           ` Bin.Cheng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).