[PATCH v4] LoongArch: Optimize immediate load.

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH v4] LoongArch: Optimize immediate load.
@ 2022-11-17  9:59 Lulu Cheng
  2022-11-22 14:03 ` Xi Ruoyao
  0 siblings, 1 reply; 5+ messages in thread
From: Lulu Cheng @ 2022-11-17  9:59 UTC (permalink / raw)
  To: gcc-patches; +Cc: xry111, i, xuchenghua, Lulu Cheng

v1 -> v2:
1. Change the code format.
2. Fix bugs in the code.

v2 -> v3:
Modifying a code implementation of an undefined behavior.

v3 -> v4:
Move the part of the immediate number decomposition from expand pass to split
pass.

Both regression tests and spec2006 passed.

The problem mentioned in the link does not move the four immediate load
instructions out of the loop. It has been optimized. Now, as in the test case,
four immediate load instructions are generated outside the loop.
(https://sourceware.org/pipermail/libc-alpha/2022-September/142202.html)

--------------------------------------------------------------------
Because loop2_invariant pass will extract the instructions that do not change
in the loop out of the loop, some instructions will not meet the extraction
conditions if the machine performs immediate decomposition while expand pass,
so the immediate decomposition will be transferred to the split process.

gcc/ChangeLog:

	* config/loongarch/loongarch.cc (enum loongarch_load_imm_method):
	Remove the member METHOD_INSV that is not currently used.
	(struct loongarch_integer_op): Define a new member curr_value,
	that records the value of the number stored in the destination
	register immediately after the current instruction has run.
	(loongarch_build_integer): Assign a value to the curr_value member variable.
	(loongarch_move_integer): Adds information for the immediate load instruction.
	* config/loongarch/loongarch.md (*movdi_32bit): Redefine as define_insn_and_split.
	(*movdi_64bit): Likewise.
	(*movsi_internal): Likewise.
	(*movhi_internal): Likewise.
	* config/loongarch/predicates.md: Return true as long as it is CONST_INT, ensure
	that the immediate number is not optimized by decomposition during expand
	optimization loop.

gcc/testsuite/ChangeLog:

	* gcc.target/loongarch/imm-load.c: New test.
	* gcc.target/loongarch/imm-load1.c: New test.
---
 gcc/config/loongarch/loongarch.cc             | 62 ++++++++++---------
 gcc/config/loongarch/loongarch.md             | 44 +++++++++++--
 gcc/config/loongarch/predicates.md            |  2 +-
 gcc/testsuite/gcc.target/loongarch/imm-load.c | 10 +++
 .../gcc.target/loongarch/imm-load1.c          | 26 ++++++++
 5 files changed, 110 insertions(+), 34 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load1.c

diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index 8ee32c90573..9e0d6c7c3ea 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -139,22 +139,21 @@ struct loongarch_address_info
 
    METHOD_LU52I:
      Load 52-63 bit of the immediate number.
-
-   METHOD_INSV:
-     immediate like 0xfff00000fffffxxx
-   */
+*/
 enum loongarch_load_imm_method
 {
   METHOD_NORMAL,
   METHOD_LU32I,
-  METHOD_LU52I,
-  METHOD_INSV
+  METHOD_LU52I
 };
 
 struct loongarch_integer_op
 {
   enum rtx_code code;
   HOST_WIDE_INT value;
+  /* Represent the result of the immediate count of the load instruction at
+     each step.  */
+  HOST_WIDE_INT curr_value;
   enum loongarch_load_imm_method method;
 };
 
@@ -1475,24 +1474,27 @@ loongarch_build_integer (struct loongarch_integer_op *codes,
     {
       /* The value of the lower 32 bit be loaded with one instruction.
 	 lu12i.w.  */
-      codes[0].code = UNKNOWN;
-      codes[0].method = METHOD_NORMAL;
-      codes[0].value = low_part;
+      codes[cost].code = UNKNOWN;
+      codes[cost].method = METHOD_NORMAL;
+      codes[cost].value = low_part;
+      codes[cost].curr_value = low_part;
       cost++;
     }
   else
     {
       /* lu12i.w + ior.  */
-      codes[0].code = UNKNOWN;
-      codes[0].method = METHOD_NORMAL;
-      codes[0].value = low_part & ~(IMM_REACH - 1);
+      codes[cost].code = UNKNOWN;
+      codes[cost].method = METHOD_NORMAL;
+      codes[cost].value = low_part & ~(IMM_REACH - 1);
+      codes[cost].curr_value = codes[cost].value;
       cost++;
       HOST_WIDE_INT iorv = low_part & (IMM_REACH - 1);
       if (iorv != 0)
 	{
-	  codes[1].code = IOR;
-	  codes[1].method = METHOD_NORMAL;
-	  codes[1].value = iorv;
+	  codes[cost].code = IOR;
+	  codes[cost].method = METHOD_NORMAL;
+	  codes[cost].value = iorv;
+	  codes[cost].curr_value = low_part;
 	  cost++;
 	}
     }
@@ -1515,11 +1517,14 @@ loongarch_build_integer (struct loongarch_integer_op *codes,
 	{
 	  codes[cost].method = METHOD_LU52I;
 	  codes[cost].value = value & LU52I_B;
+	  codes[cost].curr_value = value;
 	  return cost + 1;
 	}
 
       codes[cost].method = METHOD_LU32I;
       codes[cost].value = (value & LU32I_B) | (sign51 ? LU52I_B : 0);
+      codes[cost].curr_value = (value & 0xfffffffffffff)
+	| (sign51 ? LU52I_B : 0);
       cost++;
 
       /* Determine whether the 52-61 bits are sign-extended from the low order,
@@ -1528,6 +1533,7 @@ loongarch_build_integer (struct loongarch_integer_op *codes,
 	{
 	  codes[cost].method = METHOD_LU52I;
 	  codes[cost].value = value & LU52I_B;
+	  codes[cost].curr_value = value;
 	  cost++;
 	}
     }
@@ -2911,6 +2917,9 @@ loongarch_move_integer (rtx temp, rtx dest, unsigned HOST_WIDE_INT value)
       else
 	x = force_reg (mode, x);
 
+      set_unique_reg_note (get_last_insn (), REG_EQUAL,
+			   GEN_INT (codes[i-1].curr_value));
+
       switch (codes[i].method)
 	{
 	case METHOD_NORMAL:
@@ -2918,22 +2927,17 @@ loongarch_move_integer (rtx temp, rtx dest, unsigned HOST_WIDE_INT value)
 			      GEN_INT (codes[i].value));
 	  break;
 	case METHOD_LU32I:
-	  emit_insn (
-	    gen_rtx_SET (x,
-			 gen_rtx_IOR (DImode,
-				      gen_rtx_ZERO_EXTEND (
-					DImode, gen_rtx_SUBREG (SImode, x, 0)),
-				      GEN_INT (codes[i].value))));
+	  gcc_assert (mode == DImode);
+	  x = gen_rtx_IOR (DImode,
+			   gen_rtx_ZERO_EXTEND (DImode,
+						gen_rtx_SUBREG (SImode, x, 0)),
+			   GEN_INT (codes[i].value));
 	  break;
 	case METHOD_LU52I:
-	  emit_insn (gen_lu52i_d (x, x, GEN_INT (0xfffffffffffff),
-				  GEN_INT (codes[i].value)));
-	  break;
-	case METHOD_INSV:
-	  emit_insn (
-	    gen_rtx_SET (gen_rtx_ZERO_EXTRACT (DImode, x, GEN_INT (20),
-					       GEN_INT (32)),
-			 gen_rtx_REG (DImode, 0)));
+	  gcc_assert (mode == DImode);
+	  x = gen_rtx_IOR (DImode,
+			   gen_rtx_AND (DImode, x, GEN_INT (0xfffffffffffff)),
+			   GEN_INT (codes[i].value));
 	  break;
 	default:
 	  gcc_unreachable ();
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
index 2fda5381904..f61db66d535 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -1718,23 +1718,41 @@ (define_expand "movdi"
     DONE;
 })
 
-(define_insn "*movdi_32bit"
+(define_insn_and_split "*movdi_32bit"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,r,w,*f,*f,*r,*m")
        (match_operand:DI 1 "move_operand" "r,i,w,r,*J*r,*m,*f,*f"))]
   "!TARGET_64BIT
    && (register_operand (operands[0], DImode)
        || reg_or_0_operand (operands[1], DImode))"
   { return loongarch_output_move (operands[0], operands[1]); }
+  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P (REGNO
+  (operands[0]))"
+  [(const_int 0)]
+  "
+{
+  loongarch_move_integer (operands[0], operands[0], INTVAL (operands[1]));
+  DONE;
+}
+  "
   [(set_attr "move_type" "move,const,load,store,mgtf,fpload,mftg,fpstore")
    (set_attr "mode" "DI")])
 
-(define_insn "*movdi_64bit"
+(define_insn_and_split "*movdi_64bit"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,r,w,*f,*f,*r,*m")
 	(match_operand:DI 1 "move_operand" "r,Yd,w,rJ,*r*J,*m,*f,*f"))]
   "TARGET_64BIT
    && (register_operand (operands[0], DImode)
        || reg_or_0_operand (operands[1], DImode))"
   { return loongarch_output_move (operands[0], operands[1]); }
+  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P (REGNO
+  (operands[0]))"
+  [(const_int 0)]
+  "
+{
+  loongarch_move_integer (operands[0], operands[0], INTVAL (operands[1]));
+  DONE;
+}
+  "
   [(set_attr "move_type" "move,const,load,store,mgtf,fpload,mftg,fpstore")
    (set_attr "mode" "DI")])
 
@@ -1749,12 +1767,21 @@ (define_expand "movsi"
     DONE;
 })
 
-(define_insn "*movsi_internal"
+(define_insn_and_split "*movsi_internal"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r,r,w,*f,*f,*r,*m,*r,*z")
 	(match_operand:SI 1 "move_operand" "r,Yd,w,rJ,*r*J,*m,*f,*f,*z,*r"))]
   "(register_operand (operands[0], SImode)
     || reg_or_0_operand (operands[1], SImode))"
   { return loongarch_output_move (operands[0], operands[1]); }
+  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P (REGNO
+  (operands[0]))"
+  [(const_int 0)]
+  "
+{
+  loongarch_move_integer (operands[0], operands[0], INTVAL (operands[1]));
+  DONE;
+}
+  "
   [(set_attr "move_type" "move,const,load,store,mgtf,fpload,mftg,fpstore,mftg,mgtf")
    (set_attr "mode" "SI")])
 
@@ -1774,12 +1801,21 @@ (define_expand "movhi"
     DONE;
 })
 
-(define_insn "*movhi_internal"
+(define_insn_and_split "*movhi_internal"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,r,m,r,k")
 	(match_operand:HI 1 "move_operand" "r,Yd,I,m,rJ,k,rJ"))]
   "(register_operand (operands[0], HImode)
        || reg_or_0_operand (operands[1], HImode))"
   { return loongarch_output_move (operands[0], operands[1]); }
+  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P (REGNO
+  (operands[0]))"
+  [(const_int 0)]
+  "
+{
+  loongarch_move_integer (operands[0], operands[0], INTVAL (operands[1]));
+  DONE;
+}
+  "
   [(set_attr "move_type" "move,const,const,load,store,load,store")
    (set_attr "mode" "HI")])
 
diff --git a/gcc/config/loongarch/predicates.md b/gcc/config/loongarch/predicates.md
index 8bd0c1376c9..58c3dc2261c 100644
--- a/gcc/config/loongarch/predicates.md
+++ b/gcc/config/loongarch/predicates.md
@@ -226,7 +226,7 @@ (define_predicate "move_operand"
   switch (GET_CODE (op))
     {
     case CONST_INT:
-      return !splittable_const_int_operand (op, mode);
+      return true;
 
     case CONST:
     case SYMBOL_REF:
diff --git a/gcc/testsuite/gcc.target/loongarch/imm-load.c b/gcc/testsuite/gcc.target/loongarch/imm-load.c
new file mode 100644
index 00000000000..c04ca33996f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/imm-load.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-mabi=lp64d -O2 -fdump-rtl-split1" } */
+
+long int
+test (void)
+{
+  return 0x1234567890abcdef;
+}
+/* { dg-final { scan-rtl-dump-times "scanning new insn with uid" 6 "split1" } } */
+
diff --git a/gcc/testsuite/gcc.target/loongarch/imm-load1.c b/gcc/testsuite/gcc.target/loongarch/imm-load1.c
new file mode 100644
index 00000000000..2ff02971239
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/imm-load1.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-mabi=lp64d -O2" } */
+/* { dg-final { scan-assembler "test:.*lu52i\.d.*\n\taddi\.w.*\n\.L2:" } } */
+
+
+extern long long b[10];
+static inline long long
+repeat_bytes (void)
+{
+  long long r = 0x0101010101010101;
+
+  return r;
+}
+
+static inline long long
+highbit_mask (long long m)
+{
+  return m & repeat_bytes ();
+}
+
+void test(long long *a)
+{
+  for (int i = 0; i < 10; i++)
+    b[i] = highbit_mask (a[i]);
+
+}
-- 
2.31.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v4] LoongArch: Optimize immediate load.
  2022-11-17  9:59 [PATCH v4] LoongArch: Optimize immediate load Lulu Cheng
@ 2022-11-22 14:03 ` Xi Ruoyao
  2022-11-22 16:44   ` Xi Ruoyao
  0 siblings, 1 reply; 5+ messages in thread
From: Xi Ruoyao @ 2022-11-22 14:03 UTC (permalink / raw)
  To: Lulu Cheng, gcc-patches; +Cc: i, xuchenghua

While I still can't fully understand the immediate load issue and how
this patch fix it, I've tested this patch (alongside the prefetch
instruction patch) with bootstrap-ubsan.  And the compiled result of
imm-load1.c seems OK.

On Thu, 2022-11-17 at 17:59 +0800, Lulu Cheng wrote:
> v1 -> v2:
> 1. Change the code format.
> 2. Fix bugs in the code.
> 
> v2 -> v3:
> Modifying a code implementation of an undefined behavior.
> 
> v3 -> v4:
> Move the part of the immediate number decomposition from expand pass
> to split
> pass.
> 
> Both regression tests and spec2006 passed.
> 
> The problem mentioned in the link does not move the four immediate
> load
> instructions out of the loop. It has been optimized. Now, as in the
> test case,
> four immediate load instructions are generated outside the loop.
> (
> https://sourceware.org/pipermail/libc-alpha/2022-September/142202.html)
> 
> --------------------------------------------------------------------
> Because loop2_invariant pass will extract the instructions that do not
> change
> in the loop out of the loop, some instructions will not meet the
> extraction
> conditions if the machine performs immediate decomposition while
> expand pass,
> so the immediate decomposition will be transferred to the split
> process.
> 
> gcc/ChangeLog:
> 
>         * config/loongarch/loongarch.cc (enum
> loongarch_load_imm_method):
>         Remove the member METHOD_INSV that is not currently used.
>         (struct loongarch_integer_op): Define a new member curr_value,
>         that records the value of the number stored in the destination
>         register immediately after the current instruction has run.
>         (loongarch_build_integer): Assign a value to the curr_value
> member variable.
>         (loongarch_move_integer): Adds information for the immediate
> load instruction.
>         * config/loongarch/loongarch.md (*movdi_32bit): Redefine as
> define_insn_and_split.
>         (*movdi_64bit): Likewise.
>         (*movsi_internal): Likewise.
>         (*movhi_internal): Likewise.
>         * config/loongarch/predicates.md: Return true as long as it is
> CONST_INT, ensure
>         that the immediate number is not optimized by decomposition
> during expand
>         optimization loop.
> 
> gcc/testsuite/ChangeLog:
> 
>         * gcc.target/loongarch/imm-load.c: New test.
>         * gcc.target/loongarch/imm-load1.c: New test.
> ---
>  gcc/config/loongarch/loongarch.cc             | 62 ++++++++++--------
> -
>  gcc/config/loongarch/loongarch.md             | 44 +++++++++++--
>  gcc/config/loongarch/predicates.md            |  2 +-
>  gcc/testsuite/gcc.target/loongarch/imm-load.c | 10 +++
>  .../gcc.target/loongarch/imm-load1.c          | 26 ++++++++
>  5 files changed, 110 insertions(+), 34 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load.c
>  create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load1.c
> 
> diff --git a/gcc/config/loongarch/loongarch.cc
> b/gcc/config/loongarch/loongarch.cc
> index 8ee32c90573..9e0d6c7c3ea 100644
> --- a/gcc/config/loongarch/loongarch.cc
> +++ b/gcc/config/loongarch/loongarch.cc
> @@ -139,22 +139,21 @@ struct loongarch_address_info
>  
>     METHOD_LU52I:
>       Load 52-63 bit of the immediate number.
> -
> -   METHOD_INSV:
> -     immediate like 0xfff00000fffffxxx
> -   */
> +*/
>  enum loongarch_load_imm_method
>  {
>    METHOD_NORMAL,
>    METHOD_LU32I,
> -  METHOD_LU52I,
> -  METHOD_INSV
> +  METHOD_LU52I
>  };
>  
>  struct loongarch_integer_op
>  {
>    enum rtx_code code;
>    HOST_WIDE_INT value;
> +  /* Represent the result of the immediate count of the load
> instruction at
> +     each step.  */
> +  HOST_WIDE_INT curr_value;
>    enum loongarch_load_imm_method method;
>  };
>  
> @@ -1475,24 +1474,27 @@ loongarch_build_integer (struct
> loongarch_integer_op *codes,
>      {
>        /* The value of the lower 32 bit be loaded with one
> instruction.
>          lu12i.w.  */
> -      codes[0].code = UNKNOWN;
> -      codes[0].method = METHOD_NORMAL;
> -      codes[0].value = low_part;
> +      codes[cost].code = UNKNOWN;
> +      codes[cost].method = METHOD_NORMAL;
> +      codes[cost].value = low_part;
> +      codes[cost].curr_value = low_part;
>        cost++;
>      }
>    else
>      {
>        /* lu12i.w + ior.  */
> -      codes[0].code = UNKNOWN;
> -      codes[0].method = METHOD_NORMAL;
> -      codes[0].value = low_part & ~(IMM_REACH - 1);
> +      codes[cost].code = UNKNOWN;
> +      codes[cost].method = METHOD_NORMAL;
> +      codes[cost].value = low_part & ~(IMM_REACH - 1);
> +      codes[cost].curr_value = codes[cost].value;
>        cost++;
>        HOST_WIDE_INT iorv = low_part & (IMM_REACH - 1);
>        if (iorv != 0)
>         {
> -         codes[1].code = IOR;
> -         codes[1].method = METHOD_NORMAL;
> -         codes[1].value = iorv;
> +         codes[cost].code = IOR;
> +         codes[cost].method = METHOD_NORMAL;
> +         codes[cost].value = iorv;
> +         codes[cost].curr_value = low_part;
>           cost++;
>         }
>      }
> @@ -1515,11 +1517,14 @@ loongarch_build_integer (struct
> loongarch_integer_op *codes,
>         {
>           codes[cost].method = METHOD_LU52I;
>           codes[cost].value = value & LU52I_B;
> +         codes[cost].curr_value = value;
>           return cost + 1;
>         }
>  
>        codes[cost].method = METHOD_LU32I;
>        codes[cost].value = (value & LU32I_B) | (sign51 ? LU52I_B : 0);
> +      codes[cost].curr_value = (value & 0xfffffffffffff)
> +       | (sign51 ? LU52I_B : 0);
>        cost++;
>  
>        /* Determine whether the 52-61 bits are sign-extended from the
> low order,
> @@ -1528,6 +1533,7 @@ loongarch_build_integer (struct
> loongarch_integer_op *codes,
>         {
>           codes[cost].method = METHOD_LU52I;
>           codes[cost].value = value & LU52I_B;
> +         codes[cost].curr_value = value;
>           cost++;
>         }
>      }
> @@ -2911,6 +2917,9 @@ loongarch_move_integer (rtx temp, rtx dest,
> unsigned HOST_WIDE_INT value)
>        else
>         x = force_reg (mode, x);
>  
> +      set_unique_reg_note (get_last_insn (), REG_EQUAL,
> +                          GEN_INT (codes[i-1].curr_value));
> +
>        switch (codes[i].method)
>         {
>         case METHOD_NORMAL:
> @@ -2918,22 +2927,17 @@ loongarch_move_integer (rtx temp, rtx dest,
> unsigned HOST_WIDE_INT value)
>                               GEN_INT (codes[i].value));
>           break;
>         case METHOD_LU32I:
> -         emit_insn (
> -           gen_rtx_SET (x,
> -                        gen_rtx_IOR (DImode,
> -                                     gen_rtx_ZERO_EXTEND (
> -                                       DImode, gen_rtx_SUBREG
> (SImode, x, 0)),
> -                                     GEN_INT (codes[i].value))));
> +         gcc_assert (mode == DImode);
> +         x = gen_rtx_IOR (DImode,
> +                          gen_rtx_ZERO_EXTEND (DImode,
> +                                               gen_rtx_SUBREG
> (SImode, x, 0)),
> +                          GEN_INT (codes[i].value));
>           break;
>         case METHOD_LU52I:
> -         emit_insn (gen_lu52i_d (x, x, GEN_INT (0xfffffffffffff),
> -                                 GEN_INT (codes[i].value)));
> -         break;
> -       case METHOD_INSV:
> -         emit_insn (
> -           gen_rtx_SET (gen_rtx_ZERO_EXTRACT (DImode, x, GEN_INT
> (20),
> -                                              GEN_INT (32)),
> -                        gen_rtx_REG (DImode, 0)));
> +         gcc_assert (mode == DImode);
> +         x = gen_rtx_IOR (DImode,
> +                          gen_rtx_AND (DImode, x, GEN_INT
> (0xfffffffffffff)),
> +                          GEN_INT (codes[i].value));
>           break;
>         default:
>           gcc_unreachable ();
> diff --git a/gcc/config/loongarch/loongarch.md
> b/gcc/config/loongarch/loongarch.md
> index 2fda5381904..f61db66d535 100644
> --- a/gcc/config/loongarch/loongarch.md
> +++ b/gcc/config/loongarch/loongarch.md
> @@ -1718,23 +1718,41 @@ (define_expand "movdi"
>      DONE;
>  })
>  
> -(define_insn "*movdi_32bit"
> +(define_insn_and_split "*movdi_32bit"
>    [(set (match_operand:DI 0 "nonimmediate_operand"
> "=r,r,r,w,*f,*f,*r,*m")
>         (match_operand:DI 1 "move_operand" "r,i,w,r,*J*r,*m,*f,*f"))]
>    "!TARGET_64BIT
>     && (register_operand (operands[0], DImode)
>         || reg_or_0_operand (operands[1], DImode))"
>    { return loongarch_output_move (operands[0], operands[1]); }
> +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
> (REGNO
> +  (operands[0]))"
> +  [(const_int 0)]
> +  "
> +{
> +  loongarch_move_integer (operands[0], operands[0], INTVAL
> (operands[1]));
> +  DONE;
> +}
> +  "
>    [(set_attr "move_type"
> "move,const,load,store,mgtf,fpload,mftg,fpstore")
>     (set_attr "mode" "DI")])
>  
> -(define_insn "*movdi_64bit"
> +(define_insn_and_split "*movdi_64bit"
>    [(set (match_operand:DI 0 "nonimmediate_operand"
> "=r,r,r,w,*f,*f,*r,*m")
>         (match_operand:DI 1 "move_operand"
> "r,Yd,w,rJ,*r*J,*m,*f,*f"))]
>    "TARGET_64BIT
>     && (register_operand (operands[0], DImode)
>         || reg_or_0_operand (operands[1], DImode))"
>    { return loongarch_output_move (operands[0], operands[1]); }
> +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
> (REGNO
> +  (operands[0]))"
> +  [(const_int 0)]
> +  "
> +{
> +  loongarch_move_integer (operands[0], operands[0], INTVAL
> (operands[1]));
> +  DONE;
> +}
> +  "
>    [(set_attr "move_type"
> "move,const,load,store,mgtf,fpload,mftg,fpstore")
>     (set_attr "mode" "DI")])
>  
> @@ -1749,12 +1767,21 @@ (define_expand "movsi"
>      DONE;
>  })
>  
> -(define_insn "*movsi_internal"
> +(define_insn_and_split "*movsi_internal"
>    [(set (match_operand:SI 0 "nonimmediate_operand"
> "=r,r,r,w,*f,*f,*r,*m,*r,*z")
>         (match_operand:SI 1 "move_operand"
> "r,Yd,w,rJ,*r*J,*m,*f,*f,*z,*r"))]
>    "(register_operand (operands[0], SImode)
>      || reg_or_0_operand (operands[1], SImode))"
>    { return loongarch_output_move (operands[0], operands[1]); }
> +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
> (REGNO
> +  (operands[0]))"
> +  [(const_int 0)]
> +  "
> +{
> +  loongarch_move_integer (operands[0], operands[0], INTVAL
> (operands[1]));
> +  DONE;
> +}
> +  "
>    [(set_attr "move_type"
> "move,const,load,store,mgtf,fpload,mftg,fpstore,mftg,mgtf")
>     (set_attr "mode" "SI")])
>  
> @@ -1774,12 +1801,21 @@ (define_expand "movhi"
>      DONE;
>  })
>  
> -(define_insn "*movhi_internal"
> +(define_insn_and_split "*movhi_internal"
>    [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,r,m,r,k")
>         (match_operand:HI 1 "move_operand" "r,Yd,I,m,rJ,k,rJ"))]
>    "(register_operand (operands[0], HImode)
>         || reg_or_0_operand (operands[1], HImode))"
>    { return loongarch_output_move (operands[0], operands[1]); }
> +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
> (REGNO
> +  (operands[0]))"
> +  [(const_int 0)]
> +  "
> +{
> +  loongarch_move_integer (operands[0], operands[0], INTVAL
> (operands[1]));
> +  DONE;
> +}
> +  "
>    [(set_attr "move_type" "move,const,const,load,store,load,store")
>     (set_attr "mode" "HI")])
>  
> diff --git a/gcc/config/loongarch/predicates.md
> b/gcc/config/loongarch/predicates.md
> index 8bd0c1376c9..58c3dc2261c 100644
> --- a/gcc/config/loongarch/predicates.md
> +++ b/gcc/config/loongarch/predicates.md
> @@ -226,7 +226,7 @@ (define_predicate "move_operand"
>    switch (GET_CODE (op))
>      {
>      case CONST_INT:
> -      return !splittable_const_int_operand (op, mode);
> +      return true;
>  
>      case CONST:
>      case SYMBOL_REF:
> diff --git a/gcc/testsuite/gcc.target/loongarch/imm-load.c
> b/gcc/testsuite/gcc.target/loongarch/imm-load.c
> new file mode 100644
> index 00000000000..c04ca33996f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/loongarch/imm-load.c
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mabi=lp64d -O2 -fdump-rtl-split1" } */
> +
> +long int
> +test (void)
> +{
> +  return 0x1234567890abcdef;
> +}
> +/* { dg-final { scan-rtl-dump-times "scanning new insn with uid" 6
> "split1" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/loongarch/imm-load1.c
> b/gcc/testsuite/gcc.target/loongarch/imm-load1.c
> new file mode 100644
> index 00000000000..2ff02971239
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/loongarch/imm-load1.c
> @@ -0,0 +1,26 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mabi=lp64d -O2" } */
> +/* { dg-final { scan-assembler
> "test:.*lu52i\.d.*\n\taddi\.w.*\n\.L2:" } } */
> +
> +
> +extern long long b[10];
> +static inline long long
> +repeat_bytes (void)
> +{
> +  long long r = 0x0101010101010101;
> +
> +  return r;
> +}
> +
> +static inline long long
> +highbit_mask (long long m)
> +{
> +  return m & repeat_bytes ();
> +}
> +
> +void test(long long *a)
> +{
> +  for (int i = 0; i < 10; i++)
> +    b[i] = highbit_mask (a[i]);
> +
> +}

-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v4] LoongArch: Optimize immediate load.
  2022-11-22 14:03 ` Xi Ruoyao
@ 2022-11-22 16:44   ` Xi Ruoyao
  2022-11-23  2:12     ` chenglulu
  2022-11-28  2:46     ` [pushed][PATCH " Lulu Cheng
  0 siblings, 2 replies; 5+ messages in thread
From: Xi Ruoyao @ 2022-11-22 16:44 UTC (permalink / raw)
  To: Lulu Cheng, gcc-patches; +Cc: i, xuchenghua

On Tue, 2022-11-22 at 22:03 +0800, Xi Ruoyao via Gcc-patches wrote:
> While I still can't fully understand the immediate load issue and how
> this patch fix it, I've tested this patch (alongside the prefetch
> instruction patch) with bootstrap-ubsan.  And the compiled result of
> imm-load1.c seems OK.

And it's doing correct thing for Glibc "improved generic string
functions" patch, producing some really tight loop now.

> 
> On Thu, 2022-11-17 at 17:59 +0800, Lulu Cheng wrote:
> > v1 -> v2:
> > 1. Change the code format.
> > 2. Fix bugs in the code.
> > 
> > v2 -> v3:
> > Modifying a code implementation of an undefined behavior.
> > 
> > v3 -> v4:
> > Move the part of the immediate number decomposition from expand pass
> > to split
> > pass.
> > 
> > Both regression tests and spec2006 passed.
> > 
> > The problem mentioned in the link does not move the four immediate
> > load
> > instructions out of the loop. It has been optimized. Now, as in the
> > test case,
> > four immediate load instructions are generated outside the loop.
> > (
> > https://sourceware.org/pipermail/libc-alpha/2022-September/142202.html
> > )
> > 
> > --------------------------------------------------------------------
> > Because loop2_invariant pass will extract the instructions that do
> > not
> > change
> > in the loop out of the loop, some instructions will not meet the
> > extraction
> > conditions if the machine performs immediate decomposition while
> > expand pass,
> > so the immediate decomposition will be transferred to the split
> > process.
> > 
> > gcc/ChangeLog:
> > 
> >         * config/loongarch/loongarch.cc (enum
> > loongarch_load_imm_method):
> >         Remove the member METHOD_INSV that is not currently used.
> >         (struct loongarch_integer_op): Define a new member
> > curr_value,
> >         that records the value of the number stored in the
> > destination
> >         register immediately after the current instruction has run.
> >         (loongarch_build_integer): Assign a value to the curr_value
> > member variable.
> >         (loongarch_move_integer): Adds information for the immediate
> > load instruction.
> >         * config/loongarch/loongarch.md (*movdi_32bit): Redefine as
> > define_insn_and_split.
> >         (*movdi_64bit): Likewise.
> >         (*movsi_internal): Likewise.
> >         (*movhi_internal): Likewise.
> >         * config/loongarch/predicates.md: Return true as long as it
> > is
> > CONST_INT, ensure
> >         that the immediate number is not optimized by decomposition
> > during expand
> >         optimization loop.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> >         * gcc.target/loongarch/imm-load.c: New test.
> >         * gcc.target/loongarch/imm-load1.c: New test.
> > ---
> >  gcc/config/loongarch/loongarch.cc             | 62 ++++++++++------
> > --
> > -
> >  gcc/config/loongarch/loongarch.md             | 44 +++++++++++--
> >  gcc/config/loongarch/predicates.md            |  2 +-
> >  gcc/testsuite/gcc.target/loongarch/imm-load.c | 10 +++
> >  .../gcc.target/loongarch/imm-load1.c          | 26 ++++++++
> >  5 files changed, 110 insertions(+), 34 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load.c
> >  create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load1.c
> > 
> > diff --git a/gcc/config/loongarch/loongarch.cc
> > b/gcc/config/loongarch/loongarch.cc
> > index 8ee32c90573..9e0d6c7c3ea 100644
> > --- a/gcc/config/loongarch/loongarch.cc
> > +++ b/gcc/config/loongarch/loongarch.cc
> > @@ -139,22 +139,21 @@ struct loongarch_address_info
> >  
> >     METHOD_LU52I:
> >       Load 52-63 bit of the immediate number.
> > -
> > -   METHOD_INSV:
> > -     immediate like 0xfff00000fffffxxx
> > -   */
> > +*/
> >  enum loongarch_load_imm_method
> >  {
> >    METHOD_NORMAL,
> >    METHOD_LU32I,
> > -  METHOD_LU52I,
> > -  METHOD_INSV
> > +  METHOD_LU52I
> >  };
> >  
> >  struct loongarch_integer_op
> >  {
> >    enum rtx_code code;
> >    HOST_WIDE_INT value;
> > +  /* Represent the result of the immediate count of the load
> > instruction at
> > +     each step.  */
> > +  HOST_WIDE_INT curr_value;
> >    enum loongarch_load_imm_method method;
> >  };
> >  
> > @@ -1475,24 +1474,27 @@ loongarch_build_integer (struct
> > loongarch_integer_op *codes,
> >      {
> >        /* The value of the lower 32 bit be loaded with one
> > instruction.
> >          lu12i.w.  */
> > -      codes[0].code = UNKNOWN;
> > -      codes[0].method = METHOD_NORMAL;
> > -      codes[0].value = low_part;
> > +      codes[cost].code = UNKNOWN;
> > +      codes[cost].method = METHOD_NORMAL;
> > +      codes[cost].value = low_part;
> > +      codes[cost].curr_value = low_part;
> >        cost++;
> >      }
> >    else
> >      {
> >        /* lu12i.w + ior.  */
> > -      codes[0].code = UNKNOWN;
> > -      codes[0].method = METHOD_NORMAL;
> > -      codes[0].value = low_part & ~(IMM_REACH - 1);
> > +      codes[cost].code = UNKNOWN;
> > +      codes[cost].method = METHOD_NORMAL;
> > +      codes[cost].value = low_part & ~(IMM_REACH - 1);
> > +      codes[cost].curr_value = codes[cost].value;
> >        cost++;
> >        HOST_WIDE_INT iorv = low_part & (IMM_REACH - 1);
> >        if (iorv != 0)
> >         {
> > -         codes[1].code = IOR;
> > -         codes[1].method = METHOD_NORMAL;
> > -         codes[1].value = iorv;
> > +         codes[cost].code = IOR;
> > +         codes[cost].method = METHOD_NORMAL;
> > +         codes[cost].value = iorv;
> > +         codes[cost].curr_value = low_part;
> >           cost++;
> >         }
> >      }
> > @@ -1515,11 +1517,14 @@ loongarch_build_integer (struct
> > loongarch_integer_op *codes,
> >         {
> >           codes[cost].method = METHOD_LU52I;
> >           codes[cost].value = value & LU52I_B;
> > +         codes[cost].curr_value = value;
> >           return cost + 1;
> >         }
> >  
> >        codes[cost].method = METHOD_LU32I;
> >        codes[cost].value = (value & LU32I_B) | (sign51 ? LU52I_B :
> > 0);
> > +      codes[cost].curr_value = (value & 0xfffffffffffff)
> > +       | (sign51 ? LU52I_B : 0);
> >        cost++;
> >  
> >        /* Determine whether the 52-61 bits are sign-extended from
> > the
> > low order,
> > @@ -1528,6 +1533,7 @@ loongarch_build_integer (struct
> > loongarch_integer_op *codes,
> >         {
> >           codes[cost].method = METHOD_LU52I;
> >           codes[cost].value = value & LU52I_B;
> > +         codes[cost].curr_value = value;
> >           cost++;
> >         }
> >      }
> > @@ -2911,6 +2917,9 @@ loongarch_move_integer (rtx temp, rtx dest,
> > unsigned HOST_WIDE_INT value)
> >        else
> >         x = force_reg (mode, x);
> >  
> > +      set_unique_reg_note (get_last_insn (), REG_EQUAL,
> > +                          GEN_INT (codes[i-1].curr_value));
> > +
> >        switch (codes[i].method)
> >         {
> >         case METHOD_NORMAL:
> > @@ -2918,22 +2927,17 @@ loongarch_move_integer (rtx temp, rtx dest,
> > unsigned HOST_WIDE_INT value)
> >                               GEN_INT (codes[i].value));
> >           break;
> >         case METHOD_LU32I:
> > -         emit_insn (
> > -           gen_rtx_SET (x,
> > -                        gen_rtx_IOR (DImode,
> > -                                     gen_rtx_ZERO_EXTEND (
> > -                                       DImode, gen_rtx_SUBREG
> > (SImode, x, 0)),
> > -                                     GEN_INT (codes[i].value))));
> > +         gcc_assert (mode == DImode);
> > +         x = gen_rtx_IOR (DImode,
> > +                          gen_rtx_ZERO_EXTEND (DImode,
> > +                                               gen_rtx_SUBREG
> > (SImode, x, 0)),
> > +                          GEN_INT (codes[i].value));
> >           break;
> >         case METHOD_LU52I:
> > -         emit_insn (gen_lu52i_d (x, x, GEN_INT (0xfffffffffffff),
> > -                                 GEN_INT (codes[i].value)));
> > -         break;
> > -       case METHOD_INSV:
> > -         emit_insn (
> > -           gen_rtx_SET (gen_rtx_ZERO_EXTRACT (DImode, x, GEN_INT
> > (20),
> > -                                              GEN_INT (32)),
> > -                        gen_rtx_REG (DImode, 0)));
> > +         gcc_assert (mode == DImode);
> > +         x = gen_rtx_IOR (DImode,
> > +                          gen_rtx_AND (DImode, x, GEN_INT
> > (0xfffffffffffff)),
> > +                          GEN_INT (codes[i].value));
> >           break;
> >         default:
> >           gcc_unreachable ();
> > diff --git a/gcc/config/loongarch/loongarch.md
> > b/gcc/config/loongarch/loongarch.md
> > index 2fda5381904..f61db66d535 100644
> > --- a/gcc/config/loongarch/loongarch.md
> > +++ b/gcc/config/loongarch/loongarch.md
> > @@ -1718,23 +1718,41 @@ (define_expand "movdi"
> >      DONE;
> >  })
> >  
> > -(define_insn "*movdi_32bit"
> > +(define_insn_and_split "*movdi_32bit"
> >    [(set (match_operand:DI 0 "nonimmediate_operand"
> > "=r,r,r,w,*f,*f,*r,*m")
> >         (match_operand:DI 1 "move_operand"
> > "r,i,w,r,*J*r,*m,*f,*f"))]
> >    "!TARGET_64BIT
> >     && (register_operand (operands[0], DImode)
> >         || reg_or_0_operand (operands[1], DImode))"
> >    { return loongarch_output_move (operands[0], operands[1]); }
> > +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
> > (REGNO
> > +  (operands[0]))"
> > +  [(const_int 0)]
> > +  "
> > +{
> > +  loongarch_move_integer (operands[0], operands[0], INTVAL
> > (operands[1]));
> > +  DONE;
> > +}
> > +  "
> >    [(set_attr "move_type"
> > "move,const,load,store,mgtf,fpload,mftg,fpstore")
> >     (set_attr "mode" "DI")])
> >  
> > -(define_insn "*movdi_64bit"
> > +(define_insn_and_split "*movdi_64bit"
> >    [(set (match_operand:DI 0 "nonimmediate_operand"
> > "=r,r,r,w,*f,*f,*r,*m")
> >         (match_operand:DI 1 "move_operand"
> > "r,Yd,w,rJ,*r*J,*m,*f,*f"))]
> >    "TARGET_64BIT
> >     && (register_operand (operands[0], DImode)
> >         || reg_or_0_operand (operands[1], DImode))"
> >    { return loongarch_output_move (operands[0], operands[1]); }
> > +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
> > (REGNO
> > +  (operands[0]))"
> > +  [(const_int 0)]
> > +  "
> > +{
> > +  loongarch_move_integer (operands[0], operands[0], INTVAL
> > (operands[1]));
> > +  DONE;
> > +}
> > +  "
> >    [(set_attr "move_type"
> > "move,const,load,store,mgtf,fpload,mftg,fpstore")
> >     (set_attr "mode" "DI")])
> >  
> > @@ -1749,12 +1767,21 @@ (define_expand "movsi"
> >      DONE;
> >  })
> >  
> > -(define_insn "*movsi_internal"
> > +(define_insn_and_split "*movsi_internal"
> >    [(set (match_operand:SI 0 "nonimmediate_operand"
> > "=r,r,r,w,*f,*f,*r,*m,*r,*z")
> >         (match_operand:SI 1 "move_operand"
> > "r,Yd,w,rJ,*r*J,*m,*f,*f,*z,*r"))]
> >    "(register_operand (operands[0], SImode)
> >      || reg_or_0_operand (operands[1], SImode))"
> >    { return loongarch_output_move (operands[0], operands[1]); }
> > +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
> > (REGNO
> > +  (operands[0]))"
> > +  [(const_int 0)]
> > +  "
> > +{
> > +  loongarch_move_integer (operands[0], operands[0], INTVAL
> > (operands[1]));
> > +  DONE;
> > +}
> > +  "
> >    [(set_attr "move_type"
> > "move,const,load,store,mgtf,fpload,mftg,fpstore,mftg,mgtf")
> >     (set_attr "mode" "SI")])
> >  
> > @@ -1774,12 +1801,21 @@ (define_expand "movhi"
> >      DONE;
> >  })
> >  
> > -(define_insn "*movhi_internal"
> > +(define_insn_and_split "*movhi_internal"
> >    [(set (match_operand:HI 0 "nonimmediate_operand"
> > "=r,r,r,r,m,r,k")
> >         (match_operand:HI 1 "move_operand" "r,Yd,I,m,rJ,k,rJ"))]
> >    "(register_operand (operands[0], HImode)
> >         || reg_or_0_operand (operands[1], HImode))"
> >    { return loongarch_output_move (operands[0], operands[1]); }
> > +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
> > (REGNO
> > +  (operands[0]))"
> > +  [(const_int 0)]
> > +  "
> > +{
> > +  loongarch_move_integer (operands[0], operands[0], INTVAL
> > (operands[1]));
> > +  DONE;
> > +}
> > +  "
> >    [(set_attr "move_type" "move,const,const,load,store,load,store")
> >     (set_attr "mode" "HI")])
> >  
> > diff --git a/gcc/config/loongarch/predicates.md
> > b/gcc/config/loongarch/predicates.md
> > index 8bd0c1376c9..58c3dc2261c 100644
> > --- a/gcc/config/loongarch/predicates.md
> > +++ b/gcc/config/loongarch/predicates.md
> > @@ -226,7 +226,7 @@ (define_predicate "move_operand"
> >    switch (GET_CODE (op))
> >      {
> >      case CONST_INT:
> > -      return !splittable_const_int_operand (op, mode);
> > +      return true;
> >  
> >      case CONST:
> >      case SYMBOL_REF:
> > diff --git a/gcc/testsuite/gcc.target/loongarch/imm-load.c
> > b/gcc/testsuite/gcc.target/loongarch/imm-load.c
> > new file mode 100644
> > index 00000000000..c04ca33996f
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/loongarch/imm-load.c
> > @@ -0,0 +1,10 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mabi=lp64d -O2 -fdump-rtl-split1" } */
> > +
> > +long int
> > +test (void)
> > +{
> > +  return 0x1234567890abcdef;
> > +}
> > +/* { dg-final { scan-rtl-dump-times "scanning new insn with uid" 6
> > "split1" } } */
> > +
> > diff --git a/gcc/testsuite/gcc.target/loongarch/imm-load1.c
> > b/gcc/testsuite/gcc.target/loongarch/imm-load1.c
> > new file mode 100644
> > index 00000000000..2ff02971239
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/loongarch/imm-load1.c
> > @@ -0,0 +1,26 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mabi=lp64d -O2" } */
> > +/* { dg-final { scan-assembler
> > "test:.*lu52i\.d.*\n\taddi\.w.*\n\.L2:" } } */
> > +
> > +
> > +extern long long b[10];
> > +static inline long long
> > +repeat_bytes (void)
> > +{
> > +  long long r = 0x0101010101010101;
> > +
> > +  return r;
> > +}
> > +
> > +static inline long long
> > +highbit_mask (long long m)
> > +{
> > +  return m & repeat_bytes ();
> > +}
> > +
> > +void test(long long *a)
> > +{
> > +  for (int i = 0; i < 10; i++)
> > +    b[i] = highbit_mask (a[i]);
> > +
> > +}
> 

-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v4] LoongArch: Optimize immediate load.
  2022-11-22 16:44   ` Xi Ruoyao
@ 2022-11-23  2:12     ` chenglulu
  2022-11-28  2:46     ` [pushed][PATCH " Lulu Cheng
  1 sibling, 0 replies; 5+ messages in thread
From: chenglulu @ 2022-11-23  2:12 UTC (permalink / raw)
  To: Xi Ruoyao, gcc-patches; +Cc: i, xuchenghua

[-- Attachment #1: Type: text/plain, Size: 2082 bytes --]


在 2022/11/23 00:44, Xi Ruoyao 写道:
>> While I still can't fully understand the immediate load issue and how
>> this patch fix it, I've tested this patch (alongside the prefetch
>> instruction patch) with bootstrap-ubsan.  And the compiled result of
>> imm-load1.c seems OK.
> And it's doing correct thing for Glibc "improved generic string
> functions" patch, producing some really tight loop now.
>
In the process of debugging, I found this,bringing the immediate number 
load instruction out of the loop is done in loop2_invariant optimization.

One of the conditions for extraction is that the destination register 
cannot be used more than once, and the sequence before it was modified 
was like this：

(insn 12 11 13 3 (set (reg:DI 90)
         (const_int 16842752 [0x1010000])) "test.c":13:12 discrim 1 131 
{*movdi_64bit}
      (nil))
(insn 13 12 14 3 (set (reg:DI 91)
         (ior:DI (reg:DI 90)
             (const_int 257 [0x101]))) "test.c":13:12 discrim 1 88 {iordi3}
      (expr_list:REG_DEAD (reg:DI 90)
         (expr_list:REG_EQUAL (const_int 16843009 [0x1010101])
             (nil))))

(insn 14 13 15 3 (set (reg:DI 91)
         (ior:DI (zero_extend:DI (subreg:SI (reg:DI 91) 0))
             (const_int 282578783305728 [0x1010100000000]))) 
"test.c":13:12 discrim 1 150 {lu32i_d}
      (expr_list:REG_EQUAL (const_int 282578800148737 [0x1010101010101])
         (nil)))
(insn 15 14 17 3 (set (reg:DI 91)
         (ior:DI (and:DI (reg:DI 91)
                 (const_int 4503599627370495 [0xfffffffffffff]))
             (const_int 72057594037927936 [0x100000000000000]))) 
"test.c":13:12 discrim 1 151 {lu52i_d}
      (expr_list:REG_EQUAL (const_int 72340172838076673 [0x101010101010101])
         (nil)))

Therefore, the last two instructions do not meet the extraction conditions.

But because of the implementation of our instructions, I freed myself up 
immediately to do it loop2_invariant later, so I avoided this problem.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [pushed][PATCH v4] LoongArch: Optimize immediate load.
  2022-11-22 16:44   ` Xi Ruoyao
  2022-11-23  2:12     ` chenglulu
@ 2022-11-28  2:46     ` Lulu Cheng
  1 sibling, 0 replies; 5+ messages in thread
From: Lulu Cheng @ 2022-11-28  2:46 UTC (permalink / raw)
  To: Xi Ruoyao, gcc-patches; +Cc: i, xuchenghua

Pushed r13-4315.

在 2022/11/23 上午12:44, Xi Ruoyao 写道:
> On Tue, 2022-11-22 at 22:03 +0800, Xi Ruoyao via Gcc-patches wrote:
>> While I still can't fully understand the immediate load issue and how
>> this patch fix it, I've tested this patch (alongside the prefetch
>> instruction patch) with bootstrap-ubsan.  And the compiled result of
>> imm-load1.c seems OK.
> And it's doing correct thing for Glibc "improved generic string
> functions" patch, producing some really tight loop now.
>
>> On Thu, 2022-11-17 at 17:59 +0800, Lulu Cheng wrote:
>>> v1 -> v2:
>>> 1. Change the code format.
>>> 2. Fix bugs in the code.
>>>
>>> v2 -> v3:
>>> Modifying a code implementation of an undefined behavior.
>>>
>>> v3 -> v4:
>>> Move the part of the immediate number decomposition from expand pass
>>> to split
>>> pass.
>>>
>>> Both regression tests and spec2006 passed.
>>>
>>> The problem mentioned in the link does not move the four immediate
>>> load
>>> instructions out of the loop. It has been optimized. Now, as in the
>>> test case,
>>> four immediate load instructions are generated outside the loop.
>>> (
>>> https://sourceware.org/pipermail/libc-alpha/2022-September/142202.html
>>> )
>>>
>>> --------------------------------------------------------------------
>>> Because loop2_invariant pass will extract the instructions that do
>>> not
>>> change
>>> in the loop out of the loop, some instructions will not meet the
>>> extraction
>>> conditions if the machine performs immediate decomposition while
>>> expand pass,
>>> so the immediate decomposition will be transferred to the split
>>> process.
>>>
>>> gcc/ChangeLog:
>>>
>>>          * config/loongarch/loongarch.cc (enum
>>> loongarch_load_imm_method):
>>>          Remove the member METHOD_INSV that is not currently used.
>>>          (struct loongarch_integer_op): Define a new member
>>> curr_value,
>>>          that records the value of the number stored in the
>>> destination
>>>          register immediately after the current instruction has run.
>>>          (loongarch_build_integer): Assign a value to the curr_value
>>> member variable.
>>>          (loongarch_move_integer): Adds information for the immediate
>>> load instruction.
>>>          * config/loongarch/loongarch.md (*movdi_32bit): Redefine as
>>> define_insn_and_split.
>>>          (*movdi_64bit): Likewise.
>>>          (*movsi_internal): Likewise.
>>>          (*movhi_internal): Likewise.
>>>          * config/loongarch/predicates.md: Return true as long as it
>>> is
>>> CONST_INT, ensure
>>>          that the immediate number is not optimized by decomposition
>>> during expand
>>>          optimization loop.
>>>
>>> gcc/testsuite/ChangeLog:
>>>
>>>          * gcc.target/loongarch/imm-load.c: New test.
>>>          * gcc.target/loongarch/imm-load1.c: New test.
>>> ---
>>>   gcc/config/loongarch/loongarch.cc             | 62 ++++++++++------
>>> --
>>> -
>>>   gcc/config/loongarch/loongarch.md             | 44 +++++++++++--
>>>   gcc/config/loongarch/predicates.md            |  2 +-
>>>   gcc/testsuite/gcc.target/loongarch/imm-load.c | 10 +++
>>>   .../gcc.target/loongarch/imm-load1.c          | 26 ++++++++
>>>   5 files changed, 110 insertions(+), 34 deletions(-)
>>>   create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load.c
>>>   create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load1.c
>>>
>>> diff --git a/gcc/config/loongarch/loongarch.cc
>>> b/gcc/config/loongarch/loongarch.cc
>>> index 8ee32c90573..9e0d6c7c3ea 100644
>>> --- a/gcc/config/loongarch/loongarch.cc
>>> +++ b/gcc/config/loongarch/loongarch.cc
>>> @@ -139,22 +139,21 @@ struct loongarch_address_info
>>>   
>>>      METHOD_LU52I:
>>>        Load 52-63 bit of the immediate number.
>>> -
>>> -   METHOD_INSV:
>>> -     immediate like 0xfff00000fffffxxx
>>> -   */
>>> +*/
>>>   enum loongarch_load_imm_method
>>>   {
>>>     METHOD_NORMAL,
>>>     METHOD_LU32I,
>>> -  METHOD_LU52I,
>>> -  METHOD_INSV
>>> +  METHOD_LU52I
>>>   };
>>>   
>>>   struct loongarch_integer_op
>>>   {
>>>     enum rtx_code code;
>>>     HOST_WIDE_INT value;
>>> +  /* Represent the result of the immediate count of the load
>>> instruction at
>>> +     each step.  */
>>> +  HOST_WIDE_INT curr_value;
>>>     enum loongarch_load_imm_method method;
>>>   };
>>>   
>>> @@ -1475,24 +1474,27 @@ loongarch_build_integer (struct
>>> loongarch_integer_op *codes,
>>>       {
>>>         /* The value of the lower 32 bit be loaded with one
>>> instruction.
>>>           lu12i.w.  */
>>> -      codes[0].code = UNKNOWN;
>>> -      codes[0].method = METHOD_NORMAL;
>>> -      codes[0].value = low_part;
>>> +      codes[cost].code = UNKNOWN;
>>> +      codes[cost].method = METHOD_NORMAL;
>>> +      codes[cost].value = low_part;
>>> +      codes[cost].curr_value = low_part;
>>>         cost++;
>>>       }
>>>     else
>>>       {
>>>         /* lu12i.w + ior.  */
>>> -      codes[0].code = UNKNOWN;
>>> -      codes[0].method = METHOD_NORMAL;
>>> -      codes[0].value = low_part & ~(IMM_REACH - 1);
>>> +      codes[cost].code = UNKNOWN;
>>> +      codes[cost].method = METHOD_NORMAL;
>>> +      codes[cost].value = low_part & ~(IMM_REACH - 1);
>>> +      codes[cost].curr_value = codes[cost].value;
>>>         cost++;
>>>         HOST_WIDE_INT iorv = low_part & (IMM_REACH - 1);
>>>         if (iorv != 0)
>>>          {
>>> -         codes[1].code = IOR;
>>> -         codes[1].method = METHOD_NORMAL;
>>> -         codes[1].value = iorv;
>>> +         codes[cost].code = IOR;
>>> +         codes[cost].method = METHOD_NORMAL;
>>> +         codes[cost].value = iorv;
>>> +         codes[cost].curr_value = low_part;
>>>            cost++;
>>>          }
>>>       }
>>> @@ -1515,11 +1517,14 @@ loongarch_build_integer (struct
>>> loongarch_integer_op *codes,
>>>          {
>>>            codes[cost].method = METHOD_LU52I;
>>>            codes[cost].value = value & LU52I_B;
>>> +         codes[cost].curr_value = value;
>>>            return cost + 1;
>>>          }
>>>   
>>>         codes[cost].method = METHOD_LU32I;
>>>         codes[cost].value = (value & LU32I_B) | (sign51 ? LU52I_B :
>>> 0);
>>> +      codes[cost].curr_value = (value & 0xfffffffffffff)
>>> +       | (sign51 ? LU52I_B : 0);
>>>         cost++;
>>>   
>>>         /* Determine whether the 52-61 bits are sign-extended from
>>> the
>>> low order,
>>> @@ -1528,6 +1533,7 @@ loongarch_build_integer (struct
>>> loongarch_integer_op *codes,
>>>          {
>>>            codes[cost].method = METHOD_LU52I;
>>>            codes[cost].value = value & LU52I_B;
>>> +         codes[cost].curr_value = value;
>>>            cost++;
>>>          }
>>>       }
>>> @@ -2911,6 +2917,9 @@ loongarch_move_integer (rtx temp, rtx dest,
>>> unsigned HOST_WIDE_INT value)
>>>         else
>>>          x = force_reg (mode, x);
>>>   
>>> +      set_unique_reg_note (get_last_insn (), REG_EQUAL,
>>> +                          GEN_INT (codes[i-1].curr_value));
>>> +
>>>         switch (codes[i].method)
>>>          {
>>>          case METHOD_NORMAL:
>>> @@ -2918,22 +2927,17 @@ loongarch_move_integer (rtx temp, rtx dest,
>>> unsigned HOST_WIDE_INT value)
>>>                                GEN_INT (codes[i].value));
>>>            break;
>>>          case METHOD_LU32I:
>>> -         emit_insn (
>>> -           gen_rtx_SET (x,
>>> -                        gen_rtx_IOR (DImode,
>>> -                                     gen_rtx_ZERO_EXTEND (
>>> -                                       DImode, gen_rtx_SUBREG
>>> (SImode, x, 0)),
>>> -                                     GEN_INT (codes[i].value))));
>>> +         gcc_assert (mode == DImode);
>>> +         x = gen_rtx_IOR (DImode,
>>> +                          gen_rtx_ZERO_EXTEND (DImode,
>>> +                                               gen_rtx_SUBREG
>>> (SImode, x, 0)),
>>> +                          GEN_INT (codes[i].value));
>>>            break;
>>>          case METHOD_LU52I:
>>> -         emit_insn (gen_lu52i_d (x, x, GEN_INT (0xfffffffffffff),
>>> -                                 GEN_INT (codes[i].value)));
>>> -         break;
>>> -       case METHOD_INSV:
>>> -         emit_insn (
>>> -           gen_rtx_SET (gen_rtx_ZERO_EXTRACT (DImode, x, GEN_INT
>>> (20),
>>> -                                              GEN_INT (32)),
>>> -                        gen_rtx_REG (DImode, 0)));
>>> +         gcc_assert (mode == DImode);
>>> +         x = gen_rtx_IOR (DImode,
>>> +                          gen_rtx_AND (DImode, x, GEN_INT
>>> (0xfffffffffffff)),
>>> +                          GEN_INT (codes[i].value));
>>>            break;
>>>          default:
>>>            gcc_unreachable ();
>>> diff --git a/gcc/config/loongarch/loongarch.md
>>> b/gcc/config/loongarch/loongarch.md
>>> index 2fda5381904..f61db66d535 100644
>>> --- a/gcc/config/loongarch/loongarch.md
>>> +++ b/gcc/config/loongarch/loongarch.md
>>> @@ -1718,23 +1718,41 @@ (define_expand "movdi"
>>>       DONE;
>>>   })
>>>   
>>> -(define_insn "*movdi_32bit"
>>> +(define_insn_and_split "*movdi_32bit"
>>>     [(set (match_operand:DI 0 "nonimmediate_operand"
>>> "=r,r,r,w,*f,*f,*r,*m")
>>>          (match_operand:DI 1 "move_operand"
>>> "r,i,w,r,*J*r,*m,*f,*f"))]
>>>     "!TARGET_64BIT
>>>      && (register_operand (operands[0], DImode)
>>>          || reg_or_0_operand (operands[1], DImode))"
>>>     { return loongarch_output_move (operands[0], operands[1]); }
>>> +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
>>> (REGNO
>>> +  (operands[0]))"
>>> +  [(const_int 0)]
>>> +  "
>>> +{
>>> +  loongarch_move_integer (operands[0], operands[0], INTVAL
>>> (operands[1]));
>>> +  DONE;
>>> +}
>>> +  "
>>>     [(set_attr "move_type"
>>> "move,const,load,store,mgtf,fpload,mftg,fpstore")
>>>      (set_attr "mode" "DI")])
>>>   
>>> -(define_insn "*movdi_64bit"
>>> +(define_insn_and_split "*movdi_64bit"
>>>     [(set (match_operand:DI 0 "nonimmediate_operand"
>>> "=r,r,r,w,*f,*f,*r,*m")
>>>          (match_operand:DI 1 "move_operand"
>>> "r,Yd,w,rJ,*r*J,*m,*f,*f"))]
>>>     "TARGET_64BIT
>>>      && (register_operand (operands[0], DImode)
>>>          || reg_or_0_operand (operands[1], DImode))"
>>>     { return loongarch_output_move (operands[0], operands[1]); }
>>> +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
>>> (REGNO
>>> +  (operands[0]))"
>>> +  [(const_int 0)]
>>> +  "
>>> +{
>>> +  loongarch_move_integer (operands[0], operands[0], INTVAL
>>> (operands[1]));
>>> +  DONE;
>>> +}
>>> +  "
>>>     [(set_attr "move_type"
>>> "move,const,load,store,mgtf,fpload,mftg,fpstore")
>>>      (set_attr "mode" "DI")])
>>>   
>>> @@ -1749,12 +1767,21 @@ (define_expand "movsi"
>>>       DONE;
>>>   })
>>>   
>>> -(define_insn "*movsi_internal"
>>> +(define_insn_and_split "*movsi_internal"
>>>     [(set (match_operand:SI 0 "nonimmediate_operand"
>>> "=r,r,r,w,*f,*f,*r,*m,*r,*z")
>>>          (match_operand:SI 1 "move_operand"
>>> "r,Yd,w,rJ,*r*J,*m,*f,*f,*z,*r"))]
>>>     "(register_operand (operands[0], SImode)
>>>       || reg_or_0_operand (operands[1], SImode))"
>>>     { return loongarch_output_move (operands[0], operands[1]); }
>>> +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
>>> (REGNO
>>> +  (operands[0]))"
>>> +  [(const_int 0)]
>>> +  "
>>> +{
>>> +  loongarch_move_integer (operands[0], operands[0], INTVAL
>>> (operands[1]));
>>> +  DONE;
>>> +}
>>> +  "
>>>     [(set_attr "move_type"
>>> "move,const,load,store,mgtf,fpload,mftg,fpstore,mftg,mgtf")
>>>      (set_attr "mode" "SI")])
>>>   
>>> @@ -1774,12 +1801,21 @@ (define_expand "movhi"
>>>       DONE;
>>>   })
>>>   
>>> -(define_insn "*movhi_internal"
>>> +(define_insn_and_split "*movhi_internal"
>>>     [(set (match_operand:HI 0 "nonimmediate_operand"
>>> "=r,r,r,r,m,r,k")
>>>          (match_operand:HI 1 "move_operand" "r,Yd,I,m,rJ,k,rJ"))]
>>>     "(register_operand (operands[0], HImode)
>>>          || reg_or_0_operand (operands[1], HImode))"
>>>     { return loongarch_output_move (operands[0], operands[1]); }
>>> +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
>>> (REGNO
>>> +  (operands[0]))"
>>> +  [(const_int 0)]
>>> +  "
>>> +{
>>> +  loongarch_move_integer (operands[0], operands[0], INTVAL
>>> (operands[1]));
>>> +  DONE;
>>> +}
>>> +  "
>>>     [(set_attr "move_type" "move,const,const,load,store,load,store")
>>>      (set_attr "mode" "HI")])
>>>   
>>> diff --git a/gcc/config/loongarch/predicates.md
>>> b/gcc/config/loongarch/predicates.md
>>> index 8bd0c1376c9..58c3dc2261c 100644
>>> --- a/gcc/config/loongarch/predicates.md
>>> +++ b/gcc/config/loongarch/predicates.md
>>> @@ -226,7 +226,7 @@ (define_predicate "move_operand"
>>>     switch (GET_CODE (op))
>>>       {
>>>       case CONST_INT:
>>> -      return !splittable_const_int_operand (op, mode);
>>> +      return true;
>>>   
>>>       case CONST:
>>>       case SYMBOL_REF:
>>> diff --git a/gcc/testsuite/gcc.target/loongarch/imm-load.c
>>> b/gcc/testsuite/gcc.target/loongarch/imm-load.c
>>> new file mode 100644
>>> index 00000000000..c04ca33996f
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/loongarch/imm-load.c
>>> @@ -0,0 +1,10 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-mabi=lp64d -O2 -fdump-rtl-split1" } */
>>> +
>>> +long int
>>> +test (void)
>>> +{
>>> +  return 0x1234567890abcdef;
>>> +}
>>> +/* { dg-final { scan-rtl-dump-times "scanning new insn with uid" 6
>>> "split1" } } */
>>> +
>>> diff --git a/gcc/testsuite/gcc.target/loongarch/imm-load1.c
>>> b/gcc/testsuite/gcc.target/loongarch/imm-load1.c
>>> new file mode 100644
>>> index 00000000000..2ff02971239
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/loongarch/imm-load1.c
>>> @@ -0,0 +1,26 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-mabi=lp64d -O2" } */
>>> +/* { dg-final { scan-assembler
>>> "test:.*lu52i\.d.*\n\taddi\.w.*\n\.L2:" } } */
>>> +
>>> +
>>> +extern long long b[10];
>>> +static inline long long
>>> +repeat_bytes (void)
>>> +{
>>> +  long long r = 0x0101010101010101;
>>> +
>>> +  return r;
>>> +}
>>> +
>>> +static inline long long
>>> +highbit_mask (long long m)
>>> +{
>>> +  return m & repeat_bytes ();
>>> +}
>>> +
>>> +void test(long long *a)
>>> +{
>>> +  for (int i = 0; i < 10; i++)
>>> +    b[i] = highbit_mask (a[i]);
>>> +
>>> +}


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-11-28  2:46 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-17  9:59 [PATCH v4] LoongArch: Optimize immediate load Lulu Cheng
2022-11-22 14:03 ` Xi Ruoyao
2022-11-22 16:44   ` Xi Ruoyao
2022-11-23  2:12     ` chenglulu
2022-11-28  2:46     ` [pushed][PATCH " Lulu Cheng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).