public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH v1 0/2] Optimize immediate load. Add prefetch insns.
@ 2022-10-29  7:05 Lulu Cheng
  2022-10-29  7:05 ` [PATCH v1 1/2] LoongArch: Optimize immediate load Lulu Cheng
  2022-10-29  7:05 ` [PATCH v1 2/2] LoongArch: Add prefetch insns Lulu Cheng
  0 siblings, 2 replies; 4+ messages in thread
From: Lulu Cheng @ 2022-10-29  7:05 UTC (permalink / raw)
  To: gcc-patches; +Cc: xry111, i, xuchenghua, Lulu Cheng

1. The problem mentioned in the link does not move the four immediate load
   instructions out of the loop. It has been optimized. Now, as in the test case,
   four immediate load instructions are generated outside the loop.
   (https://sourceware.org/pipermail/libc-alpha/2022-September/142202.html)

2. Add prefetch insns.

*** BLURB HERE ***

Lulu Cheng (2):
  LoongArch: Optimize immediate load.
  LoongArch: Add prefetch insns.

 gcc/config/loongarch/constraints.md           |   7 +-
 gcc/config/loongarch/loongarch-def.c          |   2 +
 gcc/config/loongarch/loongarch-protos.h       |   1 +
 gcc/config/loongarch/loongarch-tune.h         |   1 +
 gcc/config/loongarch/loongarch.cc             | 143 ++++++++++++++----
 gcc/config/loongarch/loongarch.h              |   6 +
 gcc/config/loongarch/loongarch.md             |  49 ++++++
 gcc/config/loongarch/predicates.md            |   8 +
 gcc/testsuite/gcc.target/loongarch/imm-load.c |  25 +++
 9 files changed, 208 insertions(+), 34 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load.c

-- 
2.31.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH v1 1/2] LoongArch: Optimize immediate load.
  2022-10-29  7:05 [PATCH v1 0/2] Optimize immediate load. Add prefetch insns Lulu Cheng
@ 2022-10-29  7:05 ` Lulu Cheng
  2022-10-29  7:05 ` [PATCH v1 2/2] LoongArch: Add prefetch insns Lulu Cheng
  1 sibling, 0 replies; 4+ messages in thread
From: Lulu Cheng @ 2022-10-29  7:05 UTC (permalink / raw)
  To: gcc-patches; +Cc: xry111, i, xuchenghua, Lulu Cheng

Fixed an issue where the compiler would not take four 64-bit immediate
load instructions out of the loop.

gcc/ChangeLog:

	* config/loongarch/constraints.md (x): New constraint.
	* config/loongarch/loongarch.cc (struct loongarch_integer_op):
	Define a new member curr_value, that records the value of
	the number stored in the destination register immediately
	after the current instruction has run.
	(loongarch_build_integer): Adds a method to load the immediate
	32-bit to 63-bit field.
	(loongarch_move_integer): Same as above.
	* config/loongarch/loongarch.h (HWIT_UC_0xFFFFFFFF):
	(HI32_OPERAND): NEW macro.
	* config/loongarch/loongarch.md (load_hi32):New template.
	* config/loongarch/predicates.md (const_hi32_operand): Determines
	whether the value is an immediate number that has a value of only
	the higher 32 bits.
	(hi32_mask_operand): Immediately counts the mask of 32 to 61 bits.

gcc/testsuite/ChangeLog:

	* gcc.target/loongarch/imm-load.c: New test.
---
 gcc/config/loongarch/constraints.md           |  7 +-
 gcc/config/loongarch/loongarch.cc             | 95 ++++++++++++-------
 gcc/config/loongarch/loongarch.h              |  6 ++
 gcc/config/loongarch/loongarch.md             | 26 +++++
 gcc/config/loongarch/predicates.md            |  8 ++
 gcc/testsuite/gcc.target/loongarch/imm-load.c | 25 +++++
 6 files changed, 133 insertions(+), 34 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load.c

diff --git a/gcc/config/loongarch/constraints.md b/gcc/config/loongarch/constraints.md
index 43cb7b5f0f5..1dcf09ce5eb 100644
--- a/gcc/config/loongarch/constraints.md
+++ b/gcc/config/loongarch/constraints.md
@@ -46,7 +46,7 @@
 ;; "u" "A signed 52bit constant and low 32-bit is zero (for logic instructions)"
 ;; "v" "A signed 64-bit constant and low 44-bit is zero (for logic instructions)."
 ;; "w" "Matches any valid memory."
-;; "x" <-----unused
+;; "x" "A signed 64-bit constant and low 32-bit is zero (for logic instructions)."
 ;; "y" <-----unused
 ;; "z" FCC_REGS
 ;; "A" <-----unused
@@ -139,6 +139,11 @@ (define_constraint "v"
   (and (match_code "const_int")
        (match_test "LU52I_OPERAND (ival)")))
 
+(define_constraint "x"
+  "A signed 64-bit constant and low 32-bit is zero (for logic instructions)."
+  (and (match_code "const_int")
+       (match_test "HI32_OPERAND (ival)")))
+
 (define_register_constraint "z" "FCC_REGS"
   "A floating-point condition code register.")
 
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index f54c233f90c..5e8cd293645 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -139,6 +139,9 @@ struct loongarch_address_info
    METHOD_LU52I:
      Load 52-63 bit of the immediate number.
 
+   METHOD_LD_HI32:
+     Load 32-63 bit of the immediate number.
+
    METHOD_INSV:
      immediate like 0xfff00000fffffxxx
    */
@@ -147,13 +150,18 @@ enum loongarch_load_imm_method
   METHOD_NORMAL,
   METHOD_LU32I,
   METHOD_LU52I,
+  METHOD_LD_HI32,
   METHOD_INSV
 };
 
 struct loongarch_integer_op
 {
   enum rtx_code code;
+  /* Current Immediate Count The immediate count of the load instruction.  */
   HOST_WIDE_INT value;
+  /* Represent the result of the immediate count of the load instruction at
+     each step.  */
+  HOST_WIDE_INT curr_value;
   enum loongarch_load_imm_method method;
 };
 
@@ -1474,24 +1482,27 @@ loongarch_build_integer (struct loongarch_integer_op *codes,
     {
       /* The value of the lower 32 bit be loaded with one instruction.
 	 lu12i.w.  */
-      codes[0].code = UNKNOWN;
-      codes[0].method = METHOD_NORMAL;
-      codes[0].value = low_part;
+      codes[cost].code = UNKNOWN;
+      codes[cost].method = METHOD_NORMAL;
+      codes[cost].value = low_part;
+      codes[cost].curr_value = low_part;
       cost++;
     }
   else
     {
       /* lu12i.w + ior.  */
-      codes[0].code = UNKNOWN;
-      codes[0].method = METHOD_NORMAL;
-      codes[0].value = low_part & ~(IMM_REACH - 1);
+      codes[cost].code = UNKNOWN;
+      codes[cost].method = METHOD_NORMAL;
+      codes[cost].value = low_part & ~(IMM_REACH - 1);
+      codes[cost].curr_value = codes[cost].value;
       cost++;
       HOST_WIDE_INT iorv = low_part & (IMM_REACH - 1);
       if (iorv != 0)
 	{
-	  codes[1].code = IOR;
-	  codes[1].method = METHOD_NORMAL;
-	  codes[1].value = iorv;
+	  codes[cost].code = IOR;
+	  codes[cost].method = METHOD_NORMAL;
+	  codes[cost].value = iorv;
+	  codes[cost].curr_value = low_part;
 	  cost++;
 	}
     }
@@ -1514,23 +1525,34 @@ loongarch_build_integer (struct loongarch_integer_op *codes,
 	{
 	  codes[cost].method = METHOD_LU52I;
 	  codes[cost].value = value & LU52I_B;
-	  return cost + 1;
+	  codes[cost].curr_value = codes[cost].value | (codes[cost-1].curr_value &
+							0xfffffffffffff);
+	  return cost++;
 	}
 
-      codes[cost].method = METHOD_LU32I;
-      codes[cost].value = (value & LU32I_B) | (sign51 ? LU52I_B : 0);
-      cost++;
-
-      /* Determine whether the 52-61 bits are sign-extended from the low order,
-	 and if not, load the 52-61 bits.  */
-      if (!lu52i[(value & (HOST_WIDE_INT_1U << 51)) >> 51])
+      if (lu52i[sign51])
 	{
-	  codes[cost].method = METHOD_LU52I;
-	  codes[cost].value = value & LU52I_B;
+	  /* Determine whether the 52-61 bits are sign-extended from the low order.
+	     If so, the 52-61 bits of the immediate number do not need to be loaded.
+	  */
+	  codes[cost].method = METHOD_LU32I;
+	  codes[cost].value = (value & LU32I_B) | (sign51 ? LU52I_B : 0);
+	  codes[cost].curr_value = codes[cost].value | (codes[cost-1].curr_value &
+							0xffffffff);
+	  cost++;
+	}
+      else
+	{
+	  /* If the higher 32 bits of the 64bit immediate need to be loaded
+	     separately by two instructions, a false immediate load instruction
+	     load_hi32 is used to load them.  */
+	  codes[cost].method = METHOD_LD_HI32;
+	  codes[cost].value = value & 0xffffffff00000000;
+	  codes[cost].curr_value = codes[cost].value | (codes[cost-1].curr_value &
+							0xffffffff);
 	  cost++;
 	}
     }
-
   gcc_assert (cost <= LARCH_MAX_INTEGER_OPS);
 
   return cost;
@@ -2910,30 +2932,37 @@ loongarch_move_integer (rtx temp, rtx dest, unsigned HOST_WIDE_INT value)
       else
 	x = force_reg (mode, x);
 
+      set_unique_reg_note (get_last_insn (), REG_EQUAL, GEN_INT (codes[i-1].curr_value));
+
       switch (codes[i].method)
 	{
 	case METHOD_NORMAL:
+	  /* mov or ior.  */
 	  x = gen_rtx_fmt_ee (codes[i].code, mode, x,
 			      GEN_INT (codes[i].value));
 	  break;
 	case METHOD_LU32I:
-	  emit_insn (
-	    gen_rtx_SET (x,
-			 gen_rtx_IOR (DImode,
-				      gen_rtx_ZERO_EXTEND (
-					DImode, gen_rtx_SUBREG (SImode, x, 0)),
-				      GEN_INT (codes[i].value))));
+	  gcc_assert (mode == DImode);
+	  /* lu32i_d */
+	  x = gen_rtx_IOR (mode, gen_rtx_ZERO_EXTEND (mode,
+						gen_rtx_SUBREG (SImode, x, 0)),
+			   GEN_INT (codes[i].value));
 	  break;
 	case METHOD_LU52I:
-	  emit_insn (gen_lu52i_d (x, x, GEN_INT (0xfffffffffffff),
-				  GEN_INT (codes[i].value)));
+	  gcc_assert (mode == DImode);
+	  /* lu52i_d */
+	  x = gen_rtx_IOR (mode, gen_rtx_AND (mode, x, GEN_INT (0xfffffffffffff)),
+			   GEN_INT (codes[i].value));
 	  break;
-	case METHOD_INSV:
-	  emit_insn (
-	    gen_rtx_SET (gen_rtx_ZERO_EXTRACT (DImode, x, GEN_INT (20),
-					       GEN_INT (32)),
-			 gen_rtx_REG (DImode, 0)));
+	case METHOD_LD_HI32:
+	  /* Load the high 32 bits of the immediate number.  */
+	  gcc_assert (mode == DImode);
+	  /* load_hi32 */
+	  x = gen_rtx_IOR (mode, gen_rtx_AND (mode, x, GEN_INT (0xffffffff)),
+			   GEN_INT (codes[i].value));
 	  break;
+	case METHOD_INSV:
+	  /* It is not currently implemented.  */
 	default:
 	  gcc_unreachable ();
 	}
diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
index f4a9c329fef..cfc046f546e 100644
--- a/gcc/config/loongarch/loongarch.h
+++ b/gcc/config/loongarch/loongarch.h
@@ -605,6 +605,12 @@ enum reg_class
 #define LU52I_OPERAND(VALUE) \
   (((VALUE) | (HWIT_UC_0xFFF << 52)) == (HWIT_UC_0xFFF << 52))
 
+/* True if VALUE can be loaded into a register using load_hi32.  */
+
+#define HWIT_UC_0xFFFFFFFF HOST_WIDE_INT_UC(0xffffffff)
+#define HI32_OPERAND(VALUE) \
+  (((VALUE) | (HWIT_UC_0xFFFFFFFF << 32)) == (HWIT_UC_0xFFFFFFFF << 32))
+
 /* Return a value X with the low 12 bits clear, and such that
    VALUE - X is a signed 12-bit value.  */
 
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
index 214b14bddd3..7eaa9ab66e3 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -1882,6 +1882,32 @@ (define_expand "mov<mode>cc"
   DONE;
 })
 
+(define_insn_and_split "load_hi32"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(ior:DI
+	  (and:DI (match_operand:DI 1 "register_operand" "0")
+		  (match_operand 2 "hi32_mask_operand"))
+	(match_operand 3 "const_hi32_operand" "x")))]
+  "TARGET_64BIT"
+  "#"
+  ""
+  [(set (match_dup 0)
+        (ior:DI
+          (zero_extend:DI
+            (subreg:SI (match_dup 1) 0))
+          (match_dup 4)))
+   (set (match_dup 0)
+        (ior:DI
+          (and:DI (match_dup 0)
+                  (match_dup 6))
+          (match_dup 5)))]
+{
+  operands[4] = GEN_INT (INTVAL (operands[3]) << 12 >> 12);
+  operands[5] = GEN_INT (INTVAL (operands[3]) & 0xfff0000000000000);
+  operands[6] = GEN_INT (0xfffffffffffff);
+}
+  [(set_attr "insn_count" "2")])
+
 (define_insn "lu32i_d"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(ior:DI
diff --git a/gcc/config/loongarch/predicates.md b/gcc/config/loongarch/predicates.md
index 8bd0c1376c9..29d81ff0250 100644
--- a/gcc/config/loongarch/predicates.md
+++ b/gcc/config/loongarch/predicates.md
@@ -35,6 +35,10 @@ (define_predicate "const_lu52i_operand"
   (and (match_code "const_int")
        (match_test "LU52I_OPERAND (INTVAL (op))")))
 
+(define_predicate "const_hi32_operand"
+  (and (match_code "const_int")
+       (match_test "HI32_OPERAND (INTVAL (op))")))
+
 (define_predicate "const_arith_operand"
   (and (match_code "const_int")
        (match_test "IMM12_OPERAND (INTVAL (op))")))
@@ -103,6 +107,10 @@ (define_predicate "lu52i_mask_operand"
   (and (match_code "const_int")
        (match_test "UINTVAL (op) == 0xfffffffffffff")))
 
+(define_predicate "hi32_mask_operand"
+  (and (match_code "const_int")
+       (match_test "UINTVAL (op) == 0xffffffff")))
+
 (define_predicate "low_bitmask_operand"
   (and (match_code "const_int")
        (match_test "low_bitmask_len (mode, INTVAL (op)) > 12")))
diff --git a/gcc/testsuite/gcc.target/loongarch/imm-load.c b/gcc/testsuite/gcc.target/loongarch/imm-load.c
new file mode 100644
index 00000000000..91ceb33d058
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/imm-load.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-mabi=lp64d -O2 -fdump-rtl-loop2_invariant" } */
+
+extern long long b[10];
+static inline long long
+repeat_bytes (void)
+{
+  long long r = 0x0101010101010101;
+
+  return r;
+}
+
+static inline long long
+highbit_mask (long long m)
+{
+  return m & repeat_bytes ();
+}
+
+void test(long long *a)
+{
+  for (int i = 0; i < 10; i++)
+    b[i] = highbit_mask (a[i]);
+
+}
+/* { dg-final { scan-rtl-dump-times "moved without introducing a new temporary register" 4 "loop2_invariant" } } */
-- 
2.31.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH v1 2/2] LoongArch: Add prefetch insns.
  2022-10-29  7:05 [PATCH v1 0/2] Optimize immediate load. Add prefetch insns Lulu Cheng
  2022-10-29  7:05 ` [PATCH v1 1/2] LoongArch: Optimize immediate load Lulu Cheng
@ 2022-10-29  7:05 ` Lulu Cheng
  2022-10-29  7:40   ` Lulu Cheng
  1 sibling, 1 reply; 4+ messages in thread
From: Lulu Cheng @ 2022-10-29  7:05 UTC (permalink / raw)
  To: gcc-patches; +Cc: xry111, i, xuchenghua, Lulu Cheng, xujiahao

Co-Authored-By: xujiahao <xujiahao@loongson.cn>

gcc/ChangeLog:

	* config/loongarch/loongarch-def.c: Initial number of parallel prefetch.
	* config/loongarch/loongarch-protos.h (loongarch_prefetch_cookie):
	Function declaration.
	* config/loongarch/loongarch-tune.h (struct loongarch_cache):
	Define number of parallel prefetch.
	* config/loongarch/loongarch.cc (loongarch_option_override_internal):
	Set up parameters to be used in prefetching algorithm.
	(loongarch_prefetch_cookie): Select load or store based on the value of write.
	* config/loongarch/loongarch.md (prefetch): New template.
	(*prefetch_indexed_<mode>): New template.
---
 gcc/config/loongarch/loongarch-def.c    |  2 ++
 gcc/config/loongarch/loongarch-protos.h |  1 +
 gcc/config/loongarch/loongarch-tune.h   |  1 +
 gcc/config/loongarch/loongarch.cc       | 48 +++++++++++++++++++++++++
 gcc/config/loongarch/loongarch.md       | 23 ++++++++++++
 5 files changed, 75 insertions(+)

diff --git a/gcc/config/loongarch/loongarch-def.c b/gcc/config/loongarch/loongarch-def.c
index cbf995d81b5..80ab10a52a8 100644
--- a/gcc/config/loongarch/loongarch-def.c
+++ b/gcc/config/loongarch/loongarch-def.c
@@ -62,11 +62,13 @@ loongarch_cpu_cache[N_TUNE_TYPES] = {
       .l1d_line_size = 64,
       .l1d_size = 64,
       .l2d_size = 256,
+      .simultaneous_prefetches = 4,
   },
   [CPU_LA464] = {
       .l1d_line_size = 64,
       .l1d_size = 64,
       .l2d_size = 256,
+      .simultaneous_prefetches = 4,
   },
 };
 
diff --git a/gcc/config/loongarch/loongarch-protos.h b/gcc/config/loongarch/loongarch-protos.h
index 77b2217247d..489525b520e 100644
--- a/gcc/config/loongarch/loongarch-protos.h
+++ b/gcc/config/loongarch/loongarch-protos.h
@@ -179,5 +179,6 @@ extern tree loongarch_builtin_decl (unsigned int, bool);
 extern rtx loongarch_expand_builtin (tree, rtx, rtx subtarget ATTRIBUTE_UNUSED,
 				     machine_mode, int);
 extern tree loongarch_build_builtin_va_list (void);
+extern rtx loongarch_prefetch_cookie (rtx, rtx);
 
 #endif /* ! GCC_LOONGARCH_PROTOS_H */
diff --git a/gcc/config/loongarch/loongarch-tune.h b/gcc/config/loongarch/loongarch-tune.h
index 6f3530f5c02..8e3eb29472b 100644
--- a/gcc/config/loongarch/loongarch-tune.h
+++ b/gcc/config/loongarch/loongarch-tune.h
@@ -45,6 +45,7 @@ struct loongarch_cache {
     int l1d_line_size;  /* bytes */
     int l1d_size;       /* KiB */
     int l2d_size;       /* kiB */
+    int simultaneous_prefetches; /* number of parallel prefetch */
 };
 
 #endif /* LOONGARCH_TUNE_H */
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index 5e8cd293645..d663afe434d 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -63,6 +63,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "context.h"
 #include "builtins.h"
 #include "rtl-iter.h"
+#include "params.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -6126,6 +6127,33 @@ loongarch_option_override_internal (struct gcc_options *opts)
   if (loongarch_branch_cost == 0)
     loongarch_branch_cost = loongarch_cost->branch_cost;
 
+  /* Set up parameters to be used in prefetching algorithm. */
+  maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
+			 loongarch_cpu_cache[LARCH_ACTUAL_TUNE].simultaneous_prefetches,
+			 opts->x_param_values,
+			 opts_set->x_param_values);
+
+  maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
+			 loongarch_cpu_cache[LARCH_ACTUAL_TUNE].l1d_line_size,
+			 opts->x_param_values,
+			 opts_set->x_param_values);
+
+  maybe_set_param_value (PARAM_L1_CACHE_SIZE,
+			 loongarch_cpu_cache[LARCH_ACTUAL_TUNE].l1d_size,
+			 opts->x_param_values,
+			 opts_set->x_param_values);
+
+  maybe_set_param_value (PARAM_L2_CACHE_SIZE,
+			 loongarch_cpu_cache[LARCH_ACTUAL_TUNE].l2d_size,
+			 opts->x_param_values,
+			 opts_set->x_param_values);
+
+  /* Enable sw prefetching at -O3 and higher. */
+  if (opts->x_flag_prefetch_loop_arrays < 0
+      && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
+      && !opts->x_optimize_size)
+    opts->x_flag_prefetch_loop_arrays = 1;
+
   if (TARGET_DIRECT_EXTERN_ACCESS && flag_shlib)
     error ("%qs cannot be used for compiling a shared library",
 	   "-mdirect-extern-access");
@@ -6506,6 +6534,26 @@ loongarch_asan_shadow_offset (void)
   return TARGET_64BIT ? (HOST_WIDE_INT_1 << 46) : 0;
 }
 
+/* LoongArch only implements preld hint=0 (prefetch for load) and hint=8
+   (prefetch for store), other hint just scale to hint = 0 and hint = 1. */
+
+rtx
+loongarch_prefetch_cookie (rtx write, rtx locality)
+{
+  if (INTVAL (locality) == 1 && INTVAL (write) == 0)
+    return GEN_INT (INTVAL (write) + 2);
+
+  /* store.  */
+  if (INTVAL (write) == 1)
+    return GEN_INT (INTVAL (write) + 7);
+
+  /* load.  */
+  if (INTVAL (write) == 0)
+    return GEN_INT (INTVAL (write));
+
+  gcc_unreachable ();
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
index 7eaa9ab66e3..be247164eb4 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -3201,6 +3201,29 @@ (define_expand "untyped_call"
 ;;  ....................
 ;;
 
+(define_insn "prefetch"
+  [(prefetch (match_operand 0 "address_operand" "p")
+	     (match_operand 1 "const_int_operand" "n")
+	     (match_operand 2 "const_int_operand" "n"))]
+  ""
+{
+  operands[1] = loongarch_prefetch_cookie (operands[1], operands[2]);
+  return "preld\t%1,%a0";
+}
+  [(set_attr "type" "prefetch")])
+
+(define_insn "*prefetch_indexed_<mode>"
+  [(prefetch (plus:P (match_operand 0 "register_operand" "r")
+		     (match_operand 1 "register_operand" "r"))
+	     (match_operand 2 "const_int_operand" "n")
+	     (match_operand 3 "const_int_operand" "n"))]
+  ""
+{
+  operands[2] = loongarch_prefetch_cookie (operands[2], operands[3]);
+  return "preldx\t%2,%1,%0";
+}
+  [(set_attr "type" "prefetchx")])
+
 (define_insn "nop"
   [(const_int 0)]
   ""
-- 
2.31.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v1 2/2] LoongArch: Add prefetch insns.
  2022-10-29  7:05 ` [PATCH v1 2/2] LoongArch: Add prefetch insns Lulu Cheng
@ 2022-10-29  7:40   ` Lulu Cheng
  0 siblings, 0 replies; 4+ messages in thread
From: Lulu Cheng @ 2022-10-29  7:40 UTC (permalink / raw)
  To: gcc-patches; +Cc: xry111, i, xuchenghua, xujiahao

Sorry for the problem in this patch. I will send it again after 
modification.

在 2022/10/29 下午3:05, Lulu Cheng 写道:
> Co-Authored-By: xujiahao <xujiahao@loongson.cn>
>
> gcc/ChangeLog:
>
> 	* config/loongarch/loongarch-def.c: Initial number of parallel prefetch.
> 	* config/loongarch/loongarch-protos.h (loongarch_prefetch_cookie):
> 	Function declaration.
> 	* config/loongarch/loongarch-tune.h (struct loongarch_cache):
> 	Define number of parallel prefetch.
> 	* config/loongarch/loongarch.cc (loongarch_option_override_internal):
> 	Set up parameters to be used in prefetching algorithm.
> 	(loongarch_prefetch_cookie): Select load or store based on the value of write.
> 	* config/loongarch/loongarch.md (prefetch): New template.
> 	(*prefetch_indexed_<mode>): New template.
> ---
>   gcc/config/loongarch/loongarch-def.c    |  2 ++
>   gcc/config/loongarch/loongarch-protos.h |  1 +
>   gcc/config/loongarch/loongarch-tune.h   |  1 +
>   gcc/config/loongarch/loongarch.cc       | 48 +++++++++++++++++++++++++
>   gcc/config/loongarch/loongarch.md       | 23 ++++++++++++
>   5 files changed, 75 insertions(+)
>
> diff --git a/gcc/config/loongarch/loongarch-def.c b/gcc/config/loongarch/loongarch-def.c
> index cbf995d81b5..80ab10a52a8 100644
> --- a/gcc/config/loongarch/loongarch-def.c
> +++ b/gcc/config/loongarch/loongarch-def.c
> @@ -62,11 +62,13 @@ loongarch_cpu_cache[N_TUNE_TYPES] = {
>         .l1d_line_size = 64,
>         .l1d_size = 64,
>         .l2d_size = 256,
> +      .simultaneous_prefetches = 4,
>     },
>     [CPU_LA464] = {
>         .l1d_line_size = 64,
>         .l1d_size = 64,
>         .l2d_size = 256,
> +      .simultaneous_prefetches = 4,
>     },
>   };
>   
> diff --git a/gcc/config/loongarch/loongarch-protos.h b/gcc/config/loongarch/loongarch-protos.h
> index 77b2217247d..489525b520e 100644
> --- a/gcc/config/loongarch/loongarch-protos.h
> +++ b/gcc/config/loongarch/loongarch-protos.h
> @@ -179,5 +179,6 @@ extern tree loongarch_builtin_decl (unsigned int, bool);
>   extern rtx loongarch_expand_builtin (tree, rtx, rtx subtarget ATTRIBUTE_UNUSED,
>   				     machine_mode, int);
>   extern tree loongarch_build_builtin_va_list (void);
> +extern rtx loongarch_prefetch_cookie (rtx, rtx);
>   
>   #endif /* ! GCC_LOONGARCH_PROTOS_H */
> diff --git a/gcc/config/loongarch/loongarch-tune.h b/gcc/config/loongarch/loongarch-tune.h
> index 6f3530f5c02..8e3eb29472b 100644
> --- a/gcc/config/loongarch/loongarch-tune.h
> +++ b/gcc/config/loongarch/loongarch-tune.h
> @@ -45,6 +45,7 @@ struct loongarch_cache {
>       int l1d_line_size;  /* bytes */
>       int l1d_size;       /* KiB */
>       int l2d_size;       /* kiB */
> +    int simultaneous_prefetches; /* number of parallel prefetch */
>   };
>   
>   #endif /* LOONGARCH_TUNE_H */
> diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
> index 5e8cd293645..d663afe434d 100644
> --- a/gcc/config/loongarch/loongarch.cc
> +++ b/gcc/config/loongarch/loongarch.cc
> @@ -63,6 +63,7 @@ along with GCC; see the file COPYING3.  If not see
>   #include "context.h"
>   #include "builtins.h"
>   #include "rtl-iter.h"
> +#include "params.h"
>   
>   /* This file should be included last.  */
>   #include "target-def.h"
> @@ -6126,6 +6127,33 @@ loongarch_option_override_internal (struct gcc_options *opts)
>     if (loongarch_branch_cost == 0)
>       loongarch_branch_cost = loongarch_cost->branch_cost;
>   
> +  /* Set up parameters to be used in prefetching algorithm. */
> +  maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
> +			 loongarch_cpu_cache[LARCH_ACTUAL_TUNE].simultaneous_prefetches,
> +			 opts->x_param_values,
> +			 opts_set->x_param_values);
> +
> +  maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
> +			 loongarch_cpu_cache[LARCH_ACTUAL_TUNE].l1d_line_size,
> +			 opts->x_param_values,
> +			 opts_set->x_param_values);
> +
> +  maybe_set_param_value (PARAM_L1_CACHE_SIZE,
> +			 loongarch_cpu_cache[LARCH_ACTUAL_TUNE].l1d_size,
> +			 opts->x_param_values,
> +			 opts_set->x_param_values);
> +
> +  maybe_set_param_value (PARAM_L2_CACHE_SIZE,
> +			 loongarch_cpu_cache[LARCH_ACTUAL_TUNE].l2d_size,
> +			 opts->x_param_values,
> +			 opts_set->x_param_values);
> +
> +  /* Enable sw prefetching at -O3 and higher. */
> +  if (opts->x_flag_prefetch_loop_arrays < 0
> +      && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
> +      && !opts->x_optimize_size)
> +    opts->x_flag_prefetch_loop_arrays = 1;
> +
>     if (TARGET_DIRECT_EXTERN_ACCESS && flag_shlib)
>       error ("%qs cannot be used for compiling a shared library",
>   	   "-mdirect-extern-access");
> @@ -6506,6 +6534,26 @@ loongarch_asan_shadow_offset (void)
>     return TARGET_64BIT ? (HOST_WIDE_INT_1 << 46) : 0;
>   }
>   
> +/* LoongArch only implements preld hint=0 (prefetch for load) and hint=8
> +   (prefetch for store), other hint just scale to hint = 0 and hint = 1. */
> +
> +rtx
> +loongarch_prefetch_cookie (rtx write, rtx locality)
> +{
> +  if (INTVAL (locality) == 1 && INTVAL (write) == 0)
> +    return GEN_INT (INTVAL (write) + 2);
> +
> +  /* store.  */
> +  if (INTVAL (write) == 1)
> +    return GEN_INT (INTVAL (write) + 7);
> +
> +  /* load.  */
> +  if (INTVAL (write) == 0)
> +    return GEN_INT (INTVAL (write));
> +
> +  gcc_unreachable ();
> +}
> +
>   /* Initialize the GCC target structure.  */
>   #undef TARGET_ASM_ALIGNED_HI_OP
>   #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
> index 7eaa9ab66e3..be247164eb4 100644
> --- a/gcc/config/loongarch/loongarch.md
> +++ b/gcc/config/loongarch/loongarch.md
> @@ -3201,6 +3201,29 @@ (define_expand "untyped_call"
>   ;;  ....................
>   ;;
>   
> +(define_insn "prefetch"
> +  [(prefetch (match_operand 0 "address_operand" "p")
> +	     (match_operand 1 "const_int_operand" "n")
> +	     (match_operand 2 "const_int_operand" "n"))]
> +  ""
> +{
> +  operands[1] = loongarch_prefetch_cookie (operands[1], operands[2]);
> +  return "preld\t%1,%a0";
> +}
> +  [(set_attr "type" "prefetch")])
> +
> +(define_insn "*prefetch_indexed_<mode>"
> +  [(prefetch (plus:P (match_operand 0 "register_operand" "r")
> +		     (match_operand 1 "register_operand" "r"))
> +	     (match_operand 2 "const_int_operand" "n")
> +	     (match_operand 3 "const_int_operand" "n"))]
> +  ""
> +{
> +  operands[2] = loongarch_prefetch_cookie (operands[2], operands[3]);
> +  return "preldx\t%2,%1,%0";
> +}
> +  [(set_attr "type" "prefetchx")])
> +
>   (define_insn "nop"
>     [(const_int 0)]
>     ""


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-10-29  7:40 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-29  7:05 [PATCH v1 0/2] Optimize immediate load. Add prefetch insns Lulu Cheng
2022-10-29  7:05 ` [PATCH v1 1/2] LoongArch: Optimize immediate load Lulu Cheng
2022-10-29  7:05 ` [PATCH v1 2/2] LoongArch: Add prefetch insns Lulu Cheng
2022-10-29  7:40   ` Lulu Cheng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).