* [PATCH v1 1/2] LoongArch: Optimize immediate load.
2022-10-29 7:05 [PATCH v1 0/2] Optimize immediate load. Add prefetch insns Lulu Cheng
@ 2022-10-29 7:05 ` Lulu Cheng
2022-10-29 7:05 ` [PATCH v1 2/2] LoongArch: Add prefetch insns Lulu Cheng
1 sibling, 0 replies; 4+ messages in thread
From: Lulu Cheng @ 2022-10-29 7:05 UTC (permalink / raw)
To: gcc-patches; +Cc: xry111, i, xuchenghua, Lulu Cheng
Fixed an issue where the compiler would not take four 64-bit immediate
load instructions out of the loop.
gcc/ChangeLog:
* config/loongarch/constraints.md (x): New constraint.
* config/loongarch/loongarch.cc (struct loongarch_integer_op):
Define a new member curr_value, that records the value of
the number stored in the destination register immediately
after the current instruction has run.
(loongarch_build_integer): Adds a method to load the immediate
32-bit to 63-bit field.
(loongarch_move_integer): Same as above.
* config/loongarch/loongarch.h (HWIT_UC_0xFFFFFFFF):
(HI32_OPERAND): NEW macro.
* config/loongarch/loongarch.md (load_hi32):New template.
* config/loongarch/predicates.md (const_hi32_operand): Determines
whether the value is an immediate number that has a value of only
the higher 32 bits.
(hi32_mask_operand): Immediately counts the mask of 32 to 61 bits.
gcc/testsuite/ChangeLog:
* gcc.target/loongarch/imm-load.c: New test.
---
gcc/config/loongarch/constraints.md | 7 +-
gcc/config/loongarch/loongarch.cc | 95 ++++++++++++-------
gcc/config/loongarch/loongarch.h | 6 ++
gcc/config/loongarch/loongarch.md | 26 +++++
gcc/config/loongarch/predicates.md | 8 ++
gcc/testsuite/gcc.target/loongarch/imm-load.c | 25 +++++
6 files changed, 133 insertions(+), 34 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load.c
diff --git a/gcc/config/loongarch/constraints.md b/gcc/config/loongarch/constraints.md
index 43cb7b5f0f5..1dcf09ce5eb 100644
--- a/gcc/config/loongarch/constraints.md
+++ b/gcc/config/loongarch/constraints.md
@@ -46,7 +46,7 @@
;; "u" "A signed 52bit constant and low 32-bit is zero (for logic instructions)"
;; "v" "A signed 64-bit constant and low 44-bit is zero (for logic instructions)."
;; "w" "Matches any valid memory."
-;; "x" <-----unused
+;; "x" "A signed 64-bit constant and low 32-bit is zero (for logic instructions)."
;; "y" <-----unused
;; "z" FCC_REGS
;; "A" <-----unused
@@ -139,6 +139,11 @@ (define_constraint "v"
(and (match_code "const_int")
(match_test "LU52I_OPERAND (ival)")))
+(define_constraint "x"
+ "A signed 64-bit constant and low 32-bit is zero (for logic instructions)."
+ (and (match_code "const_int")
+ (match_test "HI32_OPERAND (ival)")))
+
(define_register_constraint "z" "FCC_REGS"
"A floating-point condition code register.")
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index f54c233f90c..5e8cd293645 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -139,6 +139,9 @@ struct loongarch_address_info
METHOD_LU52I:
Load 52-63 bit of the immediate number.
+ METHOD_LD_HI32:
+ Load 32-63 bit of the immediate number.
+
METHOD_INSV:
immediate like 0xfff00000fffffxxx
*/
@@ -147,13 +150,18 @@ enum loongarch_load_imm_method
METHOD_NORMAL,
METHOD_LU32I,
METHOD_LU52I,
+ METHOD_LD_HI32,
METHOD_INSV
};
struct loongarch_integer_op
{
enum rtx_code code;
+ /* Current Immediate Count The immediate count of the load instruction. */
HOST_WIDE_INT value;
+ /* Represent the result of the immediate count of the load instruction at
+ each step. */
+ HOST_WIDE_INT curr_value;
enum loongarch_load_imm_method method;
};
@@ -1474,24 +1482,27 @@ loongarch_build_integer (struct loongarch_integer_op *codes,
{
/* The value of the lower 32 bit be loaded with one instruction.
lu12i.w. */
- codes[0].code = UNKNOWN;
- codes[0].method = METHOD_NORMAL;
- codes[0].value = low_part;
+ codes[cost].code = UNKNOWN;
+ codes[cost].method = METHOD_NORMAL;
+ codes[cost].value = low_part;
+ codes[cost].curr_value = low_part;
cost++;
}
else
{
/* lu12i.w + ior. */
- codes[0].code = UNKNOWN;
- codes[0].method = METHOD_NORMAL;
- codes[0].value = low_part & ~(IMM_REACH - 1);
+ codes[cost].code = UNKNOWN;
+ codes[cost].method = METHOD_NORMAL;
+ codes[cost].value = low_part & ~(IMM_REACH - 1);
+ codes[cost].curr_value = codes[cost].value;
cost++;
HOST_WIDE_INT iorv = low_part & (IMM_REACH - 1);
if (iorv != 0)
{
- codes[1].code = IOR;
- codes[1].method = METHOD_NORMAL;
- codes[1].value = iorv;
+ codes[cost].code = IOR;
+ codes[cost].method = METHOD_NORMAL;
+ codes[cost].value = iorv;
+ codes[cost].curr_value = low_part;
cost++;
}
}
@@ -1514,23 +1525,34 @@ loongarch_build_integer (struct loongarch_integer_op *codes,
{
codes[cost].method = METHOD_LU52I;
codes[cost].value = value & LU52I_B;
- return cost + 1;
+ codes[cost].curr_value = codes[cost].value | (codes[cost-1].curr_value &
+ 0xfffffffffffff);
+ return cost++;
}
- codes[cost].method = METHOD_LU32I;
- codes[cost].value = (value & LU32I_B) | (sign51 ? LU52I_B : 0);
- cost++;
-
- /* Determine whether the 52-61 bits are sign-extended from the low order,
- and if not, load the 52-61 bits. */
- if (!lu52i[(value & (HOST_WIDE_INT_1U << 51)) >> 51])
+ if (lu52i[sign51])
{
- codes[cost].method = METHOD_LU52I;
- codes[cost].value = value & LU52I_B;
+ /* Determine whether the 52-61 bits are sign-extended from the low order.
+ If so, the 52-61 bits of the immediate number do not need to be loaded.
+ */
+ codes[cost].method = METHOD_LU32I;
+ codes[cost].value = (value & LU32I_B) | (sign51 ? LU52I_B : 0);
+ codes[cost].curr_value = codes[cost].value | (codes[cost-1].curr_value &
+ 0xffffffff);
+ cost++;
+ }
+ else
+ {
+ /* If the higher 32 bits of the 64bit immediate need to be loaded
+ separately by two instructions, a false immediate load instruction
+ load_hi32 is used to load them. */
+ codes[cost].method = METHOD_LD_HI32;
+ codes[cost].value = value & 0xffffffff00000000;
+ codes[cost].curr_value = codes[cost].value | (codes[cost-1].curr_value &
+ 0xffffffff);
cost++;
}
}
-
gcc_assert (cost <= LARCH_MAX_INTEGER_OPS);
return cost;
@@ -2910,30 +2932,37 @@ loongarch_move_integer (rtx temp, rtx dest, unsigned HOST_WIDE_INT value)
else
x = force_reg (mode, x);
+ set_unique_reg_note (get_last_insn (), REG_EQUAL, GEN_INT (codes[i-1].curr_value));
+
switch (codes[i].method)
{
case METHOD_NORMAL:
+ /* mov or ior. */
x = gen_rtx_fmt_ee (codes[i].code, mode, x,
GEN_INT (codes[i].value));
break;
case METHOD_LU32I:
- emit_insn (
- gen_rtx_SET (x,
- gen_rtx_IOR (DImode,
- gen_rtx_ZERO_EXTEND (
- DImode, gen_rtx_SUBREG (SImode, x, 0)),
- GEN_INT (codes[i].value))));
+ gcc_assert (mode == DImode);
+ /* lu32i_d */
+ x = gen_rtx_IOR (mode, gen_rtx_ZERO_EXTEND (mode,
+ gen_rtx_SUBREG (SImode, x, 0)),
+ GEN_INT (codes[i].value));
break;
case METHOD_LU52I:
- emit_insn (gen_lu52i_d (x, x, GEN_INT (0xfffffffffffff),
- GEN_INT (codes[i].value)));
+ gcc_assert (mode == DImode);
+ /* lu52i_d */
+ x = gen_rtx_IOR (mode, gen_rtx_AND (mode, x, GEN_INT (0xfffffffffffff)),
+ GEN_INT (codes[i].value));
break;
- case METHOD_INSV:
- emit_insn (
- gen_rtx_SET (gen_rtx_ZERO_EXTRACT (DImode, x, GEN_INT (20),
- GEN_INT (32)),
- gen_rtx_REG (DImode, 0)));
+ case METHOD_LD_HI32:
+ /* Load the high 32 bits of the immediate number. */
+ gcc_assert (mode == DImode);
+ /* load_hi32 */
+ x = gen_rtx_IOR (mode, gen_rtx_AND (mode, x, GEN_INT (0xffffffff)),
+ GEN_INT (codes[i].value));
break;
+ case METHOD_INSV:
+ /* It is not currently implemented. */
default:
gcc_unreachable ();
}
diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
index f4a9c329fef..cfc046f546e 100644
--- a/gcc/config/loongarch/loongarch.h
+++ b/gcc/config/loongarch/loongarch.h
@@ -605,6 +605,12 @@ enum reg_class
#define LU52I_OPERAND(VALUE) \
(((VALUE) | (HWIT_UC_0xFFF << 52)) == (HWIT_UC_0xFFF << 52))
+/* True if VALUE can be loaded into a register using load_hi32. */
+
+#define HWIT_UC_0xFFFFFFFF HOST_WIDE_INT_UC(0xffffffff)
+#define HI32_OPERAND(VALUE) \
+ (((VALUE) | (HWIT_UC_0xFFFFFFFF << 32)) == (HWIT_UC_0xFFFFFFFF << 32))
+
/* Return a value X with the low 12 bits clear, and such that
VALUE - X is a signed 12-bit value. */
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
index 214b14bddd3..7eaa9ab66e3 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -1882,6 +1882,32 @@ (define_expand "mov<mode>cc"
DONE;
})
+(define_insn_and_split "load_hi32"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (ior:DI
+ (and:DI (match_operand:DI 1 "register_operand" "0")
+ (match_operand 2 "hi32_mask_operand"))
+ (match_operand 3 "const_hi32_operand" "x")))]
+ "TARGET_64BIT"
+ "#"
+ ""
+ [(set (match_dup 0)
+ (ior:DI
+ (zero_extend:DI
+ (subreg:SI (match_dup 1) 0))
+ (match_dup 4)))
+ (set (match_dup 0)
+ (ior:DI
+ (and:DI (match_dup 0)
+ (match_dup 6))
+ (match_dup 5)))]
+{
+ operands[4] = GEN_INT (INTVAL (operands[3]) << 12 >> 12);
+ operands[5] = GEN_INT (INTVAL (operands[3]) & 0xfff0000000000000);
+ operands[6] = GEN_INT (0xfffffffffffff);
+}
+ [(set_attr "insn_count" "2")])
+
(define_insn "lu32i_d"
[(set (match_operand:DI 0 "register_operand" "=r")
(ior:DI
diff --git a/gcc/config/loongarch/predicates.md b/gcc/config/loongarch/predicates.md
index 8bd0c1376c9..29d81ff0250 100644
--- a/gcc/config/loongarch/predicates.md
+++ b/gcc/config/loongarch/predicates.md
@@ -35,6 +35,10 @@ (define_predicate "const_lu52i_operand"
(and (match_code "const_int")
(match_test "LU52I_OPERAND (INTVAL (op))")))
+(define_predicate "const_hi32_operand"
+ (and (match_code "const_int")
+ (match_test "HI32_OPERAND (INTVAL (op))")))
+
(define_predicate "const_arith_operand"
(and (match_code "const_int")
(match_test "IMM12_OPERAND (INTVAL (op))")))
@@ -103,6 +107,10 @@ (define_predicate "lu52i_mask_operand"
(and (match_code "const_int")
(match_test "UINTVAL (op) == 0xfffffffffffff")))
+(define_predicate "hi32_mask_operand"
+ (and (match_code "const_int")
+ (match_test "UINTVAL (op) == 0xffffffff")))
+
(define_predicate "low_bitmask_operand"
(and (match_code "const_int")
(match_test "low_bitmask_len (mode, INTVAL (op)) > 12")))
diff --git a/gcc/testsuite/gcc.target/loongarch/imm-load.c b/gcc/testsuite/gcc.target/loongarch/imm-load.c
new file mode 100644
index 00000000000..91ceb33d058
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/imm-load.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-mabi=lp64d -O2 -fdump-rtl-loop2_invariant" } */
+
+extern long long b[10];
+static inline long long
+repeat_bytes (void)
+{
+ long long r = 0x0101010101010101;
+
+ return r;
+}
+
+static inline long long
+highbit_mask (long long m)
+{
+ return m & repeat_bytes ();
+}
+
+void test(long long *a)
+{
+ for (int i = 0; i < 10; i++)
+ b[i] = highbit_mask (a[i]);
+
+}
+/* { dg-final { scan-rtl-dump-times "moved without introducing a new temporary register" 4 "loop2_invariant" } } */
--
2.31.1
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH v1 2/2] LoongArch: Add prefetch insns.
2022-10-29 7:05 [PATCH v1 0/2] Optimize immediate load. Add prefetch insns Lulu Cheng
2022-10-29 7:05 ` [PATCH v1 1/2] LoongArch: Optimize immediate load Lulu Cheng
@ 2022-10-29 7:05 ` Lulu Cheng
2022-10-29 7:40 ` Lulu Cheng
1 sibling, 1 reply; 4+ messages in thread
From: Lulu Cheng @ 2022-10-29 7:05 UTC (permalink / raw)
To: gcc-patches; +Cc: xry111, i, xuchenghua, Lulu Cheng, xujiahao
Co-Authored-By: xujiahao <xujiahao@loongson.cn>
gcc/ChangeLog:
* config/loongarch/loongarch-def.c: Initial number of parallel prefetch.
* config/loongarch/loongarch-protos.h (loongarch_prefetch_cookie):
Function declaration.
* config/loongarch/loongarch-tune.h (struct loongarch_cache):
Define number of parallel prefetch.
* config/loongarch/loongarch.cc (loongarch_option_override_internal):
Set up parameters to be used in prefetching algorithm.
(loongarch_prefetch_cookie): Select load or store based on the value of write.
* config/loongarch/loongarch.md (prefetch): New template.
(*prefetch_indexed_<mode>): New template.
---
gcc/config/loongarch/loongarch-def.c | 2 ++
gcc/config/loongarch/loongarch-protos.h | 1 +
gcc/config/loongarch/loongarch-tune.h | 1 +
gcc/config/loongarch/loongarch.cc | 48 +++++++++++++++++++++++++
gcc/config/loongarch/loongarch.md | 23 ++++++++++++
5 files changed, 75 insertions(+)
diff --git a/gcc/config/loongarch/loongarch-def.c b/gcc/config/loongarch/loongarch-def.c
index cbf995d81b5..80ab10a52a8 100644
--- a/gcc/config/loongarch/loongarch-def.c
+++ b/gcc/config/loongarch/loongarch-def.c
@@ -62,11 +62,13 @@ loongarch_cpu_cache[N_TUNE_TYPES] = {
.l1d_line_size = 64,
.l1d_size = 64,
.l2d_size = 256,
+ .simultaneous_prefetches = 4,
},
[CPU_LA464] = {
.l1d_line_size = 64,
.l1d_size = 64,
.l2d_size = 256,
+ .simultaneous_prefetches = 4,
},
};
diff --git a/gcc/config/loongarch/loongarch-protos.h b/gcc/config/loongarch/loongarch-protos.h
index 77b2217247d..489525b520e 100644
--- a/gcc/config/loongarch/loongarch-protos.h
+++ b/gcc/config/loongarch/loongarch-protos.h
@@ -179,5 +179,6 @@ extern tree loongarch_builtin_decl (unsigned int, bool);
extern rtx loongarch_expand_builtin (tree, rtx, rtx subtarget ATTRIBUTE_UNUSED,
machine_mode, int);
extern tree loongarch_build_builtin_va_list (void);
+extern rtx loongarch_prefetch_cookie (rtx, rtx);
#endif /* ! GCC_LOONGARCH_PROTOS_H */
diff --git a/gcc/config/loongarch/loongarch-tune.h b/gcc/config/loongarch/loongarch-tune.h
index 6f3530f5c02..8e3eb29472b 100644
--- a/gcc/config/loongarch/loongarch-tune.h
+++ b/gcc/config/loongarch/loongarch-tune.h
@@ -45,6 +45,7 @@ struct loongarch_cache {
int l1d_line_size; /* bytes */
int l1d_size; /* KiB */
int l2d_size; /* kiB */
+ int simultaneous_prefetches; /* number of parallel prefetch */
};
#endif /* LOONGARCH_TUNE_H */
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index 5e8cd293645..d663afe434d 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -63,6 +63,7 @@ along with GCC; see the file COPYING3. If not see
#include "context.h"
#include "builtins.h"
#include "rtl-iter.h"
+#include "params.h"
/* This file should be included last. */
#include "target-def.h"
@@ -6126,6 +6127,33 @@ loongarch_option_override_internal (struct gcc_options *opts)
if (loongarch_branch_cost == 0)
loongarch_branch_cost = loongarch_cost->branch_cost;
+ /* Set up parameters to be used in prefetching algorithm. */
+ maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
+ loongarch_cpu_cache[LARCH_ACTUAL_TUNE].simultaneous_prefetches,
+ opts->x_param_values,
+ opts_set->x_param_values);
+
+ maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
+ loongarch_cpu_cache[LARCH_ACTUAL_TUNE].l1d_line_size,
+ opts->x_param_values,
+ opts_set->x_param_values);
+
+ maybe_set_param_value (PARAM_L1_CACHE_SIZE,
+ loongarch_cpu_cache[LARCH_ACTUAL_TUNE].l1d_size,
+ opts->x_param_values,
+ opts_set->x_param_values);
+
+ maybe_set_param_value (PARAM_L2_CACHE_SIZE,
+ loongarch_cpu_cache[LARCH_ACTUAL_TUNE].l2d_size,
+ opts->x_param_values,
+ opts_set->x_param_values);
+
+ /* Enable sw prefetching at -O3 and higher. */
+ if (opts->x_flag_prefetch_loop_arrays < 0
+ && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
+ && !opts->x_optimize_size)
+ opts->x_flag_prefetch_loop_arrays = 1;
+
if (TARGET_DIRECT_EXTERN_ACCESS && flag_shlib)
error ("%qs cannot be used for compiling a shared library",
"-mdirect-extern-access");
@@ -6506,6 +6534,26 @@ loongarch_asan_shadow_offset (void)
return TARGET_64BIT ? (HOST_WIDE_INT_1 << 46) : 0;
}
+/* LoongArch only implements preld hint=0 (prefetch for load) and hint=8
+ (prefetch for store), other hint just scale to hint = 0 and hint = 1. */
+
+rtx
+loongarch_prefetch_cookie (rtx write, rtx locality)
+{
+ if (INTVAL (locality) == 1 && INTVAL (write) == 0)
+ return GEN_INT (INTVAL (write) + 2);
+
+ /* store. */
+ if (INTVAL (write) == 1)
+ return GEN_INT (INTVAL (write) + 7);
+
+ /* load. */
+ if (INTVAL (write) == 0)
+ return GEN_INT (INTVAL (write));
+
+ gcc_unreachable ();
+}
+
/* Initialize the GCC target structure. */
#undef TARGET_ASM_ALIGNED_HI_OP
#define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
index 7eaa9ab66e3..be247164eb4 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -3201,6 +3201,29 @@ (define_expand "untyped_call"
;; ....................
;;
+(define_insn "prefetch"
+ [(prefetch (match_operand 0 "address_operand" "p")
+ (match_operand 1 "const_int_operand" "n")
+ (match_operand 2 "const_int_operand" "n"))]
+ ""
+{
+ operands[1] = loongarch_prefetch_cookie (operands[1], operands[2]);
+ return "preld\t%1,%a0";
+}
+ [(set_attr "type" "prefetch")])
+
+(define_insn "*prefetch_indexed_<mode>"
+ [(prefetch (plus:P (match_operand 0 "register_operand" "r")
+ (match_operand 1 "register_operand" "r"))
+ (match_operand 2 "const_int_operand" "n")
+ (match_operand 3 "const_int_operand" "n"))]
+ ""
+{
+ operands[2] = loongarch_prefetch_cookie (operands[2], operands[3]);
+ return "preldx\t%2,%1,%0";
+}
+ [(set_attr "type" "prefetchx")])
+
(define_insn "nop"
[(const_int 0)]
""
--
2.31.1
^ permalink raw reply [flat|nested] 4+ messages in thread