From: Haochen Jiang <haochen.jiang@intel.com>
To: gcc-patches@gcc.gnu.org
Cc: hongtao.liu@intel.com, ubizjak@gmail.com
Subject: [PATCH 2/2] Align tight&hot loop without considering max skipping bytes.
Date: Wed, 15 May 2024 11:04:29 +0800 [thread overview]
Message-ID: <20240515030429.2575440-3-haochen.jiang@intel.com> (raw)
In-Reply-To: <20240515030429.2575440-1-haochen.jiang@intel.com>
From: liuhongt <hongtao.liu@intel.com>
When hot loop is small enough to fix into one cacheline, we should align
the loop with ceil_log2 (loop_size) without considering maximum
skipp bytes. It will help code prefetch.
gcc/ChangeLog:
* config/i386/i386.cc (ix86_avoid_jump_mispredicts): Change
gen_pad to gen_max_skip_align.
(ix86_align_loops): New function.
(ix86_reorg): Call ix86_align_loops.
* config/i386/i386.md (pad): Rename to ..
(max_skip_align): .. this, and accept 2 operands for align and
skip.
---
gcc/config/i386/i386.cc | 148 +++++++++++++++++++++++++++++++++++++++-
gcc/config/i386/i386.md | 10 +--
2 files changed, 153 insertions(+), 5 deletions(-)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index e67e5f62533..c617091c8e1 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23137,7 +23137,7 @@ ix86_avoid_jump_mispredicts (void)
if (dump_file)
fprintf (dump_file, "Padding insn %i by %i bytes!\n",
INSN_UID (insn), padsize);
- emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
+ emit_insn_before (gen_max_skip_align (GEN_INT (4), GEN_INT (padsize)), insn);
}
}
}
@@ -23410,6 +23410,150 @@ ix86_split_stlf_stall_load ()
}
}
+/* When a hot loop can be fit into one cacheline,
+ force align the loop without considering the max skip. */
+static void
+ix86_align_loops ()
+{
+ basic_block bb;
+
+ /* Don't do this when we don't know cache line size. */
+ if (ix86_cost->prefetch_block == 0)
+ return;
+
+ loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+ profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
+ FOR_EACH_BB_FN (bb, cfun)
+ {
+ rtx_insn *label = BB_HEAD (bb);
+ bool has_fallthru = 0;
+ edge e;
+ edge_iterator ei;
+
+ if (!LABEL_P (label))
+ continue;
+
+ profile_count fallthru_count = profile_count::zero ();
+ profile_count branch_count = profile_count::zero ();
+
+ FOR_EACH_EDGE (e, ei, bb->preds)
+ {
+ if (e->flags & EDGE_FALLTHRU)
+ has_fallthru = 1, fallthru_count += e->count ();
+ else
+ branch_count += e->count ();
+ }
+
+ if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
+ continue;
+
+ if (bb->loop_father
+ && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
+ && (has_fallthru
+ ? (!(single_succ_p (bb)
+ && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
+ && optimize_bb_for_speed_p (bb)
+ && branch_count + fallthru_count > count_threshold
+ && (branch_count > fallthru_count * param_align_loop_iterations))
+ /* In case there'no fallthru for the loop.
+ Nops inserted won't be executed. */
+ : (branch_count > count_threshold
+ || (bb->count > bb->prev_bb->count * 10
+ && (bb->prev_bb->count
+ <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
+ {
+ rtx_insn* insn, *end_insn;
+ HOST_WIDE_INT size = 0;
+ bool padding_p = true;
+ basic_block tbb = bb;
+ unsigned cond_branch_num = 0;
+ bool detect_tight_loop_p = false;
+
+ for (unsigned int i = 0; i != bb->loop_father->num_nodes;
+ i++, tbb = tbb->next_bb)
+ {
+ /* Only handle continuous cfg layout. */
+ if (bb->loop_father != tbb->loop_father)
+ {
+ padding_p = false;
+ break;
+ }
+
+ FOR_BB_INSNS (tbb, insn)
+ {
+ if (!NONDEBUG_INSN_P (insn))
+ continue;
+ size += ix86_min_insn_size (insn);
+
+ /* We don't know size of inline asm.
+ Don't align loop for call. */
+ if (asm_noperands (PATTERN (insn)) >= 0
+ || CALL_P (insn))
+ {
+ size = -1;
+ break;
+ }
+ }
+
+ if (size == -1 || size > ix86_cost->prefetch_block)
+ {
+ padding_p = false;
+ break;
+ }
+
+ FOR_EACH_EDGE (e, ei, tbb->succs)
+ {
+ /* It could be part of the loop. */
+ if (e->dest == bb)
+ {
+ detect_tight_loop_p = true;
+ break;
+ }
+ }
+
+ if (detect_tight_loop_p)
+ break;
+
+ end_insn = BB_END (tbb);
+ if (JUMP_P (end_insn))
+ {
+ /* For decoded icache:
+ 1. Up to two branches are allowed per Way.
+ 2. A non-conditional branch is the last micro-op in a Way.
+ */
+ if (onlyjump_p (end_insn)
+ && (any_uncondjump_p (end_insn)
+ || single_succ_p (tbb)))
+ {
+ padding_p = false;
+ break;
+ }
+ else if (++cond_branch_num >= 2)
+ {
+ padding_p = false;
+ break;
+ }
+ }
+
+ }
+
+ if (padding_p && detect_tight_loop_p)
+ {
+ emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
+ GEN_INT (0)), label);
+ /* End of function. */
+ if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
+ break;
+ /* Skip bb which already fits into one cacheline. */
+ bb = tbb;
+ }
+ }
+ }
+
+ loop_optimizer_finalize ();
+ free_dominance_info (CDI_DOMINATORS);
+}
+
/* Implement machine specific optimizations. We implement padding of returns
for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
static void
@@ -23433,6 +23577,8 @@ ix86_reorg (void)
#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
if (TARGET_FOUR_JUMP_LIMIT)
ix86_avoid_jump_mispredicts ();
+
+ ix86_align_loops ();
#endif
}
}
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 764bfe20ff2..686de0bf2ff 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -19150,16 +19150,18 @@
(set_attr "length_immediate" "0")
(set_attr "modrm" "0")])
-;; Pad to 16-byte boundary, max skip in op0. Used to avoid
+;; Pad to 1 << op0 byte boundary, max skip in op1. Used to avoid
;; branch prediction penalty for the third jump in a 16-byte
;; block on K8.
+;; Also it's used to align tight loops which can be fix into 1 cacheline.
+;; It can help code prefetch and reduce DSB miss.
-(define_insn "pad"
- [(unspec_volatile [(match_operand 0)] UNSPECV_ALIGN)]
+(define_insn "max_skip_align"
+ [(unspec_volatile [(match_operand 0) (match_operand 1)] UNSPECV_ALIGN)]
""
{
#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
- ASM_OUTPUT_MAX_SKIP_ALIGN (asm_out_file, 4, (int)INTVAL (operands[0]));
+ ASM_OUTPUT_MAX_SKIP_ALIGN (asm_out_file, (int)INTVAL (operands[0]), (int)INTVAL (operands[1]));
#else
/* It is tempting to use ASM_OUTPUT_ALIGN here, but we don't want to do that.
The align insn is used to avoid 3 jump instructions in the row to improve
--
2.31.1
next prev parent reply other threads:[~2024-05-15 3:04 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-05-15 3:04 [PATCH 0/2] Align tight loops to solve cross cacheline issue Haochen Jiang
2024-05-15 3:04 ` [PATCH 1/2] Adjust generic loop alignment from 16:11:8 to 16 for Intel processors Haochen Jiang
2024-05-15 3:04 ` Haochen Jiang [this message]
2024-05-15 3:30 ` [PATCH 0/2] Align tight loops to solve cross cacheline issue Jiang, Haochen
2024-05-20 3:15 ` Hongtao Liu
2024-05-27 1:33 ` Hongtao Liu
2024-05-29 3:30 ` Jiang, Haochen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240515030429.2575440-3-haochen.jiang@intel.com \
--to=haochen.jiang@intel.com \
--cc=gcc-patches@gcc.gnu.org \
--cc=hongtao.liu@intel.com \
--cc=ubizjak@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).