* [PATCH] [APX] Support Intel APX PUSH2POP2
@ 2023-10-10 6:48 Hongyu Wang
2023-10-12 1:49 ` Hongtao Liu
0 siblings, 1 reply; 2+ messages in thread
From: Hongyu Wang @ 2023-10-10 6:48 UTC (permalink / raw)
To: gcc-patches; +Cc: ubizjak, hongtao.liu, Mo, Zewei, Hu Lin1
From: "Mo, Zewei" <zewei.mo@intel.com>
Hi,
Intel APX PUSH2POP2 feature has been released in [1].
This feature requires stack to be aligned at 16byte, therefore in
prologue/epilogue, a standalone push/pop will be emitted before any
push2/pop2 if the stack was not aligned to 16byte.
Also for current implementation we only support push2/pop2 usage in
function prologue/epilogue for those callee-saved registers.
Bootstrapped/regtested on x86-64-pc-linux-gnu{-m32,} and sde.
OK for master?
[1].https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html.
gcc/ChangeLog:
* config/i386/i386.cc (gen_push2): New function to emit push2
and adjust cfa offset.
(ix86_use_push2_pop2): New function to determine whether
push2/pop2 can be used.
(ix86_compute_frame_layout): Adjust preferred stack boundary
and stack alignment needed for push2/pop2.
(ix86_emit_save_regs): Emit push2 when available.
(ix86_emit_restore_reg_using_pop2): New function to emit pop2
and adjust cfa info.
(ix86_emit_restore_regs_using_pop2): New function to loop
through the saved regs and call above.
(ix86_expand_epilogue): Call ix86_emit_restore_regs_using_pop2
when push2pop2 available.
* config/i386/i386.md (push2_di): New pattern for push2.
(pop2_di): Likewise for pop2.
gcc/testsuite/ChangeLog:
* gcc.target/i386/apx-push2pop2-1.c: New test.
* gcc.target/i386/apx-push2pop2_force_drap-1.c: Likewise.
* gcc.target/i386/apx-push2pop2_interrupt-1.c: Likewise.
Co-authored-by: Hu Lin1 <lin1.hu@intel.com>
Co-authored-by: Hongyu Wang <hongyu.wang@intel.com>
---
gcc/config/i386/i386.cc | 252 ++++++++++++++++--
gcc/config/i386/i386.md | 26 ++
.../gcc.target/i386/apx-push2pop2-1.c | 45 ++++
.../i386/apx-push2pop2_force_drap-1.c | 29 ++
.../i386/apx-push2pop2_interrupt-1.c | 28 ++
5 files changed, 365 insertions(+), 15 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/apx-push2pop2_force_drap-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/apx-push2pop2_interrupt-1.c
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 6244f64a619..8251b67e2d6 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -6473,6 +6473,26 @@ gen_pop (rtx arg)
stack_pointer_rtx)));
}
+/* Generate a "push2" pattern for input ARG. */
+rtx
+gen_push2 (rtx mem, rtx reg1, rtx reg2)
+{
+ struct machine_function *m = cfun->machine;
+ const int offset = UNITS_PER_WORD * 2;
+
+ if (m->fs.cfa_reg == stack_pointer_rtx)
+ m->fs.cfa_offset += offset;
+ m->fs.sp_offset += offset;
+
+ if (REG_P (reg1) && GET_MODE (reg1) != word_mode)
+ reg1 = gen_rtx_REG (word_mode, REGNO (reg1));
+
+ if (REG_P (reg2) && GET_MODE (reg2) != word_mode)
+ reg2 = gen_rtx_REG (word_mode, REGNO (reg2));
+
+ return gen_push2_di (mem, reg1, reg2);
+}
+
/* Return >= 0 if there is an unused call-clobbered register available
for the entire function. */
@@ -6714,6 +6734,18 @@ get_probe_interval (void)
#define SPLIT_STACK_AVAILABLE 256
+/* Helper function to determine whether push2/pop2 can be used in prologue or
+ epilogue for register save/restore. */
+static bool
+ix86_pro_and_epilogue_can_use_push2pop2 (int nregs)
+{
+ int aligned = cfun->machine->fs.sp_offset % 16 == 0;
+ return TARGET_APX_PUSH2POP2
+ && !cfun->machine->frame.save_regs_using_mov
+ && cfun->machine->func_type == TYPE_NORMAL
+ && (nregs + aligned) >= 3;
+}
+
/* Fill structure ix86_frame about frame of currently computed function. */
static void
@@ -6771,16 +6803,20 @@ ix86_compute_frame_layout (void)
Darwin's ABI specifies 128b alignment for both 32 and 64 bit variants
at call sites, including profile function calls.
- */
- if (((TARGET_64BIT_MS_ABI || TARGET_MACHO)
- && crtl->preferred_stack_boundary < 128)
- && (!crtl->is_leaf || cfun->calls_alloca != 0
- || ix86_current_function_calls_tls_descriptor
- || (TARGET_MACHO && crtl->profile)
- || ix86_incoming_stack_boundary < 128))
+
+ For APX push2/pop2, the stack also requires 128b alignment. */
+ if ((ix86_pro_and_epilogue_can_use_push2pop2 (frame->nregs)
+ && crtl->preferred_stack_boundary < 128)
+ || (((TARGET_64BIT_MS_ABI || TARGET_MACHO)
+ && crtl->preferred_stack_boundary < 128)
+ && (!crtl->is_leaf || cfun->calls_alloca != 0
+ || ix86_current_function_calls_tls_descriptor
+ || (TARGET_MACHO && crtl->profile)
+ || ix86_incoming_stack_boundary < 128)))
{
crtl->preferred_stack_boundary = 128;
- crtl->stack_alignment_needed = 128;
+ if (crtl->stack_alignment_needed < 128)
+ crtl->stack_alignment_needed = 128;
}
stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
@@ -7291,12 +7327,85 @@ ix86_emit_save_regs (void)
int regno;
rtx_insn *insn;
- for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; regno--)
- if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
- {
- insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
- RTX_FRAME_RELATED_P (insn) = 1;
- }
+ if (!TARGET_APX_PUSH2POP2 || cfun->machine->func_type != TYPE_NORMAL)
+ {
+ for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; regno--)
+ if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+ {
+ insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ }
+ }
+ else
+ {
+ int regno_list[2];
+ regno_list[0] = regno_list[1] = -1;
+ int loaded_regnum = 0;
+ bool aligned = cfun->machine->fs.sp_offset % 16 == 0;
+
+ for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; regno--)
+ if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+ {
+ if (aligned)
+ {
+ regno_list[loaded_regnum++] = regno;
+ if (loaded_regnum == 2)
+ {
+ gcc_assert (regno_list[0] != -1
+ && regno_list[1] != -1
+ && regno_list[0] != regno_list[1]);
+ const int offset = UNITS_PER_WORD * 2;
+ rtx mem = gen_rtx_MEM (TImode,
+ gen_rtx_PRE_DEC (Pmode,
+ stack_pointer_rtx));
+ insn = emit_insn (gen_push2 (mem,
+ gen_rtx_REG (word_mode,
+ regno_list[0]),
+ gen_rtx_REG (word_mode,
+ regno_list[1])));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ rtx dwarf = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (3));
+
+ for (int i = 0; i < 2; i++)
+ {
+ rtx dwarf_reg = gen_rtx_REG (word_mode,
+ regno_list[i]);
+ rtx sp_offset = plus_constant (Pmode,
+ stack_pointer_rtx,
+ + UNITS_PER_WORD
+ * (1 - i));
+ rtx tmp = gen_rtx_SET (gen_frame_mem (DImode,
+ sp_offset),
+ dwarf_reg);
+ RTX_FRAME_RELATED_P (tmp) = 1;
+ XVECEXP (dwarf, 0, i + 1) = tmp;
+ }
+ rtx sp_tmp = gen_rtx_SET (stack_pointer_rtx,
+ plus_constant (Pmode,
+ stack_pointer_rtx,
+ -offset));
+ RTX_FRAME_RELATED_P (sp_tmp) = 1;
+ XVECEXP (dwarf, 0, 0) = sp_tmp;
+ add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf);
+
+ loaded_regnum = 0;
+ regno_list[0] = regno_list[1] = -1;
+ }
+ }
+ else
+ {
+ insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ aligned = true;
+ }
+ }
+ if (loaded_regnum == 1)
+ {
+ insn = emit_insn (gen_push (gen_rtx_REG (word_mode,
+ regno_list[0])));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ }
+ }
}
/* Emit a single register save at CFA - CFA_OFFSET. */
@@ -9180,6 +9289,74 @@ ix86_emit_restore_reg_using_pop (rtx reg)
}
}
+/* Emit code to restore REG using a POP2 insn. */
+static void
+ix86_emit_restore_reg_using_pop2 (rtx reg1, rtx reg2)
+{
+ struct machine_function *m = cfun->machine;
+ const int offset = UNITS_PER_WORD * 2;
+
+ rtx mem = gen_rtx_MEM (TImode, gen_rtx_POST_INC (Pmode,
+ stack_pointer_rtx));
+ rtx_insn *insn = emit_insn (gen_pop2_di (reg1, mem, reg2));
+
+ RTX_FRAME_RELATED_P (insn) = 1;
+
+ rtx dwarf = NULL_RTX;
+ dwarf = alloc_reg_note (REG_CFA_RESTORE, reg1, dwarf);
+ dwarf = alloc_reg_note (REG_CFA_RESTORE, reg2, dwarf);
+ REG_NOTES (insn) = dwarf;
+ m->fs.sp_offset -= offset;
+
+ if (m->fs.cfa_reg == crtl->drap_reg
+ && (REGNO (reg1) == REGNO (crtl->drap_reg)
+ || REGNO (reg2) == REGNO (crtl->drap_reg)))
+ {
+ /* Previously we'd represented the CFA as an expression
+ like *(%ebp - 8). We've just popped that value from
+ the stack, which means we need to reset the CFA to
+ the drap register. This will remain until we restore
+ the stack pointer. */
+ add_reg_note (insn, REG_CFA_DEF_CFA,
+ REGNO (reg1) == REGNO (crtl->drap_reg) ? reg1 : reg2);
+ RTX_FRAME_RELATED_P (insn) = 1;
+
+ /* This means that the DRAP register is valid for addressing too. */
+ m->fs.drap_valid = true;
+ return;
+ }
+
+ if (m->fs.cfa_reg == stack_pointer_rtx)
+ {
+ rtx x = plus_constant (Pmode, stack_pointer_rtx, offset);
+ x = gen_rtx_SET (stack_pointer_rtx, x);
+ add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+ RTX_FRAME_RELATED_P (insn) = 1;
+
+ m->fs.cfa_offset -= offset;
+ }
+
+ /* When the frame pointer is the CFA, and we pop it, we are
+ swapping back to the stack pointer as the CFA. This happens
+ for stack frames that don't allocate other data, so we assume
+ the stack pointer is now pointing at the return address, i.e.
+ the function entry state, which makes the offset be 1 word. */
+ if (reg1 == hard_frame_pointer_rtx || reg2 == hard_frame_pointer_rtx)
+ {
+ m->fs.fp_valid = false;
+ if (m->fs.cfa_reg == hard_frame_pointer_rtx)
+ {
+ m->fs.cfa_reg = stack_pointer_rtx;
+ m->fs.cfa_offset -= offset;
+
+ add_reg_note (insn, REG_CFA_DEF_CFA,
+ plus_constant (Pmode, stack_pointer_rtx,
+ m->fs.cfa_offset));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ }
+ }
+}
+
/* Emit code to restore saved registers using POP insns. */
static void
@@ -9192,6 +9369,48 @@ ix86_emit_restore_regs_using_pop (void)
ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
}
+/* Emit code to restore saved registers using POP2 insns. */
+
+static void
+ix86_emit_restore_regs_using_pop2 (void)
+{
+ int regno;
+ int regno_list[2];
+ regno_list[0] = regno_list[1] = -1;
+ int loaded_regnum = 0;
+ bool aligned = cfun->machine->fs.sp_offset % 16 == 0;
+
+ for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
+ {
+ if (aligned)
+ {
+ regno_list[loaded_regnum++] = regno;
+ if (loaded_regnum == 2)
+ {
+ gcc_assert (regno_list[0] != -1
+ && regno_list[1] != -1
+ && regno_list[0] != regno_list[1]);
+
+ ix86_emit_restore_reg_using_pop2 (gen_rtx_REG (word_mode,
+ regno_list[0]),
+ gen_rtx_REG (word_mode,
+ regno_list[1]));
+ loaded_regnum = 0;
+ regno_list[0] = regno_list[1] = -1;
+ }
+ }
+ else
+ {
+ ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
+ aligned = true;
+ }
+ }
+
+ if (loaded_regnum == 1)
+ ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno_list[0]));
+}
+
/* Emit code and notes for the LEAVE instruction. If insn is non-null,
omits the emit and only attaches the notes. */
@@ -9731,7 +9950,10 @@ ix86_expand_epilogue (int style)
m->fs.cfa_reg == stack_pointer_rtx);
}
- ix86_emit_restore_regs_using_pop ();
+ if (TARGET_APX_PUSH2POP2 && m->func_type == TYPE_NORMAL)
+ ix86_emit_restore_regs_using_pop2 ();
+ else
+ ix86_emit_restore_regs_using_pop ();
}
/* If we used a stack pointer and haven't already got rid of it,
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index f390fb5692b..22bd5dde2fa 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -208,6 +208,10 @@ (define_c_enum "unspec" [
;; For insn_callee_abi:
UNSPEC_CALLEE_ABI
+ ;; For PUSH2/POP2 support
+ UNSPEC_APXPUSH2
+ UNSPEC_APXPOP2_LOW
+ UNSPEC_APXPOP2_HIGH
])
(define_c_enum "unspecv" [
@@ -3833,6 +3837,28 @@ (define_insn "*push<mode>"
(set_attr "type" "push,multi")
(set_attr "mode" "SI,TI")])
+(define_insn "push2_di"
+ [(set (match_operand:TI 0 "push_operand" "=<")
+ (unspec:TI [(match_operand:DI 1 "register_operand" "r")
+ (match_operand:DI 2 "register_operand" "r")]
+ UNSPEC_APXPUSH2))]
+ "TARGET_APX_PUSH2POP2"
+ "push2\t%1, %2"
+ [(set_attr "mode" "TI")
+ (set_attr "type" "multi")
+ (set_attr "prefix" "evex")])
+
+(define_insn "pop2_di"
+ [(parallel [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:TI 1 "pop_operand" ">")]
+ UNSPEC_APXPOP2_LOW))
+ (set (match_operand:DI 2 "register_operand" "=r")
+ (unspec:DI [(const_int 0)] UNSPEC_APXPOP2_HIGH))])]
+ "TARGET_APX_PUSH2POP2"
+ "pop2\t%0, %2"
+ [(set_attr "mode" "TI")
+ (set_attr "prefix" "evex")])
+
(define_insn "*pushsf_rex64"
[(set (match_operand:SF 0 "push_operand" "=X,X,X")
(match_operand:SF 1 "nonmemory_no_elim_operand" "f,rF,v"))]
diff --git a/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c b/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c
new file mode 100644
index 00000000000..c7968d674e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c
@@ -0,0 +1,45 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mapxf" } */
+
+extern int bar (int);
+
+void foo ()
+{
+ int a,b,c,d,e,f,i;
+ a = bar (5);
+ b = bar (a);
+ c = bar (b);
+ d = bar (c);
+ e = bar (d);
+ f = bar (e);
+ for (i = 1; i < 10; i++)
+ {
+ a += bar (a + i) + bar (b + i) +
+ bar (c + i) + bar (d + i) +
+ bar (e + i) + bar (f + i);
+ }
+}
+
+/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 16" 2 } } */
+/* { dg-final { scan-assembler-times "pushq\[^\n\r]*%r15(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_offset 15, -16(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "push2\[\\t \]*\[^\n\r]*%r14\[^\n\r]*%r13\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 32" 2 } } */
+/* { dg-final { scan-assembler-times ".cfi_offset 14, -24(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_offset 13, -32(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "push2\[\\t \]*\[^\n\r]*%r12\[^\n\r]*%rbp\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 48" 2 } } */
+/* { dg-final { scan-assembler-times ".cfi_offset 12, -40(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_offset 6, -48(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "pushq\[^\n\r]*%rbx(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 56" 2 } } */
+/* { dg-final { scan-assembler-times ".cfi_offset 3, -56(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "popq\[^\n\r]*rbx(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "pop2\[\\t \]*\[^\n\r]*%rbp\[^\n\r]*%r12\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_restore 12(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_restore 6(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "pop2\[\\t \]*\[^\n\r]*%r13\[^\n\r]*%r14\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_restore 14(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_restore 13(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "popq\[^\n\r]*%r15(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 8(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/apx-push2pop2_force_drap-1.c b/gcc/testsuite/gcc.target/i386/apx-push2pop2_force_drap-1.c
new file mode 100644
index 00000000000..38787990288
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/apx-push2pop2_force_drap-1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mapxf -mforce-drap" } */
+
+#include "apx-push2pop2-1.c"
+
+
+/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 16" 2 } } */
+/* { dg-final { scan-assembler-times "pushq\[^\n\r]*%r15(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_offset 15, -16(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "push2\[\\t \]*\[^\n\r]*%r14\[^\n\r]*%r13\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 32" 2 } } */
+/* { dg-final { scan-assembler-times ".cfi_offset 14, -24(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_offset 13, -32(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "push2\[\\t \]*\[^\n\r]*%r12\[^\n\r]*%rbp\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 48" 2 } } */
+/* { dg-final { scan-assembler-times ".cfi_offset 12, -40(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_offset 6, -48(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "pushq\[^\n\r]*%rbx(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 56" 2 } } */
+/* { dg-final { scan-assembler-times ".cfi_offset 3, -56(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "popq\[^\n\r]*rbx(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "pop2\[\\t \]*\[^\n\r]*%rbp\[^\n\r]*%r12\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_restore 12(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_restore 6(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "pop2\[\\t \]*\[^\n\r]*%r13\[^\n\r]*%r14\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_restore 14(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_restore 13(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "popq\[^\n\r]*%r15(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 8(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/apx-push2pop2_interrupt-1.c b/gcc/testsuite/gcc.target/i386/apx-push2pop2_interrupt-1.c
new file mode 100644
index 00000000000..747f7aaf191
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/apx-push2pop2_interrupt-1.c
@@ -0,0 +1,28 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mapxf -mgeneral-regs-only -mno-cld -mno-push-args -maccumulate-outgoing-args" } */
+
+extern void foo (void *) __attribute__ ((interrupt));
+
+extern int bar (int);
+
+void foo (void *frame)
+{
+ int a,b,c,d,e,f,i;
+ a = bar (5);
+ b = bar (a);
+ c = bar (b);
+ d = bar (c);
+ e = bar (d);
+ f = bar (e);
+ for (i = 1; i < 10; i++)
+ {
+ a += bar (a + i) + bar (b + i) +
+ bar (c + i) + bar (d + i) +
+ bar (e + i) + bar (f + i);
+ }
+}
+
+/* { dg-final { scan-assembler-times "pushq" 31 } } */
+/* { dg-final { scan-assembler-times "popq" 31 } } */
+/* { dg-final { scan-assembler-not "push2\[\\t \]+" } } */
+/* { dg-final { scan-assembler-not "pop2\[\\t \]+" } } */
--
2.31.1
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [PATCH] [APX] Support Intel APX PUSH2POP2
2023-10-10 6:48 [PATCH] [APX] Support Intel APX PUSH2POP2 Hongyu Wang
@ 2023-10-12 1:49 ` Hongtao Liu
0 siblings, 0 replies; 2+ messages in thread
From: Hongtao Liu @ 2023-10-12 1:49 UTC (permalink / raw)
To: Hongyu Wang; +Cc: gcc-patches, ubizjak, hongtao.liu, Mo, Zewei, Hu Lin1
On Tue, Oct 10, 2023 at 2:51 PM Hongyu Wang <hongyu.wang@intel.com> wrote:
>
> From: "Mo, Zewei" <zewei.mo@intel.com>
>
> Hi,
>
> Intel APX PUSH2POP2 feature has been released in [1].
>
> This feature requires stack to be aligned at 16byte, therefore in
> prologue/epilogue, a standalone push/pop will be emitted before any
> push2/pop2 if the stack was not aligned to 16byte.
> Also for current implementation we only support push2/pop2 usage in
> function prologue/epilogue for those callee-saved registers.
>
> Bootstrapped/regtested on x86-64-pc-linux-gnu{-m32,} and sde.
>
> OK for master?
Ok, What remains to be optimized is to the save and restore of the
caller-save registers for ipa-ra, let's leave that to GCC15.
>
> [1].https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html.
>
> gcc/ChangeLog:
>
> * config/i386/i386.cc (gen_push2): New function to emit push2
> and adjust cfa offset.
> (ix86_use_push2_pop2): New function to determine whether
> push2/pop2 can be used.
> (ix86_compute_frame_layout): Adjust preferred stack boundary
> and stack alignment needed for push2/pop2.
> (ix86_emit_save_regs): Emit push2 when available.
> (ix86_emit_restore_reg_using_pop2): New function to emit pop2
> and adjust cfa info.
> (ix86_emit_restore_regs_using_pop2): New function to loop
> through the saved regs and call above.
> (ix86_expand_epilogue): Call ix86_emit_restore_regs_using_pop2
> when push2pop2 available.
> * config/i386/i386.md (push2_di): New pattern for push2.
> (pop2_di): Likewise for pop2.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/apx-push2pop2-1.c: New test.
> * gcc.target/i386/apx-push2pop2_force_drap-1.c: Likewise.
> * gcc.target/i386/apx-push2pop2_interrupt-1.c: Likewise.
>
> Co-authored-by: Hu Lin1 <lin1.hu@intel.com>
> Co-authored-by: Hongyu Wang <hongyu.wang@intel.com>
> ---
> gcc/config/i386/i386.cc | 252 ++++++++++++++++--
> gcc/config/i386/i386.md | 26 ++
> .../gcc.target/i386/apx-push2pop2-1.c | 45 ++++
> .../i386/apx-push2pop2_force_drap-1.c | 29 ++
> .../i386/apx-push2pop2_interrupt-1.c | 28 ++
> 5 files changed, 365 insertions(+), 15 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/apx-push2pop2_force_drap-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/apx-push2pop2_interrupt-1.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 6244f64a619..8251b67e2d6 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -6473,6 +6473,26 @@ gen_pop (rtx arg)
> stack_pointer_rtx)));
> }
>
> +/* Generate a "push2" pattern for input ARG. */
> +rtx
> +gen_push2 (rtx mem, rtx reg1, rtx reg2)
> +{
> + struct machine_function *m = cfun->machine;
> + const int offset = UNITS_PER_WORD * 2;
> +
> + if (m->fs.cfa_reg == stack_pointer_rtx)
> + m->fs.cfa_offset += offset;
> + m->fs.sp_offset += offset;
> +
> + if (REG_P (reg1) && GET_MODE (reg1) != word_mode)
> + reg1 = gen_rtx_REG (word_mode, REGNO (reg1));
> +
> + if (REG_P (reg2) && GET_MODE (reg2) != word_mode)
> + reg2 = gen_rtx_REG (word_mode, REGNO (reg2));
> +
> + return gen_push2_di (mem, reg1, reg2);
> +}
> +
> /* Return >= 0 if there is an unused call-clobbered register available
> for the entire function. */
>
> @@ -6714,6 +6734,18 @@ get_probe_interval (void)
>
> #define SPLIT_STACK_AVAILABLE 256
>
> +/* Helper function to determine whether push2/pop2 can be used in prologue or
> + epilogue for register save/restore. */
> +static bool
> +ix86_pro_and_epilogue_can_use_push2pop2 (int nregs)
> +{
> + int aligned = cfun->machine->fs.sp_offset % 16 == 0;
> + return TARGET_APX_PUSH2POP2
> + && !cfun->machine->frame.save_regs_using_mov
> + && cfun->machine->func_type == TYPE_NORMAL
> + && (nregs + aligned) >= 3;
> +}
> +
> /* Fill structure ix86_frame about frame of currently computed function. */
>
> static void
> @@ -6771,16 +6803,20 @@ ix86_compute_frame_layout (void)
>
> Darwin's ABI specifies 128b alignment for both 32 and 64 bit variants
> at call sites, including profile function calls.
> - */
> - if (((TARGET_64BIT_MS_ABI || TARGET_MACHO)
> - && crtl->preferred_stack_boundary < 128)
> - && (!crtl->is_leaf || cfun->calls_alloca != 0
> - || ix86_current_function_calls_tls_descriptor
> - || (TARGET_MACHO && crtl->profile)
> - || ix86_incoming_stack_boundary < 128))
> +
> + For APX push2/pop2, the stack also requires 128b alignment. */
> + if ((ix86_pro_and_epilogue_can_use_push2pop2 (frame->nregs)
> + && crtl->preferred_stack_boundary < 128)
> + || (((TARGET_64BIT_MS_ABI || TARGET_MACHO)
> + && crtl->preferred_stack_boundary < 128)
> + && (!crtl->is_leaf || cfun->calls_alloca != 0
> + || ix86_current_function_calls_tls_descriptor
> + || (TARGET_MACHO && crtl->profile)
> + || ix86_incoming_stack_boundary < 128)))
> {
> crtl->preferred_stack_boundary = 128;
> - crtl->stack_alignment_needed = 128;
> + if (crtl->stack_alignment_needed < 128)
> + crtl->stack_alignment_needed = 128;
> }
>
> stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
> @@ -7291,12 +7327,85 @@ ix86_emit_save_regs (void)
> int regno;
> rtx_insn *insn;
>
> - for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; regno--)
> - if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
> - {
> - insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
> - RTX_FRAME_RELATED_P (insn) = 1;
> - }
> + if (!TARGET_APX_PUSH2POP2 || cfun->machine->func_type != TYPE_NORMAL)
> + {
> + for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; regno--)
> + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
> + {
> + insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
> + RTX_FRAME_RELATED_P (insn) = 1;
> + }
> + }
> + else
> + {
> + int regno_list[2];
> + regno_list[0] = regno_list[1] = -1;
> + int loaded_regnum = 0;
> + bool aligned = cfun->machine->fs.sp_offset % 16 == 0;
> +
> + for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; regno--)
> + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
> + {
> + if (aligned)
> + {
> + regno_list[loaded_regnum++] = regno;
> + if (loaded_regnum == 2)
> + {
> + gcc_assert (regno_list[0] != -1
> + && regno_list[1] != -1
> + && regno_list[0] != regno_list[1]);
> + const int offset = UNITS_PER_WORD * 2;
> + rtx mem = gen_rtx_MEM (TImode,
> + gen_rtx_PRE_DEC (Pmode,
> + stack_pointer_rtx));
> + insn = emit_insn (gen_push2 (mem,
> + gen_rtx_REG (word_mode,
> + regno_list[0]),
> + gen_rtx_REG (word_mode,
> + regno_list[1])));
> + RTX_FRAME_RELATED_P (insn) = 1;
> + rtx dwarf = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (3));
> +
> + for (int i = 0; i < 2; i++)
> + {
> + rtx dwarf_reg = gen_rtx_REG (word_mode,
> + regno_list[i]);
> + rtx sp_offset = plus_constant (Pmode,
> + stack_pointer_rtx,
> + + UNITS_PER_WORD
> + * (1 - i));
> + rtx tmp = gen_rtx_SET (gen_frame_mem (DImode,
> + sp_offset),
> + dwarf_reg);
> + RTX_FRAME_RELATED_P (tmp) = 1;
> + XVECEXP (dwarf, 0, i + 1) = tmp;
> + }
> + rtx sp_tmp = gen_rtx_SET (stack_pointer_rtx,
> + plus_constant (Pmode,
> + stack_pointer_rtx,
> + -offset));
> + RTX_FRAME_RELATED_P (sp_tmp) = 1;
> + XVECEXP (dwarf, 0, 0) = sp_tmp;
> + add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf);
> +
> + loaded_regnum = 0;
> + regno_list[0] = regno_list[1] = -1;
> + }
> + }
> + else
> + {
> + insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
> + RTX_FRAME_RELATED_P (insn) = 1;
> + aligned = true;
> + }
> + }
> + if (loaded_regnum == 1)
> + {
> + insn = emit_insn (gen_push (gen_rtx_REG (word_mode,
> + regno_list[0])));
> + RTX_FRAME_RELATED_P (insn) = 1;
> + }
> + }
> }
>
> /* Emit a single register save at CFA - CFA_OFFSET. */
> @@ -9180,6 +9289,74 @@ ix86_emit_restore_reg_using_pop (rtx reg)
> }
> }
>
> +/* Emit code to restore REG using a POP2 insn. */
> +static void
> +ix86_emit_restore_reg_using_pop2 (rtx reg1, rtx reg2)
> +{
> + struct machine_function *m = cfun->machine;
> + const int offset = UNITS_PER_WORD * 2;
> +
> + rtx mem = gen_rtx_MEM (TImode, gen_rtx_POST_INC (Pmode,
> + stack_pointer_rtx));
> + rtx_insn *insn = emit_insn (gen_pop2_di (reg1, mem, reg2));
> +
> + RTX_FRAME_RELATED_P (insn) = 1;
> +
> + rtx dwarf = NULL_RTX;
> + dwarf = alloc_reg_note (REG_CFA_RESTORE, reg1, dwarf);
> + dwarf = alloc_reg_note (REG_CFA_RESTORE, reg2, dwarf);
> + REG_NOTES (insn) = dwarf;
> + m->fs.sp_offset -= offset;
> +
> + if (m->fs.cfa_reg == crtl->drap_reg
> + && (REGNO (reg1) == REGNO (crtl->drap_reg)
> + || REGNO (reg2) == REGNO (crtl->drap_reg)))
> + {
> + /* Previously we'd represented the CFA as an expression
> + like *(%ebp - 8). We've just popped that value from
> + the stack, which means we need to reset the CFA to
> + the drap register. This will remain until we restore
> + the stack pointer. */
> + add_reg_note (insn, REG_CFA_DEF_CFA,
> + REGNO (reg1) == REGNO (crtl->drap_reg) ? reg1 : reg2);
> + RTX_FRAME_RELATED_P (insn) = 1;
> +
> + /* This means that the DRAP register is valid for addressing too. */
> + m->fs.drap_valid = true;
> + return;
> + }
> +
> + if (m->fs.cfa_reg == stack_pointer_rtx)
> + {
> + rtx x = plus_constant (Pmode, stack_pointer_rtx, offset);
> + x = gen_rtx_SET (stack_pointer_rtx, x);
> + add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
> + RTX_FRAME_RELATED_P (insn) = 1;
> +
> + m->fs.cfa_offset -= offset;
> + }
> +
> + /* When the frame pointer is the CFA, and we pop it, we are
> + swapping back to the stack pointer as the CFA. This happens
> + for stack frames that don't allocate other data, so we assume
> + the stack pointer is now pointing at the return address, i.e.
> + the function entry state, which makes the offset be 1 word. */
> + if (reg1 == hard_frame_pointer_rtx || reg2 == hard_frame_pointer_rtx)
> + {
> + m->fs.fp_valid = false;
> + if (m->fs.cfa_reg == hard_frame_pointer_rtx)
> + {
> + m->fs.cfa_reg = stack_pointer_rtx;
> + m->fs.cfa_offset -= offset;
> +
> + add_reg_note (insn, REG_CFA_DEF_CFA,
> + plus_constant (Pmode, stack_pointer_rtx,
> + m->fs.cfa_offset));
> + RTX_FRAME_RELATED_P (insn) = 1;
> + }
> + }
> +}
> +
> /* Emit code to restore saved registers using POP insns. */
>
> static void
> @@ -9192,6 +9369,48 @@ ix86_emit_restore_regs_using_pop (void)
> ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
> }
>
> +/* Emit code to restore saved registers using POP2 insns. */
> +
> +static void
> +ix86_emit_restore_regs_using_pop2 (void)
> +{
> + int regno;
> + int regno_list[2];
> + regno_list[0] = regno_list[1] = -1;
> + int loaded_regnum = 0;
> + bool aligned = cfun->machine->fs.sp_offset % 16 == 0;
> +
> + for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
> + {
> + if (aligned)
> + {
> + regno_list[loaded_regnum++] = regno;
> + if (loaded_regnum == 2)
> + {
> + gcc_assert (regno_list[0] != -1
> + && regno_list[1] != -1
> + && regno_list[0] != regno_list[1]);
> +
> + ix86_emit_restore_reg_using_pop2 (gen_rtx_REG (word_mode,
> + regno_list[0]),
> + gen_rtx_REG (word_mode,
> + regno_list[1]));
> + loaded_regnum = 0;
> + regno_list[0] = regno_list[1] = -1;
> + }
> + }
> + else
> + {
> + ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
> + aligned = true;
> + }
> + }
> +
> + if (loaded_regnum == 1)
> + ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno_list[0]));
> +}
> +
> /* Emit code and notes for the LEAVE instruction. If insn is non-null,
> omits the emit and only attaches the notes. */
>
> @@ -9731,7 +9950,10 @@ ix86_expand_epilogue (int style)
> m->fs.cfa_reg == stack_pointer_rtx);
> }
>
> - ix86_emit_restore_regs_using_pop ();
> + if (TARGET_APX_PUSH2POP2 && m->func_type == TYPE_NORMAL)
> + ix86_emit_restore_regs_using_pop2 ();
> + else
> + ix86_emit_restore_regs_using_pop ();
> }
>
> /* If we used a stack pointer and haven't already got rid of it,
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index f390fb5692b..22bd5dde2fa 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -208,6 +208,10 @@ (define_c_enum "unspec" [
> ;; For insn_callee_abi:
> UNSPEC_CALLEE_ABI
>
> + ;; For PUSH2/POP2 support
> + UNSPEC_APXPUSH2
> + UNSPEC_APXPOP2_LOW
> + UNSPEC_APXPOP2_HIGH
> ])
>
> (define_c_enum "unspecv" [
> @@ -3833,6 +3837,28 @@ (define_insn "*push<mode>"
> (set_attr "type" "push,multi")
> (set_attr "mode" "SI,TI")])
>
> +(define_insn "push2_di"
> + [(set (match_operand:TI 0 "push_operand" "=<")
> + (unspec:TI [(match_operand:DI 1 "register_operand" "r")
> + (match_operand:DI 2 "register_operand" "r")]
> + UNSPEC_APXPUSH2))]
> + "TARGET_APX_PUSH2POP2"
> + "push2\t%1, %2"
> + [(set_attr "mode" "TI")
> + (set_attr "type" "multi")
> + (set_attr "prefix" "evex")])
> +
> +(define_insn "pop2_di"
> + [(parallel [(set (match_operand:DI 0 "register_operand" "=r")
> + (unspec:DI [(match_operand:TI 1 "pop_operand" ">")]
> + UNSPEC_APXPOP2_LOW))
> + (set (match_operand:DI 2 "register_operand" "=r")
> + (unspec:DI [(const_int 0)] UNSPEC_APXPOP2_HIGH))])]
> + "TARGET_APX_PUSH2POP2"
> + "pop2\t%0, %2"
> + [(set_attr "mode" "TI")
> + (set_attr "prefix" "evex")])
> +
> (define_insn "*pushsf_rex64"
> [(set (match_operand:SF 0 "push_operand" "=X,X,X")
> (match_operand:SF 1 "nonmemory_no_elim_operand" "f,rF,v"))]
> diff --git a/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c b/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c
> new file mode 100644
> index 00000000000..c7968d674e5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c
> @@ -0,0 +1,45 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mapxf" } */
> +
> +extern int bar (int);
> +
> +void foo ()
> +{
> + int a,b,c,d,e,f,i;
> + a = bar (5);
> + b = bar (a);
> + c = bar (b);
> + d = bar (c);
> + e = bar (d);
> + f = bar (e);
> + for (i = 1; i < 10; i++)
> + {
> + a += bar (a + i) + bar (b + i) +
> + bar (c + i) + bar (d + i) +
> + bar (e + i) + bar (f + i);
> + }
> +}
> +
> +/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 16" 2 } } */
> +/* { dg-final { scan-assembler-times "pushq\[^\n\r]*%r15(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_offset 15, -16(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "push2\[\\t \]*\[^\n\r]*%r14\[^\n\r]*%r13\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 32" 2 } } */
> +/* { dg-final { scan-assembler-times ".cfi_offset 14, -24(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_offset 13, -32(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "push2\[\\t \]*\[^\n\r]*%r12\[^\n\r]*%rbp\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 48" 2 } } */
> +/* { dg-final { scan-assembler-times ".cfi_offset 12, -40(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_offset 6, -48(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "pushq\[^\n\r]*%rbx(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 56" 2 } } */
> +/* { dg-final { scan-assembler-times ".cfi_offset 3, -56(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "popq\[^\n\r]*rbx(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "pop2\[\\t \]*\[^\n\r]*%rbp\[^\n\r]*%r12\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_restore 12(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_restore 6(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "pop2\[\\t \]*\[^\n\r]*%r13\[^\n\r]*%r14\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_restore 14(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_restore 13(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "popq\[^\n\r]*%r15(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 8(?:\n|\[ \\t\]+#)" 1 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/apx-push2pop2_force_drap-1.c b/gcc/testsuite/gcc.target/i386/apx-push2pop2_force_drap-1.c
> new file mode 100644
> index 00000000000..38787990288
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/apx-push2pop2_force_drap-1.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mapxf -mforce-drap" } */
> +
> +#include "apx-push2pop2-1.c"
> +
> +
> +/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 16" 2 } } */
> +/* { dg-final { scan-assembler-times "pushq\[^\n\r]*%r15(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_offset 15, -16(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "push2\[\\t \]*\[^\n\r]*%r14\[^\n\r]*%r13\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 32" 2 } } */
> +/* { dg-final { scan-assembler-times ".cfi_offset 14, -24(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_offset 13, -32(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "push2\[\\t \]*\[^\n\r]*%r12\[^\n\r]*%rbp\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 48" 2 } } */
> +/* { dg-final { scan-assembler-times ".cfi_offset 12, -40(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_offset 6, -48(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "pushq\[^\n\r]*%rbx(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 56" 2 } } */
> +/* { dg-final { scan-assembler-times ".cfi_offset 3, -56(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "popq\[^\n\r]*rbx(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "pop2\[\\t \]*\[^\n\r]*%rbp\[^\n\r]*%r12\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_restore 12(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_restore 6(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "pop2\[\\t \]*\[^\n\r]*%r13\[^\n\r]*%r14\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_restore 14(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_restore 13(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "popq\[^\n\r]*%r15(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 8(?:\n|\[ \\t\]+#)" 1 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/apx-push2pop2_interrupt-1.c b/gcc/testsuite/gcc.target/i386/apx-push2pop2_interrupt-1.c
> new file mode 100644
> index 00000000000..747f7aaf191
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/apx-push2pop2_interrupt-1.c
> @@ -0,0 +1,28 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mapxf -mgeneral-regs-only -mno-cld -mno-push-args -maccumulate-outgoing-args" } */
> +
> +extern void foo (void *) __attribute__ ((interrupt));
> +
> +extern int bar (int);
> +
> +void foo (void *frame)
> +{
> + int a,b,c,d,e,f,i;
> + a = bar (5);
> + b = bar (a);
> + c = bar (b);
> + d = bar (c);
> + e = bar (d);
> + f = bar (e);
> + for (i = 1; i < 10; i++)
> + {
> + a += bar (a + i) + bar (b + i) +
> + bar (c + i) + bar (d + i) +
> + bar (e + i) + bar (f + i);
> + }
> +}
> +
> +/* { dg-final { scan-assembler-times "pushq" 31 } } */
> +/* { dg-final { scan-assembler-times "popq" 31 } } */
> +/* { dg-final { scan-assembler-not "push2\[\\t \]+" } } */
> +/* { dg-final { scan-assembler-not "pop2\[\\t \]+" } } */
> --
> 2.31.1
>
--
BR,
Hongtao
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2023-10-12 1:49 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-10 6:48 [PATCH] [APX] Support Intel APX PUSH2POP2 Hongyu Wang
2023-10-12 1:49 ` Hongtao Liu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).