public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-1625] Add support for stc and cmc instructions in i386.md
@ 2023-06-07 22:37 Roger Sayle
  0 siblings, 0 replies; only message in thread
From: Roger Sayle @ 2023-06-07 22:37 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:eba3565ce6d766c006cbf1f7f293bbd1226a682d

commit r14-1625-geba3565ce6d766c006cbf1f7f293bbd1226a682d
Author: Roger Sayle <roger@nextmovesoftware.com>
Date:   Wed Jun 7 23:35:15 2023 +0100

    Add support for stc and cmc instructions in i386.md
    
    This patch is the latest revision of my patch to add support for the
    STC (set carry flag) and CMC (complement carry flag) instructions to
    the i386 backend, incorporating Uros' previous feedback.  The significant
    changes are (i) the inclusion of CMC, (ii) the use of UNSPEC for pattern,
    (iii) Use of a new X86_TUNE_SLOW_STC tuning flag to use alternate
    implementations on pentium4 (which has a notoriously slow STC) when
    not optimizing for size.
    
    An example of the use of the stc instruction is:
    unsigned int foo (unsigned int a, unsigned int b, unsigned int *c) {
      return __builtin_ia32_addcarryx_u32 (1, a, b, c);
    }
    
    which previously generated:
            movl    $1, %eax
            addb    $-1, %al
            adcl    %esi, %edi
            setc    %al
            movl    %edi, (%rdx)
            movzbl  %al, %eax
            ret
    
    with this patch now generates:
            stc
            adcl    %esi, %edi
            setc    %al
            movl    %edi, (%rdx)
            movzbl  %al, %eax
            ret
    
    An example of the use of the cmc instruction (where the carry from
    a first adc is inverted/complemented as input to a second adc) is:
    unsigned int bar (unsigned int a, unsigned int b,
                      unsigned int c, unsigned int d)
    {
      unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
      return __builtin_ia32_addcarryx_u32 (c1 ^ 1, c, d, &o2);
    }
    
    which previously generated:
            movl    $1, %eax
            addb    $-1, %al
            adcl    %esi, %edi
            setnc   %al
            movl    %edi, o1(%rip)
            addb    $-1, %al
            adcl    %ecx, %edx
            setc    %al
            movl    %edx, o2(%rip)
            movzbl  %al, %eax
            ret
    
    and now generates:
            stc
            adcl    %esi, %edi
            cmc
            movl    %edi, o1(%rip)
            adcl    %ecx, %edx
            setc    %al
            movl    %edx, o2(%rip)
            movzbl  %al, %eax
            ret
    
    This version implements Uros' suggestions/refinements. (i) Avoid the
    UNSPEC_CMC by using the canonical RTL idiom for *x86_cmc, (ii) Use
    peephole2s to convert x86_stc and *x86_cmc into alternate forms on
    TARGET_SLOW_STC CPUs (pentium4), when a suitable QImode register is
    available, (iii) Prefer the addqi_cconly_overflow idiom (addb $-1,%al)
    over negqi_ccc_1 (neg %al) for setting the carry from a QImode value,
    These changes required two minor edits to i386.cc:  ix86_cc_mode had
    to be tweaked to suggest CCCmode for the new *x86_cmc pattern, and
    *x86_cmc needed to be handled/parameterized in ix86_rtx_costs so that
    combine would appreciate that this complex RTL expression was actually
    a fast, single byte instruction [i.e. preferable].
    
    2022-06-07  Roger Sayle  <roger@nextmovesoftware.com>
                Uros Bizjak  <ubizjak@gmail.com>
    
    gcc/ChangeLog
            * config/i386/i386-expand.cc (ix86_expand_builtin) <handlecarry>:
            Use new x86_stc instruction when the carry flag must be set.
            * config/i386/i386.cc (ix86_cc_mode): Use CCCmode for *x86_cmc.
            (ix86_rtx_costs): Provide accurate rtx_costs for *x86_cmc.
            * config/i386/i386.h (TARGET_SLOW_STC): New define.
            * config/i386/i386.md (UNSPEC_STC): New UNSPEC for stc.
            (x86_stc): New define_insn.
            (define_peephole2): Convert x86_stc into alternate implementation
            on pentium4 without -Os when a QImode register is available.
            (*x86_cmc): New define_insn.
            (define_peephole2): Convert *x86_cmc into alternate implementation
            on pentium4 without -Os when a QImode register is available.
            (*setccc): New define_insn_and_split for a no-op CCCmode move.
            (*setcc_qi_negqi_ccc_1_<mode>): New define_insn_and_split to
            recognize (and eliminate) the carry flag being copied to itself.
            (*setcc_qi_negqi_ccc_2_<mode>): Likewise.
            * config/i386/x86-tune.def (X86_TUNE_SLOW_STC): New tuning flag.
    
    gcc/testsuite/ChangeLog
            * gcc.target/i386/cmc-1.c: New test case.
            * gcc.target/i386/stc-1.c: Likewise.

Diff:
---
 gcc/config/i386/i386-expand.cc        | 13 ++++--
 gcc/config/i386/i386.cc               | 36 +++++++++++++++++
 gcc/config/i386/i386.h                |  1 +
 gcc/config/i386/i386.md               | 76 +++++++++++++++++++++++++++++++++++
 gcc/config/i386/x86-tune.def          |  4 ++
 gcc/testsuite/gcc.target/i386/cmc-1.c | 28 +++++++++++++
 gcc/testsuite/gcc.target/i386/stc-1.c | 21 ++++++++++
 7 files changed, 175 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index ac674418b96..697eb475f48 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -13948,8 +13948,6 @@ rdseed_step:
       arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
 
       op1 = expand_normal (arg0);
-      if (!integer_zerop (arg0))
-	op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
 
       op2 = expand_normal (arg1);
       if (!register_operand (op2, mode0))
@@ -13967,7 +13965,7 @@ rdseed_step:
 	}
 
       op0 = gen_reg_rtx (mode0);
-      if (integer_zerop (arg0))
+      if (op1 == const0_rtx)
 	{
 	  /* If arg0 is 0, optimize right away into add or sub
 	     instruction that sets CCCmode flags.  */
@@ -13977,7 +13975,14 @@ rdseed_step:
       else
 	{
 	  /* Generate CF from input operand.  */
-	  emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
+	  if (!CONST_INT_P (op1))
+	    {
+	      op1 = convert_to_mode (QImode, op1, 1);
+	      op1 = copy_to_mode_reg (QImode, op1);
+	      emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
+	    }
+	  else
+	    emit_insn (gen_x86_stc ());
 
 	  /* Generate instruction that consumes CF.  */
 	  op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d4ff56ee8dd..c4591d63063 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -15954,6 +15954,17 @@ ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
 	       && REGNO (XEXP (op1, 0)) == FLAGS_REG
 	       && XEXP (op1, 1) == const0_rtx)
 	return CCCmode;
+      /* Similarly for *x86_cmc pattern.
+	 Match LTU of op0 (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+	 and op1 (geu:QI (reg:CCC FLAGS_REG) (const_int 0)).
+	 It is sufficient to test that the operand modes are CCCmode.  */
+      else if (code == LTU
+	       && GET_CODE (op0) == NEG
+	       && GET_CODE (XEXP (op0, 0)) == LTU
+	       && GET_MODE (XEXP (XEXP (op0, 0), 0)) == CCCmode
+	       && GET_CODE (op1) == GEU
+	       && GET_MODE (XEXP (op1, 0)) == CCCmode)
+	return CCCmode;
       else
 	return CCmode;
     case GTU:			/* CF=0 & ZF=0 */
@@ -21305,6 +21316,31 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
 	  *total = 0;
 	  return true;
 	}
+      /* Match x
+	 (compare:CCC (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+		      (geu:QI (reg:CCC FLAGS_REG) (const_int 0)))  */
+      if (mode == CCCmode
+	  && GET_CODE (op0) == NEG
+	  && GET_CODE (XEXP (op0, 0)) == LTU
+	  && REG_P (XEXP (XEXP (op0, 0), 0))
+	  && GET_MODE (XEXP (XEXP (op0, 0), 0)) == CCCmode
+	  && REGNO (XEXP (XEXP (op0, 0), 0)) == FLAGS_REG
+	  && XEXP (XEXP (op0, 0), 1) == const0_rtx
+	  && GET_CODE (op1) == GEU
+	  && REG_P (XEXP (op1, 0))
+	  && GET_MODE (XEXP (op1, 0)) == CCCmode
+	  && REGNO (XEXP (op1, 0)) == FLAGS_REG
+	  && XEXP (op1, 1) == const0_rtx)
+	{
+	  /* This is *x86_cmc.  */
+	  if (!speed)
+	    *total = COSTS_N_BYTES (1);
+	  else if (TARGET_SLOW_STC)
+	    *total = COSTS_N_INSNS (2);
+	  else 
+	    *total = COSTS_N_INSNS (1);
+	  return true;
+	}
 
       if (SCALAR_INT_MODE_P (GET_MODE (op0))
 	  && GET_MODE_SIZE (GET_MODE (op0)) > UNITS_PER_WORD)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index c7439f89bdf..5ac9c78d3ba 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -448,6 +448,7 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 	ix86_tune_features[X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD]
 #define TARGET_DEST_FALSE_DEP_FOR_GLC \
 	ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC]
+#define TARGET_SLOW_STC ix86_tune_features[X86_TUNE_SLOW_STC]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e6ebc461e52..0929115ed4d 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -114,6 +114,7 @@
   UNSPEC_INSN_FALSE_DEP
   UNSPEC_SBB
   UNSPEC_CC_NE
+  UNSPEC_STC
 
   ;; For SSE/MMX support:
   UNSPEC_FIX_NOTRUNC
@@ -1999,6 +2000,53 @@
   [(set_attr "type" "ssecomi")
    (set_attr "prefix" "evex")
    (set_attr "mode" "HF")])
+
+;; Set carry flag.
+(define_insn "x86_stc"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
+  ""
+  "stc"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
+;; On Pentium 4, set the carry flag using mov $1,%al;addb $-1,%al.
+(define_peephole2
+  [(match_scratch:QI 0 "r")
+   (set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
+  "TARGET_SLOW_STC && !optimize_insn_for_size_p ()"
+  [(set (match_dup 0) (const_int 1))
+   (parallel
+     [(set (reg:CCC FLAGS_REG)
+	   (compare:CCC (plus:QI (match_dup 0) (const_int -1))
+			(match_dup 0)))
+      (set (match_dup 0) (plus:QI (match_dup 0) (const_int -1)))])])
+
+;; Complement carry flag.
+(define_insn "*x86_cmc"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+		     (geu:QI (reg:CCC FLAGS_REG) (const_int 0))))]
+  ""
+  "cmc"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "use_carry" "1")
+   (set_attr "modrm" "0")])
+
+;; On Pentium 4, cmc is replaced with setnc %al;addb $-1,%al.
+(define_peephole2
+  [(match_scratch:QI 0 "r")
+   (set (reg:CCC FLAGS_REG)
+	(compare:CCC (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+		     (geu:QI (reg:CCC FLAGS_REG) (const_int 0))))]
+  "TARGET_SLOW_STC && !optimize_insn_for_size_p ()"
+  [(set (match_dup 0) (ne:QI (reg:CCC FLAGS_REG) (const_int 0)))
+   (parallel
+     [(set (reg:CCC FLAGS_REG)
+	   (compare:CCC (plus:QI (match_dup 0) (const_int -1))
+			(match_dup 0)))
+      (set (match_dup 0) (plus:QI (match_dup 0) (const_int -1)))])])
 \f
 ;; Push/pop instructions.
 
@@ -8107,6 +8155,34 @@
   "#"
   "&& 1"
   [(const_int 0)])
+
+;; Set the carry flag from the carry flag.
+(define_insn_and_split "*setccc"
+  [(set (reg:CCC FLAGS_REG)
+	(reg:CCC FLAGS_REG))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)])
+
+;; Set the carry flag from the carry flag.
+(define_insn_and_split "*setcc_qi_negqi_ccc_1_<mode>"
+  [(set (reg:CCC FLAGS_REG)
+	(ltu:CCC (reg:CC_CCC FLAGS_REG) (const_int 0)))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)])
+
+;; Set the carry flag from the carry flag.
+(define_insn_and_split "*setcc_qi_negqi_ccc_2_<mode>"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(ltu:QI (reg:CC_CCC FLAGS_REG) (const_int 0))
+		     (const_int 0)] UNSPEC_CC_NE))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)])
 \f
 ;; Overflow setting add instructions
 
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index e1c72cddf1f..c3229d269b2 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -698,3 +698,7 @@ DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", m_NONE)
 /* X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction insertion
    before a transfer of control flow out of the function.  */
 DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", ~m_KNL)
+
+/* X86_TUNE_SLOW_STC: This disables use of stc, clc and cmc carry flag
+  modifications on architectures where theses operations are slow.  */
+DEF_TUNE (X86_TUNE_SLOW_STC, "slow_stc", m_PENT4)
diff --git a/gcc/testsuite/gcc.target/i386/cmc-1.c b/gcc/testsuite/gcc.target/i386/cmc-1.c
new file mode 100644
index 00000000000..58e922ad12c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cmc-1.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned int o1;
+unsigned int o2;
+
+unsigned int foo_xor (unsigned int a, unsigned int b,
+                      unsigned int c, unsigned int d)
+{
+  unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
+  return __builtin_ia32_addcarryx_u32 (c1 ^ 1, c, d, &o2);
+}
+
+unsigned int foo_sub (unsigned int a, unsigned int b,
+                      unsigned int c, unsigned int d)
+{
+  unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
+  return __builtin_ia32_addcarryx_u32 (1 - c1, c, d, &o2);
+}
+
+unsigned int foo_eqz (unsigned int a, unsigned int b,
+                      unsigned int c, unsigned int d)
+{
+  unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
+  return __builtin_ia32_addcarryx_u32 (c1 == 0, c, d, &o2);
+}
+
+/* { dg-final { scan-assembler "cmc" } } */
diff --git a/gcc/testsuite/gcc.target/i386/stc-1.c b/gcc/testsuite/gcc.target/i386/stc-1.c
new file mode 100644
index 00000000000..857c939dbea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/stc-1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef unsigned int u32;
+
+unsigned int foo (unsigned int a, unsigned int b, unsigned int *c)
+{
+  return __builtin_ia32_addcarryx_u32 (1, a, b, c);
+}
+
+unsigned int bar (unsigned int b, unsigned int *c)
+{
+  return __builtin_ia32_addcarryx_u32 (1, 2, b, c);
+}
+
+unsigned int baz (unsigned int a, unsigned int *c)
+{
+  return __builtin_ia32_addcarryx_u32 (1, a, 3, c);
+}
+
+/* { dg-final { scan-assembler "stc" } } */

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-06-07 22:37 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-07 22:37 [gcc r14-1625] Add support for stc and cmc instructions in i386.md Roger Sayle

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).