public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH, i386]: Optimization of the i386 and x86_64 compilers
@ 2007-03-02 11:01 Uros Bizjak
  2007-03-02 12:03 ` Jan Hubicka
  2007-03-02 18:38 ` Richard Henderson
  0 siblings, 2 replies; 7+ messages in thread
From: Uros Bizjak @ 2007-03-02 11:01 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1777 bytes --]

Hello!

Attached patch implements the idea, proposed by Michael Meissner in PR
31019. The core of his idea is to substitute (1 << ix86_[tune|arch])
in x86_some_var & (1 << ix86_[tune|arch])) by a precalculated global
variable.

The results of this patch are quite suprising as the text size of cc1
on i686 host was lowered considerably:

size cc1
   text    data     bss     dec     hex filename
7632793   18876  574516 8226185  7d8589 cc1

size cc1-patched
   text    data     bss     dec     hex filename
6731749   18876  574516 7325141  6fc5d5 cc1-patched

Yes, for 901k.

This effect could be partially attributed to the way i386 handles
variable shifts. Unpached gcc uses code that also clobbers %ecx:

 8554612:	8b 0d 9c a1 81 08    	mov    0x881a19c,%ecx
 8554618:	a1 a4 4b 72 08       	mov    0x8724ba4,%eax
 855461d:	d3 f8                	sar    %cl,%eax
 855461f:	a8 01                	test   $0x1,%al

Patched gcc uses following code:

 8498762:	a1 a0 e1 73 08       	mov    0x873e1a0,%eax
 8498767:	85 05 04 8c 64 08    	test   %eax,0x8648c04


2007-03-02  Uros Bizjak  <ubizjak@gmail.com>
	    Michael Meissner  <michael.meissner@amd.com>

        * config/i386/i386.h (TUNEMASK): Redefine to use ix86_tune_mask.
	(ARCHMASK): Define.
	(TARGET_CMOVE): Use ARCHMASK.
	(TARGET_CMPXCHG): Ditto.
	(TARGET_CMPXCHG8B): Ditto.
	(TARGET_XADD): Ditto.
	(TARGET_BSWAP): Ditto.
	* config/i386/i386.c (ix86_tune_mask): New global variable.
	(ix86_arch_mask): Ditto.
	(override_options): Initialize ix86_tune_mask and
	ix86_arch_mask. Use ARCHMASK to clear MASK_NO_FANCY_MATH_387 in
	target_flags.

The patch was bootstrapped on i686-pc-linux-gnu, regression tested for
c, c++ and gfortran.

BTW: The patch also rearranges processors #defines into more logical sequence.

Uros.

[-- Attachment #2: i386-mask.diff --]
[-- Type: application/octet-stream, Size: 4876 bytes --]

Index: config/i386/i386.h
===================================================================
--- config/i386/i386.h	(revision 122469)
+++ config/i386/i386.h	(working copy)
@@ -179,7 +179,6 @@
 #define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64)
 #define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10)
 
-#define TUNEMASK (1 << ix86_tune)
 extern const int x86_use_leave, x86_push_memory, x86_zero_extend_with_and;
 extern const int x86_use_bit_test, x86_cmove, x86_deep_branch;
 extern const int x86_branch_hints, x86_unroll_strlen;
@@ -208,6 +207,9 @@
 extern const int x86_partial_flag_reg_stall;
 extern int x86_prefetch_sse, x86_cmpxchg16b;
 
+#define TUNEMASK ix86_tune_mask
+#define ARCHMASK ix86_arch_mask
+
 #define TARGET_USE_LEAVE (x86_use_leave & TUNEMASK)
 #define TARGET_PUSH_MEMORY (x86_push_memory & TUNEMASK)
 #define TARGET_ZERO_EXTEND_WITH_AND (x86_zero_extend_with_and & TUNEMASK)
@@ -215,7 +217,7 @@
 #define TARGET_UNROLL_STRLEN (x86_unroll_strlen & TUNEMASK)
 /* For sane SSE instruction set generation we need fcomi instruction.  It is
    safe to enable all CMOVE instructions.  */
-#define TARGET_CMOVE ((x86_cmove & (1 << ix86_arch)) || TARGET_SSE)
+#define TARGET_CMOVE ((x86_cmove & ARCHMASK) || TARGET_SSE)
 #define TARGET_FISTTP (TARGET_SSE3 && TARGET_80387)
 #define TARGET_DEEP_BRANCH_PREDICTION (x86_deep_branch & TUNEMASK)
 #define TARGET_BRANCH_PREDICTION_HINTS (x86_branch_hints & TUNEMASK)
@@ -276,11 +278,11 @@
 #define TARGET_ANY_GNU_TLS (TARGET_GNU_TLS || TARGET_GNU2_TLS)
 #define TARGET_SUN_TLS (ix86_tls_dialect == TLS_DIALECT_SUN)
 
-#define TARGET_CMPXCHG (x86_cmpxchg & (1 << ix86_arch))
-#define TARGET_CMPXCHG8B (x86_cmpxchg8b & (1 << ix86_arch))
+#define TARGET_CMPXCHG (x86_cmpxchg & ARCHMASK)
+#define TARGET_CMPXCHG8B (x86_cmpxchg8b & ARCHMASK)
 #define TARGET_CMPXCHG16B (x86_cmpxchg16b)
-#define TARGET_XADD (x86_xadd & (1 << ix86_arch))
-#define TARGET_BSWAP (x86_bswap & (1 << ix86_arch))
+#define TARGET_XADD (x86_xadd & ARCHMASK)
+#define TARGET_BSWAP (x86_bswap & ARCHMASK)
 
 #ifndef TARGET_64BIT_DEFAULT
 #define TARGET_64BIT_DEFAULT 0
@@ -2130,7 +2132,10 @@
 };
 
 extern enum processor_type ix86_tune;
+extern int ix86_tune_mask;
+
 extern enum processor_type ix86_arch;
+extern int ix86_arch_mask;
 
 enum fpmath_unit
 {
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 122469)
+++ config/i386/i386.c	(working copy)
@@ -984,23 +984,25 @@
 #define m_486 (1<<PROCESSOR_I486)
 #define m_PENT (1<<PROCESSOR_PENTIUM)
 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
+#define m_PENT4  (1<<PROCESSOR_PENTIUM4)
+#define m_NOCONA  (1<<PROCESSOR_NOCONA)
+#define m_CORE2  (1<<PROCESSOR_CORE2)
+
 #define m_GEODE  (1<<PROCESSOR_GEODE)
+#define m_K6  (1<<PROCESSOR_K6)
 #define m_K6_GEODE  (m_K6 | m_GEODE)
-#define m_K6  (1<<PROCESSOR_K6)
+#define m_K8  (1<<PROCESSOR_K8)
 #define m_ATHLON  (1<<PROCESSOR_ATHLON)
-#define m_PENT4  (1<<PROCESSOR_PENTIUM4)
-#define m_K8  (1<<PROCESSOR_K8)
 #define m_ATHLON_K8  (m_K8 | m_ATHLON)
 #define m_AMDFAM10  (1<<PROCESSOR_AMDFAM10)
-#define m_NOCONA  (1<<PROCESSOR_NOCONA)
-#define m_CORE2  (1<<PROCESSOR_CORE2)
+#define m_ATHLON_K8_AMDFAM10  (m_K8 | m_ATHLON | m_AMDFAM10)
+
 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
-#define m_GENERIC (m_GENERIC32 | m_GENERIC64)
-#define m_ATHLON_K8_AMDFAM10  (m_K8 | m_ATHLON | m_AMDFAM10)
 
 /* Generic instruction choice should be common subset of supported CPUs
    (PPro/PENT4/NOCONA/CORE2/Athlon/K8).  */
+#define m_GENERIC (m_GENERIC32 | m_GENERIC64)
 
 /* Leave is not affecting Nocona SPEC2000 results negatively, so enabling for
    Generic64 seems like good code size tradeoff.  We can't enable it for 32bit
@@ -1395,8 +1397,11 @@
 
 /* Which cpu are we scheduling for.  */
 enum processor_type ix86_tune;
+int ix86_tune_mask;
+
 /* Which instruction set architecture to use.  */
 enum processor_type ix86_arch;
+int ix86_arch_mask;
 
 /* true if sse prefetch instruction is not NOOP.  */
 int x86_prefetch_sse;
@@ -2074,8 +2079,10 @@
     if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
       {
 	ix86_arch = processor_alias_table[i].processor;
+	ix86_arch_mask = 1 << ix86_arch;
 	/* Default cpu tuning to the architecture.  */
 	ix86_tune = ix86_arch;
+	ix86_tune_mask = 1 << ix86_tune;
 	if (processor_alias_table[i].flags & PTA_MMX
 	    && !(target_flags_explicit & MASK_MMX))
 	  target_flags |= MASK_MMX;
@@ -2276,7 +2283,7 @@
 
   /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
      since the insns won't need emulation.  */
-  if (x86_arch_always_fancy_math_387 & (1 << ix86_arch))
+  if (x86_arch_always_fancy_math_387 & ARCHMASK)
     target_flags &= ~MASK_NO_FANCY_MATH_387;
 
   /* Likewise, if the target doesn't have a 387, or we've specified

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2007-03-03  0:46 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-03-02 11:01 [PATCH, i386]: Optimization of the i386 and x86_64 compilers Uros Bizjak
2007-03-02 12:03 ` Jan Hubicka
2007-03-02 14:42   ` Uros Bizjak
2007-03-02 17:55   ` Meissner, Michael
2007-03-03  0:46     ` Kaveh R. GHAZI
2007-03-02 18:38 ` Richard Henderson
2007-03-02 22:32   ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).