From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 13985 invoked by alias); 2 Jun 2007 08:08:05 -0000 Received: (qmail 13975 invoked by uid 22791); 2 Jun 2007 08:08:03 -0000 X-Spam-Check-By: sourceware.org Received: from nikam-dmz.ms.mff.cuni.cz (HELO nikam.ms.mff.cuni.cz) (195.113.20.16) by sourceware.org (qpsmtpd/0.31) with ESMTP; Sat, 02 Jun 2007 08:08:00 +0000 Received: by nikam.ms.mff.cuni.cz (Postfix, from userid 29025) id 9C8BD5BA9C; Sat, 2 Jun 2007 10:07:57 +0200 (CEST) Date: Sat, 02 Jun 2007 08:08:00 -0000 From: Zdenek Dvorak To: Diego Novillo Cc: gcc-patches@gcc.gnu.org Subject: [patch] Changes in params for cache sizes Message-ID: <20070602080757.GA1887@kam.mff.cuni.cz> References: <20070523091352.GA31024@kam.mff.cuni.cz> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: User-Agent: Mutt/1.5.9i Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org X-SW-Source: 2007-06/txt/msg00084.txt.bz2 Hello, > >http://gcc.gnu.org/ml/gcc-patches/2007-04/msg00737.html > >-- analysis and generation of nontemporal prefetches > > OK, but I hope you have the --param patch for the cache size ready soon. here is the patch. In addition for the param for l2 cache size, it does the following changes: -- the cache sizes are now specified in kB (instead of number of cache lines, which was somewhat difficult to use) -- default values for L1 and L2 cache sizes are specified for various subarchitectures of i386 (these are not too reliable, as each subarchitecture usually corresponds to several processor models, that may differ in cache sizes; but not much can be done about that). One piece that is missing is detection of l2 cache size, I will add that in next patch. Bootstrapped & regtested on i686. Zdenek * doc/invoke.texi (l1-cache-size): Update documentation. (l2-cache-size): Document. * params.h (L2_CACHE_SIZE): New macro. * tree-ssa-loop-prefetch.c (L1_CACHE_SIZE_BYTES): Reflect that L1_CACHE_SIZE is in kB now. (L2_CACHE_SIZE_BYTES): New macro. (tree_ssa_prefetch_arrays): Show size in kb. * config/i386/i386.h (struct processor_costs): Add l1_cache_size and l2_cache_size fields. * config/i386/driver-i386.c (describe_cache): Detect cache size in kB. * config/i386/i386.c (size_cost, i386_cost, i486_cost,pentium_cost, pentiumpro_cost, geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost, pentium4_cost, nocona_cost, core2_cost, generic64_cost, generic32_cost): Add l1_cache_size and l2_cache_size. (override_options): Set l1-cache-size and l2-cache-size to default values if not specified otherwise. * params.def (PARAM_L1_CACHE_SIZE): Change to set in kB. (PARAM_L2_CACHE_SIZE): New. Index: doc/invoke.texi =================================================================== *** doc/invoke.texi (revision 125268) --- doc/invoke.texi (working copy) *************** Maximum number of prefetches that can ru *** 6896,6902 **** The size of cache line in L1 cache, in bytes. @item l1-cache-size ! The number of cache lines in L1 cache. @item verify-canonical-types Whether the compiler should verify the ``canonical'' types used for --- 6896,6905 ---- The size of cache line in L1 cache, in bytes. @item l1-cache-size ! The size of L1 cache, in kilobytes. ! ! @item l2-cache-size ! The size of L2 cache, in kilobytes. @item verify-canonical-types Whether the compiler should verify the ``canonical'' types used for Index: params.h =================================================================== *** params.h (revision 125268) --- params.h (working copy) *************** typedef enum compiler_param *** 168,173 **** --- 168,175 ---- PARAM_VALUE (PARAM_L1_CACHE_SIZE) #define L1_CACHE_LINE_SIZE \ PARAM_VALUE (PARAM_L1_CACHE_LINE_SIZE) + #define L2_CACHE_SIZE \ + PARAM_VALUE (PARAM_L2_CACHE_SIZE) #define VERIFY_CANONICAL_TYPES \ PARAM_VALUE (PARAM_VERIFY_CANONICAL_TYPES) #endif /* ! GCC_PARAMS_H */ Index: tree-ssa-loop-prefetch.c =================================================================== *** tree-ssa-loop-prefetch.c (revision 125268) --- tree-ssa-loop-prefetch.c (working copy) *************** Software Foundation, 59 Temple Place - S *** 166,174 **** #define HAVE_prefetch 0 #endif ! #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * L1_CACHE_LINE_SIZE)) ! /* TODO: Add parameter to specify L2 cache size. */ ! #define L2_CACHE_SIZE_BYTES (8 * L1_CACHE_SIZE_BYTES) /* We consider a memory access nontemporal if it is not reused sooner than after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore --- 166,173 ---- #define HAVE_prefetch 0 #endif ! #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024)) ! #define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024)) /* We consider a memory access nontemporal if it is not reused sooner than after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore *************** tree_ssa_prefetch_arrays (void) *** 1365,1374 **** SIMULTANEOUS_PREFETCHES); fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY); fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK); ! fprintf (dump_file, " L1 cache size: %d lines, %d bytes\n", ! L1_CACHE_SIZE, L1_CACHE_SIZE_BYTES); fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE); ! fprintf (dump_file, " L2 cache size: %d bytes\n", L2_CACHE_SIZE_BYTES); fprintf (dump_file, "\n"); } --- 1364,1373 ---- SIMULTANEOUS_PREFETCHES); fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY); fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK); ! fprintf (dump_file, " L1 cache size: %d lines, %d kB\n", ! L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE); fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE); ! fprintf (dump_file, " L2 cache size: %d kB\n", L2_CACHE_SIZE); fprintf (dump_file, "\n"); } Index: config/i386/i386.h =================================================================== *** config/i386/i386.h (revision 125268) --- config/i386/i386.h (working copy) *************** struct processor_costs { *** 124,129 **** --- 124,131 ---- in SImode, DImode and TImode*/ const int mmxsse_to_integer; /* cost of moving mmxsse register to integer and vice versa. */ + const int l1_cache_size; /* size of l1 cache, in kilobytes. */ + const int l2_cache_size; /* size of l2 cache, in kilobytes. */ const int prefetch_block; /* bytes moved to cache for prefetch. */ const int simultaneous_prefetches; /* number of parallel prefetch operations. */ Index: config/i386/driver-i386.c =================================================================== *** config/i386/driver-i386.c (revision 125268) --- config/i386/driver-i386.c (working copy) *************** describe_cache (unsigned l1_sizekb, unsi *** 56,69 **** unsigned l1_assoc ATTRIBUTE_UNUSED) { char size[1000], line[1000]; - unsigned size_in_lines; /* At the moment, gcc middle-end does not use the information about the associativity of the cache. */ ! size_in_lines = (l1_sizekb * 1024) / l1_line; ! ! sprintf (size, "--param l1-cache-size=%u", size_in_lines); sprintf (line, "--param l1-cache-line-size=%u", l1_line); return concat (size, " ", line, " ", NULL); --- 56,66 ---- unsigned l1_assoc ATTRIBUTE_UNUSED) { char size[1000], line[1000]; /* At the moment, gcc middle-end does not use the information about the associativity of the cache. */ ! sprintf (size, "--param l1-cache-size=%u", l1_sizekb); sprintf (line, "--param l1-cache-line-size=%u", l1_line); return concat (size, " ", line, " ", NULL); Index: config/i386/i386.c =================================================================== *** config/i386/i386.c (revision 125268) --- config/i386/i386.c (working copy) *************** struct processor_costs size_cost = { /* *** 112,117 **** --- 112,119 ---- {3, 3, 3}, /* cost of storing SSE registers in SImode, DImode and TImode */ 3, /* MMX or SSE register to integer */ + 0, /* size of l1 cache */ + 0, /* size of l2 cache */ 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ 2, /* Branch cost */ *************** struct processor_costs i386_cost = { /* *** 170,175 **** --- 172,179 ---- {4, 8, 16}, /* cost of storing SSE registers in SImode, DImode and TImode */ 3, /* MMX or SSE register to integer */ + 0, /* size of l1 cache */ + 0, /* size of l2 cache */ 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ 1, /* Branch cost */ *************** struct processor_costs i486_cost = { /* *** 227,232 **** --- 231,240 ---- {4, 8, 16}, /* cost of storing SSE registers in SImode, DImode and TImode */ 3, /* MMX or SSE register to integer */ + 4, /* size of l1 cache. 486 has 8kB cache + shared for code and data, so 4kB is + not really precise. */ + 4, /* size of l2 cache */ 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ 1, /* Branch cost */ *************** struct processor_costs pentium_cost = { *** 284,289 **** --- 292,299 ---- {4, 8, 16}, /* cost of storing SSE registers in SImode, DImode and TImode */ 3, /* MMX or SSE register to integer */ + 8, /* size of l1 cache. */ + 8, /* size of l2 cache */ 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ 2, /* Branch cost */ *************** struct processor_costs pentiumpro_cost = *** 341,346 **** --- 351,358 ---- {2, 2, 8}, /* cost of storing SSE registers in SImode, DImode and TImode */ 3, /* MMX or SSE register to integer */ + 8, /* size of l1 cache. */ + 256, /* size of l2 cache */ 32, /* size of prefetch block */ 6, /* number of parallel prefetches */ 2, /* Branch cost */ *************** struct processor_costs geode_cost = { *** 406,411 **** --- 418,425 ---- {1, 1, 1}, /* cost of storing SSE registers in SImode, DImode and TImode */ 1, /* MMX or SSE register to integer */ + 64, /* size of l1 cache. */ + 128, /* size of l2 cache. */ 32, /* size of prefetch block */ 1, /* number of parallel prefetches */ 1, /* Branch cost */ *************** struct processor_costs k6_cost = { *** 463,468 **** --- 477,487 ---- {2, 2, 8}, /* cost of storing SSE registers in SImode, DImode and TImode */ 6, /* MMX or SSE register to integer */ + 32, /* size of l1 cache. */ + 32, /* size of l2 cache. Some models + have integrated l2 cache, but + optimizing for k6 is not important + enough to worry about that. */ 32, /* size of prefetch block */ 1, /* number of parallel prefetches */ 1, /* Branch cost */ *************** struct processor_costs athlon_cost = { *** 520,525 **** --- 539,546 ---- {4, 4, 5}, /* cost of storing SSE registers in SImode, DImode and TImode */ 5, /* MMX or SSE register to integer */ + 64, /* size of l1 cache. */ + 256, /* size of l2 cache. */ 64, /* size of prefetch block */ 6, /* number of parallel prefetches */ 5, /* Branch cost */ *************** struct processor_costs k8_cost = { *** 580,585 **** --- 601,608 ---- {4, 4, 5}, /* cost of storing SSE registers in SImode, DImode and TImode */ 5, /* MMX or SSE register to integer */ + 64, /* size of l1 cache. */ + 512, /* size of l2 cache. */ 64, /* size of prefetch block */ /* New AMD processors never drop prefetches; if they cannot be performed immediately, they are queued. We set number of simultaneous prefetches *************** struct processor_costs amdfam10_cost = { *** 653,658 **** --- 676,683 ---- 1/1 1/1 MOVD reg32, xmmreg Double FADD 3 1/1 1/1 */ + 64, /* size of l1 cache. */ + 512, /* size of l2 cache. */ 64, /* size of prefetch block */ /* New AMD processors never drop prefetches; if they cannot be performed immediately, they are queued. We set number of simultaneous prefetches *************** struct processor_costs pentium4_cost = { *** 720,725 **** --- 745,752 ---- {2, 2, 8}, /* cost of storing SSE registers in SImode, DImode and TImode */ 10, /* MMX or SSE register to integer */ + 8, /* size of l1 cache. */ + 256, /* size of l2 cache. */ 64, /* size of prefetch block */ 6, /* number of parallel prefetches */ 2, /* Branch cost */ *************** struct processor_costs nocona_cost = { *** 778,783 **** --- 805,812 ---- {12, 12, 12}, /* cost of storing SSE registers in SImode, DImode and TImode */ 8, /* MMX or SSE register to integer */ + 8, /* size of l1 cache. */ + 1024, /* size of l2 cache. */ 128, /* size of prefetch block */ 8, /* number of parallel prefetches */ 1, /* Branch cost */ *************** struct processor_costs core2_cost = { *** 837,842 **** --- 866,873 ---- {4, 4, 4}, /* cost of storing SSE registers in SImode, DImode and TImode */ 2, /* MMX or SSE register to integer */ + 32, /* size of l1 cache. */ + 2048, /* size of l2 cache. */ 128, /* size of prefetch block */ 8, /* number of parallel prefetches */ 3, /* Branch cost */ *************** struct processor_costs generic64_cost = *** 902,907 **** --- 933,940 ---- {8, 8, 8}, /* cost of storing SSE registers in SImode, DImode and TImode */ 5, /* MMX or SSE register to integer */ + 32, /* size of l1 cache. */ + 512, /* size of l2 cache. */ 64, /* size of prefetch block */ 6, /* number of parallel prefetches */ /* Benchmarks shows large regressions on K8 sixtrack benchmark when this value *************** struct processor_costs generic32_cost = *** 962,967 **** --- 995,1002 ---- {8, 8, 8}, /* cost of storing SSE registers in SImode, DImode and TImode */ 5, /* MMX or SSE register to integer */ + 32, /* size of l1 cache. */ + 256, /* size of l2 cache. */ 64, /* size of prefetch block */ 6, /* number of parallel prefetches */ 3, /* Branch cost */ *************** override_options (void) *** 2406,2411 **** --- 2441,2450 ---- ix86_cost->simultaneous_prefetches); if (!PARAM_SET_P (PARAM_L1_CACHE_LINE_SIZE)) set_param_value ("l1-cache-line-size", ix86_cost->prefetch_block); + if (!PARAM_SET_P (PARAM_L1_CACHE_SIZE)) + set_param_value ("l1-cache-size", ix86_cost->l1_cache_size); + if (!PARAM_SET_P (PARAM_L2_CACHE_SIZE)) + set_param_value ("l2-cache-size", ix86_cost->l2_cache_size); } /* Return true if this goes in large data/bss. */ Index: params.def =================================================================== *** params.def (revision 125268) --- params.def (working copy) *************** DEFPARAM (PARAM_SIMULTANEOUS_PREFETCHES, *** 659,670 **** "The number of prefetches that can run at the same time", 3, 0, 0) ! /* The size of L1 cache in number of cache lines. */ DEFPARAM (PARAM_L1_CACHE_SIZE, "l1-cache-size", "The size of L1 cache", ! 1024, 0, 0) /* The size of L1 cache line in bytes. */ --- 659,670 ---- "The number of prefetches that can run at the same time", 3, 0, 0) ! /* The size of L1 cache in kB. */ DEFPARAM (PARAM_L1_CACHE_SIZE, "l1-cache-size", "The size of L1 cache", ! 64, 0, 0) /* The size of L1 cache line in bytes. */ *************** DEFPARAM (PARAM_L1_CACHE_LINE_SIZE, *** 673,678 **** --- 673,685 ---- "The size of L1 cache line", 32, 0, 0) + /* The size of L2 cache in kB. */ + + DEFPARAM (PARAM_L2_CACHE_SIZE, + "l2-cache-size", + "The size of L2 cache", + 512, 0, 0) + #ifdef ENABLE_CHECKING # define GCC_CANONICAL_TYPES_DEFAULT 1 #else