From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <hubicka@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1075)
	id 2BA623858C41; Thu, 18 Jan 2024 15:08:02 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 2BA623858C41
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1705590483;
	bh=WnFdJd7siNwVRADy3VBl9HMP3KGg+6dXXAniWuG3tO8=;
	h=From:To:Subject:Date:From;
	b=dpceXLxbhsNc5LWLpgaWvaz3Y7/OYlPjjlBGHrqcoZr3Yore1UIoqIGj1bDXE67iM
	 8BoxDhowf/gZZnJpSOJWnZ8QQy+4uC6365n55RsK7GA61SfloPas2v9PTnJXR3Wodu
	 crdPE5OXMBDpg0h9szLTl2btzX+9sLWDwi/EO+KA=
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset="utf-8"
From: Jan Hubicka <hubicka@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r11-11196] Update znver4 costs
X-Act-Checkin: gcc
X-Git-Author: Jan Hubicka <jh@suse.cz>
X-Git-Refname: refs/heads/releases/gcc-11
X-Git-Oldrev: c9e9681a344b81308f746490cc6b9be242cfaa43
X-Git-Newrev: e9238dac5842246ec00a65fc8d744f2fd6714daf
Message-Id: <20240118150803.2BA623858C41@sourceware.org>
Date: Thu, 18 Jan 2024 15:08:02 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:e9238dac5842246ec00a65fc8d744f2fd6714daf

commit r11-11196-ge9238dac5842246ec00a65fc8d744f2fd6714daf
Author: Jan Hubicka <jh@suse.cz>
Date:   Thu Dec 22 02:16:24 2022 +0100

    Update znver4 costs
    
    Update cost of znver4 mostly based on data measued by Agner Fog.
    Compared to previous generations x87 became bit slower which is probably not
    big deal (and we have minimal benchmarking coverage for it).  One interesting
    improvement is reducation of FMA cost.  I also updated costs of AVX256
    loads/stores  based on latencies (not throughput which is twice of avx256).
    Overall AVX512 vectorization seems to improve noticeably some of TSVC
    benchmarks but since internally 512 vectors are split to 256 vectors it is
    somewhat risky and does not win in SPEC scores (mostly by regressing benchmarks
    with loop that have small trip count like x264 and exchange), so for now I am
    going to set AVX256_OPTIMAL tune but I am still playing with it.  We improved
    since ZNVER1 on choosing vectorization size and also have vectorized
    prologues/epilogues so it may be possible to make avx512 small win overall.
    
    2022-12-22  Jan Hubicka  <hubicka@ucw.cz>
    
            * config/i386/x86-tune-costs.h (znver4_cost): Upate costs of FP and SSE
            moves, division multiplication, gathers, L2 cache size, and more
            complex FP instrutions.
    
    (cherry picked from commit bbe04bade0cc3b17e62c2af3d89b899367e7d2d1)

Diff:
---
 gcc/config/i386/x86-tune-costs.h | 134 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index ffe810f2bcb..48100d10415 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1820,6 +1820,140 @@ struct processor_costs znver3_cost = {
   "16",					/* Func alignment.  */
 };
 
+/* This table currently replicates znver3_cost table. */
+struct processor_costs znver4_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+
+  /* reg-reg moves are done by renaming and thus they are even cheaper than
+     1 cycle.  Because reg-reg move cost is 2 and following tables correspond
+     to doubles of latencies, we do not model this correctly.  It does not
+     seem to make practical difference to bump prices up even more.  */
+  6,					/* cost for loading QImode using
+					   movzbl.  */
+  {6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {8, 8, 8},				/* cost of storing integer
+					   registers.  */
+  2,					/* cost of reg,reg fld/fst.  */
+  {14, 14, 17},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode.  */
+  {12, 12, 16},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode.  */
+  2,					/* cost of moving MMX register.  */
+  {6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode.  */
+  {8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode.  */
+  2, 2, 3,				/* cost of moving XMM,YMM,ZMM
+					   register.  */
+  {6, 6, 10, 10, 12},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit.  */
+  {8, 8, 8, 12, 12},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit.  */
+  6, 8,					/* SSE->integer and integer->SSE
+					   moves.  */
+  8, 8,					/* mask->integer and integer->mask moves */
+  {6, 6, 6},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {8, 8, 8},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
+  /* End of register allocator costs.  */
+  },
+
+  COSTS_N_INSNS (1),			/* cost of an add instruction.  */
+  /* TODO: Lea with 3 components has cost 2.  */
+  COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
+  COSTS_N_INSNS (1),			/* variable shift costs.  */
+  COSTS_N_INSNS (1),			/* constant shift costs.  */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
+   COSTS_N_INSNS (3),			/* 				 HI.  */
+   COSTS_N_INSNS (3),			/*				 SI.  */
+   COSTS_N_INSNS (3),			/*				 DI.  */
+   COSTS_N_INSNS (3)},			/*			other.  */
+  0,					/* cost of multiply per each bit
+					   set.  */
+  {COSTS_N_INSNS (12),			/* cost of a divide/mod for QI.  */
+   COSTS_N_INSNS (13),			/* 			    HI.  */
+   COSTS_N_INSNS (13),			/*			    SI.  */
+   COSTS_N_INSNS (18),			/*			    DI.  */
+   COSTS_N_INSNS (18)},			/*			    other.  */
+  COSTS_N_INSNS (1),			/* cost of movsx.  */
+  COSTS_N_INSNS (1),			/* cost of movzx.  */
+  8,					/* "large" insn.  */
+  9,					/* MOVE_RATIO.  */
+  6,					/* CLEAR_RATIO */
+  {6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {8, 8, 8},				/* cost of storing integer
+					   registers.  */
+  {6, 6, 10, 10, 12},			/* cost of loading SSE registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 12, 12},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 6, 6},			/* cost of unaligned loads.  */
+  {8, 8, 8, 8, 8},			/* cost of unaligned stores.  */
+  2, 2, 2,				/* cost of moving XMM,YMM,ZMM
+					   register.  */
+  6,					/* cost of moving SSE register to integer.  */
+  /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
+     throughput 5.  Approx 7 uops do not depend on vector size and every load
+     is 5 uops.  */
+  14, 10,				/* Gather load static, per_elt.  */
+  14, 20,				/* Gather store static, per_elt.  */
+  32,					/* size of l1 cache.  */
+  1024,					/* size of l2 cache.  */
+  64,					/* size of prefetch block.  */
+  /* New AMD processors never drop prefetches; if they cannot be performed
+     immediately, they are queued.  We set number of simultaneous prefetches
+     to a large constant to reflect this (it probably is not a good idea not
+     to limit number of prefetches at all, as their execution also takes some
+     time).  */
+  100,					/* number of parallel prefetches.  */
+  3,					/* Branch cost.  */
+  COSTS_N_INSNS (7),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (7),			/* cost of FMUL instruction.  */
+  /* Latency of fdiv is 8-15.  */
+  COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
+  /* Latency of fsqrt is 4-10.  */
+  COSTS_N_INSNS (25),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
+  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
+  COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
+  /* 9-13.  */
+  COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (21),			/* cost of SQRTSD instruction.  */
+  /* Zen can execute 4 integer operations per cycle.  FP operations
+     take 3 cycles and it can execute 2 integer additions and 2
+     multiplications thus reassociation may make sense up to with of 6.
+     SPEC2k6 bencharks suggests
+     that 4 works better than 6 probably due to register pressure.
+
+     Integer vector operations are taken by FP unit and execute 3 vector
+     plus/minus operations per cycle but only one multiply.  This is adjusted
+     in ix86_reassociation_width.  */
+  4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  znver2_memcpy,
+  znver2_memset,
+  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
+  "16",					/* Loop alignment.  */
+  "16",					/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",					/* Func alignment.  */
+};
+
 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
 static stringop_algs skylake_memcpy[2] =   {
   {libcall,