From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail.loongson.cn (mail.loongson.cn [114.242.206.163]) by sourceware.org (Postfix) with ESMTP id 687C33858C56 for ; Fri, 2 Feb 2024 01:23:50 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 687C33858C56 Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=loongson.cn Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=loongson.cn ARC-Filter: OpenARC Filter v1.0.0 sourceware.org 687C33858C56 Authentication-Results: server2.sourceware.org; arc=none smtp.remote-ip=114.242.206.163 ARC-Seal: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1706837040; cv=none; b=P+axWPUlwlSSg+nCiPTMqHJ0TDIKBSE2Xv1u4R9zdPtq3U9PssQgKc/kDnfYvltPE9/itiDmJ19UuErJUNUfY53jrvqAYjPSeHF3uIV2lHEpw35Ufn302/LMwT6dyfImTD3QpGh8XfVda8aU79DFQjdRhn8rfIH8TYpjFag/UaY= ARC-Message-Signature: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1706837040; c=relaxed/simple; bh=R8RiiwXeYW1z6O7CF7IikV+2hoYcKQZbsp70y1WjKlo=; h=Subject:To:From:Message-ID:Date:MIME-Version; b=Sns4EtsMmQ1L2rkAZK+wh0KYgIYhIQ9SsMa/xEka7WH8R06KoQUmpLi8JKVG1x5enLXneMpev5S4jpC9+mJUUGSUUbJ1PgnbAp1gYDRqKqxYpJe33OBFeffjrOCS/1ZJZPE1Xr6aCP5FqA1lMQxqHUY/+upa3lz9VlGiS3hp7k8= ARC-Authentication-Results: i=1; server2.sourceware.org Received: from loongson.cn (unknown [10.20.4.107]) by gateway (Coremail) with SMTP id _____8DxWPAjRLxlivUJAA--.28655S3; Fri, 02 Feb 2024 09:23:47 +0800 (CST) Received: from [10.20.4.107] (unknown [10.20.4.107]) by localhost.localdomain (Coremail) with SMTP id AQAAf8Cx_c4iRLxlpbssAA--.40286S3; Fri, 02 Feb 2024 09:23:47 +0800 (CST) Subject: Re:[pushed] [PATCH v2] LoongArch: Adjust cost of vector_stmt that match multiply-add pattern. To: Li Wei , gcc-patches@gcc.gnu.org Cc: xry111@xry111.site, i@xen0n.name, xuchenghua@loongson.cn References: <20240126084111.1811519-1-liwei@loongson.cn> From: chenglulu Message-ID: <6185ffcd-a553-28b4-2a08-d224729809b3@loongson.cn> Date: Fri, 2 Feb 2024 09:23:46 +0800 User-Agent: Mozilla/5.0 (X11; Linux loongarch64; rv:68.0) Gecko/20100101 Thunderbird/68.7.0 MIME-Version: 1.0 In-Reply-To: <20240126084111.1811519-1-liwei@loongson.cn> Content-Type: text/plain; charset=gbk; format=flowed Content-Transfer-Encoding: 8bit Content-Language: en-US X-CM-TRANSID:AQAAf8Cx_c4iRLxlpbssAA--.40286S3 X-CM-SenderInfo: xfkh0wpoxo3qxorr0wxvrqhubq/ X-Coremail-Antispam: 1Uk129KBj93XoW3Gr4kXF4kAFyxJr1DCryUurX_yoW7tF4xpa 9IkryfJFW8Aa47G3Z7JF4rXr13A34xK3W3WasIk348Cw4DCa4aqw4Ut34UZF47J34jgr1S q3WkAF4DCa1vyagCm3ZEXasCq-sJn29KB7ZKAUJUUUUU529EdanIXcx71UUUUU7KY7ZEXa sCq-sGcSsGvfJ3Ic02F40EFcxC0VAKzVAqx4xG6I80ebIjqfuFe4nvWSU5nxnvy29KBjDU 0xBIdaVrnRJUUUv0b4IE77IF4wAFF20E14v26r1j6r4UM7CY07I20VC2zVCF04k26cxKx2 IYs7xG6rWj6s0DM7CIcVAFz4kK6r1j6r18M28lY4IEw2IIxxk0rwA2F7IY1VAKz4vEj48v e4kI8wA2z4x0Y4vE2Ix0cI8IcVAFwI0_Gr0_Xr1l84ACjcxK6xIIjxv20xvEc7CjxVAFwI 0_Gr0_Cr1l84ACjcxK6I8E87Iv67AKxVW8JVWxJwA2z4x0Y4vEx4A2jsIEc7CjxVAFwI0_ Gr0_Gr1UM2AIxVAIcxkEcVAq07x20xvEncxIr21l57IF6xkI12xvs2x26I8E6xACxx1l5I 8CrVACY4xI64kE6c02F40Ex7xfMcIj6xIIjxv20xvE14v26r126r1DMcIj6I8E87Iv67AK xVWUJVW8JwAm72CE4IkC6x0Yz7v_Jr0_Gr1lF7xvr2IY64vIr41lc7I2V7IY0VAS07AlzV AYIcxG8wCF04k20xvY0x0EwIxGrwCFx2IqxVCFs4IE7xkEbVWUJVW8JwC20s026c02F40E 14v26r1j6r18MI8I3I0E7480Y4vE14v26r106r1rMI8E67AF67kF1VAFwI0_JF0_Jw1lIx kGc2Ij64vIr41lIxAIcVC0I7IYx2IY67AKxVWUCVW8JwCI42IY6xIIjxv20xvEc7CjxVAF wI0_Jr0_Gr1lIxAIcVCF04k26cxKx2IYs7xG6r1j6r1xMIIF0xvEx4A2jsIE14v26r1j6r 4UMIIF0xvEx4A2jsIEc7CjxVAFwI0_Jr0_GrUvcSsGvfC2KfnxnUUI43ZEXa7IU1LiSJUU UUU== X-Spam-Status: No, score=-11.1 required=5.0 tests=BAYES_00,GIT_PATCH_0,KAM_DMARC_STATUS,MIME_CHARSET_FARAWAY,NICE_REPLY_A,SPF_HELO_NONE,SPF_PASS,TXREP,T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: Pushed to r14-8722. ÔÚ 2024/1/26 ÏÂÎç4:41, Li Wei дµÀ: > We found that when only 128-bit vectorization was enabled, 549.fotonik3d_r > failed to vectorize effectively. For this reason, we adjust the cost of > 128-bit vector_stmt that match the multiply-add pattern to facilitate 128-bit > vectorization. > The experimental results show that after the modification, 549.fotonik3d_r > performance can be improved by 9.77% under the 128-bit vectorization option. > > gcc/ChangeLog: > > * config/loongarch/loongarch.cc (loongarch_multiply_add_p): New. > (loongarch_vector_costs::add_stmt_cost): Adjust. > > gcc/testsuite/ChangeLog: > > * gfortran.dg/vect/vect-10.f90: New test. > --- > gcc/config/loongarch/loongarch.cc | 48 +++++++++++++++ > gcc/testsuite/gfortran.dg/vect/vect-10.f90 | 71 ++++++++++++++++++++++ > 2 files changed, 119 insertions(+) > create mode 100644 gcc/testsuite/gfortran.dg/vect/vect-10.f90 > > diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc > index b494040d165..4d99e30828b 100644 > --- a/gcc/config/loongarch/loongarch.cc > +++ b/gcc/config/loongarch/loongarch.cc > @@ -4096,6 +4096,37 @@ loongarch_vector_costs::determine_suggested_unroll_factor (loop_vec_info loop_vi > return 1 << ceil_log2 (uf); > } > > +/* Check if assign stmt rhs op comes from a multiply-add operation. */ > +static bool > +loongarch_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info) > +{ > + gassign *assign = dyn_cast (stmt_info->stmt); > + if (!assign) > + return false; > + tree_code code = gimple_assign_rhs_code (assign); > + if (code != PLUS_EXPR && code != MINUS_EXPR) > + return false; > + > + auto is_mul_result = [&](int i) > + { > + tree rhs = gimple_op (assign, i); > + if (TREE_CODE (rhs) != SSA_NAME) > + return false; > + > + stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs); > + if (!def_stmt_info > + || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def) > + return false; > + gassign *rhs_assign = dyn_cast (def_stmt_info->stmt); > + if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR) > + return false; > + > + return true; > + }; > + > + return is_mul_result (1) || is_mul_result (2); > +} > + > unsigned > loongarch_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, > stmt_vec_info stmt_info, slp_tree, > @@ -4108,6 +4139,23 @@ loongarch_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, > { > int stmt_cost = loongarch_builtin_vectorization_cost (kind, vectype, > misalign); > + if (vectype && stmt_info) > + { > + gassign *assign = dyn_cast (STMT_VINFO_STMT (stmt_info)); > + machine_mode mode = TYPE_MODE (vectype); > + > + /* We found through testing that this strategy (the stmt that > + matches the multiply-add pattern) has positive returns only > + when applied to the 128-bit vector stmt, so this restriction > + is currently made. */ > + if (kind == vector_stmt && GET_MODE_SIZE (mode) == 16 && assign) > + { > + if (!vect_is_reduction (stmt_info) > + && loongarch_multiply_add_p (m_vinfo, stmt_info)) > + stmt_cost = 0; > + } > + } > + > retval = adjust_cost_for_freq (stmt_info, where, count * stmt_cost); > m_costs[where] += retval; > > diff --git a/gcc/testsuite/gfortran.dg/vect/vect-10.f90 b/gcc/testsuite/gfortran.dg/vect/vect-10.f90 > new file mode 100644 > index 00000000000..b85bc2702a3 > --- /dev/null > +++ b/gcc/testsuite/gfortran.dg/vect/vect-10.f90 > @@ -0,0 +1,71 @@ > +! { dg-do compile } > +! { dg-additional-options "-Ofast -mlsx -fvect-cost-model=dynamic" { target loongarch64*-*-* } } > + > +MODULE material_mod > + > +IMPLICIT NONE > + > +integer, parameter :: dfp = selected_real_kind (13, 99) > +integer, parameter :: rfp = dfp > + > +PUBLIC Mat_updateE, iepx, iepy, iepz > + > +PRIVATE > + > +integer, dimension (:, :, :), allocatable :: iepx, iepy, iepz > +real (kind = rfp), dimension (:), allocatable :: Dbdx, Dbdy, Dbdz > +integer :: imin, jmin, kmin > +integer, dimension (6) :: Exsize > +integer, dimension (6) :: Eysize > +integer, dimension (6) :: Ezsize > +integer, dimension (6) :: Hxsize > +integer, dimension (6) :: Hysize > +integer, dimension (6) :: Hzsize > + > +CONTAINS > + > +SUBROUTINE mat_updateE (nx, ny, nz, Hx, Hy, Hz, Ex, Ey, Ez) > + > +integer, intent (in) :: nx, ny, nz > + > +real (kind = rfp), intent (inout), & > + dimension (Exsize (1) : Exsize (2), Exsize (3) : Exsize (4), Exsize (5) : Exsize (6)) :: Ex > +real (kind = rfp), intent (inout), & > + dimension (Eysize (1) : Eysize (2), Eysize (3) : Eysize (4), Eysize (5) : Eysize (6)) :: Ey > +real (kind = rfp), intent (inout), & > + dimension (Ezsize (1) : Ezsize (2), Ezsize (3) : Ezsize (4), Ezsize (5) : Ezsize (6)) :: Ez > +real (kind = rfp), intent (in), & > + dimension (Hxsize (1) : Hxsize (2), Hxsize (3) : Hxsize (4), Hxsize (5) : Hxsize (6)) :: Hx > +real (kind = rfp), intent (in), & > + dimension (Hysize (1) : Hysize (2), Hysize (3) : Hysize (4), Hysize (5) : Hysize (6)) :: Hy > +real (kind = rfp), intent (in), & > + dimension (Hzsize (1) : Hzsize (2), Hzsize (3) : Hzsize (4), Hzsize (5) : Hzsize (6)) :: Hz > + > +integer :: i, j, k, mp > + > +do k = kmin, nz > + do j = jmin, ny > + do i = imin, nx > + mp = iepx (i, j, k) > + Ex (i, j, k) = Ex (i, j, k) + & > + Dbdy (mp) * (Hz (i, j, k ) - Hz (i, j-1, k)) + & > + Dbdz (mp) * (Hy (i, j, k-1) - Hy (i, j , k)) > + > + mp = iepy (i, j, k) > + Ey (i, j, k) = Ey (i, j, k) + & > + Dbdz (mp) * (Hx (i , j, k) - Hx (i, j, k-1)) + & > + Dbdx (mp) * (Hz (i-1, j, k) - Hz (i, j, k )) > + > + mp = iepz (i, j, k) > + Ez (i, j, k) = Ez (i, j, k) + & > + Dbdx (mp) * (Hy (i, j , k) - Hy (i-1, j, k)) + & > + Dbdy (mp) * (Hx (i, j-1, k) - Hx (i , j, k)) > + end do > + end do > +end do > + > +END SUBROUTINE mat_updateE > + > +END MODULE material_mod > + > +! { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target loongarch64*-*-* } } }