From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-yb1-xb31.google.com (mail-yb1-xb31.google.com [IPv6:2607:f8b0:4864:20::b31]) by sourceware.org (Postfix) with ESMTPS id 75E113858C83; Fri, 22 Apr 2022 21:42:20 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 75E113858C83 Received: by mail-yb1-xb31.google.com with SMTP id j2so16849691ybu.0; Fri, 22 Apr 2022 14:42:20 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=q/tpUTE3+hZpQblDLgkLhtz1yY3LBZYEZnyGNHTUz9w=; b=mpJWcbDdTP3TZJbSKJ35PsQZkVQ4JdA7+ub5DPl4PXIytPg1y0q5EW3PwT0oWEUUR0 qnyNpxiuT7kiJVqg0Q/V38IR0pBM7WSDbAqtbQT0e6jzmTSdtCG3zi2HbHLP+qhGrm95 HR8tpc7MtfEiaB5aJkkqSuOR3j8LB0pK5AbU0E+ds/D+Srh3aUovnOlADoXjf0mQLhYV aZv8fRK11OI3W0YUgzCpL084ZJJwLpIX3Hbfjb+vJwaApQnq1FWvFIMiqolUlF37b7bc F3ytj8BMaVvzNa0wL7w6t5tEk3N9yQmRj9AWyAunF43OgF+FbYs1WwHsFNRyy+TxOxRo NEKg== X-Gm-Message-State: AOAM532cj9z/lNJ1kxethv7WichD03aGm958GU/1/lrLmj2Saz7ZYBRL 8pgkxdEXfmWGBB5JYvg/1ucGZitV1p67laf0jKUmtw/k X-Google-Smtp-Source: ABdhPJymc1iUGKVo6yUjXBkCrmA+mIqMx8d+ONM5310Zd2CT461HFR0ETcl8+A3TqKboWycKrbQH/ndOyPxreGPKJLw= X-Received: by 2002:a25:7544:0:b0:629:33a2:b142 with SMTP id q65-20020a257544000000b0062933a2b142mr6802630ybc.136.1650663739717; Fri, 22 Apr 2022 14:42:19 -0700 (PDT) MIME-Version: 1.0 References: <20210821163631.138482-1-hjl.tools@gmail.com> In-Reply-To: From: Noah Goldstein Date: Fri, 22 Apr 2022 16:42:08 -0500 Message-ID: Subject: Re: [PATCH] x86-64: Optimize load of all bits set into ZMM register [BZ #28252] To: Sunil Pandey Cc: "H.J. Lu" , Libc-stable Mailing List , GNU C Library Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=1.8 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, UNWANTED_LANGUAGE_BODY autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Level: * X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 22 Apr 2022 21:42:22 -0000 On Fri, Apr 22, 2022 at 4:40 PM Sunil Pandey via Libc-alpha wrote: > > On Sat, Aug 21, 2021 at 9:37 AM H.J. Lu via Libc-alpha > wrote: > > > > Optimize loads of all bits set into ZMM register in AVX512 SVML codes > > by replacing > > > > vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX > > > > and > > > > vmovups .L_2il0floatpacket.13(%rip), %zmmX > > > > with > > vpternlogd $0xff, %zmmX, %zmmX, %zmmX > > > > This fixes BZ #28252. > > --- > > .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------ > > .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------ > > .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------ > > .../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------ > > .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------ > > .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------ > > .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------ > > .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++---------- > > .../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------ > > .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------ > > 10 files changed, 11 insertions(+), 64 deletions(-) > > > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > > index c2cf007904..0fcb912557 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > > @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx) > > vmovaps %zmm0, %zmm8 > > > > /* Check for large arguments path */ > > - vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2 > > + vpternlogd $0xff, %zmm2, %zmm2, %zmm2 > > > > /* > > ARGUMENT RANGE REDUCTION: > > @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx) > > vmovsd %xmm0, 1216(%rsp,%r15) > > jmp .LBL_2_7 > > END (_ZGVeN8v_cos_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.16: > > - .long 0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.16,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > > index e9a5d00992..5596c950ce 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > > @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx) > > > > /* preserve mantissa, set input exponent to 2^(-10) */ > > vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2 > > - vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1 > > + vpternlogd $0xff, %zmm1, %zmm1, %zmm1 > > vpsrlq $32, %zmm4, %zmm6 > > > > /* reciprocal approximation good to at least 11 bits */ > > @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx) > > vmovsd %xmm0, 1216(%rsp,%r15) > > jmp .LBL_2_7 > > END (_ZGVeN8v_log_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.12: > > - .long 0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.12,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > > index 508da563fe..2981f1582e 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > > @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx) > > andq $-64, %rsp > > subq $1280, %rsp > > movq __svml_d_trig_data@GOTPCREL(%rip), %rax > > - vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14 > > + vpternlogd $0xff, %zmm1, %zmm1, %zmm14 > > vmovups __dAbsMask(%rax), %zmm7 > > vmovups __dInvPI(%rax), %zmm2 > > vmovups __dRShifter(%rax), %zmm1 > > @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx) > > vmovsd %xmm0, 1216(%rsp,%r15) > > jmp .LBL_2_7 > > END (_ZGVeN8v_sin_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.14: > > - .long 0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.14,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > > index 965415f2bd..4ad366373b 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > > @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx) > > > > /* SinPoly = SinR*SinPoly */ > > vfmadd213pd %zmm5, %zmm5, %zmm4 > > - vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3 > > + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 > > > > /* Update Cos result's sign */ > > vxorpd %zmm2, %zmm1, %zmm1 > > @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl) > > ENTRY (_ZGVeN8vvv_sincos_skx) > > WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx > > END (_ZGVeN8vvv_sincos_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.15: > > - .long 0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.15,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > > index cdcb16087d..b7d79efb54 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > > @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx) > > X = X - Y*PI1 - Y*PI2 - Y*PI3 > > */ > > vmovaps %zmm0, %zmm6 > > - vmovups .L_2il0floatpacket.13(%rip), %zmm12 > > + vpternlogd $0xff, %zmm12, %zmm12, %zmm12 > > vmovups __sRShifter(%rax), %zmm3 > > vmovups __sPI1_FMA(%rax), %zmm5 > > vmovups __sA9_FMA(%rax), %zmm9 > > @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx) > > vmovss %xmm0, 1216(%rsp,%r15,8) > > jmp .LBL_2_7 > > END (_ZGVeN16v_cosf_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.13: > > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.13,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > > index 1b09909344..9f03b9b780 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > > @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx) > > vmovaps %zmm0, %zmm7 > > > > /* compare against threshold */ > > - vmovups .L_2il0floatpacket.13(%rip), %zmm3 > > + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 > > vmovups __sInvLn2(%rax), %zmm4 > > vmovups __sShifter(%rax), %zmm1 > > vmovups __sLn2hi(%rax), %zmm6 > > @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx) > > jmp .LBL_2_7 > > > > END (_ZGVeN16v_expf_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.13: > > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.13,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > > index 4a7b2adbbf..2ba38b0f33 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > > @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx) > > andq $-64, %rsp > > subq $1280, %rsp > > movq __svml_slog_data@GOTPCREL(%rip), %rax > > - vmovups .L_2il0floatpacket.7(%rip), %zmm6 > > + vpternlogd $0xff, %zmm6, %zmm6, %zmm6 > > vmovups _iBrkValue(%rax), %zmm4 > > vmovups _sPoly_7(%rax), %zmm8 > > > > @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx) > > jmp .LBL_2_7 > > > > END (_ZGVeN16v_logf_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.7: > > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.7,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > > index 7f906622a5..7f0272c809 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > > @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx) > > vpsrlq $32, %zmm3, %zmm2 > > vpmovqd %zmm2, %ymm11 > > vcvtps2pd %ymm14, %zmm13 > > - vmovups .L_2il0floatpacket.23(%rip), %zmm14 > > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > > vmovaps %zmm14, %zmm26 > > vpandd _ABSMASK(%rax), %zmm1, %zmm8 > > vpcmpd $1, _INF(%rax), %zmm8, %k2 > > @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx) > > vpmovqd %zmm11, %ymm5 > > vpxord %zmm10, %zmm10, %zmm10 > > vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3} > > - vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4 > > + vpternlogd $0xff, %zmm4, %zmm4, %zmm4 > > vpxord %zmm11, %zmm11, %zmm11 > > vcvtdq2pd %ymm7, %zmm7 > > vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1} > > @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx) > > vmovss %xmm0, 1216(%rsp,%r15,8) > > jmp .LBL_2_7 > > END (_ZGVeN16vv_powf_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.23: > > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.23,@object > > -.L_2il0floatpacket.24: > > - .long 0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.24,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > > index 54cee3a537..e1d0154441 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > > @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx) > > > > /* Result sign calculations */ > > vpternlogd $150, %zmm0, %zmm14, %zmm1 > > - vmovups .L_2il0floatpacket.13(%rip), %zmm14 > > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > > > > /* Add correction term 0.5 for cos() part */ > > vaddps %zmm8, %zmm5, %zmm15 > > @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl) > > ENTRY (_ZGVeN16vvv_sincosf_skx) > > WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx > > END (_ZGVeN16vvv_sincosf_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.13: > > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.13,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > > index ec65ffdce5..bcb76ff756 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > > @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx) > > movq __svml_s_trig_data@GOTPCREL(%rip), %rax > > > > /* Check for large and special values */ > > - vmovups .L_2il0floatpacket.11(%rip), %zmm14 > > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > > vmovups __sAbsMask(%rax), %zmm5 > > vmovups __sInvPI(%rax), %zmm1 > > vmovups __sRShifter(%rax), %zmm2 > > @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx) > > vmovss %xmm0, 1216(%rsp,%r15,8) > > jmp .LBL_2_7 > > END (_ZGVeN16v_sinf_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.11: > > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.11,@object > > -- > > 2.31.1 > > > > I would like to backport this patch to release branches. > Any comments or objections? None by me > > --Sunil