From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-qk1-x732.google.com (mail-qk1-x732.google.com [IPv6:2607:f8b0:4864:20::732]) by sourceware.org (Postfix) with ESMTPS id 076E43858C83; Fri, 22 Apr 2022 21:40:35 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 076E43858C83 Received: by mail-qk1-x732.google.com with SMTP id j6so6750918qkp.9; Fri, 22 Apr 2022 14:40:35 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=jXJnUcHZOZpy2EIHVETmi7q4LxjvgKZs5xPpF5+xh9g=; b=T7wOVU47egRLXEw6u5RRNVYhHDbzdCZ5sOUTQLlgOSfAAbirYndlB4nsePjV/zHMWj /IndKwV40nQm49eTL0gmAoBGVqV3WtJoEZ1dO7LK7qaEk3/3tzuBKkkWt8IObiKKbp53 ItX+0doU+3yqRhCwlqGp3j6bYUHF9nnOF51QKCtrWNK6nS7sad6BdJNRTIOsV16Gs2ez ZQKYrvWPNYNOJnqDrxLYIxj0JSSdVChgBuF+/xOalc1IDD+ZaaAA+M7vkGTtIfB4ucFf sQVPs/dxzDArQICCIBipOmNS2CaAni5tFw+KG3sglrMVBeXJFuccekxQFbDNwovEg+Y6 /uTg== X-Gm-Message-State: AOAM531VwwCmNQrKTTINMWZehQ5iB1ODOkPTu1liMFhKrDIxtKKNHDFe pmU6uh67BxmSDpcGTc6TpSNClzQZgz1TbrENilP8iu1M X-Google-Smtp-Source: ABdhPJz3to70C0dPthbZOIDf2ktySSTvGh2yBXcF9SLninfO/uB29mbwV6/I7qgnLkjXk3rdKLmZslbR7sLqudcY7jA= X-Received: by 2002:a05:620a:1341:b0:69e:cd37:763c with SMTP id c1-20020a05620a134100b0069ecd37763cmr3948836qkl.284.1650663634107; Fri, 22 Apr 2022 14:40:34 -0700 (PDT) MIME-Version: 1.0 References: <20210821163631.138482-1-hjl.tools@gmail.com> In-Reply-To: <20210821163631.138482-1-hjl.tools@gmail.com> From: Sunil Pandey Date: Fri, 22 Apr 2022 14:39:58 -0700 Message-ID: Subject: Re: [PATCH] x86-64: Optimize load of all bits set into ZMM register [BZ #28252] To: "H.J. Lu" , libc-stable@sourceware.org Cc: GNU C Library Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=3.1 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_ENVFROM_END_DIGIT, FREEMAIL_FROM, GIT_PATCH_0, HK_RANDOM_ENVFROM, HK_RANDOM_FROM, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, UNWANTED_LANGUAGE_BODY autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Level: *** X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-stable@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-stable mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 22 Apr 2022 21:40:37 -0000 On Sat, Aug 21, 2021 at 9:37 AM H.J. Lu via Libc-alpha wrote: > > Optimize loads of all bits set into ZMM register in AVX512 SVML codes > by replacing > > vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX > > and > > vmovups .L_2il0floatpacket.13(%rip), %zmmX > > with > vpternlogd $0xff, %zmmX, %zmmX, %zmmX > > This fixes BZ #28252. > --- > .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------ > .../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++---------- > .../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------ > 10 files changed, 11 insertions(+), 64 deletions(-) > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > index c2cf007904..0fcb912557 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx) > vmovaps %zmm0, %zmm8 > > /* Check for large arguments path */ > - vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2 > + vpternlogd $0xff, %zmm2, %zmm2, %zmm2 > > /* > ARGUMENT RANGE REDUCTION: > @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx) > vmovsd %xmm0, 1216(%rsp,%r15) > jmp .LBL_2_7 > END (_ZGVeN8v_cos_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.16: > - .long 0xffffffff,0xffffffff > - .type .L_2il0floatpacket.16,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > index e9a5d00992..5596c950ce 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx) > > /* preserve mantissa, set input exponent to 2^(-10) */ > vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2 > - vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1 > + vpternlogd $0xff, %zmm1, %zmm1, %zmm1 > vpsrlq $32, %zmm4, %zmm6 > > /* reciprocal approximation good to at least 11 bits */ > @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx) > vmovsd %xmm0, 1216(%rsp,%r15) > jmp .LBL_2_7 > END (_ZGVeN8v_log_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.12: > - .long 0xffffffff,0xffffffff > - .type .L_2il0floatpacket.12,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > index 508da563fe..2981f1582e 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx) > andq $-64, %rsp > subq $1280, %rsp > movq __svml_d_trig_data@GOTPCREL(%rip), %rax > - vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14 > + vpternlogd $0xff, %zmm1, %zmm1, %zmm14 > vmovups __dAbsMask(%rax), %zmm7 > vmovups __dInvPI(%rax), %zmm2 > vmovups __dRShifter(%rax), %zmm1 > @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx) > vmovsd %xmm0, 1216(%rsp,%r15) > jmp .LBL_2_7 > END (_ZGVeN8v_sin_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.14: > - .long 0xffffffff,0xffffffff > - .type .L_2il0floatpacket.14,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > index 965415f2bd..4ad366373b 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx) > > /* SinPoly = SinR*SinPoly */ > vfmadd213pd %zmm5, %zmm5, %zmm4 > - vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3 > + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 > > /* Update Cos result's sign */ > vxorpd %zmm2, %zmm1, %zmm1 > @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl) > ENTRY (_ZGVeN8vvv_sincos_skx) > WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx > END (_ZGVeN8vvv_sincos_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.15: > - .long 0xffffffff,0xffffffff > - .type .L_2il0floatpacket.15,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > index cdcb16087d..b7d79efb54 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx) > X = X - Y*PI1 - Y*PI2 - Y*PI3 > */ > vmovaps %zmm0, %zmm6 > - vmovups .L_2il0floatpacket.13(%rip), %zmm12 > + vpternlogd $0xff, %zmm12, %zmm12, %zmm12 > vmovups __sRShifter(%rax), %zmm3 > vmovups __sPI1_FMA(%rax), %zmm5 > vmovups __sA9_FMA(%rax), %zmm9 > @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx) > vmovss %xmm0, 1216(%rsp,%r15,8) > jmp .LBL_2_7 > END (_ZGVeN16v_cosf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.13: > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.13,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > index 1b09909344..9f03b9b780 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx) > vmovaps %zmm0, %zmm7 > > /* compare against threshold */ > - vmovups .L_2il0floatpacket.13(%rip), %zmm3 > + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 > vmovups __sInvLn2(%rax), %zmm4 > vmovups __sShifter(%rax), %zmm1 > vmovups __sLn2hi(%rax), %zmm6 > @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx) > jmp .LBL_2_7 > > END (_ZGVeN16v_expf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.13: > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.13,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > index 4a7b2adbbf..2ba38b0f33 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx) > andq $-64, %rsp > subq $1280, %rsp > movq __svml_slog_data@GOTPCREL(%rip), %rax > - vmovups .L_2il0floatpacket.7(%rip), %zmm6 > + vpternlogd $0xff, %zmm6, %zmm6, %zmm6 > vmovups _iBrkValue(%rax), %zmm4 > vmovups _sPoly_7(%rax), %zmm8 > > @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx) > jmp .LBL_2_7 > > END (_ZGVeN16v_logf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.7: > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.7,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > index 7f906622a5..7f0272c809 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx) > vpsrlq $32, %zmm3, %zmm2 > vpmovqd %zmm2, %ymm11 > vcvtps2pd %ymm14, %zmm13 > - vmovups .L_2il0floatpacket.23(%rip), %zmm14 > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > vmovaps %zmm14, %zmm26 > vpandd _ABSMASK(%rax), %zmm1, %zmm8 > vpcmpd $1, _INF(%rax), %zmm8, %k2 > @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx) > vpmovqd %zmm11, %ymm5 > vpxord %zmm10, %zmm10, %zmm10 > vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3} > - vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4 > + vpternlogd $0xff, %zmm4, %zmm4, %zmm4 > vpxord %zmm11, %zmm11, %zmm11 > vcvtdq2pd %ymm7, %zmm7 > vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1} > @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx) > vmovss %xmm0, 1216(%rsp,%r15,8) > jmp .LBL_2_7 > END (_ZGVeN16vv_powf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.23: > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.23,@object > -.L_2il0floatpacket.24: > - .long 0xffffffff,0xffffffff > - .type .L_2il0floatpacket.24,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > index 54cee3a537..e1d0154441 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx) > > /* Result sign calculations */ > vpternlogd $150, %zmm0, %zmm14, %zmm1 > - vmovups .L_2il0floatpacket.13(%rip), %zmm14 > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > > /* Add correction term 0.5 for cos() part */ > vaddps %zmm8, %zmm5, %zmm15 > @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl) > ENTRY (_ZGVeN16vvv_sincosf_skx) > WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx > END (_ZGVeN16vvv_sincosf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.13: > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.13,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > index ec65ffdce5..bcb76ff756 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx) > movq __svml_s_trig_data@GOTPCREL(%rip), %rax > > /* Check for large and special values */ > - vmovups .L_2il0floatpacket.11(%rip), %zmm14 > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > vmovups __sAbsMask(%rax), %zmm5 > vmovups __sInvPI(%rax), %zmm1 > vmovups __sRShifter(%rax), %zmm2 > @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx) > vmovss %xmm0, 1216(%rsp,%r15,8) > jmp .LBL_2_7 > END (_ZGVeN16v_sinf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.11: > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.11,@object > -- > 2.31.1 > I would like to backport this patch to release branches. Any comments or objections? --Sunil