From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pg1-x529.google.com (mail-pg1-x529.google.com [IPv6:2607:f8b0:4864:20::529]) by sourceware.org (Postfix) with ESMTPS id 2C92E3858417 for ; Sat, 21 Aug 2021 23:30:27 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 2C92E3858417 Received: by mail-pg1-x529.google.com with SMTP id c17so12983588pgc.0 for ; Sat, 21 Aug 2021 16:30:27 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=oLLxoIs+1BTXfYOp2zN8KveJBJVhWsXXecm3gCIa6hY=; b=HFaI8DKz9ESYxm0KVGeC7aLIFUNM9UcYO47eaJPl2B8Q9IpvdX0Cv25Y0ylMDJH5ww Is5S9Y/R+pk1vdJLbClBBoylLTOClNBL5rp87+65Ws0+HYRMCC8n3sYkf8tQK/wwg+pA 3O5R55q8WWW2E1f8ifU7rVsR6g3z82+ldPZRo3qxn3WMB9SVMAJW5EVLtLz28LZ1MQTa jvBEQTcuGl3bh+fy//rH8ijqpgi7AjfmNE+TNYa+LOva+DcpeRNQ/X9uloet1U9qGrmX v3vucn17oP3oiWirbW/osQM5FMH3Y/36Agq96qTq+pB/jCSrKcua03FJJlu7+09em0PI C5tw== X-Gm-Message-State: AOAM532oqdUcd2I0Qe9p6zZIa9IDwP4RJCT0i9hTlPFLGN5DrIvEWFm6 tA1gNf4AXHis65LbpEkVx4bSZWUGuyRZdgqdhog= X-Google-Smtp-Source: ABdhPJwdAJgX3rUl94WpO0DrpHxK7x5dyioDrE/VTA4VjMSwVd9q7gqE1rZXQk6na26F/014Uzuv6eMqZloq85A/dMY= X-Received: by 2002:a05:6a00:d41:b0:3e1:3316:2ef with SMTP id n1-20020a056a000d4100b003e1331602efmr27003742pfv.40.1629588626187; Sat, 21 Aug 2021 16:30:26 -0700 (PDT) MIME-Version: 1.0 References: <20210821163631.138482-1-hjl.tools@gmail.com> In-Reply-To: From: Noah Goldstein Date: Sat, 21 Aug 2021 19:30:15 -0400 Message-ID: Subject: Re: [PATCH] x86-64: Optimize load of all bits set into ZMM register [BZ #28252] To: "H.J. Lu" Cc: GNU C Library X-Spam-Status: No, score=-0.0 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, HTML_MESSAGE, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, UNWANTED_LANGUAGE_BODY autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org Content-Type: text/plain; charset="UTF-8" X-Content-Filtered-By: Mailman/MimeDel 2.1.29 X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 21 Aug 2021 23:30:30 -0000 On Sat, Aug 21, 2021 at 2:09 PM H.J. Lu wrote: > On Sat, Aug 21, 2021 at 10:49 AM Noah Goldstein > wrote: > > > > > > > > On Sat, Aug 21, 2021 at 12:36 PM H.J. Lu via Libc-alpha < > libc-alpha@sourceware.org> wrote: > >> > >> Optimize loads of all bits set into ZMM register in AVX512 SVML codes > >> by replacing > >> > >> vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX > >> > >> and > >> > >> vmovups .L_2il0floatpacket.13(%rip), %zmmX > >> > >> with > >> vpternlogd $0xff, %zmmX, %zmmX, %zmmX > >> > >> This fixes BZ #28252. > >> --- > >> .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------ > >> .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------ > >> .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------ > >> .../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------ > >> .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------ > >> .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------ > >> .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------ > >> .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++---------- > >> .../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------ > >> .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------ > >> 10 files changed, 11 insertions(+), 64 deletions(-) > >> > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > >> index c2cf007904..0fcb912557 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > >> @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx) > >> vmovaps %zmm0, %zmm8 > >> > >> /* Check for large arguments path */ > >> - vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2 > >> + vpternlogd $0xff, %zmm2, %zmm2, %zmm2 > > > > Looking at the code it seems like this is used later by > > > > vpandnq %zmm1, %zmm1, %zmm2{%k1} > > > > AFAICT you can make the vpternlogd down there and just use > > > > vpternlogq $0xff, %zmm1, %zmm1, %zmm2{%k1}{z} > >> > >> > >> /* > >> ARGUMENT RANGE REDUCTION: > >> @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx) > >> vmovsd %xmm0, 1216(%rsp,%r15) > >> jmp .LBL_2_7 > >> END (_ZGVeN8v_cos_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.16: > >> - .long 0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.16,@object > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > >> index e9a5d00992..5596c950ce 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > >> @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx) > >> > >> /* preserve mantissa, set input exponent to 2^(-10) */ > >> vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2 > >> - vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1 > >> + vpternlogd $0xff, %zmm1, %zmm1, %zmm1 > > > > > > Earlier in the function there is a dependency breaking > > > > kxnorw %k3, %k3, %k3 > > > > so I think you can accomplish the same thing but breaking > > some unlucky dep chain with: > > > > vpmovm2d %k3, %zmm2 > > > >> vpsrlq $32, %zmm4, %zmm6 > >> > >> /* reciprocal approximation good to at least 11 bits */ > >> @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx) > >> vmovsd %xmm0, 1216(%rsp,%r15) > >> jmp .LBL_2_7 > >> END (_ZGVeN8v_log_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.12: > >> - .long 0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.12,@object > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > >> index 508da563fe..2981f1582e 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > >> @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx) > >> andq $-64, %rsp > >> subq $1280, %rsp > >> movq __svml_d_trig_data@GOTPCREL(%rip), %rax > >> - vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14 > >> + vpternlogd $0xff, %zmm1, %zmm1, %zmm14 > > > > This one also seems to just be used by an vpandn later on: > > > > vpandnq %zmm13, %zmm13, %zmm14{%k1} > > > > so maybe: > > vpternlogq $0xff, %zmm13, %zmm13, %zmm14{%k1}{z} > > instead of the vpandn. > > > >> vmovups __dAbsMask(%rax), %zmm7 > >> vmovups __dInvPI(%rax), %zmm2 > >> vmovups __dRShifter(%rax), %zmm1 > >> @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx) > >> vmovsd %xmm0, 1216(%rsp,%r15) > >> jmp .LBL_2_7 > >> END (_ZGVeN8v_sin_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.14: > >> - .long 0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.14,@object > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > >> index 965415f2bd..4ad366373b 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > >> @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx) > >> > >> /* SinPoly = SinR*SinPoly */ > >> vfmadd213pd %zmm5, %zmm5, %zmm4 > >> - vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3 > >> + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 > > > > > > Also vpandn below: > > vpandnq %zmm7, %zmm7, %zmm3{%k1} > > > >> > >> /* Update Cos result's sign */ > >> vxorpd %zmm2, %zmm1, %zmm1 > >> @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl) > >> ENTRY (_ZGVeN8vvv_sincos_skx) > >> WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx > >> END (_ZGVeN8vvv_sincos_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.15: > >> - .long 0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.15,@object > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > >> index cdcb16087d..b7d79efb54 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > >> @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx) > >> X = X - Y*PI1 - Y*PI2 - Y*PI3 > >> */ > >> vmovaps %zmm0, %zmm6 > >> - vmovups .L_2il0floatpacket.13(%rip), %zmm12 > >> + vpternlogd $0xff, %zmm12, %zmm12, %zmm12 > > > > > > Also vpandn below: > > vpandnd %zmm1, %zmm1, %zmm12{%k1} > > > >> > >> vmovups __sRShifter(%rax), %zmm3 > >> vmovups __sPI1_FMA(%rax), %zmm5 > >> vmovups __sA9_FMA(%rax), %zmm9 > >> @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx) > >> vmovss %xmm0, 1216(%rsp,%r15,8) > >> jmp .LBL_2_7 > >> END (_ZGVeN16v_cosf_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.13: > >> - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.13,@object > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > >> index 1b09909344..9f03b9b780 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > >> @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx) > >> vmovaps %zmm0, %zmm7 > >> > >> /* compare against threshold */ > >> - vmovups .L_2il0floatpacket.13(%rip), %zmm3 > >> + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 > > > > > > Also below: > > vpandnd %zmm2, %zmm2, %zmm3{%k1} > >> > >> vmovups __sInvLn2(%rax), %zmm4 > >> vmovups __sShifter(%rax), %zmm1 > >> vmovups __sLn2hi(%rax), %zmm6 > >> @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx) > >> jmp .LBL_2_7 > >> > >> END (_ZGVeN16v_expf_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.13: > >> - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.13,@object > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > >> index 4a7b2adbbf..2ba38b0f33 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > >> @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx) > >> andq $-64, %rsp > >> subq $1280, %rsp > >> movq __svml_slog_data@GOTPCREL(%rip), %rax > >> - vmovups .L_2il0floatpacket.7(%rip), %zmm6 > >> + vpternlogd $0xff, %zmm6, %zmm6, %zmm6 > > > > Also below: > > vpandnd %zmm1, %zmm1, %zmm6{%k1} > > > >> > >> vmovups _iBrkValue(%rax), %zmm4 > >> vmovups _sPoly_7(%rax), %zmm8 > >> > >> @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx) > >> jmp .LBL_2_7 > >> > >> END (_ZGVeN16v_logf_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.7: > >> - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.7,@object > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > >> index 7f906622a5..7f0272c809 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > >> @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx) > >> vpsrlq $32, %zmm3, %zmm2 > >> vpmovqd %zmm2, %ymm11 > >> vcvtps2pd %ymm14, %zmm13 > >> - vmovups .L_2il0floatpacket.23(%rip), %zmm14 > >> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > > > > earlier > > kxnorw %k3, %k3, %k3 > > can be used to get a dependency break with this: > > vpmovm2d %k3, %zmm14 > > The SVM codes can use some improvements. Can you > open a separate glibc bug? I'd like to address only all 1s > load here to avoid more complexity. > > Thanks. > Alright. Okay with this patch. > > >> vmovaps %zmm14, %zmm26 > >> vpandd _ABSMASK(%rax), %zmm1, %zmm8 > >> vpcmpd $1, _INF(%rax), %zmm8, %k2 > >> @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx) > >> vpmovqd %zmm11, %ymm5 > >> vpxord %zmm10, %zmm10, %zmm10 > >> vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3} > >> - vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4 > >> + vpternlogd $0xff, %zmm4, %zmm4, %zmm4 > >> vpxord %zmm11, %zmm11, %zmm11 > >> vcvtdq2pd %ymm7, %zmm7 > >> vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1} > >> @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx) > >> vmovss %xmm0, 1216(%rsp,%r15,8) > >> jmp .LBL_2_7 > >> END (_ZGVeN16vv_powf_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.23: > >> - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.23,@object > >> -.L_2il0floatpacket.24: > >> - .long 0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.24,@object > >> diff --git > a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > >> index 54cee3a537..e1d0154441 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > >> @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx) > >> > >> /* Result sign calculations */ > >> vpternlogd $150, %zmm0, %zmm14, %zmm1 > >> - vmovups .L_2il0floatpacket.13(%rip), %zmm14 > >> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > >> > >> /* Add correction term 0.5 for cos() part */ > >> vaddps %zmm8, %zmm5, %zmm15 > >> @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl) > >> ENTRY (_ZGVeN16vvv_sincosf_skx) > >> WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx > >> END (_ZGVeN16vvv_sincosf_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.13: > >> - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.13,@object > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > >> index ec65ffdce5..bcb76ff756 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > >> @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx) > >> movq __svml_s_trig_data@GOTPCREL(%rip), %rax > >> > >> /* Check for large and special values */ > >> - vmovups .L_2il0floatpacket.11(%rip), %zmm14 > >> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > > > > Also below: > > vpandnd %zmm2, %zmm2, %zmm14{%k1} > >> > >> vmovups __sAbsMask(%rax), %zmm5 > >> vmovups __sInvPI(%rax), %zmm1 > >> vmovups __sRShifter(%rax), %zmm2 > >> @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx) > >> vmovss %xmm0, 1216(%rsp,%r15,8) > >> jmp .LBL_2_7 > >> END (_ZGVeN16v_sinf_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.11: > >> - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.11,@object > >> -- > >> 2.31.1 > >> > > > -- > H.J. >