From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-ot1-x329.google.com (mail-ot1-x329.google.com [IPv6:2607:f8b0:4864:20::329]) by sourceware.org (Postfix) with ESMTPS id 96D31385781A for ; Sat, 17 Apr 2021 18:57:30 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 96D31385781A Received: by mail-ot1-x329.google.com with SMTP id f75-20020a9d03d10000b0290280def9ab76so23695516otf.12 for ; Sat, 17 Apr 2021 11:57:30 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=hwHrUX6wWty4SfxbCFKZcVEcpJ6BO7ivhb1e/yL6g+c=; b=gXc/peB61cb1H+aeX3fus6+CO75M5TFvIoQYdm6NdYsi3Nh1qNjYlHVfqai4kU8gIf /I4Xy2hGXKdwWL+l3Ll5efTBXi2gFDnRLZWiW4GazrOoDCpSFmRSomBeJAZVMvbIy3ok Yx/yF3saca4FhUzOT/57DjYmQTN6lLMvj68GR6sABRnjL8c6AM5mmo263ptjf0jtEgzv BU21gVKTKfk7I39DZmeYIHxBZ9khQQd/lpeAxMlhEaCdSEqyqAK8DYPb/sd4GtySCvRP wSo3cniqwd9LrZAt2wXlJvgaW4mlLBe1ubLqHOnTQeUg5swi6ZUwTtZ2U3gBSld9E/FE EwOw== X-Gm-Message-State: AOAM532jAfkDV8m+xY+4SbTMTzppwsPS1iowON7Y93uBCYTGToqCQZGI rIxomv/Q+R8efXE/Ti4Gt5xo7GQ8Xcl3ehni1QM16gK+StpreQ== X-Google-Smtp-Source: ABdhPJwn4r9tLooowTFZ6W8W33cpS19iCuUm7S4+B1tx1UHTZfmlPxxx1Q5TMaDIGhSufI1jGKf6sq61YjTembyzm8c= X-Received: by 2002:a05:6830:1515:: with SMTP id k21mr8507939otp.269.1618685849651; Sat, 17 Apr 2021 11:57:29 -0700 (PDT) MIME-Version: 1.0 References: <20210417025215.874105-1-goldstein.w.n@gmail.com> In-Reply-To: <20210417025215.874105-1-goldstein.w.n@gmail.com> From: "H.J. Lu" Date: Sat, 17 Apr 2021 11:56:53 -0700 Message-ID: Subject: Re: [PATCH v1 1/2] x86: Optimize strlen-evex.S To: Noah Goldstein Cc: libc-alpha@sourceware.org, carlos@systemhalted.org Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-3034.7 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 17 Apr 2021 18:57:34 -0000 On Fri, Apr 16, 2021 at 7:53 PM Noah Goldstein wrote: > > No bug. This commit optimizes strlen-evex.S. The > optimizations are mostly small things but they add up to roughly > 10-30% performance improvement for strlen. The results for strnlen are > bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and > test-wcsnlen are all passing. > > Signed-off-by: Noah Goldstein > --- > Tests where run on the following CPUs: > > Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html > > Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html > > Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html > > All times are the geometric mean of N=20. The unit of time is > seconds. > > "Cur" refers to the current implementation > "New" refers to this patches implementation > > > The strlen numbers are universal improvements: > > Results For Skylake strlen-avx2 > size, algn, Cur T , New T , Win , Dif > 1 , 0 , 4.76 , 4.27 , New , 0.49 > 2 , 0 , 4.77 , 4.165 , New , 0.6 > 3 , 0 , 4.617 , 4.095 , New , 0.52 > 4 , 0 , 4.579 , 4.006 , New , 0.57 > 5 , 0 , 4.608 , 4.008 , New , 0.6 > 6 , 0 , 4.655 , 4.086 , New , 0.57 > 7 , 0 , 4.661 , 4.071 , New , 0.59 > 8 , 0 , 4.625 , 4.092 , New , 0.53 > 16 , 0 , 4.608 , 4.021 , New , 0.59 > 10 , 0 , 4.645 , 4.111 , New , 0.53 > 32 , 0 , 5.532 , 4.817 , New , 0.71 > 21 , 0 , 4.636 , 3.775 , New , 0.86 > 64 , 0 , 5.991 , 5.352 , New , 0.64 > 42 , 0 , 5.529 , 4.789 , New , 0.74 > 128 , 0 , 7.042 , 5.473 , New , 1.57 > 85 , 0 , 6.118 , 5.466 , New , 0.65 > 256 , 0 , 10.64 , 7.954 , New , 2.69 > 170 , 0 , 9.918 , 9.585 , New , 0.33 > 512 , 0 , 12.916, 10.242, New , 2.67 > 341 , 0 , 10.764, 10.216, New , 0.55 > 1024, 0 , 18.163, 14.844, New , 3.32 > 682 , 0 , 15.292, 13.382, New , 1.91 > 2048, 0 , 38.732, 24.396, New , 14.34 > 1365, 0 , 22.299, 20.08 , New , 2.22 > 4096, 0 , 79.054, 68.682, New , 10.37 > 2730, 0 , 61.47 , 40.705, New , 20.77 > > Results For Icelake strlen-avx2 > size, algn, Cur T , New T , Win , Dif > 1 , 0 , 2.681 , 1.99 , New , 0.69 > 2 , 0 , 2.823 , 2.232 , New , 0.59 > 3 , 0 , 2.57 , 2.077 , New , 0.49 > 4 , 0 , 2.659 , 2.128 , New , 0.53 > 5 , 0 , 2.666 , 2.109 , New , 0.56 > 6 , 0 , 2.596 , 2.053 , New , 0.54 > 7 , 0 , 2.623 , 2.152 , New , 0.47 > 8 , 0 , 2.675 , 2.178 , New , 0.5 > 16 , 0 , 2.675 , 2.202 , New , 0.47 > 10 , 0 , 2.672 , 2.2 , New , 0.47 > 32 , 0 , 3.383 , 2.868 , New , 0.52 > 21 , 0 , 2.693 , 2.032 , New , 0.66 > 64 , 0 , 3.404 , 3.056 , New , 0.35 > 42 , 0 , 3.511 , 2.967 , New , 0.54 > 128 , 0 , 4.191 , 3.627 , New , 0.56 > 85 , 0 , 3.559 , 2.922 , New , 0.64 > 256 , 0 , 6.782 , 5.493 , New , 1.29 > 170 , 0 , 6.24 , 4.988 , New , 1.25 > 512 , 0 , 9.305 , 7.308 , New , 2.0 > 341 , 0 , 7.626 , 6.272 , New , 1.35 > 1024, 0 , 14.455, 11.544, New , 2.91 > 682 , 0 , 10.728, 8.738 , New , 1.99 > 2048, 0 , 24.171, 24.101, New , 0.07 > 1365, 0 , 17.474, 14.387, New , 3.09 > 4096, 0 , 57.659, 51.675, New , 5.98 > 2730, 0 , 44.702, 28.04 , New , 16.66 > > Results For Tigerlake strlen-avx2 > size, algn, Cur T , New T , Win , Dif > 1 , 0 , 4.369 , 3.008 , New , 1.36 > 2 , 0 , 4.054 , 3.231 , New , 0.82 > 3 , 0 , 4.081 , 3.243 , New , 0.84 > 4 , 0 , 3.904 , 3.17 , New , 0.73 > 5 , 0 , 3.915 , 3.178 , New , 0.74 > 6 , 0 , 3.924 , 3.184 , New , 0.74 > 7 , 0 , 3.917 , 3.177 , New , 0.74 > 8 , 0 , 3.889 , 3.209 , New , 0.68 > 16 , 0 , 3.878 , 3.03 , New , 0.85 > 10 , 0 , 3.892 , 3.004 , New , 0.89 > 32 , 0 , 4.957 , 4.162 , New , 0.79 > 21 , 0 , 3.866 , 3.18 , New , 0.69 > 64 , 0 , 5.035 , 4.521 , New , 0.51 > 42 , 0 , 5.039 , 4.276 , New , 0.76 > 128 , 0 , 6.117 , 5.253 , New , 0.86 > 85 , 0 , 4.932 , 4.421 , New , 0.51 > 256 , 0 , 10.019, 8.221 , New , 1.8 > 170 , 0 , 8.954 , 7.404 , New , 1.55 > 512 , 0 , 14.071, 10.948, New , 3.12 > 341 , 0 , 11.177, 9.246 , New , 1.93 > 1024, 0 , 21.808, 17.034, New , 4.77 > 682 , 0 , 16.07 , 12.941, New , 3.13 > 2048, 0 , 37.332, 29.853, New , 7.48 > 1365, 0 , 26.394, 21.516, New , 4.88 > 4096, 0 , 87.951, 80.35 , New , 7.6 > 2730, 0 , 62.768, 44.247, New , 18.52 > > Results For Icelake strlen-evex > size, algn, Cur T , New T , Win , Dif > 1 , 0 , 2.681 , 1.99 , New , 0.69 > 2 , 0 , 2.823 , 2.232 , New , 0.59 > 3 , 0 , 2.57 , 2.077 , New , 0.49 > 4 , 0 , 2.659 , 2.128 , New , 0.53 > 5 , 0 , 2.666 , 2.109 , New , 0.56 > 6 , 0 , 2.596 , 2.053 , New , 0.54 > 7 , 0 , 2.623 , 2.152 , New , 0.47 > 8 , 0 , 2.675 , 2.178 , New , 0.5 > 16 , 0 , 2.675 , 2.202 , New , 0.47 > 10 , 0 , 2.672 , 2.2 , New , 0.47 > 32 , 0 , 3.383 , 2.868 , New , 0.52 > 21 , 0 , 2.693 , 2.032 , New , 0.66 > 64 , 0 , 3.404 , 3.056 , New , 0.35 > 42 , 0 , 3.511 , 2.967 , New , 0.54 > 128 , 0 , 4.191 , 3.627 , New , 0.56 > 85 , 0 , 3.559 , 2.922 , New , 0.64 > 256 , 0 , 6.782 , 5.493 , New , 1.29 > 170 , 0 , 6.24 , 4.988 , New , 1.25 > 512 , 0 , 9.305 , 7.308 , New , 2.0 > 341 , 0 , 7.626 , 6.272 , New , 1.35 > 1024, 0 , 14.455, 11.544, New , 2.91 > 682 , 0 , 10.728, 8.738 , New , 1.99 > 2048, 0 , 24.171, 24.101, New , 0.07 > 1365, 0 , 17.474, 14.387, New , 3.09 > 4096, 0 , 57.659, 51.675, New , 5.98 > 2730, 0 , 44.702, 28.04 , New , 16.66 > > Results For Tigerlake strlen-evex > size, algn, Cur T , New T , Win , Dif > 1 , 0 , 4.369 , 3.008 , New , 1.36 > 2 , 0 , 4.054 , 3.231 , New , 0.82 > 3 , 0 , 4.081 , 3.243 , New , 0.84 > 4 , 0 , 3.904 , 3.17 , New , 0.73 > 5 , 0 , 3.915 , 3.178 , New , 0.74 > 6 , 0 , 3.924 , 3.184 , New , 0.74 > 7 , 0 , 3.917 , 3.177 , New , 0.74 > 8 , 0 , 3.889 , 3.209 , New , 0.68 > 16 , 0 , 3.878 , 3.03 , New , 0.85 > 10 , 0 , 3.892 , 3.004 , New , 0.89 > 32 , 0 , 4.957 , 4.162 , New , 0.79 > 21 , 0 , 3.866 , 3.18 , New , 0.69 > 64 , 0 , 5.035 , 4.521 , New , 0.51 > 42 , 0 , 5.039 , 4.276 , New , 0.76 > 128 , 0 , 6.117 , 5.253 , New , 0.86 > 85 , 0 , 4.932 , 4.421 , New , 0.51 > 256 , 0 , 10.019, 8.221 , New , 1.8 > 170 , 0 , 8.954 , 7.404 , New , 1.55 > 512 , 0 , 14.071, 10.948, New , 3.12 > 341 , 0 , 11.177, 9.246 , New , 1.93 > 1024, 0 , 21.808, 17.034, New , 4.77 > 682 , 0 , 16.07 , 12.941, New , 3.13 > 2048, 0 , 37.332, 29.853, New , 7.48 > 1365, 0 , 26.394, 21.516, New , 4.88 > 4096, 0 , 87.951, 80.35 , New , 7.6 > 2730, 0 , 62.768, 44.247, New , 18.52 > > The strnlen numbers are a bit more of a mixed bag but I think > generally positive. Its possible that the current version should be > kept. Let me know. > > Results For Skylake strnlen-avx2 > size, algn, Cur T , Sub T , Win , Dif > 1 , 0 , 4.06 , 4.1 , Cur , 0.04 > 2 , 0 , 4.15 , 4.08 , New , 0.07 > 3 , 0 , 4.1 , 4.03 , New , 0.07 > 4 , 0 , 3.95 , 3.91 , New , 0.04 > 5 , 0 , 4.07 , 3.9 , New , 0.17 > 6 , 0 , 4.04 , 3.92 , New , 0.12 > 7 , 0 , 4.03 , 3.89 , New , 0.14 > 1 , 1 , 3.75 , 3.79 , Cur , 0.04 > 2 , 2 , 4.0 , 3.91 , New , 0.09 > 3 , 3 , 4.04 , 3.92 , New , 0.12 > 4 , 4 , 3.95 , 3.86 , New , 0.09 > 5 , 5 , 3.97 , 3.91 , New , 0.06 > 6 , 6 , 3.96 , 3.92 , New , 0.04 > 7 , 7 , 3.97 , 3.91 , New , 0.06 > 4 , 1 , 3.76 , 3.83 , Cur , 0.07 > 8 , 0 , 3.73 , 4.01 , Cur , 0.28 > 8 , 1 , 3.73 , 3.88 , Cur , 0.15 > 16 , 0 , 3.68 , 3.84 , Cur , 0.16 > 16 , 1 , 3.75 , 3.92 , Cur , 0.17 > 32 , 0 , 5.93 , 5.95 , Cur , 0.02 > 32 , 1 , 5.95 , 5.98 , Cur , 0.03 > 64 , 0 , 6.46 , 5.31 , New , 1.15 > 64 , 1 , 6.66 , 5.43 , New , 1.23 > 128 , 0 , 7.42 , 6.02 , New , 1.4 > 128 , 1 , 7.57 , 5.81 , New , 1.76 > 256 , 0 , 12.02 , 9.89 , New , 2.13 > 256 , 1 , 11.91 , 9.84 , New , 2.07 > 512 , 0 , 15.06 , 11.77 , New , 3.29 > 512 , 1 , 14.79 , 11.75 , New , 3.04 > 1024, 0 , 23.61 , 16.98 , New , 6.63 > 1024, 1 , 23.63 , 16.91 , New , 6.72 > > Results For Icelake strnlen-avx2 > size, algn, Cur T , Sub T , Win , Dif > 1 , 0 , 2.81 , 2.51 , New , 0.3 > 2 , 0 , 2.8 , 2.53 , New , 0.27 > 3 , 0 , 2.7 , 2.57 , New , 0.13 > 4 , 0 , 2.68 , 2.55 , New , 0.13 > 5 , 0 , 2.7 , 2.57 , New , 0.13 > 6 , 0 , 2.73 , 2.6 , New , 0.13 > 7 , 0 , 2.69 , 2.61 , New , 0.08 > 1 , 1 , 2.53 , 2.5 , New , 0.03 > 2 , 2 , 2.67 , 2.6 , New , 0.07 > 3 , 3 , 2.67 , 2.59 , New , 0.08 > 4 , 4 , 2.66 , 2.57 , New , 0.09 > 5 , 5 , 2.65 , 2.56 , New , 0.09 > 6 , 6 , 2.67 , 2.59 , New , 0.08 > 7 , 7 , 2.65 , 2.62 , New , 0.03 > 4 , 1 , 2.65 , 2.41 , New , 0.24 > 8 , 0 , 2.68 , 2.56 , New , 0.12 > 8 , 1 , 2.62 , 2.55 , New , 0.07 > 16 , 0 , 2.66 , 2.56 , New , 0.1 > 16 , 1 , 2.63 , 2.55 , New , 0.08 > 32 , 0 , 3.62 , 3.19 , New , 0.43 > 32 , 1 , 3.74 , 3.45 , New , 0.29 > 64 , 0 , 3.9 , 3.7 , New , 0.2 > 64 , 1 , 4.13 , 3.68 , New , 0.45 > 128 , 0 , 4.34 , 4.17 , New , 0.17 > 128 , 1 , 4.59 , 4.07 , New , 0.52 > 256 , 0 , 6.74 , 6.56 , New , 0.18 > 256 , 1 , 7.34 , 7.13 , New , 0.21 > 512 , 0 , 9.64 , 8.67 , New , 0.97 > 512 , 1 , 9.49 , 8.56 , New , 0.93 > 1024, 0 , 13.57 , 12.35 , New , 1.22 > 1024, 1 , 13.57 , 12.59 , New , 0.98 > > Results For Tigerlake strnlen-avx2 > size, algn, Cur T , Sub T , Win , Dif > 1 , 0 , 4.21 , 3.91 , New , 0.3 > 2 , 0 , 4.1 , 3.79 , New , 0.31 > 3 , 0 , 4.02 , 3.81 , New , 0.21 > 4 , 0 , 4.06 , 3.82 , New , 0.24 > 5 , 0 , 4.1 , 3.81 , New , 0.29 > 6 , 0 , 4.08 , 3.82 , New , 0.26 > 7 , 0 , 4.07 , 3.87 , New , 0.2 > 1 , 1 , 3.95 , 3.8 , New , 0.15 > 2 , 2 , 4.11 , 3.88 , New , 0.23 > 3 , 3 , 4.08 , 3.88 , New , 0.2 > 4 , 4 , 4.05 , 3.94 , New , 0.11 > 5 , 5 , 4.02 , 3.89 , New , 0.13 > 6 , 6 , 4.02 , 3.89 , New , 0.13 > 7 , 7 , 4.08 , 3.84 , New , 0.24 > 4 , 1 , 4.07 , 3.7 , New , 0.37 > 8 , 0 , 4.08 , 3.95 , New , 0.13 > 8 , 1 , 4.01 , 4.02 , Cur , 0.01 > 16 , 0 , 4.03 , 4.03 , Eq , 0.0 > 16 , 1 , 4.05 , 4.0 , New , 0.05 > 32 , 0 , 5.86 , 5.23 , New , 0.63 > 32 , 1 , 5.88 , 5.36 , New , 0.52 > 64 , 0 , 6.38 , 5.73 , New , 0.65 > 64 , 1 , 6.49 , 5.56 , New , 0.93 > 128 , 0 , 7.17 , 6.39 , New , 0.78 > 128 , 1 , 7.1 , 6.41 , New , 0.69 > 256 , 0 , 11.65 , 11.0 , New , 0.65 > 256 , 1 , 11.37 , 10.97 , New , 0.4 > 512 , 0 , 14.86 , 13.43 , New , 1.43 > 512 , 1 , 14.63 , 13.35 , New , 1.28 > 1024, 0 , 20.92 , 19.33 , New , 1.59 > 1024, 1 , 20.85 , 19.38 , New , 1.47 > > Results For Icelake strnlen-evex > size, algn, Cur T , Sub T , Win , Dif > 1 , 0 , 2.9 , 2.66 , New , 0.24 > 2 , 0 , 2.99 , 2.72 , New , 0.27 > 3 , 0 , 2.93 , 2.64 , New , 0.29 > 4 , 0 , 2.83 , 2.55 , New , 0.28 > 5 , 0 , 2.92 , 2.64 , New , 0.28 > 6 , 0 , 2.95 , 2.64 , New , 0.31 > 7 , 0 , 2.91 , 2.65 , New , 0.26 > 1 , 1 , 2.63 , 2.49 , New , 0.14 > 2 , 2 , 2.89 , 2.6 , New , 0.29 > 3 , 3 , 2.89 , 2.59 , New , 0.3 > 4 , 4 , 2.9 , 2.58 , New , 0.32 > 5 , 5 , 2.87 , 2.57 , New , 0.3 > 6 , 6 , 2.9 , 2.57 , New , 0.33 > 7 , 7 , 2.88 , 2.64 , New , 0.24 > 4 , 1 , 2.65 , 2.39 , New , 0.26 > 8 , 0 , 2.85 , 2.57 , New , 0.28 > 8 , 1 , 2.62 , 2.4 , New , 0.22 > 16 , 0 , 2.83 , 2.56 , New , 0.27 > 16 , 1 , 2.63 , 2.39 , New , 0.24 > 32 , 0 , 3.95 , 3.06 , New , 0.89 > 32 , 1 , 3.95 , 3.15 , New , 0.8 > 64 , 0 , 3.98 , 3.6 , New , 0.38 > 64 , 1 , 3.88 , 3.48 , New , 0.4 > 128 , 0 , 4.45 , 4.19 , New , 0.26 > 128 , 1 , 4.57 , 4.21 , New , 0.36 > 256 , 0 , 6.75 , 6.97 , Cur , 0.22 > 256 , 1 , 7.55 , 7.76 , Cur , 0.21 > 512 , 0 , 9.75 , 10.09 , Cur , 0.34 > 512 , 1 , 9.84 , 10.13 , Cur , 0.29 > 1024, 0 , 14.45 , 14.4 , New , 0.05 > 1024, 1 , 14.39 , 14.26 , New , 0.13 > > Results For Tigerlake strnlen-evex > size, algn, Cur T , Sub T , Win , Dif > 1 , 0 , 3.86 , 3.59 , New , 0.27 > 2 , 0 , 3.78 , 3.41 , New , 0.37 > 3 , 0 , 3.69 , 3.4 , New , 0.29 > 4 , 0 , 3.62 , 3.33 , New , 0.29 > 5 , 0 , 3.76 , 3.37 , New , 0.39 > 6 , 0 , 3.73 , 3.39 , New , 0.34 > 7 , 0 , 3.7 , 3.4 , New , 0.3 > 1 , 1 , 3.58 , 3.35 , New , 0.23 > 2 , 2 , 3.75 , 3.34 , New , 0.41 > 3 , 3 , 3.72 , 3.39 , New , 0.33 > 4 , 4 , 3.69 , 3.38 , New , 0.31 > 5 , 5 , 3.69 , 3.37 , New , 0.32 > 6 , 6 , 3.68 , 3.37 , New , 0.31 > 7 , 7 , 3.74 , 3.35 , New , 0.39 > 4 , 1 , 3.39 , 3.27 , New , 0.12 > 8 , 0 , 3.4 , 3.29 , New , 0.11 > 8 , 1 , 3.34 , 3.32 , New , 0.02 > 16 , 0 , 3.36 , 3.34 , New , 0.02 > 16 , 1 , 3.39 , 3.3 , New , 0.09 > 32 , 0 , 5.13 , 5.13 , Eq , 0.0 > 32 , 1 , 5.18 , 5.16 , New , 0.02 > 64 , 0 , 5.87 , 5.44 , New , 0.43 > 64 , 1 , 5.97 , 5.44 , New , 0.53 > 128 , 0 , 7.14 , 6.48 , New , 0.66 > 128 , 1 , 7.08 , 6.63 , New , 0.45 > 256 , 0 , 11.68 , 12.57 , Cur , 0.89 > 256 , 1 , 11.67 , 12.23 , Cur , 0.56 > 512 , 0 , 15.64 , 15.74 , Cur , 0.1 > 512 , 1 , 15.52 , 15.69 , Cur , 0.17 > 1024, 0 , 23.02 , 22.57 , New , 0.45 > 1024, 1 , 23.0 , 22.8 , New , 0.2 > > sysdeps/x86_64/multiarch/strlen-evex.S | 569 +++++++++++++------------ > 1 file changed, 307 insertions(+), 262 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S > index 0583819078..d1aafac76f 100644 > --- a/sysdeps/x86_64/multiarch/strlen-evex.S > +++ b/sysdeps/x86_64/multiarch/strlen-evex.S > @@ -29,11 +29,13 @@ > # ifdef USE_AS_WCSLEN > # define VPCMP vpcmpd > # define VPMINU vpminud > -# define SHIFT_REG r9d > +# define SHIFT_REG ecx > +# define CHAR_SIZE 4 > # else > # define VPCMP vpcmpb > # define VPMINU vpminub > -# define SHIFT_REG ecx > +# define SHIFT_REG edx > +# define CHAR_SIZE 1 > # endif > > # define XMMZERO xmm16 > @@ -46,132 +48,169 @@ > # define YMM6 ymm22 > > # define VEC_SIZE 32 > +# define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > .section .text.evex,"ax",@progbits > ENTRY (STRLEN) > # ifdef USE_AS_STRNLEN > - /* Check for zero length. */ > + /* Check zero length. */ > test %RSI_LP, %RSI_LP > jz L(zero) > -# ifdef USE_AS_WCSLEN > - shl $2, %RSI_LP > -# elif defined __ILP32__ > +# if !defined USE_AS_WCSLEN && defined __ILP32__ > /* Clear the upper 32 bits. */ > movl %esi, %esi > # endif > mov %RSI_LP, %R8_LP > # endif > - movl %edi, %ecx > - movq %rdi, %rdx > + movl %edi, %eax > + sall $20, %eax > vpxorq %XMMZERO, %XMMZERO, %XMMZERO > - > /* Check if we may cross page boundary with one vector load. */ > - andl $(2 * VEC_SIZE - 1), %ecx > - cmpl $VEC_SIZE, %ecx > - ja L(cros_page_boundary) > + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax > + ja L(cross_page_boundary) > > - /* Check the first VEC_SIZE bytes. Each bit in K0 represents a > - null byte. */ > + /* Check the first VEC_SIZE bytes. */ Here are some quick comments: 1. The change is incorrect for x32: FAIL: wcsmbs/tst-size_t-wcsnlen $ cat wcsmbs/tst-size_t-wcsnlen.out Didn't expect signal from child: got `Segmentation fault' Your patch removed/changed special handling for x32. 2. Some of comment changes removed some detailed information, like - /* Check the first VEC_SIZE bytes. Each bit in K0 represents a - null byte. */ + /* Check the first VEC_SIZE bytes. */ You should put them back. 3. You need to add more comments for changes which aren't 100% obvious. > VPCMP $0, (%rdi), %YMMZERO, %k0 > kmovd %k0, %eax > - testl %eax, %eax > - > # ifdef USE_AS_STRNLEN > - jnz L(first_vec_x0_check) > - /* Adjust length and check the end of data. */ > - subq $VEC_SIZE, %rsi > - jbe L(max) > -# else > - jnz L(first_vec_x0) > + /* If length < VEC_SIZE handle special. */ > + cmpq $CHAR_PER_VEC, %rsi > + jbe L(first_vec_x0) > # endif > - > - /* Align data for aligned loads in the loop. */ > - addq $VEC_SIZE, %rdi > - andl $(VEC_SIZE - 1), %ecx > - andq $-VEC_SIZE, %rdi > - > + testl %eax, %eax > + jz L(aligned_more) > + tzcntl %eax, %eax > + ret > # ifdef USE_AS_STRNLEN > - /* Adjust length. */ > - addq %rcx, %rsi > +L(zero): > + xorl %eax, %eax > + ret > > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > + .p2align 4 > +L(first_vec_x0): > + /* Select min of length and position of first null. */ > + btsq %rsi, %rax > + tzcntl %eax, %eax > + ret > # endif > - jmp L(more_4x_vec) > > .p2align 4 > -L(cros_page_boundary): > - andl $(VEC_SIZE - 1), %ecx > - andq $-VEC_SIZE, %rdi > - > -# ifdef USE_AS_WCSLEN > - /* NB: Divide shift count by 4 since each bit in K0 represent 4 > - bytes. */ > - movl %ecx, %SHIFT_REG > - sarl $2, %SHIFT_REG > +L(first_vec_x1): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > +# ifdef USE_AS_STRNLEN > + /* Use ecx which was computed earlier to compute correct value. > + */ > +# ifdef USE_AS_WCSLEN > + sarl $2, %ecx > +# endif > + leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax > +# else > + subl %edx, %edi > +# ifdef USE_AS_WCSLEN > + sarl $2, %edi > +# endif > + leal CHAR_PER_VEC(%rdi, %rax), %eax > # endif > - VPCMP $0, (%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > + ret > > - /* Remove the leading bytes. */ > - sarxl %SHIFT_REG, %eax, %eax > - testl %eax, %eax > - jz L(aligned_more) > + .p2align 4 > +L(first_vec_x2): > tzcntl %eax, %eax > -# ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > -# endif > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > -# endif > - addq %rdi, %rax > - addq %rcx, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > + /* Use ecx which was computed earlier to compute correct value. > + */ > +# ifdef USE_AS_WCSLEN > + sarl $2, %ecx > +# endif > + leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax > +# else > + subl %edx, %edi > +# ifdef USE_AS_WCSLEN > + sarl $2, %edi > +# endif > + leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax > # endif > ret > > .p2align 4 > -L(aligned_more): > +L(first_vec_x3): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" > - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" > - to void possible addition overflow. */ > - negq %rcx > - addq $VEC_SIZE, %rcx > - > - /* Check the end of data. */ > - subq %rcx, %rsi > - jbe L(max) > + /* Use ecx which was computed earlier to compute correct value. > + */ > +# ifdef USE_AS_WCSLEN > + sarl $2, %ecx > +# endif > + leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax > +# else > + subl %edx, %edi > +# ifdef USE_AS_WCSLEN > + sarl $2, %edi > +# endif > + leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax > # endif > + ret > > - addq $VEC_SIZE, %rdi > - > + .p2align 4 > +L(first_vec_x4): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > + /* Use ecx which was computed earlier to compute correct value. > + */ > +# ifdef USE_AS_WCSLEN > + sarl $2, %ecx > +# endif > + leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax > +# else > + subl %edx, %edi > +# ifdef USE_AS_WCSLEN > + sarl $2, %edi > +# endif > + leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax > # endif > + ret > > -L(more_4x_vec): > + /* strnlen jumps here. strlen falls through. */ > + .p2align 5 > +L(aligned_more): > + movq %rdi, %rdx > + /* Align data to VEC_SIZE. */ > + andq $-(VEC_SIZE), %rdi > +L(cross_page_continue): > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > since data is only aligned to VEC_SIZE. */ > - VPCMP $0, (%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > - > +# ifdef USE_AS_STRNLEN > +# ifdef USE_AS_WCSLEN > + salq $2, %rsi > +# endif > + /* + CHAR_SIZE because it simplies the logic in > + last_4x_vec_or_less. */ > + leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx > + subq %rdx, %rcx > +# endif > + /* Load first VEC regardless. */ > VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 > +# ifdef USE_AS_STRNLEN > + /* Adjust length. If near end handle specially. */ > + subq %rcx, %rsi > + jb L(last_4x_vec_or_less) > +# endif > kmovd %k0, %eax > testl %eax, %eax > jnz L(first_vec_x1) > > VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 > kmovd %k0, %eax > - testl %eax, %eax > + test %eax, %eax > jnz L(first_vec_x2) > > VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 > @@ -179,258 +218,264 @@ L(more_4x_vec): > testl %eax, %eax > jnz L(first_vec_x3) > > - addq $(VEC_SIZE * 4), %rdi > - > -# ifdef USE_AS_STRNLEN > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > -# endif > - > - /* Align data to 4 * VEC_SIZE. */ > - movq %rdi, %rcx > - andl $(4 * VEC_SIZE - 1), %ecx > - andq $-(4 * VEC_SIZE), %rdi > + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 > + kmovd %k0, %eax > + testl %eax, %eax > + jnz L(first_vec_x4) > > + addq $VEC_SIZE, %rdi > # ifdef USE_AS_STRNLEN > - /* Adjust length. */ > + /* Check if at last VEC_SIZE * 4 length. */ > + cmpq $(VEC_SIZE * 4 - 1), %rsi > + jbe L(last_4x_vec_or_less_load) > + movl %edi, %ecx > + andl $(VEC_SIZE * 4 - 1), %ecx > + /* Readjust length. */ > addq %rcx, %rsi > # endif > + /* Align data to VEC_SIZE * 4. */ > + andq $-(VEC_SIZE * 4), %rdi > > + /* Compare 4 * VEC at a time forward. */ > .p2align 4 > L(loop_4x_vec): > - /* Compare 4 * VEC at a time forward. */ > - VMOVA (%rdi), %YMM1 > - VMOVA VEC_SIZE(%rdi), %YMM2 > - VMOVA (VEC_SIZE * 2)(%rdi), %YMM3 > - VMOVA (VEC_SIZE * 3)(%rdi), %YMM4 > - > - VPMINU %YMM1, %YMM2, %YMM5 > - VPMINU %YMM3, %YMM4, %YMM6 > + /* Load first VEC regardless. */ > + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > +# ifdef USE_AS_STRNLEN > + /* Break if at end of length. */ > + subq $(VEC_SIZE * 4), %rsi > + jb L(last_4x_vec_or_less_cmpeq) > +# endif > + VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2 > + VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 > + VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4 > + VPCMP $0, %YMM2, %YMMZERO, %k0 > + VPCMP $0, %YMM4, %YMMZERO, %k1 > + subq $-(VEC_SIZE * 4), %rdi > + kortestd %k0, %k1 > + jz L(loop_4x_vec) > + > + /* Check if end was in first half. */ > + kmovd %k0, %eax > + subq %rdx, %rdi > +# ifdef USE_AS_WCSLEN > + shrq $2, %rdi > +# endif > + testl %eax, %eax > + jz L(second_vec_return) > > - VPMINU %YMM5, %YMM6, %YMM5 > - VPCMP $0, %YMM5, %YMMZERO, %k0 > - ktestd %k0, %k0 > - jnz L(4x_vec_end) > + VPCMP $0, %YMM1, %YMMZERO, %k2 > + kmovd %k2, %edx > + /* Combine YMM1 matches (k2) with YMM2 matches (k0). */ > +# ifdef USE_AS_WCSLEN > + sall $CHAR_PER_VEC, %eax > + orl %edx, %eax > + tzcntl %eax, %eax > +# else > + salq $CHAR_PER_VEC, %rax > + orq %rdx, %rax > + tzcntq %rax, %rax > +# endif > + addq %rdi, %rax > + ret > > - addq $(VEC_SIZE * 4), %rdi > > -# ifndef USE_AS_STRNLEN > - jmp L(loop_4x_vec) > -# else > - subq $(VEC_SIZE * 4), %rsi > - ja L(loop_4x_vec) > +# ifdef USE_AS_STRNLEN > > +L(last_4x_vec_or_less_load): > + /* Depending on entry adjust rdi / prepare first VEC in YMM1. */ > + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > +L(last_4x_vec_or_less_cmpeq): > + VPCMP $0, %YMM1, %YMMZERO, %k0 > + addq $(VEC_SIZE * 3), %rdi > L(last_4x_vec_or_less): > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > - addl $(VEC_SIZE * 2), %esi > - jle L(last_2x_vec) > - > - VPCMP $0, (%rdi), %YMMZERO, %k0 > kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > + /* If remaining length > VEC_SIZE * 2. */ > + testl $(VEC_SIZE * 2), %esi > + jnz L(last_4x_vec) > > - VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > + /* length may have been negative or positive depending on where > + this was called from. This fixes that. */ > + andl $(VEC_SIZE * 4 - 1), %esi > testl %eax, %eax > - jnz L(first_vec_x1) > + jnz L(last_vec_x1_check) > > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x2_check) > subl $VEC_SIZE, %esi > - jle L(max) > + jb L(max) > > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 > + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 > kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x3_check) > - movq %r8, %rax > + tzcntl %eax, %eax > # ifdef USE_AS_WCSLEN > - shrq $2, %rax > + sarl $2, %esi > # endif > - ret > - > - .p2align 4 > -L(last_2x_vec): > - addl $(VEC_SIZE * 2), %esi > - > - VPCMP $0, (%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x0_check) > - subl $VEC_SIZE, %esi > - jle L(max) > + /* Check the end of data. */ > + cmpl %eax, %esi > + jb L(max) > > - VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x1_check) > - movq %r8, %rax > + subq %rdx, %rdi > # ifdef USE_AS_WCSLEN > - shrq $2, %rax > + sarq $2, %rdi > # endif > + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax > + ret > +L(max): > + movq %r8, %rax > ret > +# endif > > .p2align 4 > -L(first_vec_x0_check): > +L(second_vec_return): > + VPCMP $0, %YMM3, %YMMZERO, %k0 > + /* Combine YMM3 matches (k0) with YMM4 matches (k1). */ > +# ifdef USE_AS_WCSLEN > + kunpckbw %k0, %k1, %k0 > + kmovd %k0, %eax > + tzcntl %eax, %eax > +# else > + kunpckdq %k0, %k1, %k0 > + kmovq %k0, %rax > + tzcntq %rax, %rax > +# endif > + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax > + ret > + > + > +# ifdef USE_AS_STRNLEN > +L(last_vec_x1_check): > tzcntl %eax, %eax > # ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > + sarl $2, %esi > # endif > /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq %rdi, %rax > - subq %rdx, %rax > + cmpl %eax, %esi > + jb L(max) > + subq %rdx, %rdi > # ifdef USE_AS_WCSLEN > - shrq $2, %rax > + sarq $2, %rdi > # endif > + leaq (CHAR_PER_VEC)(%rdi, %rax), %rax > ret > > .p2align 4 > -L(first_vec_x1_check): > +L(last_4x_vec): > + /* Test first 2x VEC normally. */ > + testl %eax, %eax > + jnz L(last_vec_x1) > + > + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 > + kmovd %k0, %eax > + testl %eax, %eax > + jnz L(last_vec_x2) > + > + /* Normalize length. */ > + andl $(VEC_SIZE * 4 - 1), %esi > + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 > + kmovd %k0, %eax > + testl %eax, %eax > + jnz L(last_vec_x3) > + > + subl $(VEC_SIZE * 3), %esi > + jb L(max) > + > + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 > + kmovd %k0, %eax > tzcntl %eax, %eax > # ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > + sarl $2, %esi > # endif > /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $VEC_SIZE, %rax > - addq %rdi, %rax > - subq %rdx, %rax > + cmpl %eax, %esi > + jb L(max_end) > + > + subq %rdx, %rdi > # ifdef USE_AS_WCSLEN > - shrq $2, %rax > + sarq $2, %rdi > # endif > + leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax > ret > > .p2align 4 > -L(first_vec_x2_check): > +L(last_vec_x1): > tzcntl %eax, %eax > + subq %rdx, %rdi > # ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > + sarq $2, %rdi > # endif > - /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $(VEC_SIZE * 2), %rax > - addq %rdi, %rax > - subq %rdx, %rax > + leaq (CHAR_PER_VEC)(%rdi, %rax), %rax > + ret > + > + .p2align 4 > +L(last_vec_x2): > + tzcntl %eax, %eax > + subq %rdx, %rdi > # ifdef USE_AS_WCSLEN > - shrq $2, %rax > + sarq $2, %rdi > # endif > + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax > ret > > .p2align 4 > -L(first_vec_x3_check): > +L(last_vec_x3): > tzcntl %eax, %eax > # ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > + sarl $2, %esi > # endif > + subl $(CHAR_PER_VEC * 2), %esi > /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $(VEC_SIZE * 3), %rax > - addq %rdi, %rax > - subq %rdx, %rax > + cmpl %eax, %esi > + jb L(max_end) > + subq %rdx, %rdi > # ifdef USE_AS_WCSLEN > - shrq $2, %rax > + sarq $2, %rdi > # endif > + leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax > ret > - > - .p2align 4 > -L(max): > +L(max_end): > movq %r8, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > -# endif > - ret > - > - .p2align 4 > -L(zero): > - xorl %eax, %eax > ret > # endif > > + /* Cold case for crossing page with first load. */ > .p2align 4 > -L(first_vec_x0): > - tzcntl %eax, %eax > -# ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > -# endif > - addq %rdi, %rax > - subq %rdx, %rax > +L(cross_page_boundary): > + movq %rdi, %rdx > + /* Align data to VEC_SIZE. */ > + andq $-VEC_SIZE, %rdi > + VPCMP $0, (%rdi), %YMMZERO, %k0 > + kmovd %k0, %eax > + /* Remove the leading bytes. */ > # ifdef USE_AS_WCSLEN > - shrq $2, %rax > + movl %edx, %ecx > + shrl $2, %ecx > + andl $(CHAR_PER_VEC - 1), %ecx > # endif > - ret > - > - .p2align 4 > -L(first_vec_x1): > + /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */ > + sarxl %SHIFT_REG, %eax, %eax > + testl %eax, %eax > +# ifndef USE_AS_STRNLEN > + jz L(cross_page_continue) > tzcntl %eax, %eax > -# ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > -# endif > - addq $VEC_SIZE, %rax > - addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > -# endif > ret > - > - .p2align 4 > -L(first_vec_x2): > - tzcntl %eax, %eax > -# ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > -# endif > - addq $(VEC_SIZE * 2), %rax > - addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > -# endif > +# else > + jnz L(cross_page_less_vec) > +# ifndef USE_AS_WCSLEN > + movl %edx, %ecx > + andl $(CHAR_PER_VEC - 1), %ecx > +# endif > + movl $CHAR_PER_VEC, %eax > + subl %ecx, %eax > + cmpq %rax, %rsi > + ja L(cross_page_continue) > + movl %esi, %eax > ret > - > - .p2align 4 > -L(4x_vec_end): > - VPCMP $0, %YMM1, %YMMZERO, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > - VPCMP $0, %YMM2, %YMMZERO, %k1 > - kmovd %k1, %eax > - testl %eax, %eax > - jnz L(first_vec_x1) > - VPCMP $0, %YMM3, %YMMZERO, %k2 > - kmovd %k2, %eax > - testl %eax, %eax > - jnz L(first_vec_x2) > - VPCMP $0, %YMM4, %YMMZERO, %k3 > - kmovd %k3, %eax > -L(first_vec_x3): > +L(cross_page_less_vec): > tzcntl %eax, %eax > -# ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > -# endif > - addq $(VEC_SIZE * 3), %rax > - addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > -# endif > + /* Select min of length and position of first null. */ > + cmpq %rax, %rsi > + cmovb %esi, %eax > ret > +# endif > > END (STRLEN) > #endif > -- > 2.29.2 > -- H.J.