From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pf1-x42c.google.com (mail-pf1-x42c.google.com [IPv6:2607:f8b0:4864:20::42c]) by sourceware.org (Postfix) with ESMTPS id 2525A382D455 for ; Tue, 7 Jun 2022 04:05:46 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 2525A382D455 Received: by mail-pf1-x42c.google.com with SMTP id p8so14402014pfh.8 for ; Mon, 06 Jun 2022 21:05:46 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=g51rtSFrtc5/esRF2nnwl2fDrP9rbKMdQCpYoYv3ndg=; b=q7u5NS2CQs2RjTyX8beNB9P34cCA0qEJ7He4amrWv+r+1f6jCvYG/NR+IVT8wapwc8 kNHmgOfttZ/eYs9khuX83m6hb4pVVLGkExhR6QkQar7AVLOEcPME1pn1486RAzTMga+7 zgR66Std7/QDhDoV9Dkn98YBFEPsPXYYujQyZvV0R9xxQjII8y5b6nNCmgyWXEBgZdcs md6/B2xwMj5K3IThHTKYxihYS/6QE0RKuFIm0MjmfBnxfcICLrn9fILvkJ+O0vzk0ndT vZCwBmOoOS3UoeiVyrq7FKMIZkTErAh2h5ak2JQ6z4oEcYIvJRQUV+k5I5rqpWMX4Oah WzZQ== X-Gm-Message-State: AOAM530gNaJWZun5w1li81jyqGwso0bHVqkQiXr+t3KHah2EXl6Xk2aV GIuRcjOZoKFJn/hI3fXmxs0ZdZb0IZA= X-Google-Smtp-Source: ABdhPJxSWY4N5rVuP6Lw3KJPXoMCwym6FFB2QHdoBQNAhD6aXmPitLRtybopk5A4Uc7otxKb71RJnA== X-Received: by 2002:a63:65c7:0:b0:3fc:85b5:30c0 with SMTP id z190-20020a6365c7000000b003fc85b530c0mr24306239pgb.165.1654574744978; Mon, 06 Jun 2022 21:05:44 -0700 (PDT) Received: from noah-tgl.. ([2600:1010:b04a:6ef:921e:3108:9361:2ef8]) by smtp.gmail.com with ESMTPSA id r2-20020a056a00216200b0051bd9981cacsm8150736pff.123.2022.06.06.21.05.43 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 06 Jun 2022 21:05:44 -0700 (PDT) From: Noah Goldstein To: libc-alpha@sourceware.org Subject: [PATCH v5 8/8] x86: Shrink code size of memchr-evex.S Date: Mon, 6 Jun 2022 21:05:29 -0700 Message-Id: <20220607040529.2344473-8-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20220607040529.2344473-1-goldstein.w.n@gmail.com> References: <20220603044229.2180216-2-goldstein.w.n@gmail.com> <20220607040529.2344473-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Spam-Status: No, score=-12.0 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 07 Jun 2022 04:05:47 -0000 This is not meant as a performance optimization. The previous code was far to liberal in aligning targets and wasted code size unnecissarily. The total code size saving is: 64 bytes There are no non-negligible changes in the benchmarks. Geometric Mean of all benchmarks New / Old: 1.000 Full xcheck passes on x86_64. --- sysdeps/x86_64/multiarch/memchr-evex.S | 46 ++++++++++++++------------ 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S index cfaf02907d..0fd11b7632 100644 --- a/sysdeps/x86_64/multiarch/memchr-evex.S +++ b/sysdeps/x86_64/multiarch/memchr-evex.S @@ -88,7 +88,7 @@ # define PAGE_SIZE 4096 .section SECTION(.text),"ax",@progbits -ENTRY (MEMCHR) +ENTRY_P2ALIGN (MEMCHR, 6) # ifndef USE_AS_RAWMEMCHR /* Check for zero length. */ test %RDX_LP, %RDX_LP @@ -131,22 +131,24 @@ L(zero): xorl %eax, %eax ret - .p2align 5 + .p2align 4 L(first_vec_x0): - /* Check if first match was before length. */ - tzcntl %eax, %eax - xorl %ecx, %ecx - cmpl %eax, %edx - leaq (%rdi, %rax, CHAR_SIZE), %rax - cmovle %rcx, %rax + /* Check if first match was before length. NB: tzcnt has false data- + dependency on destination. eax already had a data-dependency on esi + so this should have no affect here. */ + tzcntl %eax, %esi +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rsi, CHAR_SIZE), %rdi +# else + addq %rsi, %rdi +# endif + xorl %eax, %eax + cmpl %esi, %edx + cmovg %rdi, %rax ret -# else - /* NB: first_vec_x0 is 17 bytes which will leave - cross_page_boundary (which is relatively cold) close enough - to ideal alignment. So only realign L(cross_page_boundary) if - rawmemchr. */ - .p2align 4 # endif + + .p2align 4 L(cross_page_boundary): /* Save pointer before aligning as its original value is necessary for computer return address if byte is found or @@ -400,10 +402,14 @@ L(last_2x_vec): L(zero_end): ret +L(set_zero_end): + xorl %eax, %eax + ret .p2align 4 L(first_vec_x1_check): - tzcntl %eax, %eax + /* eax must be non-zero. Use bsfl to save code size. */ + bsfl %eax, %eax /* Adjust length. */ subl $-(CHAR_PER_VEC * 4), %edx /* Check if match within remaining length. */ @@ -412,9 +418,6 @@ L(first_vec_x1_check): /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax ret -L(set_zero_end): - xorl %eax, %eax - ret .p2align 4 L(loop_4x_vec_end): @@ -464,7 +467,7 @@ L(loop_4x_vec_end): # endif ret - .p2align 4 + .p2align 4,, 10 L(last_vec_x1_return): tzcntl %eax, %eax # if defined USE_AS_WMEMCHR || RET_OFFSET != 0 @@ -496,6 +499,7 @@ L(last_vec_x3_return): # endif # ifndef USE_AS_RAWMEMCHR + .p2align 4,, 5 L(last_4x_vec_or_less_cmpeq): VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 kmovd %k0, %eax @@ -546,7 +550,7 @@ L(last_4x_vec): # endif andl %ecx, %eax jz L(zero_end2) - tzcntl %eax, %eax + bsfl %eax, %eax leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax L(zero_end2): ret @@ -562,6 +566,6 @@ L(last_vec_x3): leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ret # endif - + /* 7 bytes from next cache line. */ END (MEMCHR) #endif -- 2.34.1