From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga09.intel.com (mga09.intel.com [134.134.136.24]) by sourceware.org (Postfix) with ESMTPS id 22676385D0E0 for ; Fri, 28 Oct 2022 15:48:31 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 22676385D0E0 Authentication-Results: sourceware.org; dmarc=fail (p=none dis=none) header.from=gmail.com Authentication-Results: sourceware.org; spf=fail smtp.mailfrom=gmail.com X-IronPort-AV: E=McAfee;i="6500,9779,10514"; a="309611866" X-IronPort-AV: E=Sophos;i="5.95,221,1661842800"; d="scan'208";a="309611866" Received: from orsmga002.jf.intel.com ([10.7.209.21]) by orsmga102.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 28 Oct 2022 08:48:10 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=McAfee;i="6500,9779,10514"; a="632815657" X-IronPort-AV: E=Sophos;i="5.95,221,1661842800"; d="scan'208";a="632815657" Received: from scymds01.sc.intel.com ([10.148.94.138]) by orsmga002.jf.intel.com with ESMTP; 28 Oct 2022 08:48:10 -0700 Received: from gskx-1.sc.intel.com (gskx-1.sc.intel.com [172.25.149.211]) by scymds01.sc.intel.com with ESMTP id 29SFmAsW031281 for ; Fri, 28 Oct 2022 08:48:10 -0700 From: Sunil K Pandey To: libc-alpha@sourceware.org Subject: [PATCH] x86-64: Improve evex512 version of strlen functions Date: Fri, 28 Oct 2022 08:48:10 -0700 Message-Id: <20221028154810.1801123-1-skpgkp2@gmail.com> X-Mailer: git-send-email 2.36.1 MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Spam-Status: No, score=-7.8 required=5.0 tests=BAYES_00,DKIM_ADSP_CUSTOM_MED,FORGED_GMAIL_RCVD,FREEMAIL_ENVFROM_END_DIGIT,FREEMAIL_FROM,GIT_PATCH_0,HK_RANDOM_ENVFROM,HK_RANDOM_FROM,KAM_DMARC_NONE,KAM_DMARC_STATUS,NML_ADSP_CUSTOM_MED,RCVD_IN_MSPIKE_H3,RCVD_IN_MSPIKE_WL,SPF_HELO_NONE,SPF_SOFTFAIL,SPOOFED_FREEMAIL,SPOOF_GMAIL_MID,TXREP autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: This patch improves following functionality - Replace VPCMP with VPCMPEQ. - Replace page cross check logic with sall. - Remove extra lea from align_more. - Remove uncondition loop jump. - Use bsf to check max length in first vector. --- sysdeps/x86_64/multiarch/strlen-evex-base.S | 91 +++++++++++++-------- 1 file changed, 57 insertions(+), 34 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S index c832b15a48..fd6c770e6e 100644 --- a/sysdeps/x86_64/multiarch/strlen-evex-base.S +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S @@ -25,12 +25,12 @@ # include # ifdef USE_AS_WCSLEN -# define VPCMP vpcmpd +# define VPCMPEQ vpcmpeqd # define VPTESTN vptestnmd # define VPMINU vpminud # define CHAR_SIZE 4 # else -# define VPCMP vpcmpb +# define VPCMPEQ vpcmpeqb # define VPTESTN vptestnmb # define VPMINU vpminub # define CHAR_SIZE 1 @@ -55,20 +55,29 @@ ENTRY_P2ALIGN (STRLEN, 6) movl %edi, %eax vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) - andl $(PAGE_SIZE - 1), %eax - cmpl $(PAGE_SIZE - VEC_SIZE), %eax + sall $20, %eax + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax ja L(page_cross) /* Compare [w]char for null, mask bit will be set for match. */ - VPCMP $0, (%rdi), %VMM(0), %k0 + VPCMPEQ (%rdi), %VMM(0), %k0 +# ifdef USE_AS_STRNLEN + KMOV %k0, %VRCX + /* Store max length in rax. */ + mov %rsi, %rax + /* If rcx is 0, rax will have max length. We can not use VRCX + and VRAX here for evex256 because, upper 32 bits may be + undefined for ecx and eax. */ + bsfq %rcx, %rax + cmp $CHAR_PER_VEC, %rax + ja L(align_more) + cmpq %rax, %rsi + cmovb %esi, %eax +# else KMOV %k0, %VRAX test %VRAX, %VRAX jz L(align_more) - bsf %VRAX, %VRAX -# ifdef USE_AS_STRNLEN - cmpq %rsi, %rax - cmovnb %rsi, %rax # endif ret @@ -81,25 +90,24 @@ L(ret_max): # endif L(align_more): - leaq VEC_SIZE(%rdi), %rax + mov %rdi, %rax /* Align rax to VEC_SIZE. */ andq $-VEC_SIZE, %rax # ifdef USE_AS_STRNLEN - movq %rax, %rdx - subq %rdi, %rdx + movq %rdi, %rdx + subq %rax, %rdx # ifdef USE_AS_WCSLEN shr $2, %VRDX # endif /* At this point rdx contains [w]chars already compared. */ - subq %rsi, %rdx - jae L(ret_max) - negq %rdx + leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx /* At this point rdx contains number of w[char] needs to go. Now onwards rdx will keep decrementing with each compare. */ # endif /* Loop unroll 4 times for 4 vector loop. */ - VPCMP $0, (%rax), %VMM(0), %k0 + VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 + subq $-VEC_SIZE, %rax KMOV %k0, %VRCX test %VRCX, %VRCX jnz L(ret_vec_x1) @@ -109,7 +117,7 @@ L(align_more): jbe L(ret_max) # endif - VPCMP $0, VEC_SIZE(%rax), %VMM(0), %k0 + VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 KMOV %k0, %VRCX test %VRCX, %VRCX jnz L(ret_vec_x2) @@ -119,7 +127,7 @@ L(align_more): jbe L(ret_max) # endif - VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0 + VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0 KMOV %k0, %VRCX test %VRCX, %VRCX jnz L(ret_vec_x3) @@ -129,7 +137,7 @@ L(align_more): jbe L(ret_max) # endif - VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0 + VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0 KMOV %k0, %VRCX test %VRCX, %VRCX jnz L(ret_vec_x4) @@ -155,16 +163,10 @@ L(align_more): addq %rcx, %rdx /* Need jump as we don't want to add/subtract rdx for first iteration of 4 x VEC_SIZE aligned loop. */ - jmp L(loop_entry) # endif .p2align 4,,11 L(loop): -# ifdef USE_AS_STRNLEN - subq $(CHAR_PER_VEC * 4), %rdx - jbe L(ret_max) -L(loop_entry): -# endif /* VPMINU and VPCMP combination provide better performance as compared to alternative combinations. */ VMOVA (VEC_SIZE * 4)(%rax), %VMM(1) @@ -177,7 +179,18 @@ L(loop_entry): subq $-(VEC_SIZE * 4), %rax KORTEST %k0, %k1 - jz L(loop) + +# ifndef USE_AS_STRNLEN + jz L(loop) +# else + jnz L(loopend) + subq $(CHAR_PER_VEC * 4), %rdx + ja L(loop) + mov %rsi, %rax + ret +# endif + +L(loopend): VPTESTN %VMM(1), %VMM(1), %k2 KMOV %k2, %VRCX @@ -249,24 +262,34 @@ L(ret_vec_x1): ret L(page_cross): - movl %eax, %ecx -# ifdef USE_AS_WCSLEN + mov %rdi, %rax + movl %edi, %ecx andl $(VEC_SIZE - 1), %ecx +# ifdef USE_AS_WCSLEN sarl $2, %ecx # endif /* ecx contains number of w[char] to be skipped as a result of address alignment. */ - xorq %rdi, %rax - VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0 - KMOV %k0, %VRAX + andq $-VEC_SIZE, %rax + VPCMPEQ (%rax), %VMM(0), %k0 + KMOV %k0, %VRDX /* Ignore number of character for alignment adjustment. */ - shr %cl, %VRAX + shr %cl, %VRDX +# ifdef USE_AS_STRNLEN + jnz L(page_cross_end) + movl $CHAR_PER_VEC, %eax + sub %ecx, %eax + cmp %rax, %rsi + ja L(align_more) +# else jz L(align_more) +# endif - bsf %VRAX, %VRAX +L(page_cross_end): + bsf %VRDX, %VRAX # ifdef USE_AS_STRNLEN cmpq %rsi, %rax - cmovnb %rsi, %rax + cmovnb %esi, %eax # endif ret -- 2.36.1