From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 7852) id E4E0B3857C72; Tue, 8 Mar 2022 05:47:38 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org E4E0B3857C72 Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Sunil Pandey To: glibc-cvs@sourceware.org Subject: [glibc] x86_64: Fix svml_d_cosh2_core_sse4.S code formatting X-Act-Checkin: glibc X-Git-Author: Sunil K Pandey X-Git-Refname: refs/heads/master X-Git-Oldrev: c0de82af9294aa4a65a505716230a1b59b1f28e9 X-Git-Newrev: 23ff8b384fa1fba36c80ca3bea54146dca139643 Message-Id: <20220308054738.E4E0B3857C72@sourceware.org> Date: Tue, 8 Mar 2022 05:47:38 +0000 (GMT) X-BeenThere: glibc-cvs@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Glibc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 08 Mar 2022 05:47:39 -0000 https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=23ff8b384fa1fba36c80ca3bea54146dca139643 commit 23ff8b384fa1fba36c80ca3bea54146dca139643 Author: Sunil K Pandey Date: Mon Mar 7 10:47:11 2022 -0800 x86_64: Fix svml_d_cosh2_core_sse4.S code formatting This commit contains following formatting changes 1. Instructions proceeded by a tab. 2. Instruction less than 8 characters in length have a tab between it and the first operand. 3. Instruction greater than 7 characters in length have a space between it and the first operand. 4. Tabs after `#define`d names and their value. 5. 8 space at the beginning of line replaced by tab. 6. Indent comments with code. 7. Remove redundent .text section. 8. 1 space between line content and line comment. 9. Space after all commas. Reviewed-by: Noah Goldstein Diff: --- .../x86_64/fpu/multiarch/svml_d_cosh2_core_sse4.S | 664 ++++++++++----------- 1 file changed, 331 insertions(+), 333 deletions(-) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh2_core_sse4.S index 8cfc426a59..130c1a5cca 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh2_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh2_core_sse4.S @@ -34,363 +34,361 @@ /* Offsets for data table __svml_dcosh_data_internal */ -#define _dbT 0 -#define _dbInvLn2 2064 -#define _dbLn2hi 2080 -#define _dbLn2lo 2096 -#define _dbShifter 2112 -#define _iIndexMask 2128 -#define _dPC2 2144 -#define _dPC3 2160 -#define _dPC4 2176 -#define _iMaxIndex 2192 -#define _lExpMask 2208 -#define _dSign 2224 -#define _iDomainRange 2240 +#define _dbT 0 +#define _dbInvLn2 2064 +#define _dbLn2hi 2080 +#define _dbLn2lo 2096 +#define _dbShifter 2112 +#define _iIndexMask 2128 +#define _dPC2 2144 +#define _dPC3 2160 +#define _dPC4 2176 +#define _iMaxIndex 2192 +#define _lExpMask 2208 +#define _dSign 2224 +#define _iDomainRange 2240 #include - .text - .section .text.sse4,"ax",@progbits + .section .text.sse4, "ax", @progbits ENTRY(_ZGVbN2v_cosh_sse4) - subq $72, %rsp - cfi_def_cfa_offset(80) - movaps %xmm0, %xmm4 - movups _dSign+__svml_dcosh_data_internal(%rip), %xmm2 - lea _dbT+__svml_dcosh_data_internal(%rip), %r8 - -/* Abs argument */ - movaps %xmm2, %xmm5 - -/* dXSign=0x001000000000 */ - psrlq $11, %xmm2 - -/* - * Load argument - * dM = x*2^K/log(2) + RShifter - */ - movups _dbInvLn2+__svml_dcosh_data_internal(%rip), %xmm3 - andnps %xmm4, %xmm5 - mulpd %xmm5, %xmm3 - movups _dbShifter+__svml_dcosh_data_internal(%rip), %xmm1 - addpd %xmm1, %xmm3 - -/* - * R - * dN = dM - RShifter - */ - movaps %xmm3, %xmm15 - subpd %xmm1, %xmm15 - -/* dR = dX - dN*Log2_hi/2^K */ - movups _dbLn2hi+__svml_dcosh_data_internal(%rip), %xmm14 - mulpd %xmm15, %xmm14 - -/* dR = (dX - dN*Log2_hi/2^K) - dN*Log2_lo/2^K */ - movups _dbLn2lo+__svml_dcosh_data_internal(%rip), %xmm1 - mulpd %xmm15, %xmm1 - -/* - * Check for overflow\underflow - * - */ - pshufd $221, %xmm5, %xmm7 - subpd %xmm14, %xmm5 - movq _iIndexMask+__svml_dcosh_data_internal(%rip), %xmm8 - -/* Index and lookup */ - pshufd $136, %xmm3, %xmm9 - -/* - * G1,G2,G3: dTdif,dTn * 2^N,2^(-N) - * NB: copied from sinh_la - to be optimized!!!!! - */ - psllq $44, %xmm3 - -/* - * trick - * 256=-iIndex - */ - movq _iMaxIndex+__svml_dcosh_data_internal(%rip), %xmm12 - pand %xmm8, %xmm9 - subpd %xmm1, %xmm5 - psubd %xmm9, %xmm12 - -/* iIndex*=3 */ - movdqa %xmm9, %xmm10 - -/* iDomainRange*=3 */ - pslld $3, %xmm12 - pslld $3, %xmm10 - movd %xmm12, %esi - pshufd $1, %xmm12, %xmm13 - movq _iDomainRange+__svml_dcosh_data_internal(%rip), %xmm6 - movd %xmm13, %edi - pcmpgtd %xmm6, %xmm7 - movmskps %xmm7, %eax - -/* dR2 = dR^2 */ - movaps %xmm5, %xmm7 - -/* lM now is an EXP(2^N) */ - pand _lExpMask+__svml_dcosh_data_internal(%rip), %xmm3 - pshufd $1, %xmm10, %xmm11 - movslq %esi, %rsi - mulpd %xmm5, %xmm7 - movd %xmm10, %edx - movsd (%r8,%rsi), %xmm6 - movd %xmm11, %ecx - movslq %edi, %rdi - movslq %edx, %rdx - movslq %ecx, %rcx - movhpd (%r8,%rdi), %xmm6 - -/* */ - psubq %xmm3, %xmm6 - -/* lX- = EXP(1/2) */ - psubq %xmm2, %xmm6 - -/* - * sinh(r) = r +r*r^2*a3 .... - * dSinh_r = r^2*a3 - */ - movups _dPC3+__svml_dcosh_data_internal(%rip), %xmm2 - mulpd %xmm7, %xmm2 - -/* dSinh_r = r + r*r^2*a3 */ - mulpd %xmm5, %xmm2 - movsd (%r8,%rdx), %xmm0 - movhpd (%r8,%rcx), %xmm0 - paddq %xmm3, %xmm0 - addpd %xmm2, %xmm5 - -/* dTn = dTn*2^N - dTn*2^-N */ - movaps %xmm0, %xmm3 - subpd %xmm6, %xmm3 - -/* dTp = dTn*2^N + dTn*2^-N */ - addpd %xmm6, %xmm0 - mulpd %xmm5, %xmm3 - -/* poly(r) = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ - movups _dPC4+__svml_dcosh_data_internal(%rip), %xmm5 - mulpd %xmm7, %xmm5 - addpd _dPC2+__svml_dcosh_data_internal(%rip), %xmm5 - mulpd %xmm5, %xmm7 - -/* dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ - mulpd %xmm0, %xmm7 - addpd %xmm7, %xmm3 - -/* _VRES1 = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ - addpd %xmm3, %xmm0 - andl $3, %eax - -/* Ret H */ - -/* Go to special inputs processing branch */ - jne L(SPECIAL_VALUES_BRANCH) - # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm4 - -/* Restore registers - * and exit the function - */ + subq $72, %rsp + cfi_def_cfa_offset(80) + movaps %xmm0, %xmm4 + movups _dSign+__svml_dcosh_data_internal(%rip), %xmm2 + lea _dbT+__svml_dcosh_data_internal(%rip), %r8 + + /* Abs argument */ + movaps %xmm2, %xmm5 + + /* dXSign=0x001000000000 */ + psrlq $11, %xmm2 + + /* + * Load argument + * dM = x*2^K/log(2) + RShifter + */ + movups _dbInvLn2+__svml_dcosh_data_internal(%rip), %xmm3 + andnps %xmm4, %xmm5 + mulpd %xmm5, %xmm3 + movups _dbShifter+__svml_dcosh_data_internal(%rip), %xmm1 + addpd %xmm1, %xmm3 + + /* + * R + * dN = dM - RShifter + */ + movaps %xmm3, %xmm15 + subpd %xmm1, %xmm15 + + /* dR = dX - dN*Log2_hi/2^K */ + movups _dbLn2hi+__svml_dcosh_data_internal(%rip), %xmm14 + mulpd %xmm15, %xmm14 + + /* dR = (dX - dN*Log2_hi/2^K) - dN*Log2_lo/2^K */ + movups _dbLn2lo+__svml_dcosh_data_internal(%rip), %xmm1 + mulpd %xmm15, %xmm1 + + /* + * Check for overflow\underflow + * + */ + pshufd $221, %xmm5, %xmm7 + subpd %xmm14, %xmm5 + movq _iIndexMask+__svml_dcosh_data_internal(%rip), %xmm8 + + /* Index and lookup */ + pshufd $136, %xmm3, %xmm9 + + /* + * G1, G2, G3: dTdif, dTn * 2^N, 2^(-N) + * NB: copied from sinh_la - to be optimized!!!!! + */ + psllq $44, %xmm3 + + /* + * trick + * 256=-iIndex + */ + movq _iMaxIndex+__svml_dcosh_data_internal(%rip), %xmm12 + pand %xmm8, %xmm9 + subpd %xmm1, %xmm5 + psubd %xmm9, %xmm12 + + /* iIndex*=3 */ + movdqa %xmm9, %xmm10 + + /* iDomainRange*=3 */ + pslld $3, %xmm12 + pslld $3, %xmm10 + movd %xmm12, %esi + pshufd $1, %xmm12, %xmm13 + movq _iDomainRange+__svml_dcosh_data_internal(%rip), %xmm6 + movd %xmm13, %edi + pcmpgtd %xmm6, %xmm7 + movmskps %xmm7, %eax + + /* dR2 = dR^2 */ + movaps %xmm5, %xmm7 + + /* lM now is an EXP(2^N) */ + pand _lExpMask+__svml_dcosh_data_internal(%rip), %xmm3 + pshufd $1, %xmm10, %xmm11 + movslq %esi, %rsi + mulpd %xmm5, %xmm7 + movd %xmm10, %edx + movsd (%r8, %rsi), %xmm6 + movd %xmm11, %ecx + movslq %edi, %rdi + movslq %edx, %rdx + movslq %ecx, %rcx + movhpd (%r8, %rdi), %xmm6 + + /* */ + psubq %xmm3, %xmm6 + + /* lX- = EXP(1/2) */ + psubq %xmm2, %xmm6 + + /* + * sinh(r) = r +r*r^2*a3 .... + * dSinh_r = r^2*a3 + */ + movups _dPC3+__svml_dcosh_data_internal(%rip), %xmm2 + mulpd %xmm7, %xmm2 + + /* dSinh_r = r + r*r^2*a3 */ + mulpd %xmm5, %xmm2 + movsd (%r8, %rdx), %xmm0 + movhpd (%r8, %rcx), %xmm0 + paddq %xmm3, %xmm0 + addpd %xmm2, %xmm5 + + /* dTn = dTn*2^N - dTn*2^-N */ + movaps %xmm0, %xmm3 + subpd %xmm6, %xmm3 + + /* dTp = dTn*2^N + dTn*2^-N */ + addpd %xmm6, %xmm0 + mulpd %xmm5, %xmm3 + + /* poly(r) = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ + movups _dPC4+__svml_dcosh_data_internal(%rip), %xmm5 + mulpd %xmm7, %xmm5 + addpd _dPC2+__svml_dcosh_data_internal(%rip), %xmm5 + mulpd %xmm5, %xmm7 + + /* dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ + mulpd %xmm0, %xmm7 + addpd %xmm7, %xmm3 + + /* _VRES1 = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ + addpd %xmm3, %xmm0 + andl $3, %eax + + /* Ret H */ + + /* Go to special inputs processing branch */ + jne L(SPECIAL_VALUES_BRANCH) + # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm4 + + /* Restore registers + * and exit the function + */ L(EXIT): - addq $72, %rsp - cfi_def_cfa_offset(8) - ret - cfi_def_cfa_offset(80) + addq $72, %rsp + cfi_def_cfa_offset(8) + ret + cfi_def_cfa_offset(80) -/* Branch to process - * special inputs - */ + /* Branch to process + * special inputs + */ L(SPECIAL_VALUES_BRANCH): - movups %xmm4, 32(%rsp) - movups %xmm0, 48(%rsp) - # LOE rbx rbp r12 r13 r14 r15 eax xmm0 - - xorl %edx, %edx - movq %r12, 16(%rsp) - cfi_offset(12, -64) - movl %edx, %r12d - movq %r13, 8(%rsp) - cfi_offset(13, -72) - movl %eax, %r13d - movq %r14, (%rsp) - cfi_offset(14, -80) - # LOE rbx rbp r15 r12d r13d - -/* Range mask - * bits check - */ + movups %xmm4, 32(%rsp) + movups %xmm0, 48(%rsp) + # LOE rbx rbp r12 r13 r14 r15 eax xmm0 + + xorl %edx, %edx + movq %r12, 16(%rsp) + cfi_offset(12, -64) + movl %edx, %r12d + movq %r13, 8(%rsp) + cfi_offset(13, -72) + movl %eax, %r13d + movq %r14, (%rsp) + cfi_offset(14, -80) + # LOE rbx rbp r15 r12d r13d + + /* Range mask + * bits check + */ L(RANGEMASK_CHECK): - btl %r12d, %r13d + btl %r12d, %r13d -/* Call scalar math function */ - jc L(SCALAR_MATH_CALL) - # LOE rbx rbp r15 r12d r13d + /* Call scalar math function */ + jc L(SCALAR_MATH_CALL) + # LOE rbx rbp r15 r12d r13d -/* Special inputs - * processing loop - */ + /* Special inputs + * processing loop + */ L(SPECIAL_VALUES_LOOP): - incl %r12d - cmpl $2, %r12d - -/* Check bits in range mask */ - jl L(RANGEMASK_CHECK) - # LOE rbx rbp r15 r12d r13d - - movq 16(%rsp), %r12 - cfi_restore(12) - movq 8(%rsp), %r13 - cfi_restore(13) - movq (%rsp), %r14 - cfi_restore(14) - movups 48(%rsp), %xmm0 - -/* Go to exit */ - jmp L(EXIT) - cfi_offset(12, -64) - cfi_offset(13, -72) - cfi_offset(14, -80) - # LOE rbx rbp r12 r13 r14 r15 xmm0 - -/* Scalar math fucntion call - * to process special input - */ + incl %r12d + cmpl $2, %r12d + + /* Check bits in range mask */ + jl L(RANGEMASK_CHECK) + # LOE rbx rbp r15 r12d r13d + + movq 16(%rsp), %r12 + cfi_restore(12) + movq 8(%rsp), %r13 + cfi_restore(13) + movq (%rsp), %r14 + cfi_restore(14) + movups 48(%rsp), %xmm0 + + /* Go to exit */ + jmp L(EXIT) + cfi_offset(12, -64) + cfi_offset(13, -72) + cfi_offset(14, -80) + # LOE rbx rbp r12 r13 r14 r15 xmm0 + + /* Scalar math fucntion call + * to process special input + */ L(SCALAR_MATH_CALL): - movl %r12d, %r14d - movsd 32(%rsp,%r14,8), %xmm0 - call cosh@PLT - # LOE rbx rbp r14 r15 r12d r13d xmm0 + movl %r12d, %r14d + movsd 32(%rsp, %r14, 8), %xmm0 + call cosh@PLT + # LOE rbx rbp r14 r15 r12d r13d xmm0 - movsd %xmm0, 48(%rsp,%r14,8) + movsd %xmm0, 48(%rsp, %r14, 8) -/* Process special inputs in loop */ - jmp L(SPECIAL_VALUES_LOOP) - # LOE rbx rbp r15 r12d r13d + /* Process special inputs in loop */ + jmp L(SPECIAL_VALUES_LOOP) + # LOE rbx rbp r15 r12d r13d END(_ZGVbN2v_cosh_sse4) - .section .rodata, "a" - .align 16 + .section .rodata, "a" + .align 16 #ifdef __svml_dcosh_data_internal_typedef typedef unsigned int VUINT32; -typedef struct -{ - __declspec(align(16)) VUINT32 _dbT[(1 + (1<<8))][2]; //dTpj ONLY! - __declspec(align(16)) VUINT32 _dbInvLn2[2][2]; - __declspec(align(16)) VUINT32 _dbLn2hi[2][2]; - __declspec(align(16)) VUINT32 _dbLn2lo[2][2]; - __declspec(align(16)) VUINT32 _dbShifter[2][2]; - __declspec(align(16)) VUINT32 _iIndexMask[4][1]; //(1<