From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-oo1-xc29.google.com (mail-oo1-xc29.google.com [IPv6:2607:f8b0:4864:20::c29]) by sourceware.org (Postfix) with ESMTPS id AAB023948A5F for ; Wed, 23 Feb 2022 14:09:35 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org AAB023948A5F Received: by mail-oo1-xc29.google.com with SMTP id i10-20020a4aab0a000000b002fccf890d5fso22406271oon.5 for ; Wed, 23 Feb 2022 06:09:35 -0800 (PST) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=rbaOaESU0xM3jstDgSDs/SYsFJU7f3pS4YnLJenRfeY=; b=bcK2EjexOO3BuODyQVRyxxncbWpUoMSXdGEmef3HpWo1ymn/3nixfqxXP0ryO3rK8n jcTef3hzhZ9RVd/RpVaSN2K5vQEB//MfuJbPpR3/RIfE7hAeFPEYFs8+pM+DVn4ja4Hj ZCrtbdppJkhkR3HFu433pgbKexiSw/k6qzsMEtTJFxNdGQiFGa1W99O4ZyTdPipWPzkR s8ryv9xLLAEE0AwsnGcD5K86OWxk7Ickov3Fx8ISoCab0qDWtV6RjSjvsyTbpc8EMQL5 oZdFepejpsh7Yz4fo3ffQHX/AqLOq1m62Rx9MM+UNk+R+Enaag2JH3CyL6ijp0fvtyzG muxA== X-Gm-Message-State: AOAM533EkK0FY6k5FVQKnvLVxEnYFilIRAsKtChakYu2qno4dDbUykdi 0hc0UVO9NhOsRCoxnuxpAHT0GZPmtg9prA== X-Google-Smtp-Source: ABdhPJysCppLCgepv6FkvqSmFfamq+EMo8ajyUImGIsbHRQ0swdq/lr8PSx/UqeKnvitUf8AHyp7rg== X-Received: by 2002:a05:6820:827:b0:31c:923f:bced with SMTP id bg39-20020a056820082700b0031c923fbcedmr3701886oob.79.1645625374557; Wed, 23 Feb 2022 06:09:34 -0800 (PST) Received: from birita.. ([2804:431:c7ca:cb36:52bd:55cf:8e44:571]) by smtp.gmail.com with ESMTPSA id o22sm8801734otp.21.2022.02.23.06.09.33 for (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 23 Feb 2022 06:09:34 -0800 (PST) From: Adhemerval Zanella To: libc-alpha@sourceware.org Subject: [PATCH v2 06/11] ia64: Remove bzero optimization Date: Wed, 23 Feb 2022 11:09:16 -0300 Message-Id: <20220223140921.2768062-7-adhemerval.zanella@linaro.org> X-Mailer: git-send-email 2.32.0 In-Reply-To: <20220223140921.2768062-1-adhemerval.zanella@linaro.org> References: <20220223140921.2768062-1-adhemerval.zanella@linaro.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Spam-Status: No, score=-12.1 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, GIT_PATCH_0, KAM_SHORT, RCVD_IN_DNSWL_NONE, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 23 Feb 2022 14:09:46 -0000 The symbol is not present current POSIX specification and compiler already generates memset call. --- sysdeps/ia64/bzero.S | 312 ------------------------------------------- 1 file changed, 312 deletions(-) delete mode 100644 sysdeps/ia64/bzero.S diff --git a/sysdeps/ia64/bzero.S b/sysdeps/ia64/bzero.S deleted file mode 100644 index cd01abb436..0000000000 --- a/sysdeps/ia64/bzero.S +++ /dev/null @@ -1,312 +0,0 @@ -/* Optimized version of the standard bzero() function. - This file is part of the GNU C Library. - Copyright (C) 2000-2022 Free Software Foundation, Inc. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -/* Return: dest - - Inputs: - in0: dest - in1: count - - The algorithm is fairly straightforward: set byte by byte until we - we get to a 16B-aligned address, then loop on 128 B chunks using an - early store as prefetching, then loop on 32B chucks, then clear remaining - words, finally clear remaining bytes. - Since a stf.spill f0 can store 16B in one go, we use this instruction - to get peak speed. */ - -#include -#undef ret - -#define dest in0 -#define cnt in1 - -#define tmp r31 -#define save_lc r30 -#define ptr0 r29 -#define ptr1 r28 -#define ptr2 r27 -#define ptr3 r26 -#define ptr9 r24 -#define loopcnt r23 -#define linecnt r22 -#define bytecnt r21 - -// This routine uses only scratch predicate registers (p6 - p15) -#define p_scr p6 // default register for same-cycle branches -#define p_unalgn p9 -#define p_y p11 -#define p_n p12 -#define p_yy p13 -#define p_nn p14 - -#define movi0 mov - -#define MIN1 15 -#define MIN1P1HALF 8 -#define LINE_SIZE 128 -#define LSIZE_SH 7 // shift amount -#define PREF_AHEAD 8 - -#define USE_FLP -#if defined(USE_INT) -#define store st8 -#define myval r0 -#elif defined(USE_FLP) -#define store stf8 -#define myval f0 -#endif - -.align 64 -ENTRY(bzero) -{ .mmi - .prologue - alloc tmp = ar.pfs, 2, 0, 0, 0 - lfetch.nt1 [dest] - .save ar.lc, save_lc - movi0 save_lc = ar.lc -} { .mmi - .body - mov ret0 = dest // return value - nop.m 0 - cmp.eq p_scr, p0 = cnt, r0 -;; } -{ .mmi - and ptr2 = -(MIN1+1), dest // aligned address - and tmp = MIN1, dest // prepare to check for alignment - tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U) -} { .mib - mov ptr1 = dest - nop.i 0 -(p_scr) br.ret.dpnt.many rp // return immediately if count = 0 -;; } -{ .mib - cmp.ne p_unalgn, p0 = tmp, r0 -} { .mib // NB: # of bytes to move is 1 - sub bytecnt = (MIN1+1), tmp // higher than loopcnt - cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task? -(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U) -;; } -{ .mmi -(p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment -(p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment -(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ? -;; } -{ .mib -(p_y) add cnt = -8, cnt -(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ? -} { .mib -(p_y) st8 [ptr2] = r0,-4 -(p_n) add ptr2 = 4, ptr2 -;; } -{ .mib -(p_yy) add cnt = -4, cnt -(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ? -} { .mib -(p_yy) st4 [ptr2] = r0,-2 -(p_nn) add ptr2 = 2, ptr2 -;; } -{ .mmi - mov tmp = LINE_SIZE+1 // for compare -(p_y) add cnt = -2, cnt -(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ? -} { .mmi - nop.m 0 -(p_y) st2 [ptr2] = r0,-1 -(p_n) add ptr2 = 1, ptr2 -;; } - -{ .mmi -(p_yy) st1 [ptr2] = r0 - cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task? -} { .mbb -(p_yy) add cnt = -1, cnt -(p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few -;; } -{ .mib - nop.m 0 - shr.u linecnt = cnt, LSIZE_SH - nop.b 0 -;; } - - .align 32 -.l1b: // ------------------// L1B: store ahead into cache lines; fill later -{ .mmi - and tmp = -(LINE_SIZE), cnt // compute end of range - mov ptr9 = ptr1 // used for prefetching - and cnt = (LINE_SIZE-1), cnt // remainder -} { .mmi - mov loopcnt = PREF_AHEAD-1 // default prefetch loop - cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value -;; } -{ .mmi -(p_scr) add loopcnt = -1, linecnt - add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores) - add ptr1 = tmp, ptr1 // first address beyond total range -;; } -{ .mmi - add tmp = -1, linecnt // next loop count - movi0 ar.lc = loopcnt -;; } -.pref_l1b: -{ .mib - stf.spill [ptr9] = f0, 128 // Do stores one cache line apart - nop.i 0 - br.cloop.dptk.few .pref_l1b -;; } -{ .mmi - add ptr0 = 16, ptr2 // Two stores in parallel - movi0 ar.lc = tmp -;; } -.l1bx: - { .mmi - stf.spill [ptr2] = f0, 32 - stf.spill [ptr0] = f0, 32 - ;; } - { .mmi - stf.spill [ptr2] = f0, 32 - stf.spill [ptr0] = f0, 32 - ;; } - { .mmi - stf.spill [ptr2] = f0, 32 - stf.spill [ptr0] = f0, 64 - cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching? - ;; } -{ .mmb - stf.spill [ptr2] = f0, 32 -(p_scr) stf.spill [ptr9] = f0, 128 - br.cloop.dptk.few .l1bx -;; } -{ .mib - cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ? -(p_scr) br.cond.dpnt.many .move_bytes_from_alignment -;; } - -.fraction_of_line: -{ .mib - add ptr2 = 16, ptr1 - shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32 -;; } -{ .mib - cmp.eq p_scr, p0 = loopcnt, r0 - add loopcnt = -1, loopcnt -(p_scr) br.cond.dpnt.many .store_words -;; } -{ .mib - and cnt = 0x1f, cnt // compute the remaining cnt - movi0 ar.lc = loopcnt -;; } - .align 32 -.l2: // -----------------------------// L2A: store 32B in 2 cycles -{ .mmb - store [ptr1] = myval, 8 - store [ptr2] = myval, 8 -;; } { .mmb - store [ptr1] = myval, 24 - store [ptr2] = myval, 24 - br.cloop.dptk.many .l2 -;; } -.store_words: -{ .mib - cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ? -(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch -;; } - -{ .mmi - store [ptr1] = myval, 8 // store - cmp.le p_y, p_n = 16, cnt // - add cnt = -8, cnt // subtract -;; } -{ .mmi -(p_y) store [ptr1] = myval, 8 // store -(p_y) cmp.le.unc p_yy, p_nn = 16, cnt -(p_y) add cnt = -8, cnt // subtract -;; } -{ .mmi // store -(p_yy) store [ptr1] = myval, 8 -(p_yy) add cnt = -8, cnt // subtract -;; } - -.move_bytes_from_alignment: -{ .mib - cmp.eq p_scr, p0 = cnt, r0 - tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ? -(p_scr) br.cond.dpnt.few .restore_and_exit -;; } -{ .mib -(p_y) st4 [ptr1] = r0,4 - tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ? -;; } -{ .mib -(p_yy) st2 [ptr1] = r0,2 - tbit.nz.unc p_y, p0 = cnt, 0 // should we terminate with a st1 ? -;; } - -{ .mib -(p_y) st1 [ptr1] = r0 -;; } -.restore_and_exit: -{ .mib - nop.m 0 - movi0 ar.lc = save_lc - br.ret.sptk.many rp -;; } - -.move_bytes_unaligned: -{ .mmi - .pred.rel "mutex",p_y, p_n - .pred.rel "mutex",p_yy, p_nn -(p_n) cmp.le p_yy, p_nn = 4, cnt -(p_y) cmp.le p_yy, p_nn = 5, cnt -(p_n) add ptr2 = 2, ptr1 -} { .mmi -(p_y) add ptr2 = 3, ptr1 -(p_y) st1 [ptr1] = r0, 1 // fill 1 (odd-aligned) byte -(p_y) add cnt = -1, cnt // [15, 14 (or less) left] -;; } -{ .mmi -(p_yy) cmp.le.unc p_y, p0 = 8, cnt - add ptr3 = ptr1, cnt // prepare last store - movi0 ar.lc = save_lc -} { .mmi -(p_yy) st2 [ptr1] = r0, 4 // fill 2 (aligned) bytes -(p_yy) st2 [ptr2] = r0, 4 // fill 2 (aligned) bytes -(p_yy) add cnt = -4, cnt // [11, 10 (o less) left] -;; } -{ .mmi -(p_y) cmp.le.unc p_yy, p0 = 8, cnt - add ptr3 = -1, ptr3 // last store - tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ? -} { .mmi -(p_y) st2 [ptr1] = r0, 4 // fill 2 (aligned) bytes -(p_y) st2 [ptr2] = r0, 4 // fill 2 (aligned) bytes -(p_y) add cnt = -4, cnt // [7, 6 (or less) left] -;; } -{ .mmi -(p_yy) st2 [ptr1] = r0, 4 // fill 2 (aligned) bytes -(p_yy) st2 [ptr2] = r0, 4 // fill 2 (aligned) bytes - // [3, 2 (or less) left] - tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ? -} { .mmi -(p_yy) add cnt = -4, cnt -;; } -{ .mmb -(p_scr) st2 [ptr1] = r0 // fill 2 (aligned) bytes -(p_y) st1 [ptr3] = r0 // fill last byte (using ptr3) - br.ret.sptk.many rp -;; } -END(bzero) -- 2.32.0