From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pf1-x431.google.com (mail-pf1-x431.google.com [IPv6:2607:f8b0:4864:20::431]) by sourceware.org (Postfix) with ESMTPS id 32D043857C4A for ; Thu, 21 Apr 2022 03:15:00 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 32D043857C4A Received: by mail-pf1-x431.google.com with SMTP id j6so1426772pfe.13 for ; Wed, 20 Apr 2022 20:15:00 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=Ov3HDsgC1rBdx5HPuNgTNYwpbTviRKQk62twdniIKzc=; b=XMxJvgZbxOiPHrSH++QZ4mLK3RvquApZE5QGKcRjbF8lzFKvHEXWESh/EBdneSxU2a 3EKKPoSB0IfpiQCINX2tULBzg5XNT/IyjPToL9rF1kZv4Z84PdsO+e3mAdJnuW/iDf3Y cIEgc2iTfoc/JsCXOERUz9Dum/us/0dozb9kvCbD31Ic4zErCmisAjDA0Dr/j1cBAYYb WWHfw/H4X3jkkIdBUC/QWyLMyMxRvd4G6D4A7TwuNQZGt8l+7Jqdq38tWIYHegG44Z9K V1oO1yqOkmnC9kDVkmpVJlaU9R75ONGhHl2kRInbkG6PZEZl+WoUjZigsicPxiiHbQdg 6/tA== X-Gm-Message-State: AOAM5327ESAqjHslIDVkT66RTacMS1OQttKXO1tPciNuSC6EyAhp8bNX w4vCzSuJY9UUGwls3Q1+z+cehrjivUU= X-Google-Smtp-Source: ABdhPJyIleBRgX5n+qqcW1ZBfoT2E6gLG/Js9B+nrM1J4yFmbzPpqN+UtA6C1JcXv4rvmFEaa2DrjA== X-Received: by 2002:a63:5907:0:b0:382:2f93:5467 with SMTP id n7-20020a635907000000b003822f935467mr21723768pgb.460.1650510898559; Wed, 20 Apr 2022 20:14:58 -0700 (PDT) Received: from localhost.localdomain ([64.145.94.63]) by smtp.googlemail.com with ESMTPSA id n59-20020a17090a5ac100b001cd498dc153sm1424022pji.3.2022.04.20.20.14.56 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 20 Apr 2022 20:14:58 -0700 (PDT) From: Noah Goldstein To: libc-alpha@sourceware.org Subject: [PATCH v1 4/5] x86: Optimize {str|wcs}rchr-avx2 Date: Wed, 20 Apr 2022 22:14:11 -0500 Message-Id: <20220421031410.2142238-4-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220421031410.2142238-1-goldstein.w.n@gmail.com> References: <20220421031410.2142238-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Spam-Status: No, score=-11.6 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_NUMSUBJECT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 21 Apr 2022 03:15:03 -0000 The new code unrolls the main loop slightly without adding too much overhead and minimizes the comparisons for the search CHAR. Geometric Mean of all benchmarks New / Old: 0.832 See email for all results. Full xcheck passes on x86_64 with and without multiarch enabled. --- Results For: strrchr Geometric Mean of N=30 runs. Geometric Mean of all benchmarks New / Old: 0.832 Benchmarks performance on Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html len, align, pos, seek, max_char, freq, New Time / Old Time 2048, 0, 32, 0, 127, 1, 0.673 2048, 1, 32, 0, 127, 1, 0.68 2048, 0, 64, 0, 127, 1, 0.566 2048, 2, 64, 0, 127, 1, 0.574 2048, 0, 128, 0, 127, 1, 0.976 2048, 3, 128, 0, 127, 1, 0.967 2048, 0, 256, 0, 127, 1, 0.931 2048, 4, 256, 0, 127, 1, 0.921 2048, 0, 512, 0, 127, 1, 0.792 2048, 5, 512, 0, 127, 1, 0.78 2048, 0, 1024, 0, 127, 1, 0.733 2048, 6, 1024, 0, 127, 1, 0.729 2048, 0, 2048, 0, 127, 1, 0.795 2048, 7, 2048, 0, 127, 1, 0.805 2048, 0, 4096, 0, 127, 1, 0.803 2048, 8, 4096, 0, 127, 1, 0.794 256, 1, 64, 0, 127, 1, 0.584 256, 15, 64, 0, 127, 1, 0.587 256, 2, 64, 0, 127, 1, 0.586 256, 30, 64, 0, 127, 1, 0.592 256, 3, 64, 0, 127, 1, 0.586 256, 45, 64, 0, 127, 1, 0.505 256, 4, 64, 0, 127, 1, 0.59 256, 60, 64, 0, 127, 1, 0.501 256, 5, 64, 0, 127, 1, 0.595 256, 75, 64, 0, 127, 1, 0.588 256, 6, 64, 0, 127, 1, 0.593 256, 90, 64, 0, 127, 1, 0.594 256, 7, 64, 0, 127, 1, 0.596 256, 105, 64, 0, 127, 1, 0.506 1, 0, 0, 0, 127, 1, 0.872 2, 0, 1, 0, 127, 1, 0.861 3, 0, 2, 0, 127, 1, 0.862 4, 0, 3, 0, 127, 1, 0.884 5, 0, 4, 0, 127, 1, 0.869 6, 0, 5, 0, 127, 1, 0.861 7, 0, 6, 0, 127, 1, 0.865 8, 0, 7, 0, 127, 1, 0.884 9, 0, 8, 0, 127, 1, 0.862 10, 0, 9, 0, 127, 1, 0.889 11, 0, 10, 0, 127, 1, 0.9 12, 0, 11, 0, 127, 1, 0.897 13, 0, 12, 0, 127, 1, 0.909 14, 0, 13, 0, 127, 1, 0.885 15, 0, 14, 0, 127, 1, 0.929 16, 0, 15, 0, 127, 1, 0.871 17, 0, 16, 0, 127, 1, 0.875 18, 0, 17, 0, 127, 1, 0.878 19, 0, 18, 0, 127, 1, 0.889 20, 0, 19, 0, 127, 1, 0.89 21, 0, 20, 0, 127, 1, 0.901 22, 0, 21, 0, 127, 1, 0.91 23, 0, 22, 0, 127, 1, 0.912 24, 0, 23, 0, 127, 1, 0.907 25, 0, 24, 0, 127, 1, 0.947 26, 0, 25, 0, 127, 1, 0.904 27, 0, 26, 0, 127, 1, 0.921 28, 0, 27, 0, 127, 1, 0.899 29, 0, 28, 0, 127, 1, 0.923 30, 0, 29, 0, 127, 1, 0.918 31, 0, 30, 0, 127, 1, 0.943 32, 0, 31, 0, 127, 1, 0.914 2048, 0, 32, 23, 127, 1, 0.815 2048, 1, 32, 23, 127, 1, 0.829 2048, 0, 64, 23, 127, 1, 0.884 2048, 2, 64, 23, 127, 1, 0.882 2048, 0, 128, 23, 127, 1, 0.884 2048, 3, 128, 23, 127, 1, 0.851 2048, 0, 256, 23, 127, 1, 0.843 2048, 4, 256, 23, 127, 1, 0.867 2048, 0, 512, 23, 127, 1, 0.746 2048, 5, 512, 23, 127, 1, 0.863 2048, 0, 1024, 23, 127, 1, 0.662 2048, 6, 1024, 23, 127, 1, 0.683 2048, 0, 2048, 23, 127, 1, 0.852 2048, 7, 2048, 23, 127, 1, 0.837 2048, 0, 4096, 23, 127, 1, 0.837 2048, 8, 4096, 23, 127, 1, 0.829 256, 1, 64, 23, 127, 1, 0.934 256, 15, 64, 23, 127, 1, 0.936 256, 2, 64, 23, 127, 1, 0.931 256, 30, 64, 23, 127, 1, 0.938 256, 3, 64, 23, 127, 1, 0.927 256, 45, 64, 23, 127, 1, 0.863 256, 4, 64, 23, 127, 1, 0.939 256, 60, 64, 23, 127, 1, 0.871 256, 5, 64, 23, 127, 1, 0.94 256, 75, 64, 23, 127, 1, 0.933 256, 6, 64, 23, 127, 1, 0.915 256, 90, 64, 23, 127, 1, 0.934 256, 7, 64, 23, 127, 1, 0.938 256, 105, 64, 23, 127, 1, 0.871 1, 0, 0, 23, 127, 1, 0.865 2, 0, 1, 23, 127, 1, 0.87 3, 0, 2, 23, 127, 1, 0.882 4, 0, 3, 23, 127, 1, 0.901 5, 0, 4, 23, 127, 1, 0.879 6, 0, 5, 23, 127, 1, 0.934 7, 0, 6, 23, 127, 1, 0.874 8, 0, 7, 23, 127, 1, 0.895 9, 0, 8, 23, 127, 1, 0.873 10, 0, 9, 23, 127, 1, 0.861 11, 0, 10, 23, 127, 1, 0.865 12, 0, 11, 23, 127, 1, 0.875 13, 0, 12, 23, 127, 1, 0.878 14, 0, 13, 23, 127, 1, 0.86 15, 0, 14, 23, 127, 1, 0.889 16, 0, 15, 23, 127, 1, 0.875 17, 0, 16, 23, 127, 1, 0.911 18, 0, 17, 23, 127, 1, 0.891 19, 0, 18, 23, 127, 1, 0.921 20, 0, 19, 23, 127, 1, 0.898 21, 0, 20, 23, 127, 1, 0.895 22, 0, 21, 23, 127, 1, 0.906 23, 0, 22, 23, 127, 1, 0.911 24, 0, 23, 23, 127, 1, 0.877 25, 0, 24, 23, 127, 1, 0.9 26, 0, 25, 23, 127, 1, 0.911 27, 0, 26, 23, 127, 1, 0.926 28, 0, 27, 23, 127, 1, 0.918 29, 0, 28, 23, 127, 1, 0.952 30, 0, 29, 23, 127, 1, 0.943 31, 0, 30, 23, 127, 1, 0.934 32, 0, 31, 23, 127, 1, 0.8 2048, 0, 32, 23, 127, 2, 0.872 2048, 1, 32, 23, 127, 2, 0.819 2048, 0, 64, 23, 127, 2, 0.815 2048, 2, 64, 23, 127, 2, 0.805 2048, 0, 128, 23, 127, 2, 0.884 2048, 3, 128, 23, 127, 2, 0.852 2048, 0, 256, 23, 127, 2, 0.873 2048, 4, 256, 23, 127, 2, 0.871 2048, 0, 512, 23, 127, 2, 0.654 2048, 5, 512, 23, 127, 2, 0.762 2048, 0, 1024, 23, 127, 2, 0.646 2048, 6, 1024, 23, 127, 2, 0.665 2048, 0, 2048, 23, 127, 2, 0.678 2048, 7, 2048, 23, 127, 2, 0.675 2048, 0, 4096, 23, 127, 2, 0.849 2048, 8, 4096, 23, 127, 2, 0.835 256, 1, 64, 23, 127, 2, 0.917 256, 15, 64, 23, 127, 2, 0.915 256, 2, 64, 23, 127, 2, 0.911 256, 30, 64, 23, 127, 2, 0.907 256, 3, 64, 23, 127, 2, 0.9 256, 45, 64, 23, 127, 2, 0.816 256, 4, 64, 23, 127, 2, 0.912 256, 60, 64, 23, 127, 2, 0.81 256, 5, 64, 23, 127, 2, 0.904 256, 75, 64, 23, 127, 2, 0.911 256, 6, 64, 23, 127, 2, 0.898 256, 90, 64, 23, 127, 2, 0.912 256, 7, 64, 23, 127, 2, 0.909 256, 105, 64, 23, 127, 2, 0.81 1, 0, 0, 23, 127, 2, 0.858 2, 0, 1, 23, 127, 2, 0.89 3, 0, 2, 23, 127, 2, 0.877 4, 0, 3, 23, 127, 2, 0.863 5, 0, 4, 23, 127, 2, 0.863 6, 0, 5, 23, 127, 2, 0.889 7, 0, 6, 23, 127, 2, 0.898 8, 0, 7, 23, 127, 2, 0.885 9, 0, 8, 23, 127, 2, 0.863 10, 0, 9, 23, 127, 2, 0.902 11, 0, 10, 23, 127, 2, 0.865 12, 0, 11, 23, 127, 2, 0.864 13, 0, 12, 23, 127, 2, 0.87 14, 0, 13, 23, 127, 2, 0.862 15, 0, 14, 23, 127, 2, 0.861 16, 0, 15, 23, 127, 2, 0.859 17, 0, 16, 23, 127, 2, 0.87 18, 0, 17, 23, 127, 2, 0.892 19, 0, 18, 23, 127, 2, 0.874 20, 0, 19, 23, 127, 2, 0.866 21, 0, 20, 23, 127, 2, 0.877 22, 0, 21, 23, 127, 2, 0.868 23, 0, 22, 23, 127, 2, 0.884 24, 0, 23, 23, 127, 2, 0.881 25, 0, 24, 23, 127, 2, 0.872 26, 0, 25, 23, 127, 2, 0.866 27, 0, 26, 23, 127, 2, 0.881 28, 0, 27, 23, 127, 2, 0.93 29, 0, 28, 23, 127, 2, 0.886 30, 0, 29, 23, 127, 2, 0.869 31, 0, 30, 23, 127, 2, 0.869 32, 0, 31, 23, 127, 2, 0.667 2048, 0, 32, 23, 127, 4, 0.858 2048, 1, 32, 23, 127, 4, 0.858 2048, 0, 64, 23, 127, 4, 0.838 2048, 2, 64, 23, 127, 4, 0.834 2048, 0, 128, 23, 127, 4, 0.85 2048, 3, 128, 23, 127, 4, 0.762 2048, 0, 256, 23, 127, 4, 0.874 2048, 4, 256, 23, 127, 4, 0.796 2048, 0, 512, 23, 127, 4, 0.691 2048, 5, 512, 23, 127, 4, 0.755 2048, 0, 1024, 23, 127, 4, 0.676 2048, 6, 1024, 23, 127, 4, 0.661 2048, 0, 2048, 23, 127, 4, 0.678 2048, 7, 2048, 23, 127, 4, 0.678 2048, 0, 4096, 23, 127, 4, 0.676 2048, 8, 4096, 23, 127, 4, 0.677 256, 1, 64, 23, 127, 4, 0.875 256, 15, 64, 23, 127, 4, 0.877 256, 2, 64, 23, 127, 4, 0.875 256, 30, 64, 23, 127, 4, 0.875 256, 3, 64, 23, 127, 4, 0.878 256, 45, 64, 23, 127, 4, 0.829 256, 4, 64, 23, 127, 4, 0.876 256, 60, 64, 23, 127, 4, 0.807 256, 5, 64, 23, 127, 4, 0.874 256, 75, 64, 23, 127, 4, 0.872 256, 6, 64, 23, 127, 4, 0.874 256, 90, 64, 23, 127, 4, 0.874 256, 7, 64, 23, 127, 4, 0.873 256, 105, 64, 23, 127, 4, 0.826 1, 0, 0, 23, 127, 4, 0.863 2, 0, 1, 23, 127, 4, 0.861 3, 0, 2, 23, 127, 4, 0.863 4, 0, 3, 23, 127, 4, 0.867 5, 0, 4, 23, 127, 4, 0.866 6, 0, 5, 23, 127, 4, 0.873 7, 0, 6, 23, 127, 4, 0.873 8, 0, 7, 23, 127, 4, 0.866 9, 0, 8, 23, 127, 4, 0.861 10, 0, 9, 23, 127, 4, 0.861 11, 0, 10, 23, 127, 4, 0.857 12, 0, 11, 23, 127, 4, 0.864 13, 0, 12, 23, 127, 4, 0.86 14, 0, 13, 23, 127, 4, 0.859 15, 0, 14, 23, 127, 4, 0.854 16, 0, 15, 23, 127, 4, 0.857 17, 0, 16, 23, 127, 4, 0.881 18, 0, 17, 23, 127, 4, 0.863 19, 0, 18, 23, 127, 4, 0.86 20, 0, 19, 23, 127, 4, 0.906 21, 0, 20, 23, 127, 4, 0.924 22, 0, 21, 23, 127, 4, 0.885 23, 0, 22, 23, 127, 4, 0.861 24, 0, 23, 23, 127, 4, 0.907 25, 0, 24, 23, 127, 4, 0.909 26, 0, 25, 23, 127, 4, 0.863 27, 0, 26, 23, 127, 4, 0.862 28, 0, 27, 23, 127, 4, 0.887 29, 0, 28, 23, 127, 4, 0.879 30, 0, 29, 23, 127, 4, 0.932 31, 0, 30, 23, 127, 4, 0.895 32, 0, 31, 23, 127, 4, 0.666 2048, 0, 32, 23, 127, 8, 0.865 2048, 1, 32, 23, 127, 8, 0.892 2048, 0, 64, 23, 127, 8, 0.85 2048, 2, 64, 23, 127, 8, 0.834 2048, 0, 128, 23, 127, 8, 0.823 2048, 3, 128, 23, 127, 8, 0.809 2048, 0, 256, 23, 127, 8, 0.84 2048, 4, 256, 23, 127, 8, 0.738 2048, 0, 512, 23, 127, 8, 0.656 2048, 5, 512, 23, 127, 8, 0.644 2048, 0, 1024, 23, 127, 8, 0.705 2048, 6, 1024, 23, 127, 8, 0.708 2048, 0, 2048, 23, 127, 8, 0.701 2048, 7, 2048, 23, 127, 8, 0.7 2048, 0, 4096, 23, 127, 8, 0.68 2048, 8, 4096, 23, 127, 8, 0.678 256, 1, 64, 23, 127, 8, 0.881 256, 15, 64, 23, 127, 8, 0.879 256, 2, 64, 23, 127, 8, 0.878 256, 30, 64, 23, 127, 8, 0.877 256, 3, 64, 23, 127, 8, 0.88 256, 45, 64, 23, 127, 8, 0.829 256, 4, 64, 23, 127, 8, 0.883 256, 60, 64, 23, 127, 8, 0.808 256, 5, 64, 23, 127, 8, 0.875 256, 75, 64, 23, 127, 8, 0.877 256, 6, 64, 23, 127, 8, 0.874 256, 90, 64, 23, 127, 8, 0.874 256, 7, 64, 23, 127, 8, 0.874 256, 105, 64, 23, 127, 8, 0.83 1, 0, 0, 23, 127, 8, 0.862 2, 0, 1, 23, 127, 8, 0.865 3, 0, 2, 23, 127, 8, 0.866 4, 0, 3, 23, 127, 8, 0.863 5, 0, 4, 23, 127, 8, 0.874 6, 0, 5, 23, 127, 8, 0.87 7, 0, 6, 23, 127, 8, 0.87 8, 0, 7, 23, 127, 8, 0.864 9, 0, 8, 23, 127, 8, 0.87 10, 0, 9, 23, 127, 8, 0.861 11, 0, 10, 23, 127, 8, 0.862 12, 0, 11, 23, 127, 8, 0.87 13, 0, 12, 23, 127, 8, 0.858 14, 0, 13, 23, 127, 8, 0.86 15, 0, 14, 23, 127, 8, 0.863 16, 0, 15, 23, 127, 8, 0.866 17, 0, 16, 23, 127, 8, 0.86 18, 0, 17, 23, 127, 8, 0.887 19, 0, 18, 23, 127, 8, 0.858 20, 0, 19, 23, 127, 8, 0.891 21, 0, 20, 23, 127, 8, 0.874 22, 0, 21, 23, 127, 8, 0.891 23, 0, 22, 23, 127, 8, 0.873 24, 0, 23, 23, 127, 8, 0.895 25, 0, 24, 23, 127, 8, 0.884 26, 0, 25, 23, 127, 8, 0.878 27, 0, 26, 23, 127, 8, 0.878 28, 0, 27, 23, 127, 8, 0.891 29, 0, 28, 23, 127, 8, 0.91 30, 0, 29, 23, 127, 8, 0.881 31, 0, 30, 23, 127, 8, 0.917 32, 0, 31, 23, 127, 8, 0.667 2048, 0, 32, 23, 127, 16, 0.86 2048, 1, 32, 23, 127, 16, 0.847 2048, 0, 64, 23, 127, 16, 0.846 2048, 2, 64, 23, 127, 16, 0.852 2048, 0, 128, 23, 127, 16, 0.82 2048, 3, 128, 23, 127, 16, 0.751 2048, 0, 256, 23, 127, 16, 0.788 2048, 4, 256, 23, 127, 16, 0.712 2048, 0, 512, 23, 127, 16, 0.524 2048, 5, 512, 23, 127, 16, 0.517 2048, 0, 1024, 23, 127, 16, 0.583 2048, 6, 1024, 23, 127, 16, 0.682 2048, 0, 2048, 23, 127, 16, 0.77 2048, 7, 2048, 23, 127, 16, 0.659 2048, 0, 4096, 23, 127, 16, 0.7 2048, 8, 4096, 23, 127, 16, 0.7 256, 1, 64, 23, 127, 16, 0.798 256, 15, 64, 23, 127, 16, 0.873 256, 2, 64, 23, 127, 16, 0.875 256, 30, 64, 23, 127, 16, 0.877 256, 3, 64, 23, 127, 16, 0.875 256, 45, 64, 23, 127, 16, 0.834 256, 4, 64, 23, 127, 16, 0.873 256, 60, 64, 23, 127, 16, 0.809 256, 5, 64, 23, 127, 16, 0.879 256, 75, 64, 23, 127, 16, 0.884 256, 6, 64, 23, 127, 16, 0.874 256, 90, 64, 23, 127, 16, 0.876 256, 7, 64, 23, 127, 16, 0.876 256, 105, 64, 23, 127, 16, 0.827 1, 0, 0, 23, 127, 16, 0.859 2, 0, 1, 23, 127, 16, 0.864 3, 0, 2, 23, 127, 16, 0.871 4, 0, 3, 23, 127, 16, 0.869 5, 0, 4, 23, 127, 16, 0.881 6, 0, 5, 23, 127, 16, 0.869 7, 0, 6, 23, 127, 16, 0.867 8, 0, 7, 23, 127, 16, 0.877 9, 0, 8, 23, 127, 16, 0.862 10, 0, 9, 23, 127, 16, 0.861 11, 0, 10, 23, 127, 16, 0.859 12, 0, 11, 23, 127, 16, 0.858 13, 0, 12, 23, 127, 16, 0.867 14, 0, 13, 23, 127, 16, 0.857 15, 0, 14, 23, 127, 16, 0.858 16, 0, 15, 23, 127, 16, 0.857 17, 0, 16, 23, 127, 16, 0.858 18, 0, 17, 23, 127, 16, 0.867 19, 0, 18, 23, 127, 16, 0.875 20, 0, 19, 23, 127, 16, 0.868 21, 0, 20, 23, 127, 16, 0.861 22, 0, 21, 23, 127, 16, 0.868 23, 0, 22, 23, 127, 16, 0.866 24, 0, 23, 23, 127, 16, 0.858 25, 0, 24, 23, 127, 16, 0.859 26, 0, 25, 23, 127, 16, 0.857 27, 0, 26, 23, 127, 16, 0.866 28, 0, 27, 23, 127, 16, 0.875 29, 0, 28, 23, 127, 16, 0.896 30, 0, 29, 23, 127, 16, 0.889 31, 0, 30, 23, 127, 16, 0.903 32, 0, 31, 23, 127, 16, 0.667 sysdeps/x86_64/multiarch/strrchr-avx2.S | 415 +++++++++++++++--------- 1 file changed, 258 insertions(+), 157 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S index 1df2adfad0..9d1e45defc 100644 --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S @@ -27,9 +27,13 @@ # ifdef USE_AS_WCSRCHR # define VPBROADCAST vpbroadcastd # define VPCMPEQ vpcmpeqd +# define VPMIN vpminud +# define CHAR_SIZE 4 # else # define VPBROADCAST vpbroadcastb # define VPCMPEQ vpcmpeqb +# define VPMIN vpminub +# define CHAR_SIZE 1 # endif # ifndef VZEROUPPER @@ -41,196 +45,293 @@ # endif # define VEC_SIZE 32 +# define PAGE_SIZE 4096 - .section SECTION(.text),"ax",@progbits -ENTRY (STRRCHR) - movd %esi, %xmm4 - movl %edi, %ecx + .section SECTION(.text), "ax", @progbits +ENTRY(STRRCHR) + movd %esi, %xmm7 + movl %edi, %eax /* Broadcast CHAR to YMM4. */ - VPBROADCAST %xmm4, %ymm4 + VPBROADCAST %xmm7, %ymm7 vpxor %xmm0, %xmm0, %xmm0 - /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + /* Shift here instead of `andl` to save code size (saves a fetch + block). */ + sall $20, %eax + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax + ja L(cross_page) +L(page_cross_continue): vmovdqu (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx - vpmovmskb %ymm3, %eax - addq $VEC_SIZE, %rdi + /* Check end of string match. */ + VPCMPEQ %ymm1, %ymm0, %ymm6 + vpmovmskb %ymm6, %ecx + testl %ecx, %ecx + jz L(aligned_more) + + /* Only check match with search CHAR if needed. */ + VPCMPEQ %ymm1, %ymm7, %ymm1 + vpmovmskb %ymm1, %eax + /* Check if match before first zero. */ + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(ret0) + bsrl %eax, %eax + addq %rdi, %rax + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If + search CHAR is zero we are correct. Either way `andq + -CHAR_SIZE, %rax` gets the correct result. */ +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif +L(ret0): +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + + /* Returns for first vec x1/x2 have hard coded backward search + path for earlier matches. */ + .p2align 4,, 10 +L(first_vec_x1): + VPCMPEQ %ymm2, %ymm7, %ymm6 + vpmovmskb %ymm6, %eax + blsmskl %ecx, %ecx + andl %ecx, %eax + jnz L(first_vec_x1_return) + + .p2align 4,, 4 +L(first_vec_x0_test): + VPCMPEQ %ymm1, %ymm7, %ymm6 + vpmovmskb %ymm6, %eax + testl %eax, %eax + jz L(ret1) + bsrl %eax, %eax + addq %r8, %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif +L(ret1): + VZEROUPPER_RETURN + .p2align 4,, 10 +L(first_vec_x0_x1_test): + VPCMPEQ %ymm2, %ymm7, %ymm6 + vpmovmskb %ymm6, %eax testl %eax, %eax - jnz L(first_vec) + jz L(first_vec_x0_test) + .p2align 4,, 4 +L(first_vec_x1_return): + bsrl %eax, %eax + leaq 1(%rdi, %rax), %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif + VZEROUPPER_RETURN - testl %ecx, %ecx - jnz L(return_null) - andq $-VEC_SIZE, %rdi - xorl %edx, %edx - jmp L(aligned_loop) + .p2align 4,, 10 +L(first_vec_x2): + VPCMPEQ %ymm3, %ymm7, %ymm6 + vpmovmskb %ymm6, %eax + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(first_vec_x0_x1_test) + bsrl %eax, %eax + leaq (VEC_SIZE + 1)(%rdi, %rax), %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif + VZEROUPPER_RETURN + .p2align 4 -L(first_vec): - /* Check if there is a nul CHAR. */ +L(aligned_more): + /* Save original pointer if match was in VEC 0. */ + movq %rdi, %r8 + + /* Align src. */ + orq $(VEC_SIZE - 1), %rdi + vmovdqu 1(%rdi), %ymm2 + VPCMPEQ %ymm2, %ymm0, %ymm6 + vpmovmskb %ymm6, %ecx testl %ecx, %ecx - jnz L(char_and_nul_in_first_vec) + jnz L(first_vec_x1) - /* Remember the match and keep searching. */ - movl %eax, %edx - movq %rdi, %rsi - andq $-VEC_SIZE, %rdi - jmp L(aligned_loop) + vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3 + VPCMPEQ %ymm3, %ymm0, %ymm6 + vpmovmskb %ymm6, %ecx + testl %ecx, %ecx + jnz L(first_vec_x2) + /* Save pointer again before realigning. */ + movq %rdi, %rsi + addq $(VEC_SIZE + 1), %rdi + andq $-(VEC_SIZE * 2), %rdi .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %edx - vpmovmskb %ymm3, %eax - shrl %cl, %edx - shrl %cl, %eax - addq $VEC_SIZE, %rdi - - /* Check if there is a CHAR. */ +L(first_aligned_loop): + /* Do 2x VEC at a time. Any more and the cost of finding the + match outweights loop benefit. */ + vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 + vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 + + VPCMPEQ %ymm4, %ymm7, %ymm6 + VPMIN %ymm4, %ymm5, %ymm8 + VPCMPEQ %ymm5, %ymm7, %ymm10 + vpor %ymm6, %ymm10, %ymm5 + VPCMPEQ %ymm8, %ymm0, %ymm8 + vpor %ymm5, %ymm8, %ymm9 + + vpmovmskb %ymm9, %eax + addq $(VEC_SIZE * 2), %rdi + /* No zero or search CHAR. */ testl %eax, %eax - jnz L(found_char) - - testl %edx, %edx - jnz L(return_null) + jz L(first_aligned_loop) - jmp L(aligned_loop) - - .p2align 4 -L(found_char): - testl %edx, %edx - jnz L(char_and_nul) + /* If no zero CHAR then go to second loop (this allows us to + throw away all prior work). */ + vpmovmskb %ymm8, %ecx + testl %ecx, %ecx + jz L(second_aligned_loop_prep) - /* Remember the match and keep searching. */ - movl %eax, %edx - leaq (%rdi, %rcx), %rsi + /* Search char could be zero so we need to get the true match. + */ + vpmovmskb %ymm5, %eax + testl %eax, %eax + jnz L(first_aligned_loop_return) - .p2align 4 -L(aligned_loop): - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - addq $VEC_SIZE, %rdi - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx - vpmovmskb %ymm3, %eax - orl %eax, %ecx - jnz L(char_nor_null) - - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - add $VEC_SIZE, %rdi - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx + .p2align 4,, 4 +L(first_vec_x1_or_x2): + VPCMPEQ %ymm3, %ymm7, %ymm3 + VPCMPEQ %ymm2, %ymm7, %ymm2 vpmovmskb %ymm3, %eax - orl %eax, %ecx - jnz L(char_nor_null) - - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - addq $VEC_SIZE, %rdi - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx - vpmovmskb %ymm3, %eax - orl %eax, %ecx - jnz L(char_nor_null) - - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - addq $VEC_SIZE, %rdi - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx - vpmovmskb %ymm3, %eax - orl %eax, %ecx - jz L(aligned_loop) - - .p2align 4 -L(char_nor_null): - /* Find a CHAR or a nul CHAR in a loop. */ - testl %eax, %eax - jnz L(match) -L(return_value): - testl %edx, %edx - jz L(return_null) - movl %edx, %eax - movq %rsi, %rdi + vpmovmskb %ymm2, %edx + /* Use add for macro-fusion. */ + addq %rax, %rdx + jz L(first_vec_x0_test) + /* NB: We could move this shift to before the branch and save a + bit of code size / performance on the fall through. The + branch leads to the null case which generally seems hotter + than char in first 3x VEC. */ + salq $32, %rax + addq %rdx, %rax + bsrq %rax, %rax + leaq 1(%rsi, %rax), %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif + VZEROUPPER_RETURN + .p2align 4,, 8 +L(first_aligned_loop_return): + VPCMPEQ %ymm4, %ymm0, %ymm4 + vpmovmskb %ymm4, %edx + salq $32, %rcx + orq %rdx, %rcx + + vpmovmskb %ymm10, %eax + vpmovmskb %ymm6, %edx + salq $32, %rax + orq %rdx, %rax + blsmskq %rcx, %rcx + andq %rcx, %rax + jz L(first_vec_x1_or_x2) + + bsrq %rax, %rax + leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax # ifdef USE_AS_WCSRCHR - /* Keep the first bit for each matching CHAR for bsr. */ - andl $0x11111111, %eax + andq $-CHAR_SIZE, %rax # endif - bsrl %eax, %eax - leaq -VEC_SIZE(%rdi, %rax), %rax -L(return_vzeroupper): - ZERO_UPPER_VEC_REGISTERS_RETURN + VZEROUPPER_RETURN + /* Search char cannot be zero. */ .p2align 4 -L(match): - /* Find a CHAR. Check if there is a nul CHAR. */ - vpmovmskb %ymm2, %ecx - testl %ecx, %ecx - jnz L(find_nul) - - /* Remember the match and keep searching. */ - movl %eax, %edx +L(second_aligned_loop_set_furthest_match): + /* Save VEC and pointer from most recent match. */ +L(second_aligned_loop_prep): movq %rdi, %rsi - jmp L(aligned_loop) + vmovdqu %ymm6, %ymm2 + vmovdqu %ymm10, %ymm3 .p2align 4 -L(find_nul): -# ifdef USE_AS_WCSRCHR - /* Keep the first bit for each matching CHAR for bsr. */ - andl $0x11111111, %ecx - andl $0x11111111, %eax -# endif - /* Mask out any matching bits after the nul CHAR. */ - movl %ecx, %r8d - subl $1, %r8d - xorl %ecx, %r8d - andl %r8d, %eax +L(second_aligned_loop): + /* Search 2x at at time. */ + vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 + vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 + + VPCMPEQ %ymm4, %ymm7, %ymm6 + VPMIN %ymm4, %ymm5, %ymm1 + VPCMPEQ %ymm5, %ymm7, %ymm10 + vpor %ymm6, %ymm10, %ymm5 + VPCMPEQ %ymm1, %ymm0, %ymm1 + vpor %ymm5, %ymm1, %ymm9 + + vpmovmskb %ymm9, %eax + addq $(VEC_SIZE * 2), %rdi testl %eax, %eax - /* If there is no CHAR here, return the remembered one. */ - jz L(return_value) - bsrl %eax, %eax - leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER_RETURN - - .p2align 4 -L(char_and_nul): - /* Find both a CHAR and a nul CHAR. */ - addq %rcx, %rdi - movl %edx, %ecx -L(char_and_nul_in_first_vec): -# ifdef USE_AS_WCSRCHR - /* Keep the first bit for each matching CHAR for bsr. */ - andl $0x11111111, %ecx - andl $0x11111111, %eax -# endif - /* Mask out any matching bits after the nul CHAR. */ - movl %ecx, %r8d - subl $1, %r8d - xorl %ecx, %r8d - andl %r8d, %eax + jz L(second_aligned_loop) + vpmovmskb %ymm1, %ecx + testl %ecx, %ecx + jz L(second_aligned_loop_set_furthest_match) + vpmovmskb %ymm5, %eax testl %eax, %eax - /* Return null pointer if the nul CHAR comes first. */ - jz L(return_null) - bsrl %eax, %eax - leaq -VEC_SIZE(%rdi, %rax), %rax + jnz L(return_new_match) + + /* This is the hot patch. We know CHAR is inbounds and that + ymm3/ymm2 have latest match. */ + .p2align 4,, 4 +L(return_old_match): + vpmovmskb %ymm3, %eax + vpmovmskb %ymm2, %edx + salq $32, %rax + orq %rdx, %rax + bsrq %rax, %rax + /* Search char cannot be zero so safe to just use lea for + wcsrchr. */ + leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax VZEROUPPER_RETURN - .p2align 4 -L(return_null): - xorl %eax, %eax + /* Last iteration also potentially has a match. */ + .p2align 4,, 8 +L(return_new_match): + VPCMPEQ %ymm4, %ymm0, %ymm4 + vpmovmskb %ymm4, %edx + salq $32, %rcx + orq %rdx, %rcx + + vpmovmskb %ymm10, %eax + vpmovmskb %ymm6, %edx + salq $32, %rax + orq %rdx, %rax + blsmskq %rcx, %rcx + andq %rcx, %rax + jz L(return_old_match) + bsrq %rax, %rax + /* Search char cannot be zero so safe to just use lea for + wcsrchr. */ + leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax VZEROUPPER_RETURN -END (STRRCHR) + .p2align 4,, 4 +L(cross_page): + movq %rdi, %rsi + andq $-VEC_SIZE, %rsi + vmovdqu (%rsi), %ymm1 + VPCMPEQ %ymm1, %ymm0, %ymm6 + vpmovmskb %ymm6, %ecx + shrxl %edi, %ecx, %ecx + testl %ecx, %ecx + jz L(page_cross_continue) + VPCMPEQ %ymm1, %ymm7, %ymm1 + vpmovmskb %ymm1, %eax + shrxl %edi, %eax, %eax + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(ret2) + bsrl %eax, %eax + addq %rdi, %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif +L(ret2): + VZEROUPPER_RETURN +END(STRRCHR) #endif -- 2.25.1