public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] aarch64: MTE compatible strchrnul
@ 2020-06-03  9:43 Andrea Corallo
  2020-06-03 11:27 ` Florian Weimer
  2020-06-03 14:40 ` Adhemerval Zanella
  0 siblings, 2 replies; 19+ messages in thread
From: Andrea Corallo @ 2020-06-03  9:43 UTC (permalink / raw)
  To: libc-alpha; +Cc: Szabolcs.Nagy, Wilco.Dijkstra, nd

[-- Attachment #1: Type: text/plain, Size: 14621 bytes --]

Hi all,

I'd like to submit this patch introducing an Arm MTE compatible
strchrnul implementation.

Follows a performance comparison of the strchrnul benchmark run on 
Cortex-A72, Cortex-A53, Neoverse N1.

| length | alignment | perf-uplift A72 | perf-uplift A53 | perf-uplift N1 |
|--------+-----------+-----------------+-----------------+----------------|
|     32 |         0 |           1.16x |           1.07x |          1.35x |
|     32 |         1 |           1.25x |           1.16x |          1.15x |
|     64 |         0 |           1.26x |           0.97x |          1.20x |
|     64 |         2 |           1.35x |           1.04x |          1.30x |
|    128 |         0 |           1.12x |           0.84x |          1.22x |
|    128 |         3 |           1.25x |           0.87x |          1.30x |
|    256 |         0 |           1.14x |           0.84x |          1.16x |
|    256 |         4 |           1.24x |           0.81x |          1.16x |
|    512 |         0 |           1.15x |           0.80x |          1.13x |
|    512 |         5 |           1.17x |           0.81x |          1.14x |
|   1024 |         0 |           1.14x |           0.78x |          1.08x |
|   1024 |         6 |           1.03x |           0.78x |          1.10x |
|   2048 |         0 |           1.12x |           0.76x |          1.08x |
|   2048 |         7 |           1.14x |           0.77x |          1.09x |
|     64 |         1 |           1.35x |           1.04x |          1.37x |
|     64 |         1 |           1.36x |           1.04x |          1.37x |
|     64 |         2 |           1.36x |           1.04x |          1.37x |
|     64 |         2 |           1.37x |           1.04x |          1.38x |
|     64 |         3 |           1.38x |           1.04x |          1.36x |
|     64 |         3 |           1.40x |           1.04x |          1.36x |
|     64 |         4 |           1.41x |           1.04x |          1.36x |
|     64 |         4 |           1.36x |           1.04x |          1.36x |
|     64 |         5 |           1.34x |           1.04x |          1.40x |
|     64 |         5 |           1.35x |           1.04x |          1.36x |
|     64 |         6 |           1.34x |           1.04x |          1.37x |
|     64 |         6 |           1.41x |           1.04x |          1.37x |
|     64 |         7 |           1.39x |           1.04x |          1.36x |
|     64 |         7 |           1.34x |           1.04x |          1.37x |
|      0 |         0 |           1.18x |           1.63x |          1.66x |
|      0 |         0 |           1.18x |           1.63x |          1.66x |
|      1 |         0 |           1.18x |           1.63x |          1.66x |
|      1 |         0 |           1.18x |           1.63x |          1.67x |
|      2 |         0 |           1.18x |           1.63x |          1.66x |
|      2 |         0 |           1.18x |           1.63x |          1.65x |
|      3 |         0 |           1.18x |           1.63x |          1.66x |
|      3 |         0 |           1.18x |           1.63x |          1.66x |
|      4 |         0 |           1.18x |           1.63x |          1.65x |
|      4 |         0 |           1.18x |           1.63x |          1.66x |
|      5 |         0 |           1.18x |           1.63x |          1.66x |
|      5 |         0 |           1.18x |           1.63x |          1.66x |
|      6 |         0 |           1.18x |           1.63x |          1.66x |
|      6 |         0 |           1.18x |           1.63x |          1.66x |
|      7 |         0 |           1.18x |           1.63x |          1.66x |
|      7 |         0 |           1.18x |           1.63x |          1.64x |
|      8 |         0 |           1.18x |           1.63x |          1.66x |
|      8 |         0 |           1.18x |           1.63x |          1.66x |
|      9 |         0 |           1.18x |           1.63x |          1.65x |
|      9 |         0 |           1.18x |           1.63x |          1.66x |
|     10 |         0 |           1.18x |           1.63x |          1.66x |
|     10 |         0 |           1.18x |           1.63x |          1.66x |
|     11 |         0 |           1.18x |           1.63x |          1.64x |
|     11 |         0 |           1.18x |           1.63x |          1.63x |
|     12 |         0 |           1.18x |           1.63x |          1.63x |
|     12 |         0 |           1.18x |           1.63x |          1.66x |
|     13 |         0 |           1.18x |           1.63x |          1.63x |
|     13 |         0 |           1.18x |           1.63x |          1.63x |
|     14 |         0 |           1.18x |           1.63x |          1.63x |
|     14 |         0 |           1.18x |           1.63x |          1.22x |
|     15 |         0 |           1.19x |           1.63x |          1.22x |
|     15 |         0 |           1.18x |           1.63x |          1.63x |
|     16 |         0 |           1.03x |           0.96x |          1.15x |
|     16 |         0 |           1.03x |           0.96x |          1.13x |
|     17 |         0 |           1.03x |           0.96x |          0.98x |
|     17 |         0 |           1.03x |           0.96x |          0.98x |
|     18 |         0 |           1.03x |           0.96x |          0.98x |
|     18 |         0 |           1.03x |           0.96x |          0.98x |
|     19 |         0 |           1.04x |           0.96x |          0.98x |
|     19 |         0 |           1.04x |           0.96x |          0.98x |
|     20 |         0 |           1.04x |           0.96x |          1.00x |
|     20 |         0 |           1.03x |           0.96x |          0.99x |
|     21 |         0 |           1.04x |           0.96x |          0.99x |
|     21 |         0 |           1.03x |           0.96x |          1.14x |
|     22 |         0 |           1.04x |           0.96x |          1.14x |
|     22 |         0 |           1.03x |           0.96x |          1.14x |
|     23 |         0 |           1.03x |           0.96x |          1.13x |
|     23 |         0 |           1.03x |           0.96x |          1.15x |
|     24 |         0 |           1.04x |           0.96x |          1.13x |
|     24 |         0 |           1.04x |           0.95x |          1.13x |
|     25 |         0 |           1.03x |           0.96x |          1.15x |
|     25 |         0 |           1.04x |           0.96x |          1.12x |
|     26 |         0 |           1.04x |           0.96x |          1.13x |
|     26 |         0 |           1.02x |           0.96x |          1.13x |
|     27 |         0 |           1.04x |           0.96x |          1.13x |
|     27 |         0 |           1.03x |           0.96x |          1.13x |
|     28 |         0 |           1.03x |           0.96x |          0.98x |
|     28 |         0 |           1.04x |           0.96x |          1.05x |
|     29 |         0 |           1.02x |           0.96x |          1.00x |
|     29 |         0 |           1.03x |           0.96x |          1.00x |
|     30 |         0 |           1.04x |           0.96x |          1.00x |
|     30 |         0 |           1.04x |           0.96x |          1.00x |
|     31 |         0 |           1.04x |           0.96x |          0.99x |
|     31 |         0 |           1.03x |           0.96x |          0.99x |
|     32 |         0 |           1.09x |           1.07x |          1.09x |
|     32 |         1 |           1.25x |           1.15x |          1.38x |
|     64 |         0 |           1.27x |           0.98x |          1.20x |
|     64 |         2 |           1.41x |           1.04x |          1.30x |
|    128 |         0 |           1.15x |           0.84x |          1.22x |
|    128 |         3 |           1.23x |           0.87x |          1.30x |
|    256 |         0 |           1.16x |           0.84x |          1.16x |
|    256 |         4 |           1.23x |           0.81x |          1.17x |
|    512 |         0 |           1.14x |           0.80x |          1.12x |
|    512 |         5 |           1.18x |           0.81x |          1.14x |
|   1024 |         0 |           1.16x |           0.78x |          1.09x |
|   1024 |         6 |           1.03x |           0.78x |          1.11x |
|   2048 |         0 |           1.14x |           0.76x |          1.08x |
|   2048 |         7 |           1.14x |           0.77x |          1.09x |
|     64 |         1 |           1.40x |           1.04x |          1.37x |
|     64 |         1 |           1.40x |           1.04x |          1.37x |
|     64 |         2 |           1.35x |           1.04x |          1.37x |
|     64 |         2 |           1.38x |           1.04x |          1.37x |
|     64 |         3 |           1.36x |           1.04x |          1.37x |
|     64 |         3 |           1.34x |           1.04x |          1.37x |
|     64 |         4 |           1.41x |           1.04x |          1.37x |
|     64 |         4 |           1.38x |           1.04x |          1.37x |
|     64 |         5 |           1.36x |           1.04x |          1.37x |
|     64 |         5 |           1.36x |           1.04x |          1.37x |
|     64 |         6 |           1.35x |           1.04x |          1.37x |
|     64 |         6 |           1.40x |           1.04x |          1.37x |
|     64 |         7 |           1.35x |           1.04x |          1.37x |
|     64 |         7 |           1.40x |           1.04x |          1.37x |
|      0 |         0 |           1.19x |           1.63x |          1.66x |
|      0 |         0 |           1.19x |           1.63x |          1.66x |
|      1 |         0 |           1.19x |           1.63x |          1.66x |
|      1 |         0 |           1.19x |           1.63x |          1.66x |
|      2 |         0 |           1.18x |           1.63x |          1.63x |
|      2 |         0 |           1.18x |           1.63x |          1.66x |
|      3 |         0 |           1.18x |           1.63x |          1.66x |
|      3 |         0 |           1.20x |           1.63x |          1.63x |
|      4 |         0 |           1.18x |           1.63x |          1.63x |
|      4 |         0 |           1.18x |           1.63x |          1.66x |
|      5 |         0 |           1.18x |           1.63x |          1.66x |
|      5 |         0 |           1.18x |           1.63x |          1.66x |
|      6 |         0 |           1.18x |           1.63x |          1.66x |
|      6 |         0 |           1.18x |           1.63x |          1.66x |
|      7 |         0 |           1.18x |           1.63x |          1.66x |
|      7 |         0 |           1.18x |           1.63x |          1.66x |
|      8 |         0 |           1.18x |           1.63x |          1.25x |
|      8 |         0 |           1.18x |           1.63x |          1.66x |
|      9 |         0 |           1.18x |           1.63x |          1.66x |
|      9 |         0 |           1.18x |           1.63x |          1.66x |
|     10 |         0 |           1.18x |           1.63x |          1.66x |
|     10 |         0 |           1.18x |           1.63x |          1.66x |
|     11 |         0 |           1.18x |           1.63x |          1.66x |
|     11 |         0 |           1.18x |           1.63x |          1.66x |
|     12 |         0 |           1.18x |           1.63x |          1.66x |
|     12 |         0 |           1.19x |           1.63x |          1.66x |
|     13 |         0 |           1.18x |           1.63x |          1.66x |
|     13 |         0 |           1.18x |           1.63x |          1.66x |
|     14 |         0 |           1.19x |           1.63x |          1.66x |
|     14 |         0 |           1.19x |           1.63x |          1.66x |
|     15 |         0 |           1.18x |           1.63x |          1.66x |
|     15 |         0 |           1.18x |           1.63x |          1.66x |
|     16 |         0 |           1.03x |           0.96x |          1.00x |
|     16 |         0 |           1.03x |           0.96x |          1.00x |
|     17 |         0 |           1.03x |           0.96x |          1.00x |
|     17 |         0 |           1.03x |           0.96x |          1.15x |
|     18 |         0 |           1.03x |           0.96x |          1.14x |
|     18 |         0 |           1.04x |           0.96x |          1.15x |
|     19 |         0 |           1.04x |           0.96x |          1.15x |
|     19 |         0 |           1.04x |           0.96x |          1.15x |
|     20 |         0 |           1.04x |           0.96x |          1.15x |
|     20 |         0 |           1.03x |           0.96x |          1.15x |
|     21 |         0 |           1.04x |           0.96x |          1.15x |
|     21 |         0 |           1.03x |           0.96x |          1.15x |
|     22 |         0 |           1.02x |           0.96x |          1.15x |
|     22 |         0 |           1.03x |           0.96x |          1.15x |
|     23 |         0 |           1.03x |           0.96x |          1.15x |
|     23 |         0 |           1.03x |           0.96x |          1.15x |
|     24 |         0 |           1.03x |           0.96x |          1.00x |
|     24 |         0 |           1.02x |           0.96x |          1.00x |
|     25 |         0 |           1.04x |           0.96x |          1.00x |
|     25 |         0 |           1.03x |           0.96x |          1.16x |
|     26 |         0 |           1.04x |           0.96x |          1.15x |
|     26 |         0 |           1.03x |           0.96x |          1.15x |
|     27 |         0 |           1.04x |           0.96x |          1.00x |
|     27 |         0 |           1.03x |           0.96x |          1.00x |
|     28 |         0 |           1.04x |           0.96x |          1.00x |
|     28 |         0 |           1.04x |           0.96x |          1.00x |
|     29 |         0 |           1.03x |           0.96x |          1.00x |
|     29 |         0 |           1.04x |           0.96x |          1.15x |
|     30 |         0 |           1.04x |           0.96x |          1.15x |
|     30 |         0 |           1.03x |           0.95x |          1.00x |
|     31 |         0 |           1.03x |           0.96x |          1.00x |
|     31 |         0 |           1.04x |           0.96x |          1.00x |

This patch is passing GLIBC tests.

Regards

  Andrea

8< --- 8< --- 8<
Introduce an Arm MTE compatible strchrnul implementation.

Benchmarked on Cortex-A72, Cortex-A53, Neoverse N1 does not show
performance regressions.

Co-authored-by: Wilco Dijkstra <wilco.dijkstra@arm.com>

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: strchrnul-mte.patch --]
[-- Type: text/x-diff, Size: 5186 bytes --]

diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
index a65be6cba8..1ae4598f82 100644
--- a/sysdeps/aarch64/strchrnul.S
+++ b/sysdeps/aarch64/strchrnul.S
@@ -22,109 +22,75 @@
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Neon Available.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
  */
 
-/* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
-
 #define result		x0
 
-/* Locals and temporaries.  */
-
 #define src		x2
-#define tmp1		x3
-#define wtmp2		w4
-#define tmp3		x5
+#define tmp1		x1
+#define tmp2		x3
+#define tmp2w		w3
 
 #define vrepchr		v0
-#define vdata1		v1
-#define vdata2		v2
-#define vhas_nul1	v3
-#define vhas_nul2	v4
-#define vhas_chr1	v5
-#define vhas_chr2	v6
-#define vrepmask	v7
-#define vend1		v16
-
-/* Core algorithm.
-
-   For each 32-byte hunk we calculate a 64-bit syndrome value, with
-   two bits per byte (LSB is always in bits 0 and 1, for both big
-   and little-endian systems).  For each tuple, bit 0 is set iff
-   the relevant byte matched the requested character or nul.  Since the
-   bits in the syndrome reflect exactly the order in which things occur
-   in the original string a count_trailing_zeros() operation will
-   identify exactly which byte is causing the termination.  */
+#define vdata		v1
+#define qdata		q1
+#define vhas_nul	v2
+#define vhas_chr	v3
+#define vrepmask	v4
+#define vend		v5
+#define dend		d5
+
+/* Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__strchrnul)
 	DELOUSE (0)
-	/* Magic constant 0x40100401 to allow us to identify which lane
-	   matches the termination condition.  */
-	mov	wtmp2, #0x0401
-	movk	wtmp2, #0x4010, lsl #16
+	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
-	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
-	dup	vrepmask.4s, wtmp2
-	ands	tmp1, srcin, #31
-	b.eq	L(loop)
-
-	/* Input string is not 32-byte aligned.  Rather than forcing
-	   the padding bytes to a safe value, we calculate the syndrome
-	   for all the bytes, but then mask off those bits of the
-	   syndrome that are related to the padding.  */
-	ld1	{vdata1.16b, vdata2.16b}, [src], #32
-	neg	tmp1, tmp1
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
-	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
-	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	orr	vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
-	orr	vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
-	lsl	tmp1, tmp1, #1
-	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
-	mov	tmp3, #~0
-	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
-	lsr	tmp1, tmp3, tmp1
-
-	mov	tmp3, vend1.2d[0]
-	bic	tmp1, tmp3, tmp1	// Mask padding bits.
-	cbnz	tmp1, L(tail)
+	ld1	{vdata.16b}, [src]
+	mov	tmp2w, 0xf00f
+	dup	vrepmask.8h, tmp2w
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
+	lsl	tmp2, srcin, 2
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	tmp1, dend
+	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
+	cbz	tmp1, L(loop)
 
+	rbit	tmp1, tmp1
+	clz	tmp1, tmp1
+	add	result, srcin, tmp1, lsr 2
+	ret
+
+	.p2align 4
 L(loop):
-	ld1	{vdata1.16b, vdata2.16b}, [src], #32
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
-	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
-	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	/* Use a fast check for the termination condition.  */
-	orr	vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
-	orr	vend1.16b, vhas_chr1.16b, vhas_chr2.16b
-	addp	vend1.2d, vend1.2d, vend1.2d
-	mov	tmp1, vend1.2d[0]
+	ldr	qdata, [src, 16]!
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
+	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
 
-	/* Termination condition found.  Now need to establish exactly why
-	   we terminated.  */
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
-	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
-	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
-
-	mov	tmp1, vend1.2d[0]
-L(tail):
-	/* Count the trailing zeros, by bit reversing...  */
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	tmp1, dend
+#ifndef __AARCH64EB__
 	rbit	tmp1, tmp1
-	/* Re-bias source.  */
-	sub	src, src, #32
-	clz	tmp1, tmp1	/* ... and counting the leading zeros.  */
-	/* tmp1 is twice the offset into the fragment.  */
-	add	result, src, tmp1, lsr #1
+#endif
+	clz	tmp1, tmp1
+	add	result, src, tmp1, lsr 2
 	ret
 
 END(__strchrnul)

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2020-06-05 16:26 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-03  9:43 [PATCH] aarch64: MTE compatible strchrnul Andrea Corallo
2020-06-03 11:27 ` Florian Weimer
2020-06-03 14:14   ` Andrea Corallo
2020-06-03 14:24     ` Florian Weimer
2020-06-03 14:33       ` Adhemerval Zanella
2020-06-03 14:53         ` Szabolcs Nagy
2020-06-03 15:03           ` Szabolcs Nagy
2020-06-03 15:04           ` Adhemerval Zanella
2020-06-03 15:04         ` Florian Weimer
2020-06-03 15:18           ` Szabolcs Nagy
2020-06-03 15:24             ` Florian Weimer
2020-06-03 14:40 ` Adhemerval Zanella
2020-06-03 16:01   ` Andrea Corallo
2020-06-04 12:04     ` H.J. Lu
2020-06-04 19:48       ` Szabolcs Nagy
2020-06-04 20:03         ` H.J. Lu
2020-06-05  7:38           ` Szabolcs Nagy
2020-06-05  8:45             ` Wilco Dijkstra
2020-06-05 16:25               ` Szabolcs Nagy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).