* [Patch, mips] Faster strcmp for mips
@ 2013-11-14 21:25 Steve Ellcey
2013-11-14 23:14 ` Ondřej Bílka
` (2 more replies)
0 siblings, 3 replies; 11+ messages in thread
From: Steve Ellcey @ 2013-11-14 21:25 UTC (permalink / raw)
To: libc-ports
[-- Attachment #1: Type: text/plain, Size: 884 bytes --]
I have created an optimized strcmp routine for MIPS and would like to check
it in. I verified it by running 'make check' and I also ran 'make bench'
to check the performance. This function is slightly slower then the current
generic strcmp for strings that are not aligned, but much faster when strings
are aligned because it will then load and compare either 2, 4, or 8 bytes at a
time.
This means it could be loading bytes beyond the end of the strings being
compared but it looks like other architecture specific strcmp functions
are also doing this optimization and the newlib version of strcmp also does
this.
OK to checkin?
I have attached the original and new bench-strcmp.out files in case anyone
wants to compare the old and new strcmp performance results.
Steve Ellcey
sellcey@mips.com
2013-11-14 Steve Ellcey <sellcey@mips.com>
* sysdeps/mips/strcmp.c: New.
[-- Attachment #2: strcmp.patch --]
[-- Type: text/x-patch, Size: 5369 bytes --]
diff --git a/ports/sysdeps/mips/strcmp.c b/ports/sysdeps/mips/strcmp.c
new file mode 100644
index 0000000..8ce75cd
--- /dev/null
+++ b/ports/sysdeps/mips/strcmp.c
@@ -0,0 +1,194 @@
+/* Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stdint.h>
+
+#ifdef _LIBC
+# include <string.h>
+# include <memcopy.h>
+# undef strcmp
+#endif
+
+#ifdef __mips64
+# define ENABLE64
+#endif
+
+/* The UNALIGNED* macros returns 0 if both addresses are aligned on
+ a 2, 4, or 8 byte boundry. The DETECTNULL* macros return nonzero
+ if the 2, 4, or 8 byte value has a null byte in it. If ENABLE64
+ is not set, we set UNALIGNED8 to '1' so that it always returns
+ TRUE and we never try to use 8 byte operations. */
+
+#ifdef ENABLE64
+# define UNALIGNED8(X, Y) (((uintptr_t) X & (sizeof (uint64_t) - 1)) \
+ | ((uintptr_t) Y & (sizeof (uint64_t) - 1)))
+# define DETECTNULL8(X) (((X) - 0x0101010101010101) \
+ & ~((X) | 0x7f7f7f7f7f7f7f7f))
+#else
+# define UNALIGNED8(X, Y) (1)
+# define DETECTNULL8(X) (0)
+#endif
+
+
+typedef struct bits8
+{
+ unsigned int B0:8, B1:8, B2:8, B3:8, B4:8, B5:8, B6:8, B7:8;
+} bits8_t;
+typedef union
+{
+ uint64_t v;
+ bits8_t b;
+} bitfields8_t;
+
+
+#define UNALIGNED4(X, Y) (((uintptr_t) X & (sizeof (uint32_t) - 1)) \
+ | ((uintptr_t) Y & (sizeof (uint32_t) - 1)))
+#define DETECTNULL4(X) (((X) - 0x01010101) & ~((X) | 0x7f7f7f7f))
+
+typedef struct bits4
+{
+ unsigned int B0:8, B1:8, B2:8, B3:8;
+} bits4_t;
+typedef union
+{
+ uint32_t v;
+ bits4_t b;
+} bitfields4_t;
+
+
+#define UNALIGNED2(X, Y) (((uintptr_t) X & (sizeof (uint16_t) - 1)) \
+ | ((uintptr_t) Y & (sizeof (uint16_t) - 1)))
+#define DETECTNULL2(X) (((X) - 0x0101) & ~((X) | 0x7f7f))
+typedef struct bits2
+{
+ unsigned int B0:8, B1:8;
+} bits2_t;
+typedef union
+{
+ uint16_t v;
+ bits2_t b;
+} bitfields2_t;
+
+
+#ifndef STRCMP_NAME
+# define STRCMP_NAME strcmp
+#endif
+
+int
+STRCMP_NAME(const char *s1, const char *s2)
+{
+ if (__builtin_expect (UNALIGNED2 (s1, s2), 1))
+ {
+ /* s1 and s2 are unaligned, compare them one byte at a time. */
+ while (*s1 != '\0' && *s1 == *s2)
+ {
+ s1++;
+ s2++;
+ }
+ return (*(unsigned char *) s1) - (*(unsigned char *) s2);
+ }
+
+ if (__builtin_expect (UNALIGNED4 (s1, s2), 1))
+ {
+ /* s1 and s2 are 2 byte aligned, compare them 2 bytes at a time. */
+ uint16_t *a1 = (uint16_t *) s1;
+ uint16_t *a2 = (uint16_t *) s2;
+ uint16_t x, y;
+ bitfields2_t bx, by;
+ a1 = (uint16_t *) s1;
+ a2 = (uint16_t *) s2;
+ x = *(a1++);
+ y = *(a2++);
+ while (1)
+ {
+ if (x != y)
+ break;
+ if (DETECTNULL2 (x))
+ return 0;
+ x = *(a1++);
+ y = *(a2++);
+ }
+ bx.v = x; by.v = y;
+ if (bx.b.B0 - by.b.B0 != 0 || bx.b.B0 == 0)
+ return bx.b.B0 - by.b.B0;
+ return bx.b.B1 - by.b.B1;
+ }
+
+ if (__builtin_expect (UNALIGNED8 (s1, s2), 1))
+ {
+ /* s1 and s2 are 4 byte aligned, compare them 4 bytes at a time. */
+ uint32_t *a1 = (uint32_t *) s1;
+ uint32_t *a2 = (uint32_t *) s2;
+ uint32_t x, y;
+ bitfields4_t bx, by;
+ x = *(a1++);
+ y = *(a2++);
+ while (1)
+ {
+ if (x != y)
+ break;
+ if (DETECTNULL4 (x))
+ return 0;
+ x = *(a1++);
+ y = *(a2++);
+ }
+ bx.v = x; by.v = y;
+ if (bx.b.B0 - by.b.B0 != 0 || bx.b.B0 == 0)
+ return bx.b.B0 - by.b.B0;
+ if (bx.b.B1 - by.b.B1 != 0 || bx.b.B1 == 0)
+ return bx.b.B1 - by.b.B1;
+ if (bx.b.B2 - by.b.B2 != 0 || bx.b.B2 == 0)
+ return bx.b.B2 - by.b.B2;
+ return bx.b.B3 - by.b.B3;
+ }
+
+ /* s1 and s2 are 8 byte aligned, compare them 8 bytes at a time. */
+ uint64_t *a1 = (uint64_t *) s1;
+ uint64_t *a2 = (uint64_t *) s2;
+ uint64_t x, y;
+ bitfields8_t bx, by;
+ x = *(a1++);
+ y = *(a2++);
+ while (1)
+ {
+ if (x != y)
+ break;
+ if (DETECTNULL8 (x))
+ return 0;
+ x = *(a1++);
+ y = *(a2++);
+ }
+ bx.v = x; by.v = y;
+ if (bx.b.B0 - by.b.B0 != 0 || bx.b.B0 == 0)
+ return bx.b.B0 - by.b.B0;
+ if (bx.b.B1 - by.b.B1 != 0 || bx.b.B1 == 0)
+ return bx.b.B1 - by.b.B1;
+ if (bx.b.B2 - by.b.B2 != 0 || bx.b.B2 == 0)
+ return bx.b.B2 - by.b.B2;
+ if (bx.b.B3 - by.b.B3 != 0 || bx.b.B3 == 0)
+ return bx.b.B3 - by.b.B3;
+ if (bx.b.B4 - by.b.B4 != 0 || bx.b.B4 == 0)
+ return bx.b.B4 - by.b.B4;
+ if (bx.b.B5 - by.b.B5 != 0 || bx.b.B5 == 0)
+ return bx.b.B5 - by.b.B5;
+ if (bx.b.B6 - by.b.B6 != 0 || bx.b.B6 == 0)
+ return bx.b.B6 - by.b.B6;
+ return bx.b.B7 - by.b.B7;
+}
+#ifdef _LIBC
+libc_hidden_builtin_def (STRCMP_NAME)
+#endif
[-- Attachment #3: bench-strcmp.out.orig --]
[-- Type: text/plain, Size: 11146 bytes --]
strcmp simple_strcmp stupid_strcmp
Length 1, alignment 1/ 1: 46.1875 41.375 124.203
Length 1, alignment 1/ 1: 22.375 26.25 68.5469
Length 1, alignment 1/ 1: 22.3281 24.125 72.8906
Length 2, alignment 2/ 2: 44.7656 45.625 108.781
Length 2, alignment 2/ 2: 32.0781 39.8906 107.375
Length 2, alignment 2/ 2: 31.7656 40.25 108.328
Length 3, alignment 3/ 3: 43.625 45.5156 107.094
Length 3, alignment 3/ 3: 50.2656 44.8125 105.703
Length 3, alignment 3/ 3: 50.8125 45.2656 106.625
Length 4, alignment 4/ 4: 49.625 49.1094 113.844
Length 4, alignment 4/ 4: 45.8438 44.1406 106.828
Length 4, alignment 4/ 4: 45.4688 43.7812 106.562
Length 5, alignment 5/ 5: 53.4844 54.4219 150.828
Length 5, alignment 5/ 5: 50.1875 48.1562 144.672
Length 5, alignment 5/ 5: 51.2344 48.3906 145.672
Length 6, alignment 6/ 6: 57.8281 57.8125 155.625
Length 6, alignment 6/ 6: 54.875 53.3125 149.531
Length 6, alignment 6/ 6: 54.9688 53.5156 148.156
Length 7, alignment 7/ 7: 63.2188 62.3438 147.422
Length 7, alignment 7/ 7: 60.2031 57.8594 142.656
Length 7, alignment 7/ 7: 60.1875 57.1562 140.984
Length 8, alignment 8/ 8: 67.2031 68.2188 155.672
Length 8, alignment 8/ 8: 66.7031 61.875 149.297
Length 8, alignment 8/ 8: 64.2344 62.8438 148.812
Length 9, alignment 9/ 9: 71.5 72.4375 191.797
Length 9, alignment 9/ 9: 68.8906 66.875 185.062
Length 9, alignment 9/ 9: 68.8906 66.4844 185.688
Length 10, alignment 10/10: 76.7812 77.375 195.672
Length 10, alignment 10/10: 74.3125 71.4688 189.203
Length 10, alignment 10/10: 73.2656 71.1406 189.547
Length 11, alignment 11/11: 80.875 81.6094 186.141
Length 11, alignment 11/11: 78.1875 76.0625 179.719
Length 11, alignment 11/11: 79.0625 75.5 181.125
Length 12, alignment 12/12: 85.5625 86.4062 175.469
Length 12, alignment 12/12: 83.2344 81.5469 169.344
Length 12, alignment 12/12: 83.9531 81.5312 169.375
Length 13, alignment 13/13: 90.5625 91.125 230.656
Length 13, alignment 13/13: 87.25 85.4062 224.688
Length 13, alignment 13/13: 87.4062 84.7656 224.75
Length 14, alignment 14/14: 94.8125 95.9219 212.281
Length 14, alignment 14/14: 92.2188 89.7812 206.172
Length 14, alignment 14/14: 92.25 90.625 206.516
Length 15, alignment 15/15: 98.8594 101.234 206.844
Length 15, alignment 15/15: 97.0312 94.8594 200.531
Length 15, alignment 15/15: 96.9844 94.8438 200.234
Length 16, alignment 16/16: 104.141 105.828 205.406
Length 16, alignment 16/16: 101.625 99.5781 199.531
Length 16, alignment 16/16: 102.938 99.8125 199.219
Length 17, alignment 17/17: 108.859 109.609 251.172
Length 17, alignment 17/17: 106.969 104.312 245.031
Length 17, alignment 17/17: 106.891 103.922 245.062
Length 18, alignment 18/18: 113.875 114.609 244.781
Length 18, alignment 18/18: 110.875 108.938 237.078
Length 18, alignment 18/18: 111.516 108.578 236.797
Length 19, alignment 19/19: 120.188 118.609 237.406
Length 19, alignment 19/19: 116.234 113.609 231.641
Length 19, alignment 19/19: 116.578 114.188 231.25
Length 20, alignment 20/20: 123.484 124.172 236.781
Length 20, alignment 20/20: 121.234 118.172 228.953
Length 20, alignment 20/20: 121.25 117.922 230.641
Length 21, alignment 21/21: 128.172 128.656 281.453
Length 21, alignment 21/21: 124.875 123.266 275.578
Length 21, alignment 21/21: 125.297 123.312 275.266
Length 22, alignment 22/22: 132.922 132.594 274.297
Length 22, alignment 22/22: 130.219 128.5 268.375
Length 22, alignment 22/22: 129.672 127.828 268.125
Length 23, alignment 23/23: 136.891 138.516 268.391
Length 23, alignment 23/23: 134.938 132.578 262.266
Length 23, alignment 23/23: 134.312 133.5 262.266
Length 24, alignment 24/24: 142.203 142.219 267.766
Length 24, alignment 24/24: 138.875 137.547 260.953
Length 24, alignment 24/24: 139.328 137.578 260.281
Length 25, alignment 25/25: 146.906 146.906 312.406
Length 25, alignment 25/25: 144.188 141.203 306.281
Length 25, alignment 25/25: 143.594 141.203 305.875
Length 26, alignment 26/26: 151.078 152.125 305.281
Length 26, alignment 26/26: 149.266 146.828 298.469
Length 26, alignment 26/26: 148.984 146.281 297.766
Length 27, alignment 27/27: 155.875 156.453 298.672
Length 27, alignment 27/27: 153.562 151.219 292.203
Length 27, alignment 27/27: 152.844 151.797 292.203
Length 28, alignment 28/28: 159.844 161.641 298.5
Length 28, alignment 28/28: 158.484 156.453 292.031
Length 28, alignment 28/28: 158.219 155.969 291.656
Length 29, alignment 29/29: 164.875 165.859 343.781
Length 29, alignment 29/29: 162.906 160.312 336.922
Length 29, alignment 29/29: 163.594 161.281 337.625
Length 30, alignment 30/30: 169.875 170.297 335.953
Length 30, alignment 30/30: 167.547 165.969 329.422
Length 30, alignment 30/30: 166.859 164.812 329.547
Length 31, alignment 31/31: 174.516 174.594 329.062
Length 31, alignment 31/31: 172.266 169.141 323.25
Length 31, alignment 31/31: 171.781 170.156 323.203
Length 4, alignment 0/ 0: 48.25 49.2031 113.25
Length 4, alignment 0/ 0: 48.8281 49.0781 111.969
Length 4, alignment 0/ 0: 46.3125 43.5469 106.125
Length 4, alignment 0/ 0: 45.5781 43.5 106.844
Length 4, alignment 0/ 0: 46.1875 43.4375 107.219
Length 4, alignment 0/ 0: 45.625 44.25 106.062
Length 4, alignment 0/ 1: 45.8594 44.9219 134
Length 4, alignment 1/ 2: 45.8281 43.4688 136.375
Length 8, alignment 0/ 0: 66.9375 67.4531 155.312
Length 8, alignment 0/ 0: 67.2812 67.0781 154.953
Length 8, alignment 0/ 0: 63.9375 62.1875 149.5
Length 8, alignment 0/ 0: 64.875 62.9062 148.078
Length 8, alignment 0/ 0: 64.9219 62.0469 148.578
Length 8, alignment 0/ 0: 64.6094 63.1406 149.25
Length 8, alignment 0/ 2: 65.1875 62.9219 168.969
Length 8, alignment 2/ 3: 64.8594 62.4062 163.953
Length 16, alignment 0/ 0: 105.125 104.797 205.422
Length 16, alignment 0/ 0: 104.266 104.938 205.422
Length 16, alignment 0/ 0: 102.406 100.516 200.375
Length 16, alignment 0/ 0: 101.938 99.625 199.656
Length 16, alignment 0/ 0: 102.562 99.4688 199.578
Length 16, alignment 0/ 0: 103.703 100.609 199.312
Length 16, alignment 0/ 3: 102.562 100.266 213.5
Length 16, alignment 3/ 4: 101.156 100.141 213.188
Length 32, alignment 0/ 0: 180.312 179.594 328.766
Length 32, alignment 0/ 0: 179.391 179.562 328.438
Length 32, alignment 0/ 0: 176.031 174.859 321.656
Length 32, alignment 0/ 0: 175.953 174.516 323.016
Length 32, alignment 0/ 0: 176.906 173.891 322.312
Length 32, alignment 0/ 0: 177.328 174.172 322.656
Length 32, alignment 0/ 4: 176.609 174.609 322
Length 32, alignment 4/ 5: 176.844 174.812 347.953
Length 64, alignment 0/ 0: 328.938 329.547 573.062
Length 64, alignment 0/ 0: 328.594 328.188 573.734
Length 64, alignment 0/ 0: 325.891 324.156 567.297
Length 64, alignment 0/ 0: 325.922 323.25 567.922
Length 64, alignment 0/ 0: 326.266 323.938 567.672
Length 64, alignment 0/ 0: 326.641 323.656 568.312
Length 64, alignment 0/ 5: 325.828 323.562 593.547
Length 64, alignment 5/ 6: 325.891 323.422 604.406
Length 128, alignment 0/ 0: 628 627.156 1064.38
Length 128, alignment 0/ 0: 626.594 626.781 1064.12
Length 128, alignment 0/ 0: 623.859 622.25 1063.12
Length 128, alignment 0/ 0: 624.453 622.25 1058.94
Length 128, alignment 0/ 0: 624.797 622.594 1059.06
Length 128, alignment 0/ 0: 624.516 622.25 1057.58
Length 128, alignment 0/ 6: 625.156 622.281 1075.31
Length 128, alignment 6/ 7: 624.484 621.562 1085.12
Length 256, alignment 0/ 0: 1362.39 1225.83 2046.11
Length 256, alignment 0/ 0: 1224.84 1225.56 2046.44
Length 256, alignment 0/ 0: 1221.45 1220.12 2039.59
Length 256, alignment 0/ 0: 1221.53 1219.48 2040.27
Length 256, alignment 0/ 0: 1222.94 1220.17 2040.31
Length 256, alignment 0/ 0: 1222.55 1219.78 2040.34
Length 256, alignment 0/ 7: 1222.16 1219.75 2053.31
Length 256, alignment 7/ 8: 1221.5 1220.16 2054.06
Length 512, alignment 0/ 0: 2421.22 2419.52 4009.11
Length 512, alignment 0/ 0: 2419.88 2420.09 4008.08
Length 512, alignment 0/ 0: 2416.55 2414.77 4003.3
Length 512, alignment 0/ 0: 2417.58 2413.47 4001.94
Length 512, alignment 0/ 0: 2417.2 2415.11 4004.34
Length 512, alignment 0/ 0: 2416.59 2414.73 4002.31
Length 512, alignment 0/ 8: 2416.45 2414.94 4002.61
Length 512, alignment 8/ 9: 2417.58 2414.09 4027.45
Length 1024, alignment 0/ 0: 4810.2 4809.06 7933.61
Length 1024, alignment 0/ 0: 4808.58 4809.42 8055.28
Length 1024, alignment 0/ 0: 4806.56 4803.86 7927.97
Length 1024, alignment 0/ 0: 4804.86 4804.42 7928.3
Length 1024, alignment 0/ 0: 4806.23 4803.39 7928.02
Length 1024, alignment 0/ 0: 4805.91 4803.56 7926.5
Length 1024, alignment 0/ 9: 4805.62 4803.2 7952.8
Length 1024, alignment 9/10: 4805.89 4803.16 7964.42
Length 16, alignment 1/ 2: 103.891 104.141 243.875
Length 16, alignment 2/ 1: 103.938 105.516 246.516
Length 16, alignment 1/ 2: 101.891 100.469 238.516
Length 16, alignment 2/ 1: 102.25 100.469 240.703
Length 16, alignment 1/ 2: 101.859 99.9219 237.312
Length 16, alignment 2/ 1: 101.5 99.8125 239.922
Length 32, alignment 2/ 4: 179.516 180.141 347.859
Length 32, alignment 4/ 2: 179.188 179.469 347.844
Length 32, alignment 2/ 4: 176.562 174.281 342.547
Length 32, alignment 4/ 2: 176.531 175.516 342.797
Length 32, alignment 2/ 4: 177.219 174.609 342.484
Length 32, alignment 4/ 2: 177.203 174.797 342.406
Length 64, alignment 3/ 6: 329.516 330.188 600.891
Length 64, alignment 6/ 3: 329.016 329.75 597.25
Length 64, alignment 3/ 6: 326.172 323.609 600.797
Length 64, alignment 6/ 3: 326.188 324.484 591.531
Length 64, alignment 3/ 6: 326.562 324.203 601.5
Length 64, alignment 6/ 3: 326.469 323.797 590.484
Length 128, alignment 4/ 8: 627.172 627.938 1064.44
Length 128, alignment 8/ 4: 626.125 731.984 1064.86
Length 128, alignment 4/ 8: 624.828 622.594 1063.17
Length 128, alignment 8/ 4: 624.828 622.609 1059.06
Length 128, alignment 4/ 8: 624.484 622.625 1058.72
Length 128, alignment 8/ 4: 624.109 622.641 1058.34
Length 256, alignment 5/10: 1225.45 1225.47 2083.92
Length 256, alignment 10/ 5: 1224.47 1226.22 2087.42
Length 256, alignment 5/10: 1222.14 1219.47 2077.34
Length 256, alignment 10/ 5: 1221.5 1219.91 2080.52
Length 256, alignment 5/10: 1221.81 1219.84 2076.56
Length 256, alignment 10/ 5: 1221.12 1220.78 2080.61
Length 512, alignment 6/12: 2419.5 2420.77 4028.08
Length 512, alignment 12/ 6: 2418.83 2420.06 4028.16
Length 512, alignment 6/12: 2416.11 2414.16 4021.72
Length 512, alignment 12/ 6: 2417.03 2415.11 4022.73
Length 512, alignment 6/12: 2417.12 2414.75 4022.12
Length 512, alignment 12/ 6: 2417.2 2415.16 4022.33
Length 1024, alignment 7/14: 4808.33 4809.05 7960.83
Length 1024, alignment 14/ 7: 4809.47 4809.05 7957.28
Length 1024, alignment 7/14: 4805.81 4951.92 7962
Length 1024, alignment 14/ 7: 4805.78 4803.53 7950.88
Length 1024, alignment 7/14: 4806.55 4803.22 7960.41
Length 1024, alignment 14/ 7: 4805.8 4802.88 7951.06
[-- Attachment #4: bench-strcmp.out.new --]
[-- Type: text/plain, Size: 11142 bytes --]
strcmp simple_strcmp stupid_strcmp
Length 1, alignment 1/ 1: 55.2812 41.8438 113.406
Length 1, alignment 1/ 1: 27.3438 25.2969 62.8906
Length 1, alignment 1/ 1: 27.5 25.1094 63.3906
Length 2, alignment 2/ 2: 38.0312 45.1406 108.219
Length 2, alignment 2/ 2: 30.4219 39.1406 107.109
Length 2, alignment 2/ 2: 30.9688 40.4688 108.469
Length 3, alignment 3/ 3: 50.0938 44.7656 106.375
Length 3, alignment 3/ 3: 51.2656 44.75 106.312
Length 3, alignment 3/ 3: 51.0625 44.8906 106.719
Length 4, alignment 4/ 4: 39.4219 49.8906 117.719
Length 4, alignment 4/ 4: 37.3125 43.1562 110.891
Length 4, alignment 4/ 4: 36.8906 43.6094 112.312
Length 5, alignment 5/ 5: 55.9844 52.7812 150.312
Length 5, alignment 5/ 5: 54.125 47.9062 144.844
Length 5, alignment 5/ 5: 55.0625 47.4219 143.781
Length 6, alignment 6/ 6: 58.4219 57.4375 154.719
Length 6, alignment 6/ 6: 45.5 53.1719 149.016
Length 6, alignment 6/ 6: 45.9688 53.8125 149.375
Length 7, alignment 7/ 7: 67.5938 62.4219 147.344
Length 7, alignment 7/ 7: 64.2188 57.4531 141.578
Length 7, alignment 7/ 7: 64.4688 57.0938 141.25
Length 8, alignment 8/ 8: 48.6875 66.9531 157.344
Length 8, alignment 8/ 8: 41.875 62.1406 151.938
Length 8, alignment 8/ 8: 42.7188 62.3906 152.25
Length 9, alignment 9/ 9: 77.4688 72.4688 192.094
Length 9, alignment 9/ 9: 80.4688 66.1094 185.609
Length 9, alignment 9/ 9: 75.4531 66.4531 185.578
Length 10, alignment 10/10: 64.7812 76.4375 195.203
Length 10, alignment 10/10: 59.25 71.1875 190.094
Length 10, alignment 10/10: 58.2344 71.1406 190.484
Length 11, alignment 11/11: 88.625 81.9375 187.469
Length 11, alignment 11/11: 86.1562 76.1094 181.047
Length 11, alignment 11/11: 91.4844 76.4688 182.156
Length 12, alignment 12/12: 60.5312 85.7969 175
Length 12, alignment 12/12: 58.1562 81.0469 169.906
Length 12, alignment 12/12: 58.3594 81.8594 169.625
Length 13, alignment 13/13: 98.5781 91.1406 230.172
Length 13, alignment 13/13: 97.4844 85.8125 225.062
Length 13, alignment 13/13: 96.5625 85.7656 225.031
Length 14, alignment 14/14: 76.1562 96.0469 213.531
Length 14, alignment 14/14: 69.7031 90.4844 207.406
Length 14, alignment 14/14: 68.7969 91.2344 206.75
Length 15, alignment 15/15: 109.188 100.203 206.219
Length 15, alignment 15/15: 106.953 94.5312 200.391
Length 15, alignment 15/15: 106.891 94.1094 200.094
Length 16, alignment 16/16: 65.8594 104.75 206.297
Length 16, alignment 16/16: 63.8281 99.2031 200.469
Length 16, alignment 16/16: 64.1094 99.5 199.812
Length 17, alignment 17/17: 120.031 109.219 251.234
Length 17, alignment 17/17: 118.375 104.109 244.812
Length 17, alignment 17/17: 117.484 104.547 244.797
Length 18, alignment 18/18: 86.9688 114.656 242.156
Length 18, alignment 18/18: 79.4688 109 237
Length 18, alignment 18/18: 78.9844 108.375 236.719
Length 19, alignment 19/19: 130.016 117.844 236.859
Length 19, alignment 19/19: 128.438 114.828 231.391
Length 19, alignment 19/19: 128.812 113.531 230.688
Length 20, alignment 20/20: 66.1875 123.094 236.031
Length 20, alignment 20/20: 64.5 118.109 229.516
Length 20, alignment 20/20: 64.375 117.984 230.234
Length 21, alignment 21/21: 142 127.859 280.531
Length 21, alignment 21/21: 138.906 123.562 274.703
Length 21, alignment 21/21: 139.266 122.797 275.047
Length 22, alignment 22/22: 97.6406 133.797 274.141
Length 22, alignment 22/22: 90.0781 128.359 268.438
Length 22, alignment 22/22: 89.75 127.078 267.297
Length 23, alignment 23/23: 152.266 137.812 268.234
Length 23, alignment 23/23: 150.125 131.797 262.078
Length 23, alignment 23/23: 150.516 132.125 261.734
Length 24, alignment 24/24: 72.125 142.766 267.281
Length 24, alignment 24/24: 70.1875 136.812 260.578
Length 24, alignment 24/24: 70.4844 136.812 260.172
Length 25, alignment 25/25: 163.188 146.5 311.484
Length 25, alignment 25/25: 160.766 141.188 306.391
Length 25, alignment 25/25: 161.109 142.906 306.375
Length 26, alignment 26/26: 108.141 153.156 303.953
Length 26, alignment 26/26: 101.594 146.234 298.984
Length 26, alignment 26/26: 101.562 146.453 298.625
Length 27, alignment 27/27: 173.906 155.875 298.5
Length 27, alignment 27/27: 171.172 150.906 292.688
Length 27, alignment 27/27: 171.547 152.344 293.109
Length 28, alignment 28/28: 78.5625 161.109 298.438
Length 28, alignment 28/28: 75.5938 156.234 291.938
Length 28, alignment 28/28: 76.4844 155.141 291.234
Length 29, alignment 29/29: 183.672 164.875 343.562
Length 29, alignment 29/29: 187.797 160.172 336.312
Length 29, alignment 29/29: 182.438 159.562 337.453
Length 30, alignment 30/30: 118.812 172.016 335.703
Length 30, alignment 30/30: 112.375 164.172 329.266
Length 30, alignment 30/30: 111.766 165.25 328.906
Length 31, alignment 31/31: 194.656 174.406 330.141
Length 31, alignment 31/31: 193.109 169.734 323.391
Length 31, alignment 31/31: 197.812 169.938 323.75
Length 4, alignment 0/ 0: 38.875 49.5938 118.312
Length 4, alignment 0/ 0: 38.6406 48.0781 117.594
Length 4, alignment 0/ 0: 38.3281 43.7344 112.297
Length 4, alignment 0/ 0: 37.7188 44.2031 112.281
Length 4, alignment 0/ 0: 38.375 43.6562 111.562
Length 4, alignment 0/ 0: 37.8281 44.25 111.688
Length 4, alignment 0/ 1: 49.0156 43.7969 132.766
Length 4, alignment 1/ 2: 48.7969 43.5156 137.234
Length 8, alignment 0/ 0: 47.7344 67.3906 159.422
Length 8, alignment 0/ 0: 48.2344 68.2031 158.438
Length 8, alignment 0/ 0: 41.25 63.5469 151.547
Length 8, alignment 0/ 0: 42.4531 62.1719 151.844
Length 8, alignment 0/ 0: 41.4688 62.1562 152.562
Length 8, alignment 0/ 0: 42.5469 62.1562 152.203
Length 8, alignment 0/ 2: 56.75 61.7969 168.031
Length 8, alignment 2/ 3: 75.2812 62.8125 164.594
Length 16, alignment 0/ 0: 65.5 106.109 206.344
Length 16, alignment 0/ 0: 65.625 104.938 206
Length 16, alignment 0/ 0: 64.5781 99.9062 200.25
Length 16, alignment 0/ 0: 63.3594 99.6094 199.812
Length 16, alignment 0/ 0: 64.2812 99.1562 199.172
Length 16, alignment 0/ 0: 63.4531 99.8281 198.75
Length 16, alignment 0/ 3: 112.844 100.438 213.266
Length 16, alignment 3/ 4: 113.141 99.3906 213.734
Length 32, alignment 0/ 0: 83.4688 178.984 329.078
Length 32, alignment 0/ 0: 82.9062 179.703 328.719
Length 32, alignment 0/ 0: 81.0156 174.141 321.469
Length 32, alignment 0/ 0: 81.0312 173.859 321.906
Length 32, alignment 0/ 0: 80.1719 174.5 425.094
Length 32, alignment 0/ 0: 82.1719 175.062 322.578
Length 32, alignment 0/ 4: 81.5625 174.844 322.266
Length 32, alignment 4/ 5: 197.641 174.484 348.094
Length 64, alignment 0/ 0: 124.781 328.891 573.891
Length 64, alignment 0/ 0: 125.703 329.562 574.234
Length 64, alignment 0/ 0: 124 323.562 568.922
Length 64, alignment 0/ 0: 123.469 322.922 568.438
Length 64, alignment 0/ 0: 123.391 324.016 567.812
Length 64, alignment 0/ 0: 124.141 323.969 568.109
Length 64, alignment 0/ 5: 368.688 323.5 593.297
Length 64, alignment 5/ 6: 369.141 323.75 604.578
Length 128, alignment 0/ 0: 211.188 627.578 1064.97
Length 128, alignment 0/ 0: 209.875 628.906 1064.69
Length 128, alignment 0/ 0: 209.312 622.891 1062.7
Length 128, alignment 0/ 0: 208.938 622.125 1058.53
Length 128, alignment 0/ 0: 208.938 622.547 1059.25
Length 128, alignment 0/ 0: 209 622.281 1058.88
Length 128, alignment 0/ 6: 372.406 622.109 1075.41
Length 128, alignment 6/ 7: 711.203 622.203 1086.06
Length 256, alignment 0/ 0: 381.953 1225.98 2045.91
Length 256, alignment 0/ 0: 381.203 1225.58 2046.22
Length 256, alignment 0/ 0: 380.703 1219.52 2040.84
Length 256, alignment 0/ 0: 379.312 1219.27 2040.48
Length 256, alignment 0/ 0: 379.281 1218.56 2040.47
Length 256, alignment 0/ 0: 379.953 1220.17 2040.84
Length 256, alignment 0/ 7: 1392.52 1219.81 2053.27
Length 256, alignment 7/ 8: 1393.08 1220.58 2053.02
Length 512, alignment 0/ 0: 724.828 2420.86 4009.06
Length 512, alignment 0/ 0: 722.562 2420.8 4008.62
Length 512, alignment 0/ 0: 721.953 2414.12 4002.53
Length 512, alignment 0/ 0: 720.938 2413.97 4002.19
Length 512, alignment 0/ 0: 721.422 2414.14 4002.23
Length 512, alignment 0/ 0: 721.016 2414.61 4002.89
Length 512, alignment 0/ 8: 720.969 2415.02 4001.16
Length 512, alignment 8/ 9: 2758.58 2414.36 4027.84
Length 1024, alignment 0/ 0: 1737.12 4809.94 7934.03
Length 1024, alignment 0/ 0: 1405.19 4808.14 7933.61
Length 1024, alignment 0/ 0: 1402.95 4803.48 7927.91
Length 1024, alignment 0/ 0: 1403.48 4803.72 7927.17
Length 1024, alignment 0/ 0: 1403.58 4804.3 7928.25
Length 1024, alignment 0/ 0: 1403.67 4802.84 7927.83
Length 1024, alignment 0/ 9: 5488.84 4803.41 7952.75
Length 1024, alignment 9/10: 5488.34 4803.36 7964.98
Length 16, alignment 1/ 2: 114.516 104.156 242.672
Length 16, alignment 2/ 1: 114.625 104.5 245.531
Length 16, alignment 1/ 2: 112.125 99.9062 236.859
Length 16, alignment 2/ 1: 112.469 100.516 240.578
Length 16, alignment 1/ 2: 113.234 99.5156 237.172
Length 16, alignment 2/ 1: 112.781 99.8906 240.547
Length 32, alignment 2/ 4: 124.156 179.531 347.328
Length 32, alignment 4/ 2: 123.797 179.766 348
Length 32, alignment 2/ 4: 116.953 175.125 341.688
Length 32, alignment 4/ 2: 117.516 174.422 341.141
Length 32, alignment 2/ 4: 116.781 173.172 341.25
Length 32, alignment 4/ 2: 116.672 174.203 341.172
Length 64, alignment 3/ 6: 371.234 329.906 599.672
Length 64, alignment 6/ 3: 370.562 329.531 596.078
Length 64, alignment 3/ 6: 368.891 322.844 601.031
Length 64, alignment 6/ 3: 374.469 323.875 600.531
Length 64, alignment 3/ 6: 374.281 324.594 591.125
Length 64, alignment 6/ 3: 373.844 323.875 600.578
Length 128, alignment 4/ 8: 210.625 627.438 1063.94
Length 128, alignment 8/ 4: 210.469 628.234 1065.33
Length 128, alignment 4/ 8: 209.859 622.984 1061.83
Length 128, alignment 8/ 4: 209.641 622.219 1057.14
Length 128, alignment 4/ 8: 209.078 622.609 1057.91
Length 128, alignment 8/ 4: 208.469 621.453 1058.62
Length 256, alignment 5/10: 1399.86 1225.31 2084.86
Length 256, alignment 10/ 5: 1394.7 1225.75 2086.88
Length 256, alignment 5/10: 1393.8 1220.42 2078.56
Length 256, alignment 10/ 5: 1397.73 1219.47 2080.16
Length 256, alignment 5/10: 1398.42 1220.52 2076.86
Length 256, alignment 10/ 5: 1392.72 1219.55 2080.2
Length 512, alignment 6/12: 1403.19 2419.84 4029.84
Length 512, alignment 12/ 6: 1403.45 2419.45 4028.77
Length 512, alignment 6/12: 1397.28 2414.45 4022.02
Length 512, alignment 12/ 6: 1396.98 2414.45 4021.92
Length 512, alignment 6/12: 1396.98 2414.42 4021.33
Length 512, alignment 12/ 6: 1396.44 2413.98 4021.95
Length 1024, alignment 7/14: 5491.94 4808.83 7959.73
Length 1024, alignment 14/ 7: 5490.12 4809.33 7956.56
Length 1024, alignment 7/14: 5627.53 4804.8 7952.16
Length 1024, alignment 14/ 7: 5488.05 4802.39 7960.66
Length 1024, alignment 7/14: 5488.42 4803.78 7951.44
Length 1024, alignment 14/ 7: 5488.7 4803.83 7959.84
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Patch, mips] Faster strcmp for mips
2013-11-14 21:25 [Patch, mips] Faster strcmp for mips Steve Ellcey
@ 2013-11-14 23:14 ` Ondřej Bílka
2013-11-15 18:21 ` Steve Ellcey
2013-11-15 18:48 ` Carlos O'Donell
2015-02-19 7:25 ` Matt Turner
2 siblings, 1 reply; 11+ messages in thread
From: Ondřej Bílka @ 2013-11-14 23:14 UTC (permalink / raw)
To: Steve Ellcey; +Cc: libc-ports
On Thu, Nov 14, 2013 at 01:23:41PM -0800, Steve Ellcey wrote:
>
> I have created an optimized strcmp routine for MIPS and would like to check
> it in. I verified it by running 'make check' and I also ran 'make bench'
> to check the performance. This function is slightly slower then the current
> generic strcmp for strings that are not aligned, but much faster when strings
> are aligned because it will then load and compare either 2, 4, or 8 bytes at a
> time.
>
Problem is that aligned access could be only third of total calls. Also
around 90% calls need to check only first 16 bytes. These numbers differ
by application on x64 they are rule rather than exception. I collected
data of running gcc+gnuplot here:
http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_ivy_bridge/strcmp_profile/results_gcc/result.html
> This means it could be loading bytes beyond the end of the strings being
> compared but it looks like other architecture specific strcmp functions
> are also doing this optimization and the newlib version of strcmp also does
> this.
>
Also unaligned case can be made faster with bit of extra effort.
How fast are unaligned loads on MIPS? They should be faster than
assembling value by shifts from two loads manualy which should be faster
than byte-by-byte checking.
What is most improtant for performance is handle first bytes quickly.
You need to check several alternatives, one is unroll checking of first
8-16 bytes. Second is to do a single vector check.
Then you need to implement a loop, what I did on x64 is align one
argument and load second as unaligned.
You need to precompute when a unaligned access would cross page boundary
and switch to byte-by-byte loop when that happens.
Here is modified code that I used to generate x64 implementation, I hope
it will be useful.
You need to change a TEST(a,b) to code that checks 8 bytes starting at a
and b.
Also you may try to unroll these loops to do 16/32 bytes per iteration.
int
strcmp_new (char *a, char *b)
{
int i;
int or = (((unsigned long) a) | ((unsigned long) b)) & 4095;
if (or <= 4096 - 8)
{
TEST (a, b);
}
else
{
for (i = 0; i < 8; i++)
{
if (!a[i] || (a[i] - b[i]))
return a[i] - b[i];
}
}
char *a_al = ALIGN_DOWN (a + 8, 8);
int dif = a_al - a;
a += dif;
b += dif;
unsigned long b_next = ((unsigned long) b) & (4096 - 1);
unsigned long next_page = (4096 - b_next) / 8;
while (1)
{
if (__builtin_expect (next_page-- == 0, 0))
{
next_page = 4096 / 8;
next_page--;
for (i = 0; i < 8; i++)
{
if (!a[i] || (a[i] - b[i]))
return a[i] - b[i];
}
}
TEST (a, b);
a += 8;
b += 8;
}
}
Also here
> + bx.v = x; by.v = y;
> + if (bx.b.B0 - by.b.B0 != 0 || bx.b.B0 == 0)
> + return bx.b.B0 - by.b.B0;
> + if (bx.b.B1 - by.b.B1 != 0 || bx.b.B1 == 0)
> + return bx.b.B1 - by.b.B1;
> + if (bx.b.B2 - by.b.B2 != 0 || bx.b.B2 == 0)
> + return bx.b.B2 - by.b.B2;
> + if (bx.b.B3 - by.b.B3 != 0 || bx.b.B3 == 0)
> + return bx.b.B3 - by.b.B3;
> + if (bx.b.B4 - by.b.B4 != 0 || bx.b.B4 == 0)
> + return bx.b.B4 - by.b.B4;
> + if (bx.b.B5 - by.b.B5 != 0 || bx.b.B5 == 0)
> + return bx.b.B5 - by.b.B5;
> + if (bx.b.B6 - by.b.B6 != 0 || bx.b.B6 == 0)
> + return bx.b.B6 - by.b.B6;
> + return bx.b.B7 - by.b.B7;
There is alternative way to find answer without branching.
You need to compute a number that has nonzero i-th byte if i-th bytes of
x and y differ or they are 0. Then find first nonzero bit which can be
done quickly by CLZ instruction.
This could be implemented as following
uint64_t bitmask = DETECTNULL8(x) | (x ^ y);
uint64_t bitfirst = bitmask ^ (bitmask - 1);
int pos = (ffsl(bitfirst) - 1) / 8;
return a[pos] - b[pos];
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Patch, mips] Faster strcmp for mips
2013-11-14 23:14 ` Ondřej Bílka
@ 2013-11-15 18:21 ` Steve Ellcey
2013-11-15 19:02 ` Ondřej Bílka
0 siblings, 1 reply; 11+ messages in thread
From: Steve Ellcey @ 2013-11-15 18:21 UTC (permalink / raw)
To: Ondřej Bílka; +Cc: libc-ports
On Fri, 2013-11-15 at 00:14 +0100, Ondøej BĂlka wrote:
> Problem is that aligned access could be only third of total calls. Also
> around 90% calls need to check only first 16 bytes. These numbers differ
> by application on x64 they are rule rather than exception. I collected
> data of running gcc+gnuplot here:
> http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_ivy_bridge/strcmp_profile/results_gcc/result.html
Very interesting, you have obviously spent a lot more time then I have
on strcmp.
> > This means it could be loading bytes beyond the end of the strings being
> > compared but it looks like other architecture specific strcmp functions
> > are also doing this optimization and the newlib version of strcmp also does
> > this.
> >
> Also unaligned case can be made faster with bit of extra effort.
> How fast are unaligned loads on MIPS? They should be faster than
> assembling value by shifts from two loads manualy which should be faster
> than byte-by-byte checking.
MIPS doesn't have unaligned loads but it has 'partial' loads where two
loads can give you four (or eight) bytes in two load operations
regardless of alignment. I may need to look at this some more though
that will probably mean going to assembly language instead of C.
> Also here
> > + bx.v = x; by.v = y;
> > + if (bx.b.B0 - by.b.B0 != 0 || bx.b.B0 == 0)
> > + return bx.b.B0 - by.b.B0;
> > + if (bx.b.B1 - by.b.B1 != 0 || bx.b.B1 == 0)
> > + return bx.b.B1 - by.b.B1;
> > + if (bx.b.B2 - by.b.B2 != 0 || bx.b.B2 == 0)
> > + return bx.b.B2 - by.b.B2;
> > + if (bx.b.B3 - by.b.B3 != 0 || bx.b.B3 == 0)
> > + return bx.b.B3 - by.b.B3;
> > + if (bx.b.B4 - by.b.B4 != 0 || bx.b.B4 == 0)
> > + return bx.b.B4 - by.b.B4;
> > + if (bx.b.B5 - by.b.B5 != 0 || bx.b.B5 == 0)
> > + return bx.b.B5 - by.b.B5;
> > + if (bx.b.B6 - by.b.B6 != 0 || bx.b.B6 == 0)
> > + return bx.b.B6 - by.b.B6;
> > + return bx.b.B7 - by.b.B7;
>
> There is alternative way to find answer without branching.
> You need to compute a number that has nonzero i-th byte if i-th bytes of
> x and y differ or they are 0. Then find first nonzero bit which can be
> done quickly by CLZ instruction.
>
> This could be implemented as following
>
> uint64_t bitmask = DETECTNULL8(x) | (x ^ y);
> uint64_t bitfirst = bitmask ^ (bitmask - 1);
> int pos = (ffsl(bitfirst) - 1) / 8;
> return a[pos] - b[pos];
This looks very interesting, I tried just plugging it in but it didn't
work in some cases. I need to stare at this some more to understand
exactly how it should work and what cases are failing for me.
Steve Ellcey
sellcey@mips.com
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Patch, mips] Faster strcmp for mips
2013-11-14 21:25 [Patch, mips] Faster strcmp for mips Steve Ellcey
2013-11-14 23:14 ` Ondřej Bílka
@ 2013-11-15 18:48 ` Carlos O'Donell
2013-11-15 18:53 ` Steve Ellcey
2015-02-19 7:25 ` Matt Turner
2 siblings, 1 reply; 11+ messages in thread
From: Carlos O'Donell @ 2013-11-15 18:48 UTC (permalink / raw)
To: Steve Ellcey; +Cc: libc-ports
On Thu, Nov 14, 2013 at 4:23 PM, Steve Ellcey <sellcey@mips.com> wrote:
> This means it could be loading bytes beyond the end of the strings being
> compared but it looks like other architecture specific strcmp functions
> are also doing this optimization and the newlib version of strcmp also does
> this.
I thought that doing so was dangerous? I'm pretty sure we've been trying
to fix such "load bytes beyond the end of the string" issues because you
could have a string that straddles a page boundary with the next page
unmapped and such an optimized routine would fault on a read from the
unmapped page.
How do you plan to fix that?
Cheers,
Carlos.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Patch, mips] Faster strcmp for mips
2013-11-15 18:48 ` Carlos O'Donell
@ 2013-11-15 18:53 ` Steve Ellcey
2013-11-15 18:59 ` Carlos O'Donell
0 siblings, 1 reply; 11+ messages in thread
From: Steve Ellcey @ 2013-11-15 18:53 UTC (permalink / raw)
To: Carlos O'Donell; +Cc: libc-ports
On Fri, 2013-11-15 at 13:47 -0500, Carlos O'Donell wrote:
> On Thu, Nov 14, 2013 at 4:23 PM, Steve Ellcey <sellcey@mips.com> wrote:
> > This means it could be loading bytes beyond the end of the strings being
> > compared but it looks like other architecture specific strcmp functions
> > are also doing this optimization and the newlib version of strcmp also does
> > this.
>
> I thought that doing so was dangerous? I'm pretty sure we've been trying
> to fix such "load bytes beyond the end of the string" issues because you
> could have a string that straddles a page boundary with the next page
> unmapped and such an optimized routine would fault on a read from the
> unmapped page.
>
> How do you plan to fix that?
>
> Cheers,
> Carlos.
I think the assumption has been that if an aligned word contains at
least one byte of the string, it is OK to read that entire word. I
don't think a single aligned word (4 or 8 bytes depending on the
architecture) can straddle a page boundary. If the word was unaligned,
then I think it could straddle a page boundary and you could get into
trouble.
Steve Ellcey
sellcey@mips.com
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Patch, mips] Faster strcmp for mips
2013-11-15 18:53 ` Steve Ellcey
@ 2013-11-15 18:59 ` Carlos O'Donell
0 siblings, 0 replies; 11+ messages in thread
From: Carlos O'Donell @ 2013-11-15 18:59 UTC (permalink / raw)
To: Steve Ellcey, Carlos O'Donell; +Cc: libc-ports
On 11/15/2013 01:52 PM, Steve Ellcey wrote:
> On Fri, 2013-11-15 at 13:47 -0500, Carlos O'Donell wrote:
>> On Thu, Nov 14, 2013 at 4:23 PM, Steve Ellcey <sellcey@mips.com> wrote:
>>> This means it could be loading bytes beyond the end of the strings being
>>> compared but it looks like other architecture specific strcmp functions
>>> are also doing this optimization and the newlib version of strcmp also does
>>> this.
>>
>> I thought that doing so was dangerous? I'm pretty sure we've been trying
>> to fix such "load bytes beyond the end of the string" issues because you
>> could have a string that straddles a page boundary with the next page
>> unmapped and such an optimized routine would fault on a read from the
>> unmapped page.
>>
>> How do you plan to fix that?
>>
>> Cheers,
>> Carlos.
>
> I think the assumption has been that if an aligned word contains at
> least one byte of the string, it is OK to read that entire word. I
> don't think a single aligned word (4 or 8 bytes depending on the
> architecture) can straddle a page boundary. If the word was unaligned,
> then I think it could straddle a page boundary and you could get into
> trouble.
That is obviously correct because the aligned word itself wouldn't
cross the the page boundary. My only comment here is that reading
past the end of the string is a dangerous operation unless you take
care to ensure it doesn't cross a page boundary. Given that you've
clarified that it's only ever a an aligned word load, that can't
possibly cross a page boundary so you're safe.
Sorry for the noise.
Cheers,
Carlos.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Patch, mips] Faster strcmp for mips
2013-11-15 18:21 ` Steve Ellcey
@ 2013-11-15 19:02 ` Ondřej Bílka
2013-11-18 23:50 ` Steve Ellcey
0 siblings, 1 reply; 11+ messages in thread
From: Ondřej Bílka @ 2013-11-15 19:02 UTC (permalink / raw)
To: Steve Ellcey; +Cc: libc-ports
On Fri, Nov 15, 2013 at 10:20:04AM -0800, Steve Ellcey wrote:
> On Fri, 2013-11-15 at 00:14 +0100, OndÅej BÃlka wrote:
>
> > Problem is that aligned access could be only third of total calls. Also
> > around 90% calls need to check only first 16 bytes. These numbers differ
> > by application on x64 they are rule rather than exception. I collected
> > data of running gcc+gnuplot here:
> > http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_ivy_bridge/strcmp_profile/results_gcc/result.html
>
> Very interesting, you have obviously spent a lot more time then I have
> on strcmp.
>
> > > This means it could be loading bytes beyond the end of the strings being
> > > compared but it looks like other architecture specific strcmp functions
> > > are also doing this optimization and the newlib version of strcmp also does
> > > this.
> > >
> > Also unaligned case can be made faster with bit of extra effort.
> > How fast are unaligned loads on MIPS? They should be faster than
> > assembling value by shifts from two loads manualy which should be faster
> > than byte-by-byte checking.
>
> MIPS doesn't have unaligned loads but it has 'partial' loads where two
> loads can give you four (or eight) bytes in two load operations
> regardless of alignment. I may need to look at this some more though
> that will probably mean going to assembly language instead of C.
>
You can also first try this in c by using shifts.
>
> > Also here
> > > + bx.v = x; by.v = y;
> > > + if (bx.b.B0 - by.b.B0 != 0 || bx.b.B0 == 0)
> > > + return bx.b.B0 - by.b.B0;
> > > + if (bx.b.B1 - by.b.B1 != 0 || bx.b.B1 == 0)
> > > + return bx.b.B1 - by.b.B1;
> > > + if (bx.b.B2 - by.b.B2 != 0 || bx.b.B2 == 0)
> > > + return bx.b.B2 - by.b.B2;
> > > + if (bx.b.B3 - by.b.B3 != 0 || bx.b.B3 == 0)
> > > + return bx.b.B3 - by.b.B3;
> > > + if (bx.b.B4 - by.b.B4 != 0 || bx.b.B4 == 0)
> > > + return bx.b.B4 - by.b.B4;
> > > + if (bx.b.B5 - by.b.B5 != 0 || bx.b.B5 == 0)
> > > + return bx.b.B5 - by.b.B5;
> > > + if (bx.b.B6 - by.b.B6 != 0 || bx.b.B6 == 0)
> > > + return bx.b.B6 - by.b.B6;
> > > + return bx.b.B7 - by.b.B7;
> >
> > There is alternative way to find answer without branching.
> > You need to compute a number that has nonzero i-th byte if i-th bytes of
> > x and y differ or they are 0. Then find first nonzero bit which can be
> > done quickly by CLZ instruction.
> >
> > This could be implemented as following
> >
> > uint64_t bitmask = DETECTNULL8(x) | (x ^ y);
> > uint64_t bitfirst = bitmask ^ (bitmask - 1);
> > int pos = (ffsl(bitfirst) - 1) / 8;
> > return a[pos] - b[pos];
>
> This looks very interesting, I tried just plugging it in but it didn't
> work in some cases. I need to stare at this some more to understand
> exactly how it should work and what cases are failing for me.
>
I wrote that late at nigth and first idea was use __builtin_clz which
would yield following code.
uint64_t bitmask = DETECTNULL8(x) | (x ^ y);
uint64_t bitfirst = bitmask ^ (bitmask - 1);
int pos = (63 - __builtin_clzl(bitfirst)) / 8;
return a[pos] - b[pos];
I decided that using ffls was shorter but for some reasons I kept
bitfirst there. A correct version is
uint64_t bitmask = DETECTNULL8(x) | (x ^ y);
int pos = (ffsl(bitmask) - 1) / 8;
return a[pos] - b[pos];
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Patch, mips] Faster strcmp for mips
2013-11-15 19:02 ` Ondřej Bílka
@ 2013-11-18 23:50 ` Steve Ellcey
2013-11-19 2:32 ` Ondřej Bílka
0 siblings, 1 reply; 11+ messages in thread
From: Steve Ellcey @ 2013-11-18 23:50 UTC (permalink / raw)
To: Ondřej Bílka; +Cc: libc-ports
On Fri, 2013-11-15 at 20:02 +0100, Ondøej BĂlka wrote:
> I decided that using ffls was shorter but for some reasons I kept
> bitfirst there. A correct version is
>
> uint64_t bitmask = DETECTNULL8(x) | (x ^ y);
> int pos = (ffsl(bitmask) - 1) / 8;
> return a[pos] - b[pos];
Yes, that works much better. But it only works in little-endian mode. I
think I would need a fls (find last set) or something similar for
big-endian wouldn't I? Or else I would need to swap the bytes around
before using ffs/ffsl.
Steve Ellcey
sellcey@mips.com
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Patch, mips] Faster strcmp for mips
2013-11-18 23:50 ` Steve Ellcey
@ 2013-11-19 2:32 ` Ondřej Bílka
0 siblings, 0 replies; 11+ messages in thread
From: Ondřej Bílka @ 2013-11-19 2:32 UTC (permalink / raw)
To: Steve Ellcey; +Cc: libc-ports
On Mon, Nov 18, 2013 at 03:37:58PM -0800, Steve Ellcey wrote:
> On Fri, 2013-11-15 at 20:02 +0100, OndÅej BÃlka wrote:
>
> > I decided that using ffls was shorter but for some reasons I kept
> > bitfirst there. A correct version is
> >
> > uint64_t bitmask = DETECTNULL8(x) | (x ^ y);
> > int pos = (ffsl(bitmask) - 1) / 8;
> > return a[pos] - b[pos];
>
> Yes, that works much better. But it only works in little-endian mode. I
> think I would need a fls (find last set) or something similar for
> big-endian wouldn't I? Or else I would need to swap the bytes around
> before using ffs/ffsl.
>
Yes, a correct function is __builtin_clzl. Difference from ffs is that when you pass zero then result is undefined which should not be problem here.
There are more builtins here:
http://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Patch, mips] Faster strcmp for mips
2013-11-14 21:25 [Patch, mips] Faster strcmp for mips Steve Ellcey
2013-11-14 23:14 ` Ondřej Bílka
2013-11-15 18:48 ` Carlos O'Donell
@ 2015-02-19 7:25 ` Matt Turner
2015-02-19 18:12 ` Steve Ellcey
2 siblings, 1 reply; 11+ messages in thread
From: Matt Turner @ 2015-02-19 7:25 UTC (permalink / raw)
To: Steve Ellcey; +Cc: libc-ports
On Thu, Nov 14, 2013 at 1:23 PM, Steve Ellcey <sellcey@mips.com> wrote:
>
> I have created an optimized strcmp routine for MIPS and would like to check
> it in. I verified it by running 'make check' and I also ran 'make bench'
> to check the performance. This function is slightly slower then the current
> generic strcmp for strings that are not aligned, but much faster when strings
> are aligned because it will then load and compare either 2, 4, or 8 bytes at a
> time.
>
> This means it could be loading bytes beyond the end of the strings being
> compared but it looks like other architecture specific strcmp functions
> are also doing this optimization and the newlib version of strcmp also does
> this.
>
> OK to checkin?
>
> I have attached the original and new bench-strcmp.out files in case anyone
> wants to compare the old and new strcmp performance results.
>
> Steve Ellcey
> sellcey@mips.com
>
>
> 2013-11-14 Steve Ellcey <sellcey@mips.com>
>
> * sysdeps/mips/strcmp.c: New.
>
>
Doesn't seem that this was ever committed?
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Patch, mips] Faster strcmp for mips
2015-02-19 7:25 ` Matt Turner
@ 2015-02-19 18:12 ` Steve Ellcey
0 siblings, 0 replies; 11+ messages in thread
From: Steve Ellcey @ 2015-02-19 18:12 UTC (permalink / raw)
To: Matt Turner; +Cc: libc-ports
On Wed, 2015-02-18 at 23:24 -0800, Matt Turner wrote:
> > 2013-11-14 Steve Ellcey <sellcey@mips.com>
> >
> > * sysdeps/mips/strcmp.c: New.
> >
> >
>
> Doesn't seem that this was ever committed?
That specific patch wasn't checked in but a later patch was checked in
with an assembly language version of MIPS strcmp instead.
From glibc ChangeLog:
2014-10-01 Steve Ellcey <sellcey@mips.com>
* sysdeps/mips/strcmp.S: New.
Steve Ellcey
sellcey@imgtec.com
P.S. The libc-ports mailing list is no longer being used. the libc-alpha
or libc-help mailing lists should be used instead.
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2015-02-19 18:12 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-11-14 21:25 [Patch, mips] Faster strcmp for mips Steve Ellcey
2013-11-14 23:14 ` Ondřej Bílka
2013-11-15 18:21 ` Steve Ellcey
2013-11-15 19:02 ` Ondřej Bílka
2013-11-18 23:50 ` Steve Ellcey
2013-11-19 2:32 ` Ondřej Bílka
2013-11-15 18:48 ` Carlos O'Donell
2013-11-15 18:53 ` Steve Ellcey
2013-11-15 18:59 ` Carlos O'Donell
2015-02-19 7:25 ` Matt Turner
2015-02-19 18:12 ` Steve Ellcey
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).