[Patch, mips] Faster strcmp for mips

public inbox for libc-ports@sourceware.org
 help / color / mirror / Atom feed

* [Patch, mips] Faster strcmp for mips
@ 2013-11-14 21:25 Steve Ellcey
  2013-11-14 23:14 ` Ondřej Bílka
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Steve Ellcey @ 2013-11-14 21:25 UTC (permalink / raw)
  To: libc-ports

[-- Attachment #1: Type: text/plain, Size: 884 bytes --]


I have created an optimized strcmp routine for MIPS and would like to check
it in.  I verified it by running 'make check' and I also ran 'make bench'
to check the performance.  This function is slightly slower then the current
generic strcmp for strings that are not aligned, but much faster when strings
are aligned because it will then load and compare either 2, 4, or 8 bytes at a
time.

This means it could be loading bytes beyond the end of the strings being
compared but it looks like other architecture specific strcmp functions
are also doing this optimization and the newlib version of strcmp also does
this.

OK to checkin?

I have attached the original and new bench-strcmp.out files in case anyone
wants to compare the old and new strcmp performance results.

Steve Ellcey
sellcey@mips.com


2013-11-14  Steve Ellcey  <sellcey@mips.com>

	* sysdeps/mips/strcmp.c: New.



[-- Attachment #2: strcmp.patch --]
[-- Type: text/x-patch, Size: 5369 bytes --]

diff --git a/ports/sysdeps/mips/strcmp.c b/ports/sysdeps/mips/strcmp.c
new file mode 100644
index 0000000..8ce75cd
--- /dev/null
+++ b/ports/sysdeps/mips/strcmp.c
@@ -0,0 +1,194 @@
+/* Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+
+#ifdef _LIBC
+# include <string.h>
+# include <memcopy.h>
+# undef strcmp
+#endif
+
+#ifdef __mips64
+# define ENABLE64
+#endif
+
+/* The UNALIGNED* macros returns 0 if both addresses are aligned on
+   a 2, 4, or 8 byte boundry.  The DETECTNULL* macros return nonzero
+   if the 2, 4, or 8 byte value has a null byte in it.  If ENABLE64
+   is not set, we set UNALIGNED8 to '1' so that it always returns
+   TRUE and we never try to use 8 byte operations.  */
+
+#ifdef ENABLE64
+# define UNALIGNED8(X, Y) (((uintptr_t) X & (sizeof (uint64_t) - 1)) \
+			   | ((uintptr_t) Y & (sizeof (uint64_t) - 1)))
+# define DETECTNULL8(X) (((X) - 0x0101010101010101) \
+			 & ~((X) | 0x7f7f7f7f7f7f7f7f))
+#else
+# define UNALIGNED8(X, Y) (1)
+# define DETECTNULL8(X) (0)
+#endif
+
+
+typedef struct bits8
+{
+  unsigned int B0:8, B1:8, B2:8, B3:8, B4:8, B5:8, B6:8, B7:8;
+} bits8_t;
+typedef union
+{
+  uint64_t v;
+  bits8_t b;
+} bitfields8_t;
+
+
+#define UNALIGNED4(X, Y) (((uintptr_t) X & (sizeof (uint32_t) - 1)) \
+			  | ((uintptr_t) Y & (sizeof (uint32_t) - 1)))
+#define DETECTNULL4(X) (((X) - 0x01010101) & ~((X) | 0x7f7f7f7f))
+
+typedef struct bits4
+{
+  unsigned int B0:8, B1:8, B2:8, B3:8;
+} bits4_t;
+typedef union
+{
+  uint32_t v;
+  bits4_t b;
+} bitfields4_t;
+
+
+#define UNALIGNED2(X, Y) (((uintptr_t) X & (sizeof (uint16_t) - 1)) \
+			  | ((uintptr_t) Y & (sizeof (uint16_t) - 1)))
+#define DETECTNULL2(X) (((X) - 0x0101) & ~((X) | 0x7f7f))
+typedef struct bits2
+{
+  unsigned int B0:8, B1:8;
+} bits2_t;
+typedef union
+{
+  uint16_t v;
+  bits2_t b;
+} bitfields2_t;
+
+
+#ifndef STRCMP_NAME
+# define STRCMP_NAME strcmp
+#endif
+
+int
+STRCMP_NAME(const char *s1, const char *s2)
+{ 
+  if (__builtin_expect (UNALIGNED2 (s1, s2), 1))
+    {
+      /* s1 and s2 are unaligned, compare them one byte at a time.  */
+      while (*s1 != '\0' && *s1 == *s2)
+	{
+	  s1++;
+	  s2++;
+	}
+      return (*(unsigned char *) s1) - (*(unsigned char *) s2);
+    }
+
+  if (__builtin_expect (UNALIGNED4 (s1, s2), 1))
+    {
+      /* s1 and s2 are 2 byte aligned, compare them 2 bytes at a time.  */
+      uint16_t *a1 = (uint16_t *) s1;
+      uint16_t *a2 = (uint16_t *) s2;
+      uint16_t x, y;
+      bitfields2_t bx, by;
+      a1 = (uint16_t *) s1;
+      a2 = (uint16_t *) s2;
+      x = *(a1++);
+      y = *(a2++);
+      while (1)
+	{
+	  if (x != y)
+	    break;
+	  if (DETECTNULL2 (x))
+	    return 0;
+	  x = *(a1++);
+	  y = *(a2++);
+	}
+      bx.v = x; by.v = y;
+      if (bx.b.B0 - by.b.B0 != 0 || bx.b.B0 == 0)
+	return bx.b.B0 - by.b.B0;
+      return bx.b.B1 - by.b.B1;
+    }
+
+  if (__builtin_expect (UNALIGNED8 (s1, s2), 1))
+    {
+      /* s1 and s2 are 4 byte aligned, compare them 4 bytes at a time. */
+      uint32_t *a1 = (uint32_t *) s1;
+      uint32_t *a2 = (uint32_t *) s2;
+      uint32_t x, y;
+      bitfields4_t bx, by;
+      x = *(a1++);
+      y = *(a2++);
+      while (1)
+	{
+	  if (x != y)
+	    break;
+	  if (DETECTNULL4 (x))
+	    return 0;
+	  x = *(a1++);
+	  y = *(a2++);
+	}
+      bx.v = x; by.v = y;
+      if (bx.b.B0 - by.b.B0 != 0 || bx.b.B0 == 0)
+	return bx.b.B0 - by.b.B0;
+      if (bx.b.B1 - by.b.B1 != 0 || bx.b.B1 == 0)
+	return bx.b.B1 - by.b.B1;
+      if (bx.b.B2 - by.b.B2 != 0 || bx.b.B2 == 0)
+	return bx.b.B2 - by.b.B2;
+      return bx.b.B3 - by.b.B3;
+    }
+
+  /* s1 and s2 are 8 byte aligned, compare them 8 bytes at a time. */
+  uint64_t *a1 = (uint64_t *) s1;
+  uint64_t *a2 = (uint64_t *) s2;
+  uint64_t x, y;
+  bitfields8_t bx, by;
+  x = *(a1++);
+  y = *(a2++);
+  while (1)
+    {
+      if (x != y)
+	break;
+      if (DETECTNULL8 (x))
+	return 0;
+      x = *(a1++);
+      y = *(a2++);
+    }
+  bx.v = x; by.v = y;
+  if (bx.b.B0 - by.b.B0 != 0 || bx.b.B0 == 0)
+    return bx.b.B0 - by.b.B0;
+  if (bx.b.B1 - by.b.B1 != 0 || bx.b.B1 == 0)
+    return bx.b.B1 - by.b.B1;
+  if (bx.b.B2 - by.b.B2 != 0 || bx.b.B2 == 0)
+    return bx.b.B2 - by.b.B2;
+  if (bx.b.B3 - by.b.B3 != 0 || bx.b.B3 == 0)
+    return bx.b.B3 - by.b.B3;
+  if (bx.b.B4 - by.b.B4 != 0 || bx.b.B4 == 0)
+    return bx.b.B4 - by.b.B4;
+  if (bx.b.B5 - by.b.B5 != 0 || bx.b.B5 == 0)
+    return bx.b.B5 - by.b.B5;
+  if (bx.b.B6 - by.b.B6 != 0 || bx.b.B6 == 0)
+    return bx.b.B6 - by.b.B6;
+  return bx.b.B7 - by.b.B7;
+}
+#ifdef _LIBC
+libc_hidden_builtin_def (STRCMP_NAME)
+#endif

[-- Attachment #3: bench-strcmp.out.orig --]
[-- Type: text/plain, Size: 11146 bytes --]

                       	strcmp	simple_strcmp	stupid_strcmp
Length    1, alignment  1/ 1:	46.1875	41.375	124.203
Length    1, alignment  1/ 1:	22.375	26.25	68.5469
Length    1, alignment  1/ 1:	22.3281	24.125	72.8906
Length    2, alignment  2/ 2:	44.7656	45.625	108.781
Length    2, alignment  2/ 2:	32.0781	39.8906	107.375
Length    2, alignment  2/ 2:	31.7656	40.25	108.328
Length    3, alignment  3/ 3:	43.625	45.5156	107.094
Length    3, alignment  3/ 3:	50.2656	44.8125	105.703
Length    3, alignment  3/ 3:	50.8125	45.2656	106.625
Length    4, alignment  4/ 4:	49.625	49.1094	113.844
Length    4, alignment  4/ 4:	45.8438	44.1406	106.828
Length    4, alignment  4/ 4:	45.4688	43.7812	106.562
Length    5, alignment  5/ 5:	53.4844	54.4219	150.828
Length    5, alignment  5/ 5:	50.1875	48.1562	144.672
Length    5, alignment  5/ 5:	51.2344	48.3906	145.672
Length    6, alignment  6/ 6:	57.8281	57.8125	155.625
Length    6, alignment  6/ 6:	54.875	53.3125	149.531
Length    6, alignment  6/ 6:	54.9688	53.5156	148.156
Length    7, alignment  7/ 7:	63.2188	62.3438	147.422
Length    7, alignment  7/ 7:	60.2031	57.8594	142.656
Length    7, alignment  7/ 7:	60.1875	57.1562	140.984
Length    8, alignment  8/ 8:	67.2031	68.2188	155.672
Length    8, alignment  8/ 8:	66.7031	61.875	149.297
Length    8, alignment  8/ 8:	64.2344	62.8438	148.812
Length    9, alignment  9/ 9:	71.5	72.4375	191.797
Length    9, alignment  9/ 9:	68.8906	66.875	185.062
Length    9, alignment  9/ 9:	68.8906	66.4844	185.688
Length   10, alignment 10/10:	76.7812	77.375	195.672
Length   10, alignment 10/10:	74.3125	71.4688	189.203
Length   10, alignment 10/10:	73.2656	71.1406	189.547
Length   11, alignment 11/11:	80.875	81.6094	186.141
Length   11, alignment 11/11:	78.1875	76.0625	179.719
Length   11, alignment 11/11:	79.0625	75.5	181.125
Length   12, alignment 12/12:	85.5625	86.4062	175.469
Length   12, alignment 12/12:	83.2344	81.5469	169.344
Length   12, alignment 12/12:	83.9531	81.5312	169.375
Length   13, alignment 13/13:	90.5625	91.125	230.656
Length   13, alignment 13/13:	87.25	85.4062	224.688
Length   13, alignment 13/13:	87.4062	84.7656	224.75
Length   14, alignment 14/14:	94.8125	95.9219	212.281
Length   14, alignment 14/14:	92.2188	89.7812	206.172
Length   14, alignment 14/14:	92.25	90.625	206.516
Length   15, alignment 15/15:	98.8594	101.234	206.844
Length   15, alignment 15/15:	97.0312	94.8594	200.531
Length   15, alignment 15/15:	96.9844	94.8438	200.234
Length   16, alignment 16/16:	104.141	105.828	205.406
Length   16, alignment 16/16:	101.625	99.5781	199.531
Length   16, alignment 16/16:	102.938	99.8125	199.219
Length   17, alignment 17/17:	108.859	109.609	251.172
Length   17, alignment 17/17:	106.969	104.312	245.031
Length   17, alignment 17/17:	106.891	103.922	245.062
Length   18, alignment 18/18:	113.875	114.609	244.781
Length   18, alignment 18/18:	110.875	108.938	237.078
Length   18, alignment 18/18:	111.516	108.578	236.797
Length   19, alignment 19/19:	120.188	118.609	237.406
Length   19, alignment 19/19:	116.234	113.609	231.641
Length   19, alignment 19/19:	116.578	114.188	231.25
Length   20, alignment 20/20:	123.484	124.172	236.781
Length   20, alignment 20/20:	121.234	118.172	228.953
Length   20, alignment 20/20:	121.25	117.922	230.641
Length   21, alignment 21/21:	128.172	128.656	281.453
Length   21, alignment 21/21:	124.875	123.266	275.578
Length   21, alignment 21/21:	125.297	123.312	275.266
Length   22, alignment 22/22:	132.922	132.594	274.297
Length   22, alignment 22/22:	130.219	128.5	268.375
Length   22, alignment 22/22:	129.672	127.828	268.125
Length   23, alignment 23/23:	136.891	138.516	268.391
Length   23, alignment 23/23:	134.938	132.578	262.266
Length   23, alignment 23/23:	134.312	133.5	262.266
Length   24, alignment 24/24:	142.203	142.219	267.766
Length   24, alignment 24/24:	138.875	137.547	260.953
Length   24, alignment 24/24:	139.328	137.578	260.281
Length   25, alignment 25/25:	146.906	146.906	312.406
Length   25, alignment 25/25:	144.188	141.203	306.281
Length   25, alignment 25/25:	143.594	141.203	305.875
Length   26, alignment 26/26:	151.078	152.125	305.281
Length   26, alignment 26/26:	149.266	146.828	298.469
Length   26, alignment 26/26:	148.984	146.281	297.766
Length   27, alignment 27/27:	155.875	156.453	298.672
Length   27, alignment 27/27:	153.562	151.219	292.203
Length   27, alignment 27/27:	152.844	151.797	292.203
Length   28, alignment 28/28:	159.844	161.641	298.5
Length   28, alignment 28/28:	158.484	156.453	292.031
Length   28, alignment 28/28:	158.219	155.969	291.656
Length   29, alignment 29/29:	164.875	165.859	343.781
Length   29, alignment 29/29:	162.906	160.312	336.922
Length   29, alignment 29/29:	163.594	161.281	337.625
Length   30, alignment 30/30:	169.875	170.297	335.953
Length   30, alignment 30/30:	167.547	165.969	329.422
Length   30, alignment 30/30:	166.859	164.812	329.547
Length   31, alignment 31/31:	174.516	174.594	329.062
Length   31, alignment 31/31:	172.266	169.141	323.25
Length   31, alignment 31/31:	171.781	170.156	323.203
Length    4, alignment  0/ 0:	48.25	49.2031	113.25
Length    4, alignment  0/ 0:	48.8281	49.0781	111.969
Length    4, alignment  0/ 0:	46.3125	43.5469	106.125
Length    4, alignment  0/ 0:	45.5781	43.5	106.844
Length    4, alignment  0/ 0:	46.1875	43.4375	107.219
Length    4, alignment  0/ 0:	45.625	44.25	106.062
Length    4, alignment  0/ 1:	45.8594	44.9219	134
Length    4, alignment  1/ 2:	45.8281	43.4688	136.375
Length    8, alignment  0/ 0:	66.9375	67.4531	155.312
Length    8, alignment  0/ 0:	67.2812	67.0781	154.953
Length    8, alignment  0/ 0:	63.9375	62.1875	149.5
Length    8, alignment  0/ 0:	64.875	62.9062	148.078
Length    8, alignment  0/ 0:	64.9219	62.0469	148.578
Length    8, alignment  0/ 0:	64.6094	63.1406	149.25
Length    8, alignment  0/ 2:	65.1875	62.9219	168.969
Length    8, alignment  2/ 3:	64.8594	62.4062	163.953
Length   16, alignment  0/ 0:	105.125	104.797	205.422
Length   16, alignment  0/ 0:	104.266	104.938	205.422
Length   16, alignment  0/ 0:	102.406	100.516	200.375
Length   16, alignment  0/ 0:	101.938	99.625	199.656
Length   16, alignment  0/ 0:	102.562	99.4688	199.578
Length   16, alignment  0/ 0:	103.703	100.609	199.312
Length   16, alignment  0/ 3:	102.562	100.266	213.5
Length   16, alignment  3/ 4:	101.156	100.141	213.188
Length   32, alignment  0/ 0:	180.312	179.594	328.766
Length   32, alignment  0/ 0:	179.391	179.562	328.438
Length   32, alignment  0/ 0:	176.031	174.859	321.656
Length   32, alignment  0/ 0:	175.953	174.516	323.016
Length   32, alignment  0/ 0:	176.906	173.891	322.312
Length   32, alignment  0/ 0:	177.328	174.172	322.656
Length   32, alignment  0/ 4:	176.609	174.609	322
Length   32, alignment  4/ 5:	176.844	174.812	347.953
Length   64, alignment  0/ 0:	328.938	329.547	573.062
Length   64, alignment  0/ 0:	328.594	328.188	573.734
Length   64, alignment  0/ 0:	325.891	324.156	567.297
Length   64, alignment  0/ 0:	325.922	323.25	567.922
Length   64, alignment  0/ 0:	326.266	323.938	567.672
Length   64, alignment  0/ 0:	326.641	323.656	568.312
Length   64, alignment  0/ 5:	325.828	323.562	593.547
Length   64, alignment  5/ 6:	325.891	323.422	604.406
Length  128, alignment  0/ 0:	628	627.156	1064.38
Length  128, alignment  0/ 0:	626.594	626.781	1064.12
Length  128, alignment  0/ 0:	623.859	622.25	1063.12
Length  128, alignment  0/ 0:	624.453	622.25	1058.94
Length  128, alignment  0/ 0:	624.797	622.594	1059.06
Length  128, alignment  0/ 0:	624.516	622.25	1057.58
Length  128, alignment  0/ 6:	625.156	622.281	1075.31
Length  128, alignment  6/ 7:	624.484	621.562	1085.12
Length  256, alignment  0/ 0:	1362.39	1225.83	2046.11
Length  256, alignment  0/ 0:	1224.84	1225.56	2046.44
Length  256, alignment  0/ 0:	1221.45	1220.12	2039.59
Length  256, alignment  0/ 0:	1221.53	1219.48	2040.27
Length  256, alignment  0/ 0:	1222.94	1220.17	2040.31
Length  256, alignment  0/ 0:	1222.55	1219.78	2040.34
Length  256, alignment  0/ 7:	1222.16	1219.75	2053.31
Length  256, alignment  7/ 8:	1221.5	1220.16	2054.06
Length  512, alignment  0/ 0:	2421.22	2419.52	4009.11
Length  512, alignment  0/ 0:	2419.88	2420.09	4008.08
Length  512, alignment  0/ 0:	2416.55	2414.77	4003.3
Length  512, alignment  0/ 0:	2417.58	2413.47	4001.94
Length  512, alignment  0/ 0:	2417.2	2415.11	4004.34
Length  512, alignment  0/ 0:	2416.59	2414.73	4002.31
Length  512, alignment  0/ 8:	2416.45	2414.94	4002.61
Length  512, alignment  8/ 9:	2417.58	2414.09	4027.45
Length 1024, alignment  0/ 0:	4810.2	4809.06	7933.61
Length 1024, alignment  0/ 0:	4808.58	4809.42	8055.28
Length 1024, alignment  0/ 0:	4806.56	4803.86	7927.97
Length 1024, alignment  0/ 0:	4804.86	4804.42	7928.3
Length 1024, alignment  0/ 0:	4806.23	4803.39	7928.02
Length 1024, alignment  0/ 0:	4805.91	4803.56	7926.5
Length 1024, alignment  0/ 9:	4805.62	4803.2	7952.8
Length 1024, alignment  9/10:	4805.89	4803.16	7964.42
Length   16, alignment  1/ 2:	103.891	104.141	243.875
Length   16, alignment  2/ 1:	103.938	105.516	246.516
Length   16, alignment  1/ 2:	101.891	100.469	238.516
Length   16, alignment  2/ 1:	102.25	100.469	240.703
Length   16, alignment  1/ 2:	101.859	99.9219	237.312
Length   16, alignment  2/ 1:	101.5	99.8125	239.922
Length   32, alignment  2/ 4:	179.516	180.141	347.859
Length   32, alignment  4/ 2:	179.188	179.469	347.844
Length   32, alignment  2/ 4:	176.562	174.281	342.547
Length   32, alignment  4/ 2:	176.531	175.516	342.797
Length   32, alignment  2/ 4:	177.219	174.609	342.484
Length   32, alignment  4/ 2:	177.203	174.797	342.406
Length   64, alignment  3/ 6:	329.516	330.188	600.891
Length   64, alignment  6/ 3:	329.016	329.75	597.25
Length   64, alignment  3/ 6:	326.172	323.609	600.797
Length   64, alignment  6/ 3:	326.188	324.484	591.531
Length   64, alignment  3/ 6:	326.562	324.203	601.5
Length   64, alignment  6/ 3:	326.469	323.797	590.484
Length  128, alignment  4/ 8:	627.172	627.938	1064.44
Length  128, alignment  8/ 4:	626.125	731.984	1064.86
Length  128, alignment  4/ 8:	624.828	622.594	1063.17
Length  128, alignment  8/ 4:	624.828	622.609	1059.06
Length  128, alignment  4/ 8:	624.484	622.625	1058.72
Length  128, alignment  8/ 4:	624.109	622.641	1058.34
Length  256, alignment  5/10:	1225.45	1225.47	2083.92
Length  256, alignment 10/ 5:	1224.47	1226.22	2087.42
Length  256, alignment  5/10:	1222.14	1219.47	2077.34
Length  256, alignment 10/ 5:	1221.5	1219.91	2080.52
Length  256, alignment  5/10:	1221.81	1219.84	2076.56
Length  256, alignment 10/ 5:	1221.12	1220.78	2080.61
Length  512, alignment  6/12:	2419.5	2420.77	4028.08
Length  512, alignment 12/ 6:	2418.83	2420.06	4028.16
Length  512, alignment  6/12:	2416.11	2414.16	4021.72
Length  512, alignment 12/ 6:	2417.03	2415.11	4022.73
Length  512, alignment  6/12:	2417.12	2414.75	4022.12
Length  512, alignment 12/ 6:	2417.2	2415.16	4022.33
Length 1024, alignment  7/14:	4808.33	4809.05	7960.83
Length 1024, alignment 14/ 7:	4809.47	4809.05	7957.28
Length 1024, alignment  7/14:	4805.81	4951.92	7962
Length 1024, alignment 14/ 7:	4805.78	4803.53	7950.88
Length 1024, alignment  7/14:	4806.55	4803.22	7960.41
Length 1024, alignment 14/ 7:	4805.8	4802.88	7951.06

[-- Attachment #4: bench-strcmp.out.new --]
[-- Type: text/plain, Size: 11142 bytes --]

                       	strcmp	simple_strcmp	stupid_strcmp
Length    1, alignment  1/ 1:	55.2812	41.8438	113.406
Length    1, alignment  1/ 1:	27.3438	25.2969	62.8906
Length    1, alignment  1/ 1:	27.5	25.1094	63.3906
Length    2, alignment  2/ 2:	38.0312	45.1406	108.219
Length    2, alignment  2/ 2:	30.4219	39.1406	107.109
Length    2, alignment  2/ 2:	30.9688	40.4688	108.469
Length    3, alignment  3/ 3:	50.0938	44.7656	106.375
Length    3, alignment  3/ 3:	51.2656	44.75	106.312
Length    3, alignment  3/ 3:	51.0625	44.8906	106.719
Length    4, alignment  4/ 4:	39.4219	49.8906	117.719
Length    4, alignment  4/ 4:	37.3125	43.1562	110.891
Length    4, alignment  4/ 4:	36.8906	43.6094	112.312
Length    5, alignment  5/ 5:	55.9844	52.7812	150.312
Length    5, alignment  5/ 5:	54.125	47.9062	144.844
Length    5, alignment  5/ 5:	55.0625	47.4219	143.781
Length    6, alignment  6/ 6:	58.4219	57.4375	154.719
Length    6, alignment  6/ 6:	45.5	53.1719	149.016
Length    6, alignment  6/ 6:	45.9688	53.8125	149.375
Length    7, alignment  7/ 7:	67.5938	62.4219	147.344
Length    7, alignment  7/ 7:	64.2188	57.4531	141.578
Length    7, alignment  7/ 7:	64.4688	57.0938	141.25
Length    8, alignment  8/ 8:	48.6875	66.9531	157.344
Length    8, alignment  8/ 8:	41.875	62.1406	151.938
Length    8, alignment  8/ 8:	42.7188	62.3906	152.25
Length    9, alignment  9/ 9:	77.4688	72.4688	192.094
Length    9, alignment  9/ 9:	80.4688	66.1094	185.609
Length    9, alignment  9/ 9:	75.4531	66.4531	185.578
Length   10, alignment 10/10:	64.7812	76.4375	195.203
Length   10, alignment 10/10:	59.25	71.1875	190.094
Length   10, alignment 10/10:	58.2344	71.1406	190.484
Length   11, alignment 11/11:	88.625	81.9375	187.469
Length   11, alignment 11/11:	86.1562	76.1094	181.047
Length   11, alignment 11/11:	91.4844	76.4688	182.156
Length   12, alignment 12/12:	60.5312	85.7969	175
Length   12, alignment 12/12:	58.1562	81.0469	169.906
Length   12, alignment 12/12:	58.3594	81.8594	169.625
Length   13, alignment 13/13:	98.5781	91.1406	230.172
Length   13, alignment 13/13:	97.4844	85.8125	225.062
Length   13, alignment 13/13:	96.5625	85.7656	225.031
Length   14, alignment 14/14:	76.1562	96.0469	213.531
Length   14, alignment 14/14:	69.7031	90.4844	207.406
Length   14, alignment 14/14:	68.7969	91.2344	206.75
Length   15, alignment 15/15:	109.188	100.203	206.219
Length   15, alignment 15/15:	106.953	94.5312	200.391
Length   15, alignment 15/15:	106.891	94.1094	200.094
Length   16, alignment 16/16:	65.8594	104.75	206.297
Length   16, alignment 16/16:	63.8281	99.2031	200.469
Length   16, alignment 16/16:	64.1094	99.5	199.812
Length   17, alignment 17/17:	120.031	109.219	251.234
Length   17, alignment 17/17:	118.375	104.109	244.812
Length   17, alignment 17/17:	117.484	104.547	244.797
Length   18, alignment 18/18:	86.9688	114.656	242.156
Length   18, alignment 18/18:	79.4688	109	237
Length   18, alignment 18/18:	78.9844	108.375	236.719
Length   19, alignment 19/19:	130.016	117.844	236.859
Length   19, alignment 19/19:	128.438	114.828	231.391
Length   19, alignment 19/19:	128.812	113.531	230.688
Length   20, alignment 20/20:	66.1875	123.094	236.031
Length   20, alignment 20/20:	64.5	118.109	229.516
Length   20, alignment 20/20:	64.375	117.984	230.234
Length   21, alignment 21/21:	142	127.859	280.531
Length   21, alignment 21/21:	138.906	123.562	274.703
Length   21, alignment 21/21:	139.266	122.797	275.047
Length   22, alignment 22/22:	97.6406	133.797	274.141
Length   22, alignment 22/22:	90.0781	128.359	268.438
Length   22, alignment 22/22:	89.75	127.078	267.297
Length   23, alignment 23/23:	152.266	137.812	268.234
Length   23, alignment 23/23:	150.125	131.797	262.078
Length   23, alignment 23/23:	150.516	132.125	261.734
Length   24, alignment 24/24:	72.125	142.766	267.281
Length   24, alignment 24/24:	70.1875	136.812	260.578
Length   24, alignment 24/24:	70.4844	136.812	260.172
Length   25, alignment 25/25:	163.188	146.5	311.484
Length   25, alignment 25/25:	160.766	141.188	306.391
Length   25, alignment 25/25:	161.109	142.906	306.375
Length   26, alignment 26/26:	108.141	153.156	303.953
Length   26, alignment 26/26:	101.594	146.234	298.984
Length   26, alignment 26/26:	101.562	146.453	298.625
Length   27, alignment 27/27:	173.906	155.875	298.5
Length   27, alignment 27/27:	171.172	150.906	292.688
Length   27, alignment 27/27:	171.547	152.344	293.109
Length   28, alignment 28/28:	78.5625	161.109	298.438
Length   28, alignment 28/28:	75.5938	156.234	291.938
Length   28, alignment 28/28:	76.4844	155.141	291.234
Length   29, alignment 29/29:	183.672	164.875	343.562
Length   29, alignment 29/29:	187.797	160.172	336.312
Length   29, alignment 29/29:	182.438	159.562	337.453
Length   30, alignment 30/30:	118.812	172.016	335.703
Length   30, alignment 30/30:	112.375	164.172	329.266
Length   30, alignment 30/30:	111.766	165.25	328.906
Length   31, alignment 31/31:	194.656	174.406	330.141
Length   31, alignment 31/31:	193.109	169.734	323.391
Length   31, alignment 31/31:	197.812	169.938	323.75
Length    4, alignment  0/ 0:	38.875	49.5938	118.312
Length    4, alignment  0/ 0:	38.6406	48.0781	117.594
Length    4, alignment  0/ 0:	38.3281	43.7344	112.297
Length    4, alignment  0/ 0:	37.7188	44.2031	112.281
Length    4, alignment  0/ 0:	38.375	43.6562	111.562
Length    4, alignment  0/ 0:	37.8281	44.25	111.688
Length    4, alignment  0/ 1:	49.0156	43.7969	132.766
Length    4, alignment  1/ 2:	48.7969	43.5156	137.234
Length    8, alignment  0/ 0:	47.7344	67.3906	159.422
Length    8, alignment  0/ 0:	48.2344	68.2031	158.438
Length    8, alignment  0/ 0:	41.25	63.5469	151.547
Length    8, alignment  0/ 0:	42.4531	62.1719	151.844
Length    8, alignment  0/ 0:	41.4688	62.1562	152.562
Length    8, alignment  0/ 0:	42.5469	62.1562	152.203
Length    8, alignment  0/ 2:	56.75	61.7969	168.031
Length    8, alignment  2/ 3:	75.2812	62.8125	164.594
Length   16, alignment  0/ 0:	65.5	106.109	206.344
Length   16, alignment  0/ 0:	65.625	104.938	206
Length   16, alignment  0/ 0:	64.5781	99.9062	200.25
Length   16, alignment  0/ 0:	63.3594	99.6094	199.812
Length   16, alignment  0/ 0:	64.2812	99.1562	199.172
Length   16, alignment  0/ 0:	63.4531	99.8281	198.75
Length   16, alignment  0/ 3:	112.844	100.438	213.266
Length   16, alignment  3/ 4:	113.141	99.3906	213.734
Length   32, alignment  0/ 0:	83.4688	178.984	329.078
Length   32, alignment  0/ 0:	82.9062	179.703	328.719
Length   32, alignment  0/ 0:	81.0156	174.141	321.469
Length   32, alignment  0/ 0:	81.0312	173.859	321.906
Length   32, alignment  0/ 0:	80.1719	174.5	425.094
Length   32, alignment  0/ 0:	82.1719	175.062	322.578
Length   32, alignment  0/ 4:	81.5625	174.844	322.266
Length   32, alignment  4/ 5:	197.641	174.484	348.094
Length   64, alignment  0/ 0:	124.781	328.891	573.891
Length   64, alignment  0/ 0:	125.703	329.562	574.234
Length   64, alignment  0/ 0:	124	323.562	568.922
Length   64, alignment  0/ 0:	123.469	322.922	568.438
Length   64, alignment  0/ 0:	123.391	324.016	567.812
Length   64, alignment  0/ 0:	124.141	323.969	568.109
Length   64, alignment  0/ 5:	368.688	323.5	593.297
Length   64, alignment  5/ 6:	369.141	323.75	604.578
Length  128, alignment  0/ 0:	211.188	627.578	1064.97
Length  128, alignment  0/ 0:	209.875	628.906	1064.69
Length  128, alignment  0/ 0:	209.312	622.891	1062.7
Length  128, alignment  0/ 0:	208.938	622.125	1058.53
Length  128, alignment  0/ 0:	208.938	622.547	1059.25
Length  128, alignment  0/ 0:	209	622.281	1058.88
Length  128, alignment  0/ 6:	372.406	622.109	1075.41
Length  128, alignment  6/ 7:	711.203	622.203	1086.06
Length  256, alignment  0/ 0:	381.953	1225.98	2045.91
Length  256, alignment  0/ 0:	381.203	1225.58	2046.22
Length  256, alignment  0/ 0:	380.703	1219.52	2040.84
Length  256, alignment  0/ 0:	379.312	1219.27	2040.48
Length  256, alignment  0/ 0:	379.281	1218.56	2040.47
Length  256, alignment  0/ 0:	379.953	1220.17	2040.84
Length  256, alignment  0/ 7:	1392.52	1219.81	2053.27
Length  256, alignment  7/ 8:	1393.08	1220.58	2053.02
Length  512, alignment  0/ 0:	724.828	2420.86	4009.06
Length  512, alignment  0/ 0:	722.562	2420.8	4008.62
Length  512, alignment  0/ 0:	721.953	2414.12	4002.53
Length  512, alignment  0/ 0:	720.938	2413.97	4002.19
Length  512, alignment  0/ 0:	721.422	2414.14	4002.23
Length  512, alignment  0/ 0:	721.016	2414.61	4002.89
Length  512, alignment  0/ 8:	720.969	2415.02	4001.16
Length  512, alignment  8/ 9:	2758.58	2414.36	4027.84
Length 1024, alignment  0/ 0:	1737.12	4809.94	7934.03
Length 1024, alignment  0/ 0:	1405.19	4808.14	7933.61
Length 1024, alignment  0/ 0:	1402.95	4803.48	7927.91
Length 1024, alignment  0/ 0:	1403.48	4803.72	7927.17
Length 1024, alignment  0/ 0:	1403.58	4804.3	7928.25
Length 1024, alignment  0/ 0:	1403.67	4802.84	7927.83
Length 1024, alignment  0/ 9:	5488.84	4803.41	7952.75
Length 1024, alignment  9/10:	5488.34	4803.36	7964.98
Length   16, alignment  1/ 2:	114.516	104.156	242.672
Length   16, alignment  2/ 1:	114.625	104.5	245.531
Length   16, alignment  1/ 2:	112.125	99.9062	236.859
Length   16, alignment  2/ 1:	112.469	100.516	240.578
Length   16, alignment  1/ 2:	113.234	99.5156	237.172
Length   16, alignment  2/ 1:	112.781	99.8906	240.547
Length   32, alignment  2/ 4:	124.156	179.531	347.328
Length   32, alignment  4/ 2:	123.797	179.766	348
Length   32, alignment  2/ 4:	116.953	175.125	341.688
Length   32, alignment  4/ 2:	117.516	174.422	341.141
Length   32, alignment  2/ 4:	116.781	173.172	341.25
Length   32, alignment  4/ 2:	116.672	174.203	341.172
Length   64, alignment  3/ 6:	371.234	329.906	599.672
Length   64, alignment  6/ 3:	370.562	329.531	596.078
Length   64, alignment  3/ 6:	368.891	322.844	601.031
Length   64, alignment  6/ 3:	374.469	323.875	600.531
Length   64, alignment  3/ 6:	374.281	324.594	591.125
Length   64, alignment  6/ 3:	373.844	323.875	600.578
Length  128, alignment  4/ 8:	210.625	627.438	1063.94
Length  128, alignment  8/ 4:	210.469	628.234	1065.33
Length  128, alignment  4/ 8:	209.859	622.984	1061.83
Length  128, alignment  8/ 4:	209.641	622.219	1057.14
Length  128, alignment  4/ 8:	209.078	622.609	1057.91
Length  128, alignment  8/ 4:	208.469	621.453	1058.62
Length  256, alignment  5/10:	1399.86	1225.31	2084.86
Length  256, alignment 10/ 5:	1394.7	1225.75	2086.88
Length  256, alignment  5/10:	1393.8	1220.42	2078.56
Length  256, alignment 10/ 5:	1397.73	1219.47	2080.16
Length  256, alignment  5/10:	1398.42	1220.52	2076.86
Length  256, alignment 10/ 5:	1392.72	1219.55	2080.2
Length  512, alignment  6/12:	1403.19	2419.84	4029.84
Length  512, alignment 12/ 6:	1403.45	2419.45	4028.77
Length  512, alignment  6/12:	1397.28	2414.45	4022.02
Length  512, alignment 12/ 6:	1396.98	2414.45	4021.92
Length  512, alignment  6/12:	1396.98	2414.42	4021.33
Length  512, alignment 12/ 6:	1396.44	2413.98	4021.95
Length 1024, alignment  7/14:	5491.94	4808.83	7959.73
Length 1024, alignment 14/ 7:	5490.12	4809.33	7956.56
Length 1024, alignment  7/14:	5627.53	4804.8	7952.16
Length 1024, alignment 14/ 7:	5488.05	4802.39	7960.66
Length 1024, alignment  7/14:	5488.42	4803.78	7951.44
Length 1024, alignment 14/ 7:	5488.7	4803.83	7959.84

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Patch, mips] Faster strcmp for mips
  2013-11-14 21:25 [Patch, mips] Faster strcmp for mips Steve Ellcey
@ 2013-11-14 23:14 ` Ondřej Bílka
  2013-11-15 18:21   ` Steve Ellcey
  2013-11-15 18:48 ` Carlos O'Donell
  2015-02-19  7:25 ` Matt Turner
  2 siblings, 1 reply; 11+ messages in thread
From: Ondřej Bílka @ 2013-11-14 23:14 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: libc-ports

On Thu, Nov 14, 2013 at 01:23:41PM -0800, Steve Ellcey wrote:
> 
> I have created an optimized strcmp routine for MIPS and would like to check
> it in.  I verified it by running 'make check' and I also ran 'make bench'
> to check the performance.  This function is slightly slower then the current
> generic strcmp for strings that are not aligned, but much faster when strings
> are aligned because it will then load and compare either 2, 4, or 8 bytes at a
> time.
>
Problem is that aligned access could be only third of total calls. Also
around 90% calls need to check only first 16 bytes. These numbers differ
by application on x64 they are rule rather than exception. I collected
data of running gcc+gnuplot here:
http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_ivy_bridge/strcmp_profile/results_gcc/result.html 

> This means it could be loading bytes beyond the end of the strings being
> compared but it looks like other architecture specific strcmp functions
> are also doing this optimization and the newlib version of strcmp also does
> this.
> 
Also unaligned case can be made faster with bit of extra effort.
How fast are unaligned loads on MIPS? They should be faster than
assembling value by shifts from two loads manualy which should be faster
than byte-by-byte checking.

What is most improtant for performance is handle first bytes quickly.
You need to check several alternatives, one is unroll checking of first
8-16 bytes. Second is to do a single vector check.

Then you need to implement a loop, what I did on x64 is align one
argument and load second as unaligned.

You need to precompute when a unaligned access would cross page boundary
and switch to byte-by-byte loop when that happens.

Here is modified code that I used to generate x64 implementation, I hope
it will be useful.

You need to change a TEST(a,b) to code that checks 8 bytes starting at a
and b. 

Also you may try to unroll these loops to do 16/32 bytes per iteration.

int
strcmp_new (char *a, char *b)
{
  int i;
  int or = (((unsigned long) a) | ((unsigned long) b)) & 4095;
  if (or <= 4096 - 8)
    {
      TEST (a, b);
    }
  else
    {
      for (i = 0; i < 8; i++)
	{
	  if (!a[i] || (a[i] - b[i]))
	    return a[i] - b[i];
	}
    }

  char *a_al = ALIGN_DOWN (a + 8, 8);
  int dif = a_al - a;
  a += dif;
  b += dif;
  unsigned long b_next = ((unsigned long) b) & (4096 - 1);
  unsigned long next_page = (4096 - b_next) / 8;

  while (1)
    {
      if (__builtin_expect (next_page-- == 0, 0))
	{
	  next_page = 4096 / 8;
	  next_page--;
	  for (i = 0; i < 8; i++)
	    {
	      if (!a[i] || (a[i] - b[i]))
		return a[i] - b[i];
	    }
	}
      TEST (a, b);
      a += 8;
      b += 8;
    }
}

Also here
> +  bx.v = x; by.v = y;
> +  if (bx.b.B0 - by.b.B0 != 0 || bx.b.B0 == 0)
> +    return bx.b.B0 - by.b.B0;
> +  if (bx.b.B1 - by.b.B1 != 0 || bx.b.B1 == 0)
> +    return bx.b.B1 - by.b.B1;
> +  if (bx.b.B2 - by.b.B2 != 0 || bx.b.B2 == 0)
> +    return bx.b.B2 - by.b.B2;
> +  if (bx.b.B3 - by.b.B3 != 0 || bx.b.B3 == 0)
> +    return bx.b.B3 - by.b.B3;
> +  if (bx.b.B4 - by.b.B4 != 0 || bx.b.B4 == 0)
> +    return bx.b.B4 - by.b.B4;
> +  if (bx.b.B5 - by.b.B5 != 0 || bx.b.B5 == 0)
> +    return bx.b.B5 - by.b.B5;
> +  if (bx.b.B6 - by.b.B6 != 0 || bx.b.B6 == 0)
> +    return bx.b.B6 - by.b.B6;
> +  return bx.b.B7 - by.b.B7;

There is alternative way to find answer without branching.
You need to compute a number that has nonzero i-th byte if i-th bytes of
x and y differ or they are 0. Then find first nonzero bit which can be
done quickly by CLZ instruction. 

This could be implemented as following

uint64_t bitmask = DETECTNULL8(x) | (x ^ y);
uint64_t bitfirst = bitmask ^ (bitmask - 1);
int pos = (ffsl(bitfirst) - 1) / 8;
return a[pos] - b[pos];

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Patch, mips] Faster strcmp for mips
  2013-11-14 23:14 ` Ondřej Bílka
@ 2013-11-15 18:21   ` Steve Ellcey
  2013-11-15 19:02     ` Ondřej Bílka
  0 siblings, 1 reply; 11+ messages in thread
From: Steve Ellcey @ 2013-11-15 18:21 UTC (permalink / raw)
  To: Ondřej Bílka; +Cc: libc-ports

On Fri, 2013-11-15 at 00:14 +0100, OndĂ¸ej BĂlka wrote:

> Problem is that aligned access could be only third of total calls. Also
> around 90% calls need to check only first 16 bytes. These numbers differ
> by application on x64 they are rule rather than exception. I collected
> data of running gcc+gnuplot here:
> http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_ivy_bridge/strcmp_profile/results_gcc/result.html 

Very interesting, you have obviously spent a lot more time then I have
on strcmp.

> > This means it could be loading bytes beyond the end of the strings being
> > compared but it looks like other architecture specific strcmp functions
> > are also doing this optimization and the newlib version of strcmp also does
> > this.
> > 
> Also unaligned case can be made faster with bit of extra effort.
> How fast are unaligned loads on MIPS? They should be faster than
> assembling value by shifts from two loads manualy which should be faster
> than byte-by-byte checking.

MIPS doesn't have unaligned loads but it has 'partial' loads where two
loads can give you four (or eight) bytes in two load operations
regardless of alignment.  I may need to look at this some more though
that will probably mean going to assembly language instead of C.


> Also here
> > +  bx.v = x; by.v = y;
> > +  if (bx.b.B0 - by.b.B0 != 0 || bx.b.B0 == 0)
> > +    return bx.b.B0 - by.b.B0;
> > +  if (bx.b.B1 - by.b.B1 != 0 || bx.b.B1 == 0)
> > +    return bx.b.B1 - by.b.B1;
> > +  if (bx.b.B2 - by.b.B2 != 0 || bx.b.B2 == 0)
> > +    return bx.b.B2 - by.b.B2;
> > +  if (bx.b.B3 - by.b.B3 != 0 || bx.b.B3 == 0)
> > +    return bx.b.B3 - by.b.B3;
> > +  if (bx.b.B4 - by.b.B4 != 0 || bx.b.B4 == 0)
> > +    return bx.b.B4 - by.b.B4;
> > +  if (bx.b.B5 - by.b.B5 != 0 || bx.b.B5 == 0)
> > +    return bx.b.B5 - by.b.B5;
> > +  if (bx.b.B6 - by.b.B6 != 0 || bx.b.B6 == 0)
> > +    return bx.b.B6 - by.b.B6;
> > +  return bx.b.B7 - by.b.B7;
> 
> There is alternative way to find answer without branching.
> You need to compute a number that has nonzero i-th byte if i-th bytes of
> x and y differ or they are 0. Then find first nonzero bit which can be
> done quickly by CLZ instruction. 
> 
> This could be implemented as following
> 
> uint64_t bitmask = DETECTNULL8(x) | (x ^ y);
> uint64_t bitfirst = bitmask ^ (bitmask - 1);
> int pos = (ffsl(bitfirst) - 1) / 8;
> return a[pos] - b[pos];

This looks very interesting, I tried just plugging it in but it didn't
work in some cases.  I need to stare at this some more to understand
exactly how it should work and what cases are failing for me.

Steve Ellcey
sellcey@mips.com


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Patch, mips] Faster strcmp for mips
  2013-11-14 21:25 [Patch, mips] Faster strcmp for mips Steve Ellcey
  2013-11-14 23:14 ` Ondřej Bílka
@ 2013-11-15 18:48 ` Carlos O'Donell
  2013-11-15 18:53   ` Steve Ellcey
  2015-02-19  7:25 ` Matt Turner
  2 siblings, 1 reply; 11+ messages in thread
From: Carlos O'Donell @ 2013-11-15 18:48 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: libc-ports

On Thu, Nov 14, 2013 at 4:23 PM, Steve Ellcey <sellcey@mips.com> wrote:
> This means it could be loading bytes beyond the end of the strings being
> compared but it looks like other architecture specific strcmp functions
> are also doing this optimization and the newlib version of strcmp also does
> this.

I thought that doing so was dangerous? I'm pretty sure we've been trying
to fix such "load bytes beyond the end of the string" issues because you
could have a string that straddles a page boundary with the next page
unmapped and such an optimized routine would fault on a read from the
unmapped page.

How do you plan to fix that?

Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Patch, mips] Faster strcmp for mips
  2013-11-15 18:48 ` Carlos O'Donell
@ 2013-11-15 18:53   ` Steve Ellcey
  2013-11-15 18:59     ` Carlos O'Donell
  0 siblings, 1 reply; 11+ messages in thread
From: Steve Ellcey @ 2013-11-15 18:53 UTC (permalink / raw)
  To: Carlos O'Donell; +Cc: libc-ports

On Fri, 2013-11-15 at 13:47 -0500, Carlos O'Donell wrote:
> On Thu, Nov 14, 2013 at 4:23 PM, Steve Ellcey <sellcey@mips.com> wrote:
> > This means it could be loading bytes beyond the end of the strings being
> > compared but it looks like other architecture specific strcmp functions
> > are also doing this optimization and the newlib version of strcmp also does
> > this.
> 
> I thought that doing so was dangerous? I'm pretty sure we've been trying
> to fix such "load bytes beyond the end of the string" issues because you
> could have a string that straddles a page boundary with the next page
> unmapped and such an optimized routine would fault on a read from the
> unmapped page.
> 
> How do you plan to fix that?
> 
> Cheers,
> Carlos.

I think the assumption has been that if an aligned word contains at
least one byte of the string, it is OK to read that entire word.  I
don't think a single aligned word (4 or 8 bytes depending on the
architecture) can straddle a page boundary.  If the word was unaligned,
then I think it could straddle a page boundary and you could get into
trouble.

Steve Ellcey
sellcey@mips.com


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Patch, mips] Faster strcmp for mips
  2013-11-15 18:53   ` Steve Ellcey
@ 2013-11-15 18:59     ` Carlos O'Donell
  0 siblings, 0 replies; 11+ messages in thread
From: Carlos O'Donell @ 2013-11-15 18:59 UTC (permalink / raw)
  To: Steve Ellcey, Carlos O'Donell; +Cc: libc-ports

On 11/15/2013 01:52 PM, Steve Ellcey wrote:
> On Fri, 2013-11-15 at 13:47 -0500, Carlos O'Donell wrote:
>> On Thu, Nov 14, 2013 at 4:23 PM, Steve Ellcey <sellcey@mips.com> wrote:
>>> This means it could be loading bytes beyond the end of the strings being
>>> compared but it looks like other architecture specific strcmp functions
>>> are also doing this optimization and the newlib version of strcmp also does
>>> this.
>>
>> I thought that doing so was dangerous? I'm pretty sure we've been trying
>> to fix such "load bytes beyond the end of the string" issues because you
>> could have a string that straddles a page boundary with the next page
>> unmapped and such an optimized routine would fault on a read from the
>> unmapped page.
>>
>> How do you plan to fix that?
>>
>> Cheers,
>> Carlos.
> 
> I think the assumption has been that if an aligned word contains at
> least one byte of the string, it is OK to read that entire word.  I
> don't think a single aligned word (4 or 8 bytes depending on the
> architecture) can straddle a page boundary.  If the word was unaligned,
> then I think it could straddle a page boundary and you could get into
> trouble.

That is obviously correct because the aligned word itself wouldn't
cross the the page boundary. My only comment here is that reading
past the end of the string is a dangerous operation unless you take
care to ensure it doesn't cross a page boundary. Given that you've
clarified that it's only ever a an aligned word load, that can't
possibly cross a page boundary so you're safe.

Sorry for the noise.

Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Patch, mips] Faster strcmp for mips
  2013-11-15 18:21   ` Steve Ellcey
@ 2013-11-15 19:02     ` Ondřej Bílka
  2013-11-18 23:50       ` Steve Ellcey
  0 siblings, 1 reply; 11+ messages in thread
From: Ondřej Bílka @ 2013-11-15 19:02 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: libc-ports

On Fri, Nov 15, 2013 at 10:20:04AM -0800, Steve Ellcey wrote:
> On Fri, 2013-11-15 at 00:14 +0100, OndÅ™ej BÃlka wrote:
> 
> > Problem is that aligned access could be only third of total calls. Also
> > around 90% calls need to check only first 16 bytes. These numbers differ
> > by application on x64 they are rule rather than exception. I collected
> > data of running gcc+gnuplot here:
> > http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_ivy_bridge/strcmp_profile/results_gcc/result.html 
> 
> Very interesting, you have obviously spent a lot more time then I have
> on strcmp.
> 
> > > This means it could be loading bytes beyond the end of the strings being
> > > compared but it looks like other architecture specific strcmp functions
> > > are also doing this optimization and the newlib version of strcmp also does
> > > this.
> > > 
> > Also unaligned case can be made faster with bit of extra effort.
> > How fast are unaligned loads on MIPS? They should be faster than
> > assembling value by shifts from two loads manualy which should be faster
> > than byte-by-byte checking.
> 
> MIPS doesn't have unaligned loads but it has 'partial' loads where two
> loads can give you four (or eight) bytes in two load operations
> regardless of alignment.  I may need to look at this some more though
> that will probably mean going to assembly language instead of C.
> 
You can also first try this in c by using shifts.
> 
> > Also here
> > > +  bx.v = x; by.v = y;
> > > +  if (bx.b.B0 - by.b.B0 != 0 || bx.b.B0 == 0)
> > > +    return bx.b.B0 - by.b.B0;
> > > +  if (bx.b.B1 - by.b.B1 != 0 || bx.b.B1 == 0)
> > > +    return bx.b.B1 - by.b.B1;
> > > +  if (bx.b.B2 - by.b.B2 != 0 || bx.b.B2 == 0)
> > > +    return bx.b.B2 - by.b.B2;
> > > +  if (bx.b.B3 - by.b.B3 != 0 || bx.b.B3 == 0)
> > > +    return bx.b.B3 - by.b.B3;
> > > +  if (bx.b.B4 - by.b.B4 != 0 || bx.b.B4 == 0)
> > > +    return bx.b.B4 - by.b.B4;
> > > +  if (bx.b.B5 - by.b.B5 != 0 || bx.b.B5 == 0)
> > > +    return bx.b.B5 - by.b.B5;
> > > +  if (bx.b.B6 - by.b.B6 != 0 || bx.b.B6 == 0)
> > > +    return bx.b.B6 - by.b.B6;
> > > +  return bx.b.B7 - by.b.B7;
> > 
> > There is alternative way to find answer without branching.
> > You need to compute a number that has nonzero i-th byte if i-th bytes of
> > x and y differ or they are 0. Then find first nonzero bit which can be
> > done quickly by CLZ instruction. 
> > 
> > This could be implemented as following
> > 
> > uint64_t bitmask = DETECTNULL8(x) | (x ^ y);
> > uint64_t bitfirst = bitmask ^ (bitmask - 1);
> > int pos = (ffsl(bitfirst) - 1) / 8;
> > return a[pos] - b[pos];
> 
> This looks very interesting, I tried just plugging it in but it didn't
> work in some cases.  I need to stare at this some more to understand
> exactly how it should work and what cases are failing for me.
>
I wrote that late at nigth and first idea was use __builtin_clz which
would yield following code.

 uint64_t bitmask = DETECTNULL8(x) | (x ^ y);
 uint64_t bitfirst = bitmask ^ (bitmask - 1);
 int pos = (63 - __builtin_clzl(bitfirst)) / 8;
 return a[pos] - b[pos];

I decided that using ffls was shorter but for some reasons I kept
bitfirst there. A correct version is

 uint64_t bitmask = DETECTNULL8(x) | (x ^ y);
 int pos = (ffsl(bitmask) - 1) / 8;
 return a[pos] - b[pos];

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Patch, mips] Faster strcmp for mips
  2013-11-15 19:02     ` Ondřej Bílka
@ 2013-11-18 23:50       ` Steve Ellcey
  2013-11-19  2:32         ` Ondřej Bílka
  0 siblings, 1 reply; 11+ messages in thread
From: Steve Ellcey @ 2013-11-18 23:50 UTC (permalink / raw)
  To: Ondřej Bílka; +Cc: libc-ports

On Fri, 2013-11-15 at 20:02 +0100, OndĂ¸ej BĂlka wrote:

> I decided that using ffls was shorter but for some reasons I kept
> bitfirst there. A correct version is
> 
>  uint64_t bitmask = DETECTNULL8(x) | (x ^ y);
>  int pos = (ffsl(bitmask) - 1) / 8;
>  return a[pos] - b[pos];

Yes, that works much better.  But it only works in little-endian mode. I
think I would need a fls (find last set) or something similar for
big-endian wouldn't I?  Or else I would need to swap the bytes around
before using ffs/ffsl.

Steve Ellcey
sellcey@mips.com



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Patch, mips] Faster strcmp for mips
  2013-11-18 23:50       ` Steve Ellcey
@ 2013-11-19  2:32         ` Ondřej Bílka
  0 siblings, 0 replies; 11+ messages in thread
From: Ondřej Bílka @ 2013-11-19  2:32 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: libc-ports

On Mon, Nov 18, 2013 at 03:37:58PM -0800, Steve Ellcey wrote:
> On Fri, 2013-11-15 at 20:02 +0100, OndÅ™ej BÃlka wrote:
> 
> > I decided that using ffls was shorter but for some reasons I kept
> > bitfirst there. A correct version is
> > 
> >  uint64_t bitmask = DETECTNULL8(x) | (x ^ y);
> >  int pos = (ffsl(bitmask) - 1) / 8;
> >  return a[pos] - b[pos];
> 
> Yes, that works much better.  But it only works in little-endian mode. I
> think I would need a fls (find last set) or something similar for
> big-endian wouldn't I?  Or else I would need to swap the bytes around
> before using ffs/ffsl.
> 
Yes, a correct function is __builtin_clzl. Difference from ffs is that when you pass zero then result is undefined which should not be problem here.

There are more builtins here:
http://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Patch, mips] Faster strcmp for mips
  2013-11-14 21:25 [Patch, mips] Faster strcmp for mips Steve Ellcey
  2013-11-14 23:14 ` Ondřej Bílka
  2013-11-15 18:48 ` Carlos O'Donell
@ 2015-02-19  7:25 ` Matt Turner
  2015-02-19 18:12   ` Steve Ellcey
  2 siblings, 1 reply; 11+ messages in thread
From: Matt Turner @ 2015-02-19  7:25 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: libc-ports

On Thu, Nov 14, 2013 at 1:23 PM, Steve Ellcey <sellcey@mips.com> wrote:
>
> I have created an optimized strcmp routine for MIPS and would like to check
> it in.  I verified it by running 'make check' and I also ran 'make bench'
> to check the performance.  This function is slightly slower then the current
> generic strcmp for strings that are not aligned, but much faster when strings
> are aligned because it will then load and compare either 2, 4, or 8 bytes at a
> time.
>
> This means it could be loading bytes beyond the end of the strings being
> compared but it looks like other architecture specific strcmp functions
> are also doing this optimization and the newlib version of strcmp also does
> this.
>
> OK to checkin?
>
> I have attached the original and new bench-strcmp.out files in case anyone
> wants to compare the old and new strcmp performance results.
>
> Steve Ellcey
> sellcey@mips.com
>
>
> 2013-11-14  Steve Ellcey  <sellcey@mips.com>
>
>         * sysdeps/mips/strcmp.c: New.
>
>

Doesn't seem that this was ever committed?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Patch, mips] Faster strcmp for mips
  2015-02-19  7:25 ` Matt Turner
@ 2015-02-19 18:12   ` Steve Ellcey
  0 siblings, 0 replies; 11+ messages in thread
From: Steve Ellcey @ 2015-02-19 18:12 UTC (permalink / raw)
  To: Matt Turner; +Cc: libc-ports

On Wed, 2015-02-18 at 23:24 -0800, Matt Turner wrote:

> > 2013-11-14  Steve Ellcey  <sellcey@mips.com>
> >
> >         * sysdeps/mips/strcmp.c: New.
> >
> >
> 
> Doesn't seem that this was ever committed?

That specific patch wasn't checked in but a later patch was checked in
with an assembly language version of MIPS strcmp instead.

From glibc ChangeLog:


2014-10-01  Steve Ellcey  <sellcey@mips.com>

        * sysdeps/mips/strcmp.S: New.


Steve Ellcey
sellcey@imgtec.com

P.S. The libc-ports mailing list is no longer being used. the libc-alpha
or libc-help mailing lists should be used instead.

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2015-02-19 18:12 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-11-14 21:25 [Patch, mips] Faster strcmp for mips Steve Ellcey
2013-11-14 23:14 ` Ondřej Bílka
2013-11-15 18:21   ` Steve Ellcey
2013-11-15 19:02     ` Ondřej Bílka
2013-11-18 23:50       ` Steve Ellcey
2013-11-19  2:32         ` Ondřej Bílka
2013-11-15 18:48 ` Carlos O'Donell
2013-11-15 18:53   ` Steve Ellcey
2013-11-15 18:59     ` Carlos O'Donell
2015-02-19  7:25 ` Matt Turner
2015-02-19 18:12   ` Steve Ellcey

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).