public inbox for libc-ports@sourceware.org
 help / color / mirror / Atom feed
* [patch, mips] Improved memset for MIPS
@ 2013-09-05 17:06 Steve Ellcey 
  2013-09-06  0:40 ` Mike Frysinger
                   ` (2 more replies)
  0 siblings, 3 replies; 32+ messages in thread
From: Steve Ellcey  @ 2013-09-05 17:06 UTC (permalink / raw)
  To: libc-ports

I would like to update the MIPS memset routine to include many of the
improvements I made to memcpy earlier.  These include better prefetching
and more loop unrolling for better performance.  Like with memset I use
ifdefs so it can be compiled in 32 or 64 bit modes and so I also remove
the old 64bit specific version of memset.S with this patch.

Tested with the glibc and gcc testsuites and by doing some standalone
performance measurements.

OK to checkin?

Steve Ellcey
sellcey@mips.com



2013-09-05  Steve Ellcey  <sellcey@mips.com>

	* sysdeps/mips/memset.S: Change prefetching and add loop unrolling. 
	* sysdeps/mips/mips64/memset.S: Remove.



diff --git a/ports/sysdeps/mips/memset.S b/ports/sysdeps/mips/memset.S
index 85062fe..c7e5507 100644
--- a/ports/sysdeps/mips/memset.S
+++ b/ports/sysdeps/mips/memset.S
@@ -1,6 +1,5 @@
-/* Copyright (C) 2002-2013 Free Software Foundation, Inc.
+/* Copyright (C) 2013 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -16,70 +15,357 @@
    License along with the GNU C Library.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
+#ifdef ANDROID_CHANGES
+#include "machine/asm.h"
+#include "machine/regdef.h"
+#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#elif _LIBC
 #include <sysdep.h>
+#include <regdef.h>
+#include <sys/asm.h>
+#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#elif _COMPILING_NEWLIB
+#include "machine/asm.h"
+#include "machine/regdef.h"
+#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#else
+#include <regdef.h>
+#include <sys/asm.h>
+#endif
 
-	.set	nomips16
+/* Check to see if the MIPS architecture we are compiling for supports
+ * prefetching.
+ */
+
+#if (__mips == 4) || (__mips == 5) || (__mips == 32) || (__mips == 64)
+#ifndef DISABLE_PREFETCH
+#define USE_PREFETCH
+#endif
+#endif
+
+#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32))
+#ifndef DISABLE_DOUBLE
+#define USE_DOUBLE
+#endif
+#endif
+
+#ifndef USE_DOUBLE
+#ifndef DISABLE_DOUBLE_ALIGN
+#define DOUBLE_ALIGN
+#endif
+#endif
+
+/* Some asm.h files do not have the L macro definition.  */
+#ifndef L
+#if _MIPS_SIM == _ABIO32
+# define L(label) $L ## label
+#else
+# define L(label) .L ## label
+#endif
+#endif
+
+/* Some asm.h files do not have the PTR_ADDIU macro definition.  */
+#ifndef PTR_ADDIU
+#ifdef USE_DOUBLE
+#define PTR_ADDIU	daddiu
+#else
+#define PTR_ADDIU	addiu
+#endif
+#endif
+
+/*
+ * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
+ * or PREFETCH_STORE_STREAMED offers a large performance advantage
+ * but PREPAREFORSTORE has some special restrictions to consider.
+ *
+ * Prefetch with the 'prepare for store' hint does not copy a memory
+ * location into the cache, it just allocates a cache line and zeros
+ * it out.  This means that if you do not write to the entire cache
+ * line before writing it out to memory some data will get zero'ed out
+ * when the cache line is written back to memory and data will be lost.
+ *
+ * There are ifdef'ed sections of this memcpy to make sure that it does not
+ * do prefetches on cache lines that are not going to be completely written.
+ * This code is only needed and only used when PREFETCH_STORE_HINT is set to
+ * PREFETCH_HINT_PREPAREFORSTORE.  This code assumes that cache lines are
+ * less than MAX_PREFETCH_SIZE bytes and if the cache line is larger it will
+ * not work correctly.
+ */
+
+#ifdef USE_PREFETCH
+# define PREFETCH_HINT_STORE		1
+# define PREFETCH_HINT_STORE_STREAMED	5
+# define PREFETCH_HINT_STORE_RETAINED	7
+# define PREFETCH_HINT_PREPAREFORSTORE	30
+
+/*
+ * If we have not picked out what hints to use at this point use the
+ * standard load and store prefetch hints.
+ */
+#ifndef PREFETCH_STORE_HINT
+# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
+#endif
+
+/*
+ * We double everything when USE_DOUBLE is true so we do 2 prefetches to
+ * get 64 bytes in that case.  The assumption is that each individual
+ * prefetch brings in 32 bytes.
+ */
+#ifdef USE_DOUBLE
+# define PREFETCH_CHUNK 64
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREFETCH_STORE_HINT, (chunk)*64(reg); \
+ pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg)
+#else
+# define PREFETCH_CHUNK 32
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREFETCH_STORE_HINT, (chunk)*32(reg)
+#endif
+
+/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less
+ * than PREFETCH_CHUNK, the assumed size of each prefetch.  If the real size
+ * of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE
+ * hint is used, the code will not work correctly.  If PREPAREFORSTORE is not
+ * used then MAX_PREFETCH_SIZE does not matter.  */
+#define MAX_PREFETCH_SIZE 128
+/* PREFETCH_LIMIT is set based on the fact that we never use an offset greater
+ * than 5 on a STORE prefetch and that a single prefetch can never be larger
+ * than MAX_PREFETCH_SIZE.  We add the extra 32 when USE_DOUBLE is set because
+ * we actually do two prefetches in that case, one 32 bytes after the other.  */
+#ifdef USE_DOUBLE
+# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE
+#else
+# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE
+#endif
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \
+    && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE)
+/* We cannot handle this because the initial prefetches may fetch bytes that
+ * are before the buffer being copied.  We start copies with an offset
+ * of 4 so avoid this situation when using PREPAREFORSTORE.  */
+#error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small."
+#endif
+#else /* USE_PREFETCH not defined */
+# define PREFETCH_FOR_STORE(offset, reg)
+#endif
+
+/* Allow the routine to be named something else if desired.  */
+#ifndef MEMSET_NAME
+#define MEMSET_NAME memset
+#endif
 
-/* void *memset(void *s, int c, size_t n).  */
+/* We load/store 64 bits at a time when USE_DOUBLE is true.
+ * The C_ prefix stands for CHUNK and is used to avoid macro name
+ * conflicts with system header files.  */
 
+#ifdef USE_DOUBLE
+#  define C_ST	sd
 #if __MIPSEB
-# define SWHI	swl		/* high part is left in big-endian	*/
+#  define C_STHI	sdl	/* high part is left in big-endian	*/
 #else
-# define SWHI	swr		/* high part is right in little-endian	*/
+#  define C_STHI	sdr	/* high part is right in little-endian	*/
+#endif
+#else
+#  define C_ST	sw
+#if __MIPSEB
+#  define C_STHI	swl	/* high part is left in big-endian	*/
+#else
+#  define C_STHI	swr	/* high part is right in little-endian	*/
+#endif
 #endif
 
-ENTRY (memset)
+/* Bookkeeping values for 32 vs. 64 bit mode.  */
+#ifdef USE_DOUBLE
+#  define NSIZE 8
+#  define NSIZEMASK 0x3f
+#  define NSIZEDMASK 0x7f
+#else
+#  define NSIZE 4
+#  define NSIZEMASK 0x1f
+#  define NSIZEDMASK 0x3f
+#endif
+#define UNIT(unit) ((unit)*NSIZE)
+#define UNITM1(unit) (((unit)*NSIZE)-1)
+
+#ifdef ANDROID_CHANGES
+LEAF(MEMSET_NAME,0)
+#else
+LEAF(MEMSET_NAME)
+#endif
+
+	.set	nomips16
 	.set	noreorder
+/*
+ * If the size is less than 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
+ * size, copy dst pointer to v0 for the return value.
+ */
+	slti	t2,a2,(2 * NSIZE)
+	bne	t2,zero,L(lastb)
+	move	v0,a0
 
-	slti	t1, a2, 8		# Less than 8?
-	bne	t1, zero, L(last8)
-	move	v0, a0			# Setup exit value before too late
-
-	beq	a1, zero, L(ueven)	# If zero pattern, no need to extend
-	andi	a1, 0xff		# Avoid problems with bogus arguments
-	sll	t0, a1, 8
-	or	a1, t0
-	sll	t0, a1, 16
-	or	a1, t0			# a1 is now pattern in full word
-
-L(ueven):
-	subu	t0, zero, a0		# Unaligned address?
-	andi	t0, 0x3
-	beq	t0, zero, L(chkw)
-	subu	a2, t0
-	SWHI	a1, 0(a0)		# Yes, handle first unaligned part
-	addu	a0, t0			# Now both a0 and a2 are updated
+/*
+ * If memset value is not zero, we copy it to all the bytes in a 32 or 64
+ * bit word.
+ */
+	beq	a1,zero,L(set0)		/* If memset value is zero no smear  */
+	PTR_SUBU a3,zero,a0
+	nop
 
+	/* smear byte into 32 or 64 bit word */
+#if (__mips==32) && (__mips_isa_rev>=2)
+	ins     a1, a1, 8, 8        /* Replicate fill byte into half-word.  */
+	ins     a1, a1, 16, 16      /* Replicate fill byte into word.       */
+#ifdef USE_DOUBLE
+	dins	a1, a1, 32, 32      /* Replicate fill byte into dbl word.   */
+#endif
+#else
+	sll	t2,a1,8
+	or	a1,t2
+	sll	t2,a1,16
+	or	a1,t2
+#ifdef USE_DOUBLE
+	dsll	t2,a1,32
+	or	a1,t2
+#endif
+#endif
+
+/*
+ * If the destination address is not aligned do a partial store to get it
+ * aligned.  If it is already aligned just jump to L(aligned).
+ */
+L(set0):
+	andi	t2,a3,(NSIZE-1)		/* word-unaligned address?          */
+	beq	t2,zero,L(aligned)	/* t2 is the unalignment count      */
+	PTR_SUBU a2,a2,t2
+	C_STHI	a1,0(a0)
+	PTR_ADDU a0,a0,t2
+
+L(aligned):
+/*
+ * If USE_DOUBLE is not set we may still want to align the data on a 16
+ * byte boundry instead of an 8 byte boundry to maximize the opportunity
+ * of proAptive chips to do memory bonding (combining two sequential 4
+ * byte stores into one 8 byte store).  We know there are at least 4 bytes
+ * left to store or we would have jumped to L(lastb) earlier in the code.
+ */
+#ifdef DOUBLE_ALIGN
+	andi	t2,a3,4
+	beq	t2,zero,L(double_aligned)
+	PTR_SUBU a2,a2,t2
+	sw	a1,0(a0)
+	PTR_ADDU a0,a0,t2
+L(double_aligned):
+#endif
+
+/*
+ * Now the destination is aligned to (word or double word) aligned address
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dest pointer after all the 64/128 byte
+ * chunks have been copied.  We will loop, incrementing a0 until it equals a3.
+ */
+	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
+	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
+	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
+
+/* When in the loop we may prefetch with the 'prepare to store' hint,
+ * in this case the a0+x should not be past the "t0-32" address.  This
+ * means: for x=128 the last "safe" a0 address is "t0-160".  Alternatively,
+ * for x=64 the last "safe" a0 address is "t0-96" In the current version we
+ * will use "prefetch hint,128(a0)", so "t0-160" is the limit.
+ */
+#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	PTR_ADDU t0,a0,a2		/* t0 is the "past the end" address */
+	PTR_SUBU t9,t0,PREFETCH_LIMIT	/* t9 is the "last safe pref" address */
+	PREFETCH_FOR_STORE (1, a0)
+	PREFETCH_FOR_STORE (2, a0)
+	PREFETCH_FOR_STORE (3, a0)
+#endif
+L(loop16w):
+#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
+	bgtz	v1,L(skip_pref)
+	nop
+#endif
+	PREFETCH_FOR_STORE (4, a0)
+	PREFETCH_FOR_STORE (5, a0)
+L(skip_pref):
+	C_ST	a1,UNIT(0)(a0)
+	C_ST	a1,UNIT(1)(a0)
+	C_ST	a1,UNIT(2)(a0)
+	C_ST	a1,UNIT(3)(a0)
+	C_ST	a1,UNIT(4)(a0)
+	C_ST	a1,UNIT(5)(a0)
+	C_ST	a1,UNIT(6)(a0)
+	C_ST	a1,UNIT(7)(a0)
+	C_ST	a1,UNIT(8)(a0)
+	C_ST	a1,UNIT(9)(a0)
+	C_ST	a1,UNIT(10)(a0)
+	C_ST	a1,UNIT(11)(a0)
+	C_ST	a1,UNIT(12)(a0)
+	C_ST	a1,UNIT(13)(a0)
+	C_ST	a1,UNIT(14)(a0)
+	C_ST	a1,UNIT(15)(a0)
+	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
+	bne	a0,a3,L(loop16w)
+	nop
+	move	a2,t8
+
+/*
+ * Here we have dest word-aligned but less than 64-bytes or 128 bytes to go.
+ * Check for a 32(64) byte chunk and copy if if there is one.  Otherwise
+ * jump down to L(chk1w) to handle the tail end of the copy.
+ */
 L(chkw):
-	andi	t0, a2, 0x7		# Enough left for one loop iteration?
-	beq	t0, a2, L(chkl)
-	subu	a3, a2, t0
-	addu	a3, a0			# a3 is last loop address +1
-	move	a2, t0			# a2 is now # of bytes left after loop
-L(loopw):
-	addiu	a0, 8			# Handle 2 words pr. iteration
-	sw	a1, -8(a0)
-	bne	a0, a3, L(loopw)
-	sw	a1, -4(a0)
-
-L(chkl):
-	andi	t0, a2, 0x4		# Check if there is at least a full
-	beq	t0, zero, L(last8)	#  word remaining after the loop
-	subu	a2, t0
-	sw	a1, 0(a0)		# Yes...
-	addiu	a0, 4
-
-L(last8):
-	blez	a2, L(exit)		# Handle last 8 bytes (if cnt>0)
-	addu	a3, a2, a0		# a3 is last address +1
-L(lst8l):
-	addiu	a0, 1
-	bne	a0, a3, L(lst8l)
-	sb	a1, -1(a0)
-L(exit):
-	j	ra			# Bye, bye
+	andi	t8,a2,NSIZEMASK	/* is there a 32-byte/64-byte chunk.  */
+				/* the t8 is the reminder count past 32-bytes */
+	beq	a2,t8,L(chk1w)/* when a2==t8, no 32-byte chunk */
+	nop
+	C_ST	a1,UNIT(0)(a0)
+	C_ST	a1,UNIT(1)(a0)
+	C_ST	a1,UNIT(2)(a0)
+	C_ST	a1,UNIT(3)(a0)
+	C_ST	a1,UNIT(4)(a0)
+	C_ST	a1,UNIT(5)(a0)
+	C_ST	a1,UNIT(6)(a0)
+	C_ST	a1,UNIT(7)(a0)
+	PTR_ADDIU a0,a0,UNIT(8)
+
+/*
+ * Here we have less than 32(64) bytes to set.  Set up for a loop to
+ * copy one word (or double word) at a time.  Set a2 to count how many
+ * bytes we have to copy after all the word (or double word) chunks are
+ * copied and a3 to the dest pointer after all the (d)word chunks have
+ * been copied.  We will loop, incrementing a0 until a0 equals a3.
+ */
+L(chk1w):
+	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
+	beq	a2,t8,L(lastb)
+	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
+	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
+
+/* copying in words (4-byte or 8 byte chunks) */
+L(wordCopy_loop):
+	PTR_ADDIU a0,a0,UNIT(1)
+	bne	a0,a3,L(wordCopy_loop)
+	C_ST	a1,UNIT(-1)(a0)
+
+/* Copy the last 8 (or 16) bytes */
+L(lastb):
+	blez	a2,L(leave)
+	PTR_ADDU a3,a0,a2       /* a3 is the last dst address */
+L(lastbloop):
+	PTR_ADDIU a0,a0,1
+	bne	a0,a3,L(lastbloop)
+	sb	a1,-1(a0)
+L(leave):
+	j	ra
 	nop
 
+	.set	at
 	.set	reorder
-END (memset)
-libc_hidden_builtin_def (memset)
+END(MEMSET_NAME)
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+libc_hidden_builtin_def (MEMSET_NAME)
+#endif
+#endif

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-05 17:06 [patch, mips] Improved memset for MIPS Steve Ellcey 
@ 2013-09-06  0:40 ` Mike Frysinger
  2013-09-06 15:42   ` Steve Ellcey
  2013-09-06  4:18 ` Carlos O'Donell
  2013-09-06 14:31 ` Joseph S. Myers
  2 siblings, 1 reply; 32+ messages in thread
From: Mike Frysinger @ 2013-09-06  0:40 UTC (permalink / raw)
  To: libc-ports; +Cc: Steve Ellcey 

[-- Attachment #1: Type: Text/Plain, Size: 621 bytes --]

On Thursday 05 September 2013 13:05:43 Steve Ellcey wrote:
> --- a/ports/sysdeps/mips/memset.S
> +++ b/ports/sysdeps/mips/memset.S
> @@ -1,6 +1,5 @@
> -/* Copyright (C) 2002-2013 Free Software Foundation, Inc.
> +/* Copyright (C) 2013 Free Software Foundation, Inc.

err, that's not generally how it works ... we extend the years, but don't 
delete them.

> -   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.

what'd he ever do to you ? :p

> +#ifdef ANDROID_CHANGES

i wouldn't think we'd normally accept this kind of stuff, but i guess it's 
already been done with the mips memcpy ...
-mike

[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-05 17:06 [patch, mips] Improved memset for MIPS Steve Ellcey 
  2013-09-06  0:40 ` Mike Frysinger
@ 2013-09-06  4:18 ` Carlos O'Donell
  2013-09-06 16:03   ` Steve Ellcey
  2013-09-06 14:31 ` Joseph S. Myers
  2 siblings, 1 reply; 32+ messages in thread
From: Carlos O'Donell @ 2013-09-06  4:18 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: libc-ports

On 09/05/2013 01:05 PM, Steve Ellcey wrote:
> I would like to update the MIPS memset routine to include many of the
> improvements I made to memcpy earlier.  These include better prefetching
> and more loop unrolling for better performance.  Like with memset I use
> ifdefs so it can be compiled in 32 or 64 bit modes and so I also remove
> the old 64bit specific version of memset.S with this patch.
> 
> Tested with the glibc and gcc testsuites and by doing some standalone
> performance measurements.
> OK to checkin?

Two things really: 

(a) Testing details?

Could you please elaborate more on "some standalone performance 
measurements?"

What specific benchmarks did you run?

What does the glibc microbenchmark show about your changes? Do they
show a benefit?

Steve, I trust your experience with MIPS, but I'd like to see all 
of us drive a little more detail into these performance related
patches. I'm also curious if the microbenchmark shows a performance
progression. The glibc community is trying hard to add some objectivity
to our performance measurements, prevent performance regressions, and
use the tests to experiment with new implementations.

(b) the code formatting isn't in line with the project requirements.

...

> 2013-09-05  Steve Ellcey  <sellcey@mips.com>
> 
> 	* sysdeps/mips/memset.S: Change prefetching and add loop unrolling. 
> 	* sysdeps/mips/mips64/memset.S: Remove.
> 
> 
> 
> diff --git a/ports/sysdeps/mips/memset.S b/ports/sysdeps/mips/memset.S
> index 85062fe..c7e5507 100644
> --- a/ports/sysdeps/mips/memset.S
> +++ b/ports/sysdeps/mips/memset.S
> @@ -1,6 +1,5 @@
> -/* Copyright (C) 2002-2013 Free Software Foundation, Inc.
> +/* Copyright (C) 2013 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
> -   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
>  
>     The GNU C Library is free software; you can redistribute it and/or
>     modify it under the terms of the GNU Lesser General Public
> @@ -16,70 +15,357 @@
>     License along with the GNU C Library.  If not, see
>     <http://www.gnu.org/licenses/>.  */
>  
> +#ifdef ANDROID_CHANGES
> +#include "machine/asm.h"
> +#include "machine/regdef.h"
> +#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
> +#elif _LIBC
>  #include <sysdep.h>
> +#include <regdef.h>
> +#include <sys/asm.h>
> +#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
> +#elif _COMPILING_NEWLIB
> +#include "machine/asm.h"
> +#include "machine/regdef.h"
> +#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
> +#else
> +#include <regdef.h>
> +#include <sys/asm.h>
> +#endif

This doesn't meet glibc's coding standards and you didn't provide any
rationale for not meeting the standards e.g. shared file amongst multiple
implementations.

See:
https://sourceware.org/glibc/wiki/Style_and_Conventions

Particularly:
https://sourceware.org/glibc/wiki/Style_and_Conventions#Nested_C_Preprocessor_Directives

I won't repeat this review comment for the other similarly formatted cpp defines.

> -	.set	nomips16
> +/* Check to see if the MIPS architecture we are compiling for supports
> + * prefetching.
> + */

These comments are not GNU style, they should be:

/* Line one.
   Line two.
   Line three.  */

I won't repeat this review comment for the other similarly formatted comments.

> +
> +#if (__mips == 4) || (__mips == 5) || (__mips == 32) || (__mips == 64)
> +#ifndef DISABLE_PREFETCH
> +#define USE_PREFETCH
> +#endif
> +#endif
> +
> +#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32))
> +#ifndef DISABLE_DOUBLE
> +#define USE_DOUBLE
> +#endif
> +#endif
> +
> +#ifndef USE_DOUBLE
> +#ifndef DISABLE_DOUBLE_ALIGN
> +#define DOUBLE_ALIGN
> +#endif
> +#endif
> +
> +/* Some asm.h files do not have the L macro definition.  */
> +#ifndef L
> +#if _MIPS_SIM == _ABIO32
> +# define L(label) $L ## label
> +#else
> +# define L(label) .L ## label
> +#endif
> +#endif
> +
> +/* Some asm.h files do not have the PTR_ADDIU macro definition.  */
> +#ifndef PTR_ADDIU
> +#ifdef USE_DOUBLE
> +#define PTR_ADDIU	daddiu
> +#else
> +#define PTR_ADDIU	addiu
> +#endif
> +#endif
> +
> +/*
> + * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
> + * or PREFETCH_STORE_STREAMED offers a large performance advantage
> + * but PREPAREFORSTORE has some special restrictions to consider.
> + *
> + * Prefetch with the 'prepare for store' hint does not copy a memory
> + * location into the cache, it just allocates a cache line and zeros
> + * it out.  This means that if you do not write to the entire cache
> + * line before writing it out to memory some data will get zero'ed out
> + * when the cache line is written back to memory and data will be lost.
> + *
> + * There are ifdef'ed sections of this memcpy to make sure that it does not
> + * do prefetches on cache lines that are not going to be completely written.
> + * This code is only needed and only used when PREFETCH_STORE_HINT is set to
> + * PREFETCH_HINT_PREPAREFORSTORE.  This code assumes that cache lines are
> + * less than MAX_PREFETCH_SIZE bytes and if the cache line is larger it will
> + * not work correctly.
> + */
> +
> +#ifdef USE_PREFETCH
> +# define PREFETCH_HINT_STORE		1
> +# define PREFETCH_HINT_STORE_STREAMED	5
> +# define PREFETCH_HINT_STORE_RETAINED	7
> +# define PREFETCH_HINT_PREPAREFORSTORE	30

Not obvious that the endif for this is much later, which is why
cpp nesting is useful. You started using nesting here, but not
consistently.

> +
> +/*
> + * If we have not picked out what hints to use at this point use the
> + * standard load and store prefetch hints.
> + */
> +#ifndef PREFETCH_STORE_HINT
> +# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
> +#endif
> +
> +/*
> + * We double everything when USE_DOUBLE is true so we do 2 prefetches to
> + * get 64 bytes in that case.  The assumption is that each individual
> + * prefetch brings in 32 bytes.
> + */
> +#ifdef USE_DOUBLE
> +# define PREFETCH_CHUNK 64
> +# define PREFETCH_FOR_STORE(chunk, reg) \
> + pref PREFETCH_STORE_HINT, (chunk)*64(reg); \
> + pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg)
> +#else
> +# define PREFETCH_CHUNK 32
> +# define PREFETCH_FOR_STORE(chunk, reg) \
> + pref PREFETCH_STORE_HINT, (chunk)*32(reg)
> +#endif
> +
> +/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less
> + * than PREFETCH_CHUNK, the assumed size of each prefetch.  If the real size
> + * of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE
> + * hint is used, the code will not work correctly.  If PREPAREFORSTORE is not
> + * used then MAX_PREFETCH_SIZE does not matter.  */
> +#define MAX_PREFETCH_SIZE 128
> +/* PREFETCH_LIMIT is set based on the fact that we never use an offset greater
> + * than 5 on a STORE prefetch and that a single prefetch can never be larger
> + * than MAX_PREFETCH_SIZE.  We add the extra 32 when USE_DOUBLE is set because
> + * we actually do two prefetches in that case, one 32 bytes after the other.  */
> +#ifdef USE_DOUBLE
> +# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE
> +#else
> +# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE
> +#endif
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \
> +    && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE)
> +/* We cannot handle this because the initial prefetches may fetch bytes that
> + * are before the buffer being copied.  We start copies with an offset
> + * of 4 so avoid this situation when using PREPAREFORSTORE.  */
> +#error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small."
> +#endif
> +#else /* USE_PREFETCH not defined */
> +# define PREFETCH_FOR_STORE(offset, reg)
> +#endif
> +
> +/* Allow the routine to be named something else if desired.  */
> +#ifndef MEMSET_NAME
> +#define MEMSET_NAME memset
> +#endif
>  
> -/* void *memset(void *s, int c, size_t n).  */
> +/* We load/store 64 bits at a time when USE_DOUBLE is true.
> + * The C_ prefix stands for CHUNK and is used to avoid macro name
> + * conflicts with system header files.  */
>  
> +#ifdef USE_DOUBLE
> +#  define C_ST	sd

It should be one space for the nesting.

>  #if __MIPSEB
> -# define SWHI	swl		/* high part is left in big-endian	*/
> +#  define C_STHI	sdl	/* high part is left in big-endian	*/
>  #else
> -# define SWHI	swr		/* high part is right in little-endian	*/
> +#  define C_STHI	sdr	/* high part is right in little-endian	*/
> +#endif
> +#else
> +#  define C_ST	sw
> +#if __MIPSEB
> +#  define C_STHI	swl	/* high part is left in big-endian	*/
> +#else
> +#  define C_STHI	swr	/* high part is right in little-endian	*/
> +#endif
>  #endif
>  
> -ENTRY (memset)
> +/* Bookkeeping values for 32 vs. 64 bit mode.  */
> +#ifdef USE_DOUBLE
> +#  define NSIZE 8
> +#  define NSIZEMASK 0x3f
> +#  define NSIZEDMASK 0x7f
> +#else
> +#  define NSIZE 4
> +#  define NSIZEMASK 0x1f
> +#  define NSIZEDMASK 0x3f
> +#endif
> +#define UNIT(unit) ((unit)*NSIZE)
> +#define UNITM1(unit) (((unit)*NSIZE)-1)
> +
> +#ifdef ANDROID_CHANGES
> +LEAF(MEMSET_NAME,0)
> +#else
> +LEAF(MEMSET_NAME)
> +#endif
> +
> +	.set	nomips16
>  	.set	noreorder
> +/*
> + * If the size is less than 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
> + * size, copy dst pointer to v0 for the return value.
> + */
> +	slti	t2,a2,(2 * NSIZE)
> +	bne	t2,zero,L(lastb)
> +	move	v0,a0
>  
> -	slti	t1, a2, 8		# Less than 8?
> -	bne	t1, zero, L(last8)
> -	move	v0, a0			# Setup exit value before too late
> -
> -	beq	a1, zero, L(ueven)	# If zero pattern, no need to extend
> -	andi	a1, 0xff		# Avoid problems with bogus arguments
> -	sll	t0, a1, 8
> -	or	a1, t0
> -	sll	t0, a1, 16
> -	or	a1, t0			# a1 is now pattern in full word
> -
> -L(ueven):
> -	subu	t0, zero, a0		# Unaligned address?
> -	andi	t0, 0x3
> -	beq	t0, zero, L(chkw)
> -	subu	a2, t0
> -	SWHI	a1, 0(a0)		# Yes, handle first unaligned part
> -	addu	a0, t0			# Now both a0 and a2 are updated
> +/*
> + * If memset value is not zero, we copy it to all the bytes in a 32 or 64
> + * bit word.
> + */
> +	beq	a1,zero,L(set0)		/* If memset value is zero no smear  */
> +	PTR_SUBU a3,zero,a0
> +	nop
>  
> +	/* smear byte into 32 or 64 bit word */
> +#if (__mips==32) && (__mips_isa_rev>=2)
> +	ins     a1, a1, 8, 8        /* Replicate fill byte into half-word.  */
> +	ins     a1, a1, 16, 16      /* Replicate fill byte into word.       */
> +#ifdef USE_DOUBLE
> +	dins	a1, a1, 32, 32      /* Replicate fill byte into dbl word.   */
> +#endif
> +#else
> +	sll	t2,a1,8
> +	or	a1,t2
> +	sll	t2,a1,16
> +	or	a1,t2
> +#ifdef USE_DOUBLE
> +	dsll	t2,a1,32
> +	or	a1,t2
> +#endif
> +#endif
> +
> +/*
> + * If the destination address is not aligned do a partial store to get it
> + * aligned.  If it is already aligned just jump to L(aligned).
> + */
> +L(set0):
> +	andi	t2,a3,(NSIZE-1)		/* word-unaligned address?          */
> +	beq	t2,zero,L(aligned)	/* t2 is the unalignment count      */
> +	PTR_SUBU a2,a2,t2
> +	C_STHI	a1,0(a0)
> +	PTR_ADDU a0,a0,t2
> +
> +L(aligned):
> +/*
> + * If USE_DOUBLE is not set we may still want to align the data on a 16
> + * byte boundry instead of an 8 byte boundry to maximize the opportunity
> + * of proAptive chips to do memory bonding (combining two sequential 4
> + * byte stores into one 8 byte store).  We know there are at least 4 bytes
> + * left to store or we would have jumped to L(lastb) earlier in the code.
> + */
> +#ifdef DOUBLE_ALIGN
> +	andi	t2,a3,4
> +	beq	t2,zero,L(double_aligned)
> +	PTR_SUBU a2,a2,t2
> +	sw	a1,0(a0)
> +	PTR_ADDU a0,a0,t2
> +L(double_aligned):
> +#endif
> +
> +/*
> + * Now the destination is aligned to (word or double word) aligned address
> + * Set a2 to count how many bytes we have to copy after all the 64/128 byte
> + * chunks are copied and a3 to the dest pointer after all the 64/128 byte
> + * chunks have been copied.  We will loop, incrementing a0 until it equals a3.
> + */
> +	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
> +	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
> +	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
> +	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
> +
> +/* When in the loop we may prefetch with the 'prepare to store' hint,
> + * in this case the a0+x should not be past the "t0-32" address.  This
> + * means: for x=128 the last "safe" a0 address is "t0-160".  Alternatively,
> + * for x=64 the last "safe" a0 address is "t0-96" In the current version we
> + * will use "prefetch hint,128(a0)", so "t0-160" is the limit.
> + */
> +#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +	PTR_ADDU t0,a0,a2		/* t0 is the "past the end" address */
> +	PTR_SUBU t9,t0,PREFETCH_LIMIT	/* t9 is the "last safe pref" address */
> +	PREFETCH_FOR_STORE (1, a0)
> +	PREFETCH_FOR_STORE (2, a0)
> +	PREFETCH_FOR_STORE (3, a0)
> +#endif
> +L(loop16w):
> +#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
> +	bgtz	v1,L(skip_pref)
> +	nop
> +#endif
> +	PREFETCH_FOR_STORE (4, a0)
> +	PREFETCH_FOR_STORE (5, a0)
> +L(skip_pref):
> +	C_ST	a1,UNIT(0)(a0)
> +	C_ST	a1,UNIT(1)(a0)
> +	C_ST	a1,UNIT(2)(a0)
> +	C_ST	a1,UNIT(3)(a0)
> +	C_ST	a1,UNIT(4)(a0)
> +	C_ST	a1,UNIT(5)(a0)
> +	C_ST	a1,UNIT(6)(a0)
> +	C_ST	a1,UNIT(7)(a0)
> +	C_ST	a1,UNIT(8)(a0)
> +	C_ST	a1,UNIT(9)(a0)
> +	C_ST	a1,UNIT(10)(a0)
> +	C_ST	a1,UNIT(11)(a0)
> +	C_ST	a1,UNIT(12)(a0)
> +	C_ST	a1,UNIT(13)(a0)
> +	C_ST	a1,UNIT(14)(a0)
> +	C_ST	a1,UNIT(15)(a0)
> +	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
> +	bne	a0,a3,L(loop16w)
> +	nop
> +	move	a2,t8
> +
> +/*
> + * Here we have dest word-aligned but less than 64-bytes or 128 bytes to go.
> + * Check for a 32(64) byte chunk and copy if if there is one.  Otherwise
> + * jump down to L(chk1w) to handle the tail end of the copy.
> + */
>  L(chkw):
> -	andi	t0, a2, 0x7		# Enough left for one loop iteration?
> -	beq	t0, a2, L(chkl)
> -	subu	a3, a2, t0
> -	addu	a3, a0			# a3 is last loop address +1
> -	move	a2, t0			# a2 is now # of bytes left after loop
> -L(loopw):
> -	addiu	a0, 8			# Handle 2 words pr. iteration
> -	sw	a1, -8(a0)
> -	bne	a0, a3, L(loopw)
> -	sw	a1, -4(a0)
> -
> -L(chkl):
> -	andi	t0, a2, 0x4		# Check if there is at least a full
> -	beq	t0, zero, L(last8)	#  word remaining after the loop
> -	subu	a2, t0
> -	sw	a1, 0(a0)		# Yes...
> -	addiu	a0, 4
> -
> -L(last8):
> -	blez	a2, L(exit)		# Handle last 8 bytes (if cnt>0)
> -	addu	a3, a2, a0		# a3 is last address +1
> -L(lst8l):
> -	addiu	a0, 1
> -	bne	a0, a3, L(lst8l)
> -	sb	a1, -1(a0)
> -L(exit):
> -	j	ra			# Bye, bye
> +	andi	t8,a2,NSIZEMASK	/* is there a 32-byte/64-byte chunk.  */
> +				/* the t8 is the reminder count past 32-bytes */
> +	beq	a2,t8,L(chk1w)/* when a2==t8, no 32-byte chunk */
> +	nop
> +	C_ST	a1,UNIT(0)(a0)
> +	C_ST	a1,UNIT(1)(a0)
> +	C_ST	a1,UNIT(2)(a0)
> +	C_ST	a1,UNIT(3)(a0)
> +	C_ST	a1,UNIT(4)(a0)
> +	C_ST	a1,UNIT(5)(a0)
> +	C_ST	a1,UNIT(6)(a0)
> +	C_ST	a1,UNIT(7)(a0)
> +	PTR_ADDIU a0,a0,UNIT(8)
> +
> +/*
> + * Here we have less than 32(64) bytes to set.  Set up for a loop to
> + * copy one word (or double word) at a time.  Set a2 to count how many
> + * bytes we have to copy after all the word (or double word) chunks are
> + * copied and a3 to the dest pointer after all the (d)word chunks have
> + * been copied.  We will loop, incrementing a0 until a0 equals a3.
> + */
> +L(chk1w):
> +	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
> +	beq	a2,t8,L(lastb)
> +	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
> +	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
> +
> +/* copying in words (4-byte or 8 byte chunks) */
> +L(wordCopy_loop):
> +	PTR_ADDIU a0,a0,UNIT(1)
> +	bne	a0,a3,L(wordCopy_loop)
> +	C_ST	a1,UNIT(-1)(a0)
> +
> +/* Copy the last 8 (or 16) bytes */
> +L(lastb):
> +	blez	a2,L(leave)
> +	PTR_ADDU a3,a0,a2       /* a3 is the last dst address */
> +L(lastbloop):
> +	PTR_ADDIU a0,a0,1
> +	bne	a0,a3,L(lastbloop)
> +	sb	a1,-1(a0)
> +L(leave):
> +	j	ra
>  	nop
>  
> +	.set	at
>  	.set	reorder
> -END (memset)
> -libc_hidden_builtin_def (memset)
> +END(MEMSET_NAME)
> +#ifndef ANDROID_CHANGES
> +#ifdef _LIBC
> +libc_hidden_builtin_def (MEMSET_NAME)
> +#endif
> +#endif
> 

Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-05 17:06 [patch, mips] Improved memset for MIPS Steve Ellcey 
  2013-09-06  0:40 ` Mike Frysinger
  2013-09-06  4:18 ` Carlos O'Donell
@ 2013-09-06 14:31 ` Joseph S. Myers
  2013-09-06 15:58   ` Steve Ellcey
  2 siblings, 1 reply; 32+ messages in thread
From: Joseph S. Myers @ 2013-09-06 14:31 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: libc-ports

On Thu, 5 Sep 2013, Steve Ellcey  wrote:

> Tested with the glibc and gcc testsuites and by doing some standalone
> performance measurements.

Has the glibc testsuite been run without regressions for all six 
combinations of (o32, n32, n64) with (big-endian, little-endian)?

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-06  0:40 ` Mike Frysinger
@ 2013-09-06 15:42   ` Steve Ellcey
  0 siblings, 0 replies; 32+ messages in thread
From: Steve Ellcey @ 2013-09-06 15:42 UTC (permalink / raw)
  To: Mike Frysinger; +Cc: libc-ports

On Thu, 2013-09-05 at 20:40 -0400, Mike Frysinger wrote:
> On Thursday 05 September 2013 13:05:43 Steve Ellcey wrote:
> > --- a/ports/sysdeps/mips/memset.S
> > +++ b/ports/sysdeps/mips/memset.S
> > @@ -1,6 +1,5 @@
> > -/* Copyright (C) 2002-2013 Free Software Foundation, Inc.
> > +/* Copyright (C) 2013 Free Software Foundation, Inc.
> 
> err, that's not generally how it works ... we extend the years, but don't 
> delete them.

I am replaced the entire file.  While the diff makes it look like some
stuff is not replaced, the standard copyright text is about the only
thing not completely changed.  There are some blank lines and maybe a
couple of assembly language psuedo-ops that happen to match up by chance
but that is pure coincidence.

> 
> > -   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
> 
> what'd he ever do to you ? :p

Nothing, but I didn't use any of the old code.

> > +#ifdef ANDROID_CHANGES
> 
> i wouldn't think we'd normally accept this kind of stuff, but i guess it's 
> already been done with the mips memcpy ...
> -mike

Yes, I am trying to have a common memset, memcpy, and maybe some other
routines between glibc, newlib, and android/bionic.

Steve Ellcey
sellcey@mips.com



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-06 14:31 ` Joseph S. Myers
@ 2013-09-06 15:58   ` Steve Ellcey
  2013-09-06 16:09     ` Joseph S. Myers
  0 siblings, 1 reply; 32+ messages in thread
From: Steve Ellcey @ 2013-09-06 15:58 UTC (permalink / raw)
  To: Joseph S. Myers; +Cc: libc-ports

On Fri, 2013-09-06 at 14:30 +0000, Joseph S. Myers wrote:
> On Thu, 5 Sep 2013, Steve Ellcey  wrote:
> 
> > Tested with the glibc and gcc testsuites and by doing some standalone
> > performance measurements.
> 
> Has the glibc testsuite been run without regressions for all six 
> combinations of (o32, n32, n64) with (big-endian, little-endian)?

No.  I did most of my testing outside of the glibc testsuite because I
find the glibc testsuite difficult to run, see
https://sourceware.org/ml/libc-help/2013-08/msg00040.html for some of my
problems/questions.  I don't believe I have ever managed to do a clean
glibc testsuite run even in a clean tree with no local changes.  I did a
glibc testsuite run in o32 little-endian mode and did not see any
failures that looked like regressions from what I saw with a clean tree.
I did the rest of my testing outside of that.

Steve Ellcey
sellcey@mips.com


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-06  4:18 ` Carlos O'Donell
@ 2013-09-06 16:03   ` Steve Ellcey
  2013-09-06 17:12     ` Carlos O'Donell
  0 siblings, 1 reply; 32+ messages in thread
From: Steve Ellcey @ 2013-09-06 16:03 UTC (permalink / raw)
  To: Carlos O'Donell; +Cc: libc-ports

On Fri, 2013-09-06 at 00:18 -0400, Carlos O'Donell wrote:

> Two things really: 
> 
> (a) Testing details?
> 
> Could you please elaborate more on "some standalone performance 
> measurements?"
> 
> What specific benchmarks did you run?

Basically, I just wrote and used a test program that does a bunch of
memset's.  Nothing fancy or very intricate.

> What does the glibc microbenchmark show about your changes? Do they
> show a benefit?

I didn't try this, but I can.  Is there anything on the glibc web page
about how to run this benchmark?  Does it happen as part of the standard
'make check'?

> 
> Steve, I trust your experience with MIPS, but I'd like to see all 
> of us drive a little more detail into these performance related
> patches. I'm also curious if the microbenchmark shows a performance
> progression. The glibc community is trying hard to add some objectivity
> to our performance measurements, prevent performance regressions, and
> use the tests to experiment with new implementations.

That sounds reasonable.  I just need a bit of help on where this is and
how to run it.

> (b) the code formatting isn't in line with the project requirements.

I'll fix these up and resubmit when I have the changes (and some more
performance data).

Steve Ellcey
sellcey@mips.com


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-06 15:58   ` Steve Ellcey
@ 2013-09-06 16:09     ` Joseph S. Myers
  2013-09-06 16:50       ` Steve Ellcey
  2013-09-06 16:59       ` Steve Ellcey
  0 siblings, 2 replies; 32+ messages in thread
From: Joseph S. Myers @ 2013-09-06 16:09 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: libc-ports

On Fri, 6 Sep 2013, Steve Ellcey wrote:

> On Fri, 2013-09-06 at 14:30 +0000, Joseph S. Myers wrote:
> > On Thu, 5 Sep 2013, Steve Ellcey  wrote:
> > 
> > > Tested with the glibc and gcc testsuites and by doing some standalone
> > > performance measurements.
> > 
> > Has the glibc testsuite been run without regressions for all six 
> > combinations of (o32, n32, n64) with (big-endian, little-endian)?
> 
> No.  I did most of my testing outside of the glibc testsuite because I
> find the glibc testsuite difficult to run, see
> https://sourceware.org/ml/libc-help/2013-08/msg00040.html for some of my
> problems/questions.  I don't believe I have ever managed to do a clean

You'll need to debug the problems as they indicate something wrong with 
your build environment.  It's always advised to configure glibc with 
--prefix=/usr rather than some other prefix (but there is no requirement 
that the dynamic linker actually be installed during testing, you can 
ignore the -dynamic-linker= path), and your other error indicates some 
inconsistency regarding NO_CTORS_DTORS_SECTIONS.

If you see more failures than are described at 
<https://sourceware.org/glibc/wiki/Release/2.18>, you should investigate 
them as well.

The expectation is that the glibc testsuite is the normal way to test 
patches before submission, and string function patches like this need it 
to be run for all six relevant ABI variants.

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-06 16:09     ` Joseph S. Myers
@ 2013-09-06 16:50       ` Steve Ellcey
  2013-09-06 16:59         ` Joseph S. Myers
  2013-09-06 16:59       ` Steve Ellcey
  1 sibling, 1 reply; 32+ messages in thread
From: Steve Ellcey @ 2013-09-06 16:50 UTC (permalink / raw)
  To: Joseph S. Myers; +Cc: libc-ports

On Fri, 2013-09-06 at 16:09 +0000, Joseph S. Myers wrote:

> > No.  I did most of my testing outside of the glibc testsuite because I
> > find the glibc testsuite difficult to run, see
> > https://sourceware.org/ml/libc-help/2013-08/msg00040.html for some of my
> > problems/questions.  I don't believe I have ever managed to do a clean
> 
> You'll need to debug the problems as they indicate something wrong with 
> your build environment.  It's always advised to configure glibc with 
> --prefix=/usr rather than some other prefix (but there is no requirement 
> that the dynamic linker actually be installed during testing, you can 
> ignore the -dynamic-linker= path), and your other error indicates some 
> inconsistency regarding NO_CTORS_DTORS_SECTIONS.

I have found that --prefix=/usr is more of a problem then a help when
building general cross compiler toolchains.  Using a prefix of /usr
triggers various special case code in
ports/sysdeps/unix/sysv/linux/mips/configure to put things in lib32 and
lib64 and I don't actually want any of that so I use a prefix
of /usr/fake instead of /usr.

The "undefined reference to `__libc_global_ctors'" has just shown up
again in a parallel build, but it seems to go away when I rebuild.  I am
still trying to understand what is going on with this.

> The expectation is that the glibc testsuite is the normal way to test 
> patches before submission, and string function patches like this need it 
> to be run for all six relevant ABI variants.

I think some flexibility here would be better.  There is no floating
point code in this routine so running all three ABI's in both hard and
soft float modes seems like overkill to me.  Testing in big and little
endian modes would seem more likely to turn up problems then testing in
hard and soft float.

Steve Ellcey
sellcey@imgtec.com


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-06 16:50       ` Steve Ellcey
@ 2013-09-06 16:59         ` Joseph S. Myers
  2013-09-06 17:43           ` Steve Ellcey
  2013-09-18 17:41           ` Steve Ellcey
  0 siblings, 2 replies; 32+ messages in thread
From: Joseph S. Myers @ 2013-09-06 16:59 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: libc-ports

On Fri, 6 Sep 2013, Steve Ellcey wrote:

> I have found that --prefix=/usr is more of a problem then a help when
> building general cross compiler toolchains.  Using a prefix of /usr
> triggers various special case code in
> ports/sysdeps/unix/sysv/linux/mips/configure to put things in lib32 and
> lib64 and I don't actually want any of that so I use a prefix
> of /usr/fake instead of /usr.

Not using --prefix=/usr runs into ABI testsuite problems with bug 14664.

> The "undefined reference to `__libc_global_ctors'" has just shown up
> again in a parallel build, but it seems to go away when I rebuild.  I am
> still trying to understand what is going on with this.

Since it seems to be about parallel builds and linkobj/libc.so, try with 
Brooks's patch 
<https://sourceware.org/ml/libc-alpha/2013-08/msg00597.html>?  (Which, if 
it works OK on master for a while, should probably be backported to 2.18 
branch.)

> > The expectation is that the glibc testsuite is the normal way to test 
> > patches before submission, and string function patches like this need it 
> > to be run for all six relevant ABI variants.
> 
> I think some flexibility here would be better.  There is no floating
> point code in this routine so running all three ABI's in both hard and
> soft float modes seems like overkill to me.  Testing in big and little
> endian modes would seem more likely to turn up problems then testing in
> hard and soft float.

That's why I said six rather than twelve ABI variants; floating-point 
variants aren't relevant here, but the other variants are.  You could 
argue for full testing for three variants that cover all of o32, n32 and 
n64 and both BE and LE, plus just the string/ directory tests for the 
other three variants, but I think that would be the minimum.

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-06 16:09     ` Joseph S. Myers
  2013-09-06 16:50       ` Steve Ellcey
@ 2013-09-06 16:59       ` Steve Ellcey
  1 sibling, 0 replies; 32+ messages in thread
From: Steve Ellcey @ 2013-09-06 16:59 UTC (permalink / raw)
  To: Joseph S. Myers; +Cc: libc-ports

On Fri, 2013-09-06 at 16:09 +0000, Joseph S. Myers wrote:

> The expectation is that the glibc testsuite is the normal way to test 
> patches before submission, and string function patches like this need it 
> to be run for all six relevant ABI variants.

My last email mentioned hard and soft float, but you were obviously
refering to big and little endian here as you said earlier in the email.
Sorry about that, I got confused in my reply.

Steve Ellcey
sellcey@mips.com



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-06 16:03   ` Steve Ellcey
@ 2013-09-06 17:12     ` Carlos O'Donell
  2013-09-06 23:33       ` Steve Ellcey
  0 siblings, 1 reply; 32+ messages in thread
From: Carlos O'Donell @ 2013-09-06 17:12 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: libc-ports

On 09/06/2013 12:03 PM, Steve Ellcey wrote:
> On Fri, 2013-09-06 at 00:18 -0400, Carlos O'Donell wrote:
> 
>> Two things really: 
>>
>> (a) Testing details?
>>
>> Could you please elaborate more on "some standalone performance 
>> measurements?"
>>
>> What specific benchmarks did you run?
> 
> Basically, I just wrote and used a test program that does a bunch of
> memset's.  Nothing fancy or very intricate.

Are you able to post this test program for posterity along with
your patches?

>> What does the glibc microbenchmark show about your changes? Do they
>> show a benefit?
> 
> I didn't try this, but I can.  Is there anything on the glibc web page
> about how to run this benchmark?  Does it happen as part of the standard
> 'make check'?

Just run `make bench', wait a while, and compare results before and after.

Look at bench/README for more details.

>>
>> Steve, I trust your experience with MIPS, but I'd like to see all 
>> of us drive a little more detail into these performance related
>> patches. I'm also curious if the microbenchmark shows a performance
>> progression. The glibc community is trying hard to add some objectivity
>> to our performance measurements, prevent performance regressions, and
>> use the tests to experiment with new implementations.
> 
> That sounds reasonable.  I just need a bit of help on where this is and
> how to run it.

My pleasure. Ask if you get stuck.

>> (b) the code formatting isn't in line with the project requirements.
> 
> I'll fix these up and resubmit when I have the changes (and some more
> performance data).

Thanks for being accommodating.

Cheers,
Carlos.


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-06 16:59         ` Joseph S. Myers
@ 2013-09-06 17:43           ` Steve Ellcey
  2013-09-06 18:57             ` Brooks Moses
  2013-09-18 17:41           ` Steve Ellcey
  1 sibling, 1 reply; 32+ messages in thread
From: Steve Ellcey @ 2013-09-06 17:43 UTC (permalink / raw)
  To: Joseph S. Myers; +Cc: libc-ports, bmoses

On Fri, 2013-09-06 at 16:59 +0000, Joseph S. Myers wrote:

> Since it seems to be about parallel builds and linkobj/libc.so, try with 
> Brooks's patch 
> <https://sourceware.org/ml/libc-alpha/2013-08/msg00597.html>?  (Which, if 
> it works OK on master for a while, should probably be backported to 2.18 
> branch.)

I'll try this patch out.  Your comment about working OK on master made
me think it was already on ToT but that does not seem to be the case.
The patch no longer applies cleanly to ToT due to other
Makefile/Makerule changes, I tried to apply it by hand and I hope it
works.  Unfortunately, the problem I was seeing was sporadic so I may
not immediately know.

Steve Ellcey


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-06 17:43           ` Steve Ellcey
@ 2013-09-06 18:57             ` Brooks Moses
  0 siblings, 0 replies; 32+ messages in thread
From: Brooks Moses @ 2013-09-06 18:57 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Joseph S. Myers, libc-ports

On Fri, Sep 6, 2013 at 10:43 AM, Steve Ellcey <sellcey@mips.com> wrote:
> On Fri, 2013-09-06 at 16:59 +0000, Joseph S. Myers wrote:
>>> The "undefined reference to `__libc_global_ctors'" has just shown up
>>> again in a parallel build, but it seems to go away when I rebuild.  I am
>>> still trying to understand what is going on with this.
>
>> Since it seems to be about parallel builds and linkobj/libc.so, try with
>> Brooks's patch
>> <https://sourceware.org/ml/libc-alpha/2013-08/msg00597.html>?  (Which, if
>> it works OK on master for a while, should probably be backported to 2.18
>> branch.)

Joseph is correct that this is related -- that error message is
exactly the one we were getting that sent me on the path of debugging
the circular reference and writing that patch.  The compile
instruction that causes that error message is consistent with the
implicit rule that my patch fixes.

- Brooks

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-06 17:12     ` Carlos O'Donell
@ 2013-09-06 23:33       ` Steve Ellcey
  2013-09-07  2:38         ` Carlos O'Donell
  2013-09-07  5:46         ` Andreas Schwab
  0 siblings, 2 replies; 32+ messages in thread
From: Steve Ellcey @ 2013-09-06 23:33 UTC (permalink / raw)
  To: Carlos O'Donell; +Cc: libc-ports

[-- Attachment #1: Type: text/plain, Size: 1633 bytes --]

On Fri, 2013-09-06 at 13:12 -0400, Carlos O'Donell wrote:

> Are you able to post this test program for posterity along with
> your patches?

I have attached it to this email.  I compile with -UVERIFY when doing
benchmarks and with -DVERIFY when I am doing correctness testing.  On
one of my 74k boards the old memset took 63.409 seconds and the new one
took 45.577 seconds.  I played with different prefetch hints too while
benchmarking but the prepare-to-store one is the fastest.


> Just run `make bench', wait a while, and compare results before and after.
> 
> Look at bench/README for more details.

I tried running it but all the tests failed with messages like this:


Running /home/sellcey/gcc/memset/obj-mipsisa32r2el-linux-gnu/glibc/obj_default/benchtests/bench-bcopy
/home/sellcey/gcc/memset/obj-mipsisa32r2el-linux-gnu/glibc/obj_default/elf/ld.so.1: 1: /home/sellcey/gcc/memset/obj-mipsisa32r2el-linux-gnu/glibc/obj_default/elf/ld.so.1: ^?ELF^A^A^A^C^H^A�^O4~\�: not found
/home/sellcey/gcc/memset/obj-mipsisa32r2el-linux-gnu/glibc/obj_default/elf/ld.so.1: 2: /home/sellcey/gcc/memset/obj-mipsisa32r2el-linux-gnu/glibc/obj_default/elf/ld.so.1: Syntax error: "(" unexpected


I am not quite sure what to make of this, it seems to be using the right
ld.so.1 but I am not sure what it is that is 'not found'  Could this be
related to the issue of installing the latest libgcc and libstdc++ in
default locations? (glibc 2.18 wiki section 5.1.1)  I built glibc with a
GCC from a non-standard location so the libgcc and libstdc++ for that
compiler are not in the standard locations.

Steve Ellcey
sellcey@mips.com


[-- Attachment #2: test_memset.c --]
[-- Type: text/x-csrc, Size: 1455 bytes --]

#include <string.h>
#include <stdio.h>
#include <stdlib.h>

#define STARTOFFSET   64
#define MAXOFFSET 128
#ifdef VERIFY
#define SIZE 1024
#define MAXSETSIZE 520
#define SIZEINC 1
#define OFFSETINC 1
#else
#define SIZE 102400
#define MAXSETSIZE 51200
#define SIZEINC 1
#define OFFSETINC 1
#endif

/* MEMSETVAL must be a value that VAL will never return.  */
#define VAL(N) ((N % 6) + 13)
#define MEMSETVAL 0
signed char dst[SIZE];

extern void *MEMSET_NAME(void *, int, size_t);

test(int offset, int size)
{
  int i;
  char *x;

#ifdef VERIFY
  for (i = 0; i < SIZE; i++) {
    dst[i] = VAL(i);
  }
#endif
  MEMSET_NAME(&dst[offset], MEMSETVAL, size);
#ifdef VERIFY
  /* printf("Test memset of dst[%d] (0x%p), size = %d\n", offset, &dst[offset], size); */
  for (i = 0; i < offset; i++) {
      if (dst[i] != VAL(i))
	printf("FAIL, dst[%d] got changed before it should be (%d instead of %d)\n", i, VAL(i), dst[i]);
  }
  for (i = offset; i < offset+size; i++) {
      if (dst[i] != MEMSETVAL)
	printf("FAIL, dst[%d] was not changed when it should be (%d instead of %d)\n", i, dst[i], MEMSETVAL);
  }
  for (i = offset+size; i < SIZE; i++) {
      if (dst[i] != VAL(i))
	 printf("FAIL, dst[%d] got changed after it should be (%d instead of %d)\n", i, dst[i], VAL(i));
  }
#endif
}

main()
{
  int i, j;
  for (i = STARTOFFSET; i < MAXOFFSET; i = i + OFFSETINC) {
    for (j = 1; j < MAXSETSIZE; j = j + SIZEINC) {
      test(i, j);
    }
  }
  exit(0);
}

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-06 23:33       ` Steve Ellcey
@ 2013-09-07  2:38         ` Carlos O'Donell
  2013-09-10 20:31           ` Steve Ellcey
  2013-09-07  5:46         ` Andreas Schwab
  1 sibling, 1 reply; 32+ messages in thread
From: Carlos O'Donell @ 2013-09-07  2:38 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: libc-ports

On 09/06/2013 07:33 PM, Steve Ellcey wrote:
> On Fri, 2013-09-06 at 13:12 -0400, Carlos O'Donell wrote:
> 
>> Are you able to post this test program for posterity along with
>> your patches?
> 
> I have attached it to this email.  I compile with -UVERIFY when doing
> benchmarks and with -DVERIFY when I am doing correctness testing.  On
> one of my 74k boards the old memset took 63.409 seconds and the new one
> took 45.577 seconds.  I played with different prefetch hints too while
> benchmarking but the prepare-to-store one is the fastest.

Thanks for sharing that.
 
>> Just run `make bench', wait a while, and compare results before and after.
>>
>> Look at bench/README for more details.
> 
> I tried running it but all the tests failed with messages like this:
> 
> Running /home/sellcey/gcc/memset/obj-mipsisa32r2el-linux-gnu/glibc/obj_default/benchtests/bench-bcopy
> /home/sellcey/gcc/memset/obj-mipsisa32r2el-linux-gnu/glibc/obj_default/elf/ld.so.1: 1: /home/sellcey/gcc/memset/obj-mipsisa32r2el-linux-gnu/glibc/obj_default/elf/ld.so.1: ^?ELF^A^A^A^C^H^A�^O4~\�: not found
> /home/sellcey/gcc/memset/obj-mipsisa32r2el-linux-gnu/glibc/obj_default/elf/ld.so.1: 2: /home/sellcey/gcc/memset/obj-mipsisa32r2el-linux-gnu/glibc/obj_default/elf/ld.so.1: Syntax error: "(" unexpected

That's really quite odd.

Have the Makefile print what it's going to run and then re-run it by hand?

e.g.

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 4d4b909..b0f0716 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -146,7 +146,7 @@ bench: bench-set bench-func
 
 bench-set: $(binaries-benchset)
        for run in $^; do \
-         echo "Running $${run}"; \
+         echo "Running $(run-bench)"; \
          $(run-bench) > $${run}.out; \
        done
---
 
> I am not quite sure what to make of this, it seems to be using the right
> ld.so.1 but I am not sure what it is that is 'not found'  Could this be
> related to the issue of installing the latest libgcc and libstdc++ in
> default locations? (glibc 2.18 wiki section 5.1.1)  I built glibc with a
> GCC from a non-standard location so the libgcc and libstdc++ for that
> compiler are not in the standard locations.

No, libgcc won't matter unless you do cancellation, and libstdc++ doesn't
matter because it's not a C++ application. You'll just get glibc using
the versions of those from the related prefix directories. I don't think
it should be making any difference here.

You've certainly got your share of weird environment issues :-)

Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-06 23:33       ` Steve Ellcey
  2013-09-07  2:38         ` Carlos O'Donell
@ 2013-09-07  5:46         ` Andreas Schwab
  1 sibling, 0 replies; 32+ messages in thread
From: Andreas Schwab @ 2013-09-07  5:46 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Carlos O'Donell, libc-ports

Steve Ellcey <sellcey@mips.com> writes:

> Running /home/sellcey/gcc/memset/obj-mipsisa32r2el-linux-gnu/glibc/obj_default/benchtests/bench-bcopy
> /home/sellcey/gcc/memset/obj-mipsisa32r2el-linux-gnu/glibc/obj_default/elf/ld.so.1: 1: /home/sellcey/gcc/memset/obj-mipsisa32r2el-linux-gnu/glibc/obj_default/elf/ld.so.1: ^?ELF^A^A^A^C^H^A�^O4~\�: not found
> /home/sellcey/gcc/memset/obj-mipsisa32r2el-linux-gnu/glibc/obj_default/elf/ld.so.1: 2: /home/sellcey/gcc/memset/obj-mipsisa32r2el-linux-gnu/glibc/obj_default/elf/ld.so.1: Syntax error: "(" unexpected
>
>
> I am not quite sure what to make of this,

This means that the binary format is not executable, and the shell tries
to interpret it as a script.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-07  2:38         ` Carlos O'Donell
@ 2013-09-10 20:31           ` Steve Ellcey
  2013-09-10 21:01             ` Carlos O'Donell
  0 siblings, 1 reply; 32+ messages in thread
From: Steve Ellcey @ 2013-09-10 20:31 UTC (permalink / raw)
  To: Carlos O'Donell; +Cc: libc-ports

On Fri, 2013-09-06 at 22:38 -0400, Carlos O'Donell wrote:

> No, libgcc won't matter unless you do cancellation, and libstdc++ doesn't
> matter because it's not a C++ application. You'll just get glibc using
> the versions of those from the related prefix directories. I don't think
> it should be making any difference here.
> 
> You've certainly got your share of weird environment issues :-)
> 
> Cheers,
> Carlos.

As an FYI, I think I have figured out my testing problems.  I normally
build cross toolchains by building binutils; gcc (using
--without-headers); glibc; then a final GCC.  When building on a MIPS
machine I was doing the same thing and then going back to the glibc
object directory and running 'make check' or 'make bench'.  I think the
problem with this was that the GCC now in my path (the final GCC) is not
the same GCC that I used to build glibc (the initial --without-headers
GCC).  This seemed to trigger a partial rebuild of glibc along with
building the tests and that in turn caused all sorts of weird problems.

If I build the toolchain, then build a new glibc using the final GCC and
run 'make check' or 'make bench' in that glibc object directory I get
just the expected MIPS failures.

Now that I can see the results of 'make bench' I do have a question,
what is the difference between the results in bench-memset.out and
bench-memset-ifunc.out?  MIPS doesn't yet support IFUNC.  It looks like
the results in the two files are pretty close, so maybe they are
identical runs on machines with no IFUNC?

Steve Ellcey
sellcey@mips.com



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-10 20:31           ` Steve Ellcey
@ 2013-09-10 21:01             ` Carlos O'Donell
  2013-09-10 21:14               ` Steve Ellcey
  0 siblings, 1 reply; 32+ messages in thread
From: Carlos O'Donell @ 2013-09-10 21:01 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: libc-ports

On 09/10/2013 04:29 PM, Steve Ellcey wrote:
> On Fri, 2013-09-06 at 22:38 -0400, Carlos O'Donell wrote:
> 
>> No, libgcc won't matter unless you do cancellation, and libstdc++ doesn't
>> matter because it's not a C++ application. You'll just get glibc using
>> the versions of those from the related prefix directories. I don't think
>> it should be making any difference here.
>>
>> You've certainly got your share of weird environment issues :-)
>>
>> Cheers,
>> Carlos.
> 
> As an FYI, I think I have figured out my testing problems.  I normally
> build cross toolchains by building binutils; gcc (using
> --without-headers); glibc; then a final GCC.  When building on a MIPS
> machine I was doing the same thing and then going back to the glibc
> object directory and running 'make check' or 'make bench'.  I think the
> problem with this was that the GCC now in my path (the final GCC) is not
> the same GCC that I used to build glibc (the initial --without-headers
> GCC).  This seemed to trigger a partial rebuild of glibc along with
> building the tests and that in turn caused all sorts of weird problems.
> 
> If I build the toolchain, then build a new glibc using the final GCC and
> run 'make check' or 'make bench' in that glibc object directory I get
> just the expected MIPS failures.
> 
> Now that I can see the results of 'make bench' I do have a question,
> what is the difference between the results in bench-memset.out and
> bench-memset-ifunc.out?  MIPS doesn't yet support IFUNC.  It looks like
> the results in the two files are pretty close, so maybe they are
> identical runs on machines with no IFUNC?

You get the default implementation of __libc_ifunc_impl_list (the function
used by the testing infrastructure to iterate the functions implemented
as ifuncs) which adds no additional functions to the test list. You still
test the usual defaults e.g. simple, builtin, and original function entry.
Therefore it's the same as the non-IFUNC version with the results being
the same modulo testing variance.

Does that answer your question?

Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-10 21:01             ` Carlos O'Donell
@ 2013-09-10 21:14               ` Steve Ellcey
  2013-09-10 22:35                 ` Carlos O'Donell
  0 siblings, 1 reply; 32+ messages in thread
From: Steve Ellcey @ 2013-09-10 21:14 UTC (permalink / raw)
  To: Carlos O'Donell; +Cc: libc-ports

On Tue, 2013-09-10 at 17:01 -0400, Carlos O'Donell wrote:

> > Now that I can see the results of 'make bench' I do have a question,
> > what is the difference between the results in bench-memset.out and
> > bench-memset-ifunc.out?  MIPS doesn't yet support IFUNC.  It looks like
> > the results in the two files are pretty close, so maybe they are
> > identical runs on machines with no IFUNC?
> 
> You get the default implementation of __libc_ifunc_impl_list (the function
> used by the testing infrastructure to iterate the functions implemented
> as ifuncs) which adds no additional functions to the test list. You still
> test the usual defaults e.g. simple, builtin, and original function entry.
> Therefore it's the same as the non-IFUNC version with the results being
> the same modulo testing variance.
> 
> Does that answer your question?

I think so, but just to be clear: If I did have IFUNC and 4 different
implementations of memset (for example), would the testing
infrastructure run and benchmark all 4 versions of memset?

Steve Ellcey



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-10 21:14               ` Steve Ellcey
@ 2013-09-10 22:35                 ` Carlos O'Donell
  2013-09-10 22:38                   ` Carlos O'Donell
  0 siblings, 1 reply; 32+ messages in thread
From: Carlos O'Donell @ 2013-09-10 22:35 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: libc-ports

On 09/10/2013 05:12 PM, Steve Ellcey wrote:
> On Tue, 2013-09-10 at 17:01 -0400, Carlos O'Donell wrote:
> 
>>> Now that I can see the results of 'make bench' I do have a question,
>>> what is the difference between the results in bench-memset.out and
>>> bench-memset-ifunc.out?  MIPS doesn't yet support IFUNC.  It looks like
>>> the results in the two files are pretty close, so maybe they are
>>> identical runs on machines with no IFUNC?
>>
>> You get the default implementation of __libc_ifunc_impl_list (the function
>> used by the testing infrastructure to iterate the functions implemented
>> as ifuncs) which adds no additional functions to the test list. You still
>> test the usual defaults e.g. simple, builtin, and original function entry.
>> Therefore it's the same as the non-IFUNC version with the results being
>> the same modulo testing variance.
>>
>> Does that answer your question?
> 
> I think so, but just to be clear: If I did have IFUNC and 4 different
> implementations of memset (for example), would the testing
> infrastructure run and benchmark all 4 versions of memset?

Yes.

Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-10 22:35                 ` Carlos O'Donell
@ 2013-09-10 22:38                   ` Carlos O'Donell
  0 siblings, 0 replies; 32+ messages in thread
From: Carlos O'Donell @ 2013-09-10 22:38 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: libc-ports

On 09/10/2013 06:35 PM, Carlos O'Donell wrote:
> On 09/10/2013 05:12 PM, Steve Ellcey wrote:
>> On Tue, 2013-09-10 at 17:01 -0400, Carlos O'Donell wrote:
>>
>>>> Now that I can see the results of 'make bench' I do have a question,
>>>> what is the difference between the results in bench-memset.out and
>>>> bench-memset-ifunc.out?  MIPS doesn't yet support IFUNC.  It looks like
>>>> the results in the two files are pretty close, so maybe they are
>>>> identical runs on machines with no IFUNC?
>>>
>>> You get the default implementation of __libc_ifunc_impl_list (the function
>>> used by the testing infrastructure to iterate the functions implemented
>>> as ifuncs) which adds no additional functions to the test list. You still
>>> test the usual defaults e.g. simple, builtin, and original function entry.
>>> Therefore it's the same as the non-IFUNC version with the results being
>>> the same modulo testing variance.
>>>
>>> Does that answer your question?
>>
>> I think so, but just to be clear: If I did have IFUNC and 4 different
>> implementations of memset (for example), would the testing
>> infrastructure run and benchmark all 4 versions of memset?
> 
> Yes.

Modulo hardware support obviously.

All IFUNC functions are registered at runtime with a "usable"
flag indicating if the present hardware can run that function.

The test infrastucture uses "usable" to decide to test that
function or not.

Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-06 16:59         ` Joseph S. Myers
  2013-09-06 17:43           ` Steve Ellcey
@ 2013-09-18 17:41           ` Steve Ellcey
  2013-09-19 15:25             ` Carlos O'Donell
  2013-09-20 16:43             ` Joseph S. Myers
  1 sibling, 2 replies; 32+ messages in thread
From: Steve Ellcey @ 2013-09-18 17:41 UTC (permalink / raw)
  To: Joseph S. Myers, Carlos O'Donell; +Cc: libc-ports

[-- Attachment #1: Type: text/plain, Size: 1109 bytes --]

Here is an updated version of my new MIPS memset.S routine.  I fixed the
format of the comments and the ifdef indenting and I ran 'make check'
and 'make bench' on little endian and big endian systems with the o32,
n32, and n64 ABIs.  The testing did find a bug that my original testing
missed and I have fixed that bug (it involved a negative value as the
constant being set).  Other then that, the only failures I saw were the
expected check-localplt and check-execstack errors.

I don't know if you want to see all the performance results from
bench-memset.out since it has a lot of output, but looking at the
average time for 131072 byte memsets, the original libc in o32 little
endian mode averaged 43732 (seconds I guess) and the new one was
27365.  n32 went from 21886 to 21881 and n64 went from 21882 to 21877.
So the 64 bit numbers only improved a little, but the 32 bit version
shows a very nice improvement.

Steve Ellcey
sellcey@mips.com


2013-09-18  Steve Ellcey  <sellcey@mips.com>

	* sysdeps/mips/memset.S: Change prefetching and add loop unrolling. 
	* sysdeps/mips/mips64/memset.S: Remove.


[-- Attachment #2: mips-memset.patch --]
[-- Type: text/x-patch, Size: 13678 bytes --]

diff --git a/ports/sysdeps/mips/memset.S b/ports/sysdeps/mips/memset.S
index 85062fe..3e9c88f 100644
--- a/ports/sysdeps/mips/memset.S
+++ b/ports/sysdeps/mips/memset.S
@@ -1,6 +1,5 @@
-/* Copyright (C) 2002-2013 Free Software Foundation, Inc.
+/* Copyright (C) 2013 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -16,70 +15,353 @@
    License along with the GNU C Library.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
+#ifdef ANDROID_CHANGES
+# include "machine/asm.h"
+# include "machine/regdef.h"
+# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#elif _LIBC
+# include <sysdep.h>
+# include <regdef.h>
+# include <sys/asm.h>
+# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#elif _COMPILING_NEWLIB
+# include "machine/asm.h"
+# include "machine/regdef.h"
+# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#else
+# include <regdef.h>
+# include <sys/asm.h>
+#endif
 
-	.set	nomips16
+/* Check to see if the MIPS architecture we are compiling for supports
+   prefetching.  */
+
+#if (__mips == 4) || (__mips == 5) || (__mips == 32) || (__mips == 64)
+# ifndef DISABLE_PREFETCH
+#  define USE_PREFETCH
+# endif
+#endif
+
+#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32))
+# ifndef DISABLE_DOUBLE
+#  define USE_DOUBLE
+# endif
+#endif
+
+#ifndef USE_DOUBLE
+# ifndef DISABLE_DOUBLE_ALIGN
+#  define DOUBLE_ALIGN
+# endif
+#endif
+
+/* Some asm.h files do not have the L macro definition.  */
+#ifndef L
+# if _MIPS_SIM == _ABIO32
+#  define L(label) $L ## label
+# else
+#  define L(label) .L ## label
+# endif
+#endif
+
+/* Some asm.h files do not have the PTR_ADDIU macro definition.  */
+#ifndef PTR_ADDIU
+# ifdef USE_DOUBLE
+#  define PTR_ADDIU	daddiu
+# else
+#  define PTR_ADDIU	addiu
+# endif
+#endif
 
-/* void *memset(void *s, int c, size_t n).  */
+/* Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
+   or PREFETCH_STORE_STREAMED offers a large performance advantage
+   but PREPAREFORSTORE has some special restrictions to consider.
 
-#if __MIPSEB
-# define SWHI	swl		/* high part is left in big-endian	*/
+   Prefetch with the 'prepare for store' hint does not copy a memory
+   location into the cache, it just allocates a cache line and zeros
+   it out.  This means that if you do not write to the entire cache
+   line before writing it out to memory some data will get zero'ed out
+   when the cache line is written back to memory and data will be lost.
+
+   There are ifdef'ed sections of this memcpy to make sure that it does not
+   do prefetches on cache lines that are not going to be completely written.
+   This code is only needed and only used when PREFETCH_STORE_HINT is set to
+   PREFETCH_HINT_PREPAREFORSTORE.  This code assumes that cache lines are
+   less than MAX_PREFETCH_SIZE bytes and if the cache line is larger it will
+   not work correctly.  */
+
+#ifdef USE_PREFETCH
+# define PREFETCH_HINT_STORE		1
+# define PREFETCH_HINT_STORE_STREAMED	5
+# define PREFETCH_HINT_STORE_RETAINED	7
+# define PREFETCH_HINT_PREPAREFORSTORE	30
+
+/* If we have not picked out what hints to use at this point use the
+   standard load and store prefetch hints.  */
+# ifndef PREFETCH_STORE_HINT
+#  define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
+# endif
+
+/* We double everything when USE_DOUBLE is true so we do 2 prefetches to
+   get 64 bytes in that case.  The assumption is that each individual
+   prefetch brings in 32 bytes.  */
+# ifdef USE_DOUBLE
+#  define PREFETCH_CHUNK 64
+#  define PREFETCH_FOR_STORE(chunk, reg) \
+    pref PREFETCH_STORE_HINT, (chunk)*64(reg); \
+    pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg)
+# else
+#  define PREFETCH_CHUNK 32
+#  define PREFETCH_FOR_STORE(chunk, reg) \
+    pref PREFETCH_STORE_HINT, (chunk)*32(reg)
+# endif
+
+/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less
+   than PREFETCH_CHUNK, the assumed size of each prefetch.  If the real size
+   of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE
+   hint is used, the code will not work correctly.  If PREPAREFORSTORE is not
+   used than MAX_PREFETCH_SIZE does not matter.  */
+# define MAX_PREFETCH_SIZE 128
+/* PREFETCH_LIMIT is set based on the fact that we never use an offset greater
+   than 5 on a STORE prefetch and that a single prefetch can never be larger
+   than MAX_PREFETCH_SIZE.  We add the extra 32 when USE_DOUBLE is set because
+   we actually do two prefetches in that case, one 32 bytes after the other.  */
+# ifdef USE_DOUBLE
+#  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE
+# else
+#  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE
+# endif
+
+# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \
+    && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE)
+/* We cannot handle this because the initial prefetches may fetch bytes that
+   are before the buffer being copied.  We start copies with an offset
+   of 4 so avoid this situation when using PREPAREFORSTORE.  */
+#  error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small."
+# endif
+#else /* USE_PREFETCH not defined */
+# define PREFETCH_FOR_STORE(offset, reg)
+#endif
+
+/* Allow the routine to be named something else if desired.  */
+#ifndef MEMSET_NAME
+# define MEMSET_NAME memset
+#endif
+
+/* We load/store 64 bits at a time when USE_DOUBLE is true.
+   The C_ prefix stands for CHUNK and is used to avoid macro name
+   conflicts with system header files.  */
+
+#ifdef USE_DOUBLE
+# define C_ST	sd
+# if __MIPSEB
+#  define C_STHI	sdl	/* high part is left in big-endian	*/
+# else
+#  define C_STHI	sdr	/* high part is right in little-endian	*/
+# endif
 #else
-# define SWHI	swr		/* high part is right in little-endian	*/
+# define C_ST	sw
+# if __MIPSEB
+#  define C_STHI	swl	/* high part is left in big-endian	*/
+# else
+#  define C_STHI	swr	/* high part is right in little-endian	*/
+# endif
 #endif
 
-ENTRY (memset)
+/* Bookkeeping values for 32 vs. 64 bit mode.  */
+#ifdef USE_DOUBLE
+# define NSIZE 8
+# define NSIZEMASK 0x3f
+# define NSIZEDMASK 0x7f
+#else
+# define NSIZE 4
+# define NSIZEMASK 0x1f
+# define NSIZEDMASK 0x3f
+#endif
+#define UNIT(unit) ((unit)*NSIZE)
+#define UNITM1(unit) (((unit)*NSIZE)-1)
+
+#ifdef ANDROID_CHANGES
+LEAF(MEMSET_NAME,0)
+#else
+LEAF(MEMSET_NAME)
+#endif
+
+	.set	nomips16
 	.set	noreorder
+/* If the size is less than 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
+   size, copy dst pointer to v0 for the return value.  */
+	slti	t2,a2,(2 * NSIZE)
+	bne	t2,zero,L(lastb)
+	move	v0,a0
+
+/* If memset value is not zero, we copy it to all the bytes in a 32 or 64
+   bit word.  */
+	beq	a1,zero,L(set0)		/* If memset value is zero no smear  */
+	PTR_SUBU a3,zero,a0
+	nop
+
+	/* smear byte into 32 or 64 bit word */
+#if ((__mips == 64) || (__mips == 32)) && (__mips_isa_rev >= 2)
+# ifdef USE_DOUBLE
+	dins	a1, a1, 8, 8        /* Replicate fill byte into half-word.  */
+	dins	a1, a1, 16, 16      /* Replicate fill byte into word.       */
+	dins	a1, a1, 32, 32      /* Replicate fill byte into dbl word.   */
+# else
+	ins	a1, a1, 8, 8        /* Replicate fill byte into half-word.  */
+	ins	a1, a1, 16, 16      /* Replicate fill byte into word.       */
+# endif
+#else
+# ifdef USE_DOUBLE
+        and     a1,0xff
+	dsll	t2,a1,8
+	or	a1,t2
+	dsll	t2,a1,16
+	or	a1,t2
+	dsll	t2,a1,32
+	or	a1,t2
+# else
+        and     a1,0xff
+	sll	t2,a1,8
+	or	a1,t2
+	sll	t2,a1,16
+	or	a1,t2
+# endif
+#endif
+
+/* If the destination address is not aligned do a partial store to get it
+   aligned.  If it is already aligned just jump to L(aligned).  */
+L(set0):
+	andi	t2,a3,(NSIZE-1)		/* word-unaligned address?          */
+	beq	t2,zero,L(aligned)	/* t2 is the unalignment count      */
+	PTR_SUBU a2,a2,t2
+	C_STHI	a1,0(a0)
+	PTR_ADDU a0,a0,t2
+
+L(aligned):
+/* If USE_DOUBLE is not set we may still want to align the data on a 16
+   byte boundry instead of an 8 byte boundry to maximize the opportunity
+   of proAptive chips to do memory bonding (combining two sequential 4
+   byte stores into one 8 byte store).  We know there are at least 4 bytes
+   left to store or we would have jumped to L(lastb) earlier in the code.  */
+#ifdef DOUBLE_ALIGN
+	andi	t2,a3,4
+	beq	t2,zero,L(double_aligned)
+	PTR_SUBU a2,a2,t2
+	sw	a1,0(a0)
+	PTR_ADDU a0,a0,t2
+L(double_aligned):
+#endif
 
-	slti	t1, a2, 8		# Less than 8?
-	bne	t1, zero, L(last8)
-	move	v0, a0			# Setup exit value before too late
-
-	beq	a1, zero, L(ueven)	# If zero pattern, no need to extend
-	andi	a1, 0xff		# Avoid problems with bogus arguments
-	sll	t0, a1, 8
-	or	a1, t0
-	sll	t0, a1, 16
-	or	a1, t0			# a1 is now pattern in full word
-
-L(ueven):
-	subu	t0, zero, a0		# Unaligned address?
-	andi	t0, 0x3
-	beq	t0, zero, L(chkw)
-	subu	a2, t0
-	SWHI	a1, 0(a0)		# Yes, handle first unaligned part
-	addu	a0, t0			# Now both a0 and a2 are updated
+/* Now the destination is aligned to (word or double word) aligned address
+   Set a2 to count how many bytes we have to copy after all the 64/128 byte
+   chunks are copied and a3 to the dest pointer after all the 64/128 byte
+   chunks have been copied.  We will loop, incrementing a0 until it equals
+   a3.  */
+	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
+	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
+	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
 
+/* When in the loop we may prefetch with the 'prepare to store' hint,
+   in this case the a0+x should not be past the "t0-32" address.  This
+   means: for x=128 the last "safe" a0 address is "t0-160".  Alternatively,
+   for x=64 the last "safe" a0 address is "t0-96" In the current version we
+   will use "prefetch hint,128(a0)", so "t0-160" is the limit.  */
+#if defined(USE_PREFETCH) \
+    && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	PTR_ADDU t0,a0,a2		/* t0 is the "past the end" address */
+	PTR_SUBU t9,t0,PREFETCH_LIMIT	/* t9 is the "last safe pref" address */
+#endif
+#if defined(USE_PREFETCH) \
+    && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
+	PREFETCH_FOR_STORE (1, a0)
+	PREFETCH_FOR_STORE (2, a0)
+	PREFETCH_FOR_STORE (3, a0)
+#endif
+
+L(loop16w):
+#if defined(USE_PREFETCH) \
+    && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
+	bgtz	v1,L(skip_pref)
+	nop
+#endif
+	PREFETCH_FOR_STORE (4, a0)
+	PREFETCH_FOR_STORE (5, a0)
+L(skip_pref):
+	C_ST	a1,UNIT(0)(a0)
+	C_ST	a1,UNIT(1)(a0)
+	C_ST	a1,UNIT(2)(a0)
+	C_ST	a1,UNIT(3)(a0)
+	C_ST	a1,UNIT(4)(a0)
+	C_ST	a1,UNIT(5)(a0)
+	C_ST	a1,UNIT(6)(a0)
+	C_ST	a1,UNIT(7)(a0)
+	C_ST	a1,UNIT(8)(a0)
+	C_ST	a1,UNIT(9)(a0)
+	C_ST	a1,UNIT(10)(a0)
+	C_ST	a1,UNIT(11)(a0)
+	C_ST	a1,UNIT(12)(a0)
+	C_ST	a1,UNIT(13)(a0)
+	C_ST	a1,UNIT(14)(a0)
+	C_ST	a1,UNIT(15)(a0)
+	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
+	bne	a0,a3,L(loop16w)
+	nop
+	move	a2,t8
+
+/* Here we have dest word-aligned but less than 64-bytes or 128 bytes to go.
+   Check for a 32(64) byte chunk and copy if if there is one.  Otherwise
+   jump down to L(chk1w) to handle the tail end of the copy.  */
 L(chkw):
-	andi	t0, a2, 0x7		# Enough left for one loop iteration?
-	beq	t0, a2, L(chkl)
-	subu	a3, a2, t0
-	addu	a3, a0			# a3 is last loop address +1
-	move	a2, t0			# a2 is now # of bytes left after loop
-L(loopw):
-	addiu	a0, 8			# Handle 2 words pr. iteration
-	sw	a1, -8(a0)
-	bne	a0, a3, L(loopw)
-	sw	a1, -4(a0)
-
-L(chkl):
-	andi	t0, a2, 0x4		# Check if there is at least a full
-	beq	t0, zero, L(last8)	#  word remaining after the loop
-	subu	a2, t0
-	sw	a1, 0(a0)		# Yes...
-	addiu	a0, 4
-
-L(last8):
-	blez	a2, L(exit)		# Handle last 8 bytes (if cnt>0)
-	addu	a3, a2, a0		# a3 is last address +1
-L(lst8l):
-	addiu	a0, 1
-	bne	a0, a3, L(lst8l)
-	sb	a1, -1(a0)
-L(exit):
-	j	ra			# Bye, bye
+	andi	t8,a2,NSIZEMASK	/* is there a 32-byte/64-byte chunk.  */
+				/* the t8 is the reminder count past 32-bytes */
+	beq	a2,t8,L(chk1w)/* when a2==t8, no 32-byte chunk */
+	nop
+	C_ST	a1,UNIT(0)(a0)
+	C_ST	a1,UNIT(1)(a0)
+	C_ST	a1,UNIT(2)(a0)
+	C_ST	a1,UNIT(3)(a0)
+	C_ST	a1,UNIT(4)(a0)
+	C_ST	a1,UNIT(5)(a0)
+	C_ST	a1,UNIT(6)(a0)
+	C_ST	a1,UNIT(7)(a0)
+	PTR_ADDIU a0,a0,UNIT(8)
+
+/* Here we have less than 32(64) bytes to set.  Set up for a loop to
+   copy one word (or double word) at a time.  Set a2 to count how many
+   bytes we have to copy after all the word (or double word) chunks are
+   copied and a3 to the dest pointer after all the (d)word chunks have
+   been copied.  We will loop, incrementing a0 until a0 equals a3.  */
+L(chk1w):
+	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
+	beq	a2,t8,L(lastb)
+	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
+	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
+
+/* copying in words (4-byte or 8 byte chunks) */
+L(wordCopy_loop):
+	PTR_ADDIU a0,a0,UNIT(1)
+	bne	a0,a3,L(wordCopy_loop)
+	C_ST	a1,UNIT(-1)(a0)
+
+/* Copy the last 8 (or 16) bytes */
+L(lastb):
+	blez	a2,L(leave)
+	PTR_ADDU a3,a0,a2       /* a3 is the last dst address */
+L(lastbloop):
+	PTR_ADDIU a0,a0,1
+	bne	a0,a3,L(lastbloop)
+	sb	a1,-1(a0)
+L(leave):
+	j	ra
 	nop
 
+	.set	at
 	.set	reorder
-END (memset)
-libc_hidden_builtin_def (memset)
+END(MEMSET_NAME)
+#ifndef ANDROID_CHANGES
+# ifdef _LIBC
+libc_hidden_builtin_def (MEMSET_NAME)
+# endif
+#endif

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-18 17:41           ` Steve Ellcey
@ 2013-09-19 15:25             ` Carlos O'Donell
  2013-09-19 17:02               ` Steve Ellcey
  2013-09-20 16:43             ` Joseph S. Myers
  1 sibling, 1 reply; 32+ messages in thread
From: Carlos O'Donell @ 2013-09-19 15:25 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Joseph S. Myers, libc-ports

On 09/18/2013 01:40 PM, Steve Ellcey wrote:
> Here is an updated version of my new MIPS memset.S routine.  I fixed the
> format of the comments and the ifdef indenting and I ran 'make check'
> and 'make bench' on little endian and big endian systems with the o32,
> n32, and n64 ABIs.  The testing did find a bug that my original testing
> missed and I have fixed that bug (it involved a negative value as the
> constant being set).  Other then that, the only failures I saw were the
> expected check-localplt and check-execstack errors.
> 
> I don't know if you want to see all the performance results from
> bench-memset.out since it has a lot of output, but looking at the
> average time for 131072 byte memsets, the original libc in o32 little
> endian mode averaged 43732 (seconds I guess) and the new one was
> 27365.  n32 went from 21886 to 21881 and n64 went from 21882 to 21877.
> So the 64 bit numbers only improved a little, but the 32 bit version
> shows a very nice improvement.
> 
> Steve Ellcey
> sellcey@mips.com
> 
> 
> 2013-09-18  Steve Ellcey  <sellcey@mips.com>
> 
> 	* sysdeps/mips/memset.S: Change prefetching and add loop unrolling. 
> 	* sysdeps/mips/mips64/memset.S: Remove.
> 

This looks good to me.

I think Joseph Myers should give the final ACK.

Could you please post your *.out files so others 
can have the raw data for the test run?

Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-19 15:25             ` Carlos O'Donell
@ 2013-09-19 17:02               ` Steve Ellcey
  0 siblings, 0 replies; 32+ messages in thread
From: Steve Ellcey @ 2013-09-19 17:02 UTC (permalink / raw)
  To: Carlos O'Donell; +Cc: Joseph S. Myers, libc-ports

[-- Attachment #1: Type: text/plain, Size: 632 bytes --]

On Thu, 2013-09-19 at 11:25 -0400, Carlos O'Donell wrote:

> This looks good to me.
> 
> I think Joseph Myers should give the final ACK.
> 
> Could you please post your *.out files so others 
> can have the raw data for the test run?
> 
> Cheers,
> Carlos.

Sure, here are the bench-memset.out files for the o32, n32, and n64
ABIs.  All were done with hard float and in little endian mode and the
ones with the .new suffix have my change while the ones with the .orig
suffix do not.  I have big endian data too but it is not significantly
different then the little endian data so I didn't include it.

Steve Ellcey
sellcey@mips.com

[-- Attachment #2: bench-memset.out.n32.hard.new --]
[-- Type: text/plain, Size: 17985 bytes --]

                        	memset	builtin_memset	simple_memset
Length    1, alignment  0, c -65:	31.7188	28.6562	21.1562
Length    2, alignment  0, c -65:	31.0312	32.3125	27.7344
Length    4, alignment  0, c -65:	41.1875	43.5625	43.3594
Length    8, alignment  0, c -65:	44.0781	45.1719	42.2031
Length   16, alignment  0, c -65:	43.5156	43.7344	59.3438
Length   32, alignment  0, c -65:	53.9062	55.8125	91
Length   64, alignment  0, c -65:	38.4062	39.5938	155.344
Length  128, alignment  0, c -65:	49.4219	50.5469	282.641
Length  256, alignment  0, c -65:	66.3125	68.2812	538.297
Length  512, alignment  0, c -65:	98.4375	100.453	1050.61
Length 1024, alignment  0, c -65:	176.5	182.281	2075.66
Length 2048, alignment  0, c -65:	374.078	376.562	4123.64
Length 4096, alignment  0, c -65:	715.391	717.078	8219.58
Length 8192, alignment  0, c -65:	1398.44	1399.45	16412
Length 16384, alignment  0, c -65:	2765.53	2766.89	32925.6
Length 32768, alignment  0, c -65:	5497.09	5497.16	65565
Length 65536, alignment  0, c -65:	10962	10964.5	131214
Length 131072, alignment  0, c -65:	21884	22000.8	262497
Length    1, alignment  1, c -65:	27.125	27.0469	20.9844
Length    2, alignment  2, c -65:	30.5	32.5938	27.6875
Length    3, alignment  3, c -65:	35.8438	36.7969	40.625
Length    3, alignment  0, c -65:	35.875	37.6562	40.5625
Length    4, alignment  4, c -65:	41.0781	42.75	43.3594
Length    5, alignment  5, c -65:	44.0938	46.625	46.3438
Length    5, alignment  0, c -65:	44.7656	46.9688	44.9219
Length    6, alignment  6, c -65:	46	48.3125	47.625
Length    6, alignment  0, c -65:	46.7031	47.4062	47.2812
Length    7, alignment  7, c -65:	41.1875	44.1875	40.25
Length    7, alignment  0, c -65:	41.8594	43.9219	40.625
Length    8, alignment  0, c -65:	44.1719	46.6094	42.9531
Length    9, alignment  1, c -65:	46.1719	47.8906	44.5625
Length    9, alignment  0, c -65:	45.5938	46.9375	44.2812
Length   10, alignment  2, c -65:	47.8906	50.25	46.6875
Length   10, alignment  0, c -65:	47.4844	49.75	47.3125
Length   11, alignment  3, c -65:	50.2656	51.25	48
Length   11, alignment  0, c -65:	50.2812	51.875	49.7344
Length   12, alignment  4, c -65:	51.8438	52.8906	50.2188
Length   12, alignment  0, c -65:	52.6406	54.5938	50.6094
Length   13, alignment  5, c -65:	53.8594	55.4219	53
Length   13, alignment  0, c -65:	54.2344	55.4531	52.9219
Length   14, alignment  6, c -65:	55.875	57.1875	55.3594
Length   14, alignment  0, c -65:	56.2188	57.9062	54.9688
Length   15, alignment  7, c -65:	57.9062	59.8281	56.6406
Length   15, alignment  0, c -65:	58.1719	60.5938	57.7188
Length   16, alignment  0, c -65:	42.9688	44.2188	59.0312
Length   17, alignment  1, c -65:	41.4375	42.875	61.6875
Length   17, alignment  0, c -65:	41.3906	43.1719	60.3125
Length   18, alignment  2, c -65:	51.3281	52.7344	62.6094
Length   18, alignment  0, c -65:	50.0469	51.5	62.2969
Length   19, alignment  3, c -65:	50.7188	52.1562	64.9844
Length   19, alignment  0, c -65:	55.3594	56.9219	64.6406
Length   20, alignment  4, c -65:	41.6562	43.5781	67.375
Length   20, alignment  0, c -65:	57.4844	58.2344	66.9844
Length   21, alignment  5, c -65:	49.6562	49.8438	68.6094
Length   21, alignment  0, c -65:	53.5938	56.2344	68.9531
Length   22, alignment  6, c -65:	56.6094	56.8906	71.3281
Length   22, alignment  0, c -65:	61.3281	62.3125	70.7188
Length   23, alignment  7, c -65:	61.7188	62.5469	72.5625
Length   23, alignment  0, c -65:	58.2812	59.0625	72.5938
Length   24, alignment  0, c -65:	46.9375	48.7812	74.2188
Length   24, alignment  0, c -65:	47.2656	48.7812	75.6562
Length   25, alignment  1, c -65:	49.1875	50.4531	76.2812
Length   25, alignment  0, c -65:	47.1719	48.0938	76.9375
Length   26, alignment  2, c -65:	55.25	56.5	78.9688
Length   26, alignment  0, c -65:	55	56.8594	79.3438
Length   27, alignment  3, c -65:	55.4844	57.2812	81.0156
Length   27, alignment  0, c -65:	58.75	60.2344	81.3594
Length   28, alignment  4, c -65:	48.1094	48.2344	82.5625
Length   28, alignment  0, c -65:	60.9688	60.7656	82.5938
Length   29, alignment  5, c -65:	55.0469	56.2812	84.9375
Length   29, alignment  0, c -65:	59.3594	61.3125	85.4062
Length   30, alignment  6, c -65:	59.9531	60.0938	87.3281
Length   30, alignment  0, c -65:	65.4062	66.9219	86.9375
Length   31, alignment  7, c -65:	64.7031	66.9375	89.7656
Length   31, alignment  0, c -65:	63.75	65.9375	89.0312
Length   14, alignment  1, c -65:	56	58.6562	54.9375
Length 1024, alignment  3, c -65:	196.141	199.625	2074.53
Length   64, alignment  4, c -65:	73.4688	74.9062	154.641
Length   25, alignment  2, c -65:	54.7031	56.125	77.4375
Length    1, alignment  0, c  0:	26.0469	26.7812	21.7031
Length    2, alignment  0, c  0:	30.9062	32.4844	27.7031
Length    4, alignment  0, c  0:	41.5	43.3125	43.6406
Length    8, alignment  0, c  0:	44.75	45.9375	43
Length   16, alignment  0, c  0:	42.4375	44.5156	59.7031
Length   32, alignment  0, c  0:	54.0312	54.4531	90.2969
Length   64, alignment  0, c  0:	38.4062	39.5781	154.219
Length  128, alignment  0, c  0:	48.9844	50.1875	282.625
Length  256, alignment  0, c  0:	66.7188	68.5469	539.719
Length  512, alignment  0, c  0:	98.0469	101.781	1051.67
Length 1024, alignment  0, c  0:	174.672	176.875	2074.2
Length 2048, alignment  0, c  0:	374.375	374.484	4122.92
Length 4096, alignment  0, c  0:	714.422	716.344	8219.25
Length 8192, alignment  0, c  0:	1398.14	1400.09	16411.8
Length 16384, alignment  0, c  0:	2765.48	2767.17	32797
Length 32768, alignment  0, c  0:	5496.28	5607	65565
Length 65536, alignment  0, c  0:	10961.7	10962.8	131270
Length 131072, alignment  0, c  0:	21883.7	21885.5	262237
Length    1, alignment  1, c  0:	25.7188	26.9531	21.3906
Length    2, alignment  2, c  0:	30.1562	32.1406	27.4219
Length    3, alignment  3, c  0:	35.8594	38.2031	41.375
Length    3, alignment  0, c  0:	36.1562	37.4531	41.5938
Length    4, alignment  4, c  0:	41.3906	42.875	43.5781
Length    5, alignment  5, c  0:	45.0625	46.4375	46.3594
Length    5, alignment  0, c  0:	44.7188	47.1406	46.3594
Length    6, alignment  6, c  0:	48.0938	48.4688	47.6562
Length    6, alignment  0, c  0:	47.3906	48.125	48
Length    7, alignment  7, c  0:	42.3125	43.6094	40.9219
Length    7, alignment  0, c  0:	42.5625	44.1562	40.6094
Length    8, alignment  0, c  0:	43.8438	45.0938	42.6406
Length    9, alignment  1, c  0:	46.2031	47.4375	44.9844
Length    9, alignment  0, c  0:	47.2656	48.1719	44.5938
Length   10, alignment  2, c  0:	49.2656	51.2031	47.3125
Length   10, alignment  0, c  0:	48.1719	49.4219	46.5781
Length   11, alignment  3, c  0:	49.8281	51.5469	48.6406
Length   11, alignment  0, c  0:	50.5312	52.1406	49.6562
Length   12, alignment  4, c  0:	52.5625	53.8125	50.6719
Length   12, alignment  0, c  0:	51.8438	53.0625	51.3125
Length   13, alignment  5, c  0:	53.8906	56.1562	53.3125
Length   13, alignment  0, c  0:	54.5938	55.8125	52.9844
Length   14, alignment  6, c  0:	56.25	58.0156	55.375
Length   14, alignment  0, c  0:	55.9375	57.5	54.6562
Length   15, alignment  7, c  0:	58.1562	59.5156	57.3594
Length   15, alignment  0, c  0:	58.2188	59.0938	56.6406
Length   16, alignment  0, c  0:	41.7969	42.5156	58.6094
Length   17, alignment  1, c  0:	40.6875	41.1875	59.9062
Length   17, alignment  0, c  0:	42.0781	41.875	60.5938
Length   18, alignment  2, c  0:	50.9375	52.5469	62.625
Length   18, alignment  0, c  0:	50.0938	51.8438	63.7031
Length   19, alignment  3, c  0:	50.25	52.1875	64.625
Length   19, alignment  0, c  0:	55.4062	56.8125	64.2969
Length   20, alignment  4, c  0:	40.9375	43.2656	67.7656
Length   20, alignment  0, c  0:	57.5156	57.9219	66.2656
Length   21, alignment  5, c  0:	48.9844	51.2031	67.5938
Length   21, alignment  0, c  0:	54.625	55.875	69
Length   22, alignment  6, c  0:	56.3906	56.4844	70.6875
Length   22, alignment  0, c  0:	61.0156	62.5312	70.6875
Length   23, alignment  7, c  0:	61.0312	62.1875	72.6094
Length   23, alignment  0, c  0:	57.625	58.9844	72.2656
Length   24, alignment  0, c  0:	46.5938	48.125	74.6562
Length   24, alignment  0, c  0:	47.2656	48.5469	74.3125
Length   25, alignment  1, c  0:	48.5938	51.1875	76.9688
Length   25, alignment  0, c  0:	47.8438	48.8125	76.2812
Length   26, alignment  2, c  0:	56	57.9375	78.9531
Length   26, alignment  0, c  0:	55.4531	57.125	79.25
Length   27, alignment  3, c  0:	55.9844	57.5156	80.6094
Length   27, alignment  0, c  0:	58.7031	61.1719	80.6562
Length   28, alignment  4, c  0:	47.7031	48.5156	82.2656
Length   28, alignment  0, c  0:	60.7188	61.8906	82.625
Length   29, alignment  5, c  0:	55.0469	55.8281	85
Length   29, alignment  0, c  0:	59.6562	60.4844	83.9375
Length   30, alignment  6, c  0:	60.0312	59.875	86.9375
Length   30, alignment  0, c  0:	65.3438	67.125	88.0312
Length   31, alignment  7, c  0:	64.6406	65.8281	88.3281
Length   31, alignment  0, c  0:	63.7344	65.2188	88.9688
Length   14, alignment  1, c  0:	55.9375	58.8438	54.6094
Length 1024, alignment  3, c  0:	197.062	196.828	2075.36
Length   64, alignment  4, c  0:	73.0625	74.2188	154.672
Length   25, alignment  2, c  0:	55.0312	56.5312	77
Length    1, alignment  0, c 65:	25.375	27.6562	21.4844
Length    2, alignment  0, c 65:	30.5469	32.8281	27.7812
Length    4, alignment  0, c 65:	41.5469	43.1719	44
Length    8, alignment  0, c 65:	44.8281	46.1562	43.3281
Length   16, alignment  0, c 65:	42.3125	44.5	58.9688
Length   32, alignment  0, c 65:	52.875	55.1875	90.5781
Length   64, alignment  0, c 65:	38.4688	40.8438	155.031
Length  128, alignment  0, c 65:	49.0781	50.5	283.031
Length  256, alignment  0, c 65:	66.6562	67.4688	538.625
Length  512, alignment  0, c 65:	99.5	100.938	1050.58
Length 1024, alignment  0, c 65:	177.953	179.766	2074.61
Length 2048, alignment  0, c 65:	372.688	375.625	4123
Length 4096, alignment  0, c 65:	718.938	719.766	8218.52
Length 8192, alignment  0, c 65:	1401.42	1403.64	16411.6
Length 16384, alignment  0, c 65:	2768.5	2770.7	32797.5
Length 32768, alignment  0, c 65:	5500.03	5497.83	65673.7
Length 65536, alignment  0, c 65:	10957.5	10959.2	131298
Length 131072, alignment  0, c 65:	21879.3	21880.3	262336
Length    1, alignment  1, c 65:	26.4375	27.2656	21.4219
Length    2, alignment  2, c 65:	31.7812	32.1875	27.4062
Length    3, alignment  3, c 65:	36.5625	37.5	42.3281
Length    3, alignment  0, c 65:	36.4219	38.2188	41.5469
Length    4, alignment  4, c 65:	42.2812	42.75	43.25
Length    5, alignment  5, c 65:	44.8281	46.0938	45.25
Length    5, alignment  0, c 65:	45.1719	46.4531	45.2031
Length    6, alignment  6, c 65:	47.4219	47.9062	47.5781
Length    6, alignment  0, c 65:	46.6875	48.4844	47.6875
Length    7, alignment  7, c 65:	41.5625	43.5312	40.6719
Length    7, alignment  0, c 65:	41.5625	43.7812	41.2812
Length    8, alignment  0, c 65:	44.2656	46.1875	43.3281
Length    9, alignment  1, c 65:	45.1875	47.8438	44.9688
Length    9, alignment  0, c 65:	46.2031	47.8438	44.2188
Length   10, alignment  2, c 65:	48.2188	50.1562	47.3281
Length   10, alignment  0, c 65:	48.2188	50.1094	46.9531
Length   11, alignment  3, c 65:	50.1875	51.7812	48.6094
Length   11, alignment  0, c 65:	49.5312	51.8125	49.2812
Length   12, alignment  4, c 65:	52.2656	54.5156	50.9688
Length   12, alignment  0, c 65:	51.8594	54.5781	50.9688
Length   13, alignment  5, c 65:	54.9688	56.2656	52.6406
Length   13, alignment  0, c 65:	53.8125	54.8594	51.9531
Length   14, alignment  6, c 65:	56.9375	57.8594	55.3438
Length   14, alignment  0, c 65:	56.5625	57.8594	53.8906
Length   15, alignment  7, c 65:	57.1875	59.8438	56.6094
Length   15, alignment  0, c 65:	58.2656	59.5781	56.2812
Length   16, alignment  0, c 65:	42.5312	43.5312	58.5938
Length   17, alignment  1, c 65:	39.7812	41.5156	61.0156
Length   17, alignment  0, c 65:	41.75	43.5469	61.0156
Length   18, alignment  2, c 65:	52.0312	53.5	63.0156
Length   18, alignment  0, c 65:	50.4844	50.5156	62.2031
Length   19, alignment  3, c 65:	49.8438	51.0469	64.5938
Length   19, alignment  0, c 65:	56.4844	56.8125	65.3594
Length   20, alignment  4, c 65:	42.0156	42.7812	66.2344
Length   20, alignment  0, c 65:	56.7969	57.0938	66.9219
Length   21, alignment  5, c 65:	49.2969	51.5469	69.3438
Length   21, alignment  0, c 65:	54.1875	55.5312	68.9844
Length   22, alignment  6, c 65:	57.3906	56.4219	70.5938
Length   22, alignment  0, c 65:	61.3438	63.1875	71.0312
Length   23, alignment  7, c 65:	61.4375	61.7812	72.6406
Length   23, alignment  0, c 65:	58.7344	59.875	72.3125
Length   24, alignment  0, c 65:	48.375	49.2188	75.4062
Length   24, alignment  0, c 65:	47.25	49.5781	74.6406
Length   25, alignment  1, c 65:	50.375	51.2031	77.7188
Length   25, alignment  0, c 65:	47.0469	48.8281	76.2812
Length   26, alignment  2, c 65:	55.75	56.875	79.3594
Length   26, alignment  0, c 65:	55.8125	56.875	79.3281
Length   27, alignment  3, c 65:	55.625	56.9375	81.375
Length   27, alignment  0, c 65:	58.4375	60.4219	80.9375
Length   28, alignment  4, c 65:	47.5938	48.0938	82.2031
Length   28, alignment  0, c 65:	61.9062	61.2188	83
Length   29, alignment  5, c 65:	54.8125	56.1719	84.2656
Length   29, alignment  0, c 65:	59.2812	61.4844	84.625
Length   30, alignment  6, c 65:	61.0625	59.5469	87.3438
Length   30, alignment  0, c 65:	65.0312	66.8125	86.9219
Length   31, alignment  7, c 65:	63.9844	65.8438	89.3438
Length   31, alignment  0, c 65:	64.1094	65.5469	89.3594
Length   14, alignment  1, c 65:	56.8906	57.5312	54.2188
Length 1024, alignment  3, c 65:	195.484	195.984	2073.91
Length   64, alignment  4, c 65:	72.7344	74.125	155.016
Length   25, alignment  2, c 65:	54.3125	55.2188	75.5469
Length    1, alignment  0, c 130:	24.5469	27.2188	22.5312
Length    2, alignment  0, c 130:	31.3438	32.1094	27.375
Length    4, alignment  0, c 130:	42.125	43.25	44.4375
Length    8, alignment  0, c 130:	44.4375	46.2656	43.7031
Length   16, alignment  0, c 130:	42.7812	44.1719	59
Length   32, alignment  0, c 130:	52.9219	54.9062	91.6875
Length   64, alignment  0, c 130:	39.2031	40.5156	154.328
Length  128, alignment  0, c 130:	49.9062	51.5	283.422
Length  256, alignment  0, c 130:	66.25	68.2656	538.656
Length  512, alignment  0, c 130:	103.719	105.875	1051.3
Length 1024, alignment  0, c 130:	177.047	180.516	2075.34
Length 2048, alignment  0, c 130:	377.562	379.547	4123.28
Length 4096, alignment  0, c 130:	718.797	721.406	8219.97
Length 8192, alignment  0, c 130:	1397.75	1400.19	16411.6
Length 16384, alignment  0, c 130:	2765.34	2766.45	32796.7
Length 32768, alignment  0, c 130:	5495.81	5497.3	65673.2
Length 65536, alignment  0, c 130:	10957.1	10958.6	131248
Length 131072, alignment  0, c 130:	21879.9	21881.3	262341
Length    1, alignment  1, c 130:	26.125	26.9062	21.7188
Length    2, alignment  2, c 130:	30.875	33.2188	27.7969
Length    3, alignment  3, c 130:	36.6719	37.2344	41.9688
Length    3, alignment  0, c 130:	37.2188	37.9688	41.6562
Length    4, alignment  4, c 130:	41.75	43.9375	43.2656
Length    5, alignment  5, c 130:	45.0625	46.2188	46
Length    5, alignment  0, c 130:	45.0312	46.5469	45.9375
Length    6, alignment  6, c 130:	46.6875	49.25	48.0625
Length    6, alignment  0, c 130:	46.7656	48.9688	47.9844
Length    7, alignment  7, c 130:	42.1875	43.8594	41.3125
Length    7, alignment  0, c 130:	41.8594	43.8594	40.9688
Length    8, alignment  0, c 130:	44.2656	45.4531	42.5781
Length    9, alignment  1, c 130:	46.2188	47.125	44.2969
Length    9, alignment  0, c 130:	44.8125	47.4375	44.2812
Length   10, alignment  2, c 130:	47.5	50.5938	45.875
Length   10, alignment  0, c 130:	47.1562	49.25	47.0156
Length   11, alignment  3, c 130:	50.3125	51.0469	49.0156
Length   11, alignment  0, c 130:	49.8906	51.5625	48.6406
Length   12, alignment  4, c 130:	51.8906	53.7656	50.6094
Length   12, alignment  0, c 130:	52.7188	53.4219	50.2812
Length   13, alignment  5, c 130:	53.8281	55.2656	52.9375
Length   13, alignment  0, c 130:	53.5625	55.5312	52.2031
Length   14, alignment  6, c 130:	56.1719	57.5625	55.3594
Length   14, alignment  0, c 130:	54.7969	57.4219	55.7188
Length   15, alignment  7, c 130:	58.2031	59.5625	56.5938
Length   15, alignment  0, c 130:	57.8125	59.5312	56.5625
Length   16, alignment  0, c 130:	41.7812	43.2344	58.2031
Length   17, alignment  1, c 130:	40.6562	42.4062	60.9531
Length   17, alignment  0, c 130:	41	42.9844	60.2031
Length   18, alignment  2, c 130:	51.6406	52.7812	63
Length   18, alignment  0, c 130:	49.6406	51.5469	62.9688
Length   19, alignment  3, c 130:	51.0156	51.5938	65.3281
Length   19, alignment  0, c 130:	55.7969	56.2344	64.5625
Length   20, alignment  4, c 130:	42.0781	43.1875	67.3438
Length   20, alignment  0, c 130:	57.4844	57.9062	67.3125
Length   21, alignment  5, c 130:	49.2656	50.3906	68.5625
Length   21, alignment  0, c 130:	53.9062	56.2344	69.3125
Length   22, alignment  6, c 130:	56.6406	57.25	71.6719
Length   22, alignment  0, c 130:	61.0156	63.25	70.4062
Length   23, alignment  7, c 130:	59.9219	62.9219	72.9688
Length   23, alignment  0, c 130:	57.9062	60.25	73.1094
Length   24, alignment  0, c 130:	47.2656	49.5625	74.2969
Length   24, alignment  0, c 130:	48.1562	49.1562	74.9375
Length   25, alignment  1, c 130:	50.0781	51.2344	77.3281
Length   25, alignment  0, c 130:	47.3281	48.9219	77.3281
Length   26, alignment  2, c 130:	56.4844	56.0625	79.0938
Length   26, alignment  0, c 130:	54.4844	56.0625	78.2031
Length   27, alignment  3, c 130:	55.5625	56.9844	80.0312
Length   27, alignment  0, c 130:	57.7344	60.2344	80.6406
Length   28, alignment  4, c 130:	47.75	48.6562	82.6562
Length   28, alignment  0, c 130:	61.8594	61.25	83.0156
Length   29, alignment  5, c 130:	54.9688	56.2812	85.3281
Length   29, alignment  0, c 130:	60.0781	61.25	85.0312
Length   30, alignment  6, c 130:	60.7188	60.2344	87.3438
Length   30, alignment  0, c 130:	65.0312	66.5938	86.9688
Length   31, alignment  7, c 130:	64.6875	65.5469	88.2344
Length   31, alignment  0, c 130:	64.1406	65.5625	88.2656
Length   14, alignment  1, c 130:	55.9531	57.2031	54.2656
Length 1024, alignment  3, c 130:	196.094	197.281	2075
Length   64, alignment  4, c 130:	72.5156	74.25	155.031
Length   25, alignment  2, c 130:	54.6406	56.625	77.1094

[-- Attachment #3: bench-memset.out.n32.hard.orig --]
[-- Type: text/plain, Size: 17940 bytes --]

                        	memset	builtin_memset	simple_memset
Length    1, alignment  0, c -65:	30.9531	28.6719	21.75
Length    2, alignment  0, c -65:	31.9844	33.9531	27.7812
Length    4, alignment  0, c -65:	42.25	44.3438	44
Length    8, alignment  0, c -65:	51.1094	52.4844	41.9375
Length   16, alignment  0, c -65:	36.1719	37.2344	57.9219
Length   32, alignment  0, c -65:	44.5469	44.5156	89.8594
Length   64, alignment  0, c -65:	56	56.8906	155.312
Length  128, alignment  0, c -65:	59.7031	59.8906	281.938
Length  256, alignment  0, c -65:	80.9688	82.4375	538.594
Length  512, alignment  0, c -65:	123.312	125.922	1051.36
Length 1024, alignment  0, c -65:	208.969	210.234	2074.55
Length 2048, alignment  0, c -65:	380.594	382.188	4123.69
Length 4096, alignment  0, c -65:	721.719	722	8219.14
Length 8192, alignment  0, c -65:	1404.56	1406.27	16412
Length 16384, alignment  0, c -65:	2770.7	2774.7	32797.9
Length 32768, alignment  0, c -65:	5503.94	5504.16	65682.4
Length 65536, alignment  0, c -65:	10963.8	10965.9	131221
Length 131072, alignment  0, c -65:	21886.1	21887.8	262347
Length    1, alignment  1, c -65:	26.9219	27.2188	21.4062
Length    2, alignment  2, c -65:	31.2031	33.5312	27.75
Length    3, alignment  3, c -65:	37.2031	38.9062	42.6719
Length    3, alignment  0, c -65:	36.5625	38.2344	41.9688
Length    4, alignment  4, c -65:	41.3125	43.2656	43.2812
Length    5, alignment  5, c -65:	48.1094	50.2969	45.5781
Length    5, alignment  0, c -65:	48.1094	50.2188	45.625
Length    6, alignment  6, c -65:	46.0938	47.0938	47.6406
Length    6, alignment  0, c -65:	46.0625	46.75	47.375
Length    7, alignment  7, c -65:	48.8125	50.125	41.0781
Length    7, alignment  0, c -65:	48.3906	49.6562	40.2812
Length    8, alignment  0, c -65:	50.7969	53.4531	43
Length    9, alignment  1, c -65:	53.75	56.1406	45.0312
Length    9, alignment  0, c -65:	54.5469	55.3125	45
Length   10, alignment  2, c -65:	56.8125	57.6719	47.375
Length   10, alignment  0, c -65:	56.4531	58.4375	47.4062
Length   11, alignment  3, c -65:	60.125	60.7344	49
Length   11, alignment  0, c -65:	59.9062	61.0781	49
Length   12, alignment  4, c -65:	61.7812	64.1719	50.6406
Length   12, alignment  0, c -65:	62.125	64.5156	50.6094
Length   13, alignment  5, c -65:	64.4219	66.1094	52.625
Length   13, alignment  0, c -65:	64.7969	66.4531	53.0156
Length   14, alignment  6, c -65:	68.6406	69.125	55.3906
Length   14, alignment  0, c -65:	67.0781	68.4219	54.625
Length   15, alignment  7, c -65:	69.8594	71.4531	56.3281
Length   15, alignment  0, c -65:	69.8281	71.5156	56.625
Length   16, alignment  0, c -65:	35.7969	37.6406	59.0625
Length   17, alignment  1, c -65:	41.8125	43.625	60.6875
Length   17, alignment  0, c -65:	35.8125	37.3125	60.3125
Length   18, alignment  2, c -65:	52.1719	53.9219	62.9531
Length   18, alignment  0, c -65:	41.8281	44.5625	62.9375
Length   19, alignment  3, c -65:	55.4375	57.5	65.6562
Length   19, alignment  0, c -65:	56.2344	57.7656	65.2812
Length   20, alignment  4, c -65:	37.0625	38.5781	66.9688
Length   20, alignment  0, c -65:	54.5	54.6406	66.5781
Length   21, alignment  5, c -65:	43.0312	44.2344	67.9375
Length   21, alignment  0, c -65:	60.5312	62.3906	68.25
Length   22, alignment  6, c -65:	58.9062	60.875	71.0156
Length   22, alignment  0, c -65:	56.3125	56.6562	70.625
Length   23, alignment  7, c -65:	55.8906	58.875	72.6875
Length   23, alignment  0, c -65:	59.0625	60.8594	73
Length   24, alignment  0, c -65:	35.3438	37.1875	74.25
Length   24, alignment  0, c -65:	35.2656	37.25	75.125
Length   25, alignment  1, c -65:	43.1094	44.5781	76.2344
Length   25, alignment  0, c -65:	36.0625	36.9375	76.2344
Length   26, alignment  2, c -65:	53.3906	55.25	78.6875
Length   26, alignment  0, c -65:	42.7031	44.5781	78.2656
Length   27, alignment  3, c -65:	57.125	58.4688	80.6562
Length   27, alignment  0, c -65:	55.8594	58.0781	80.6875
Length   28, alignment  4, c -65:	36.3906	37.6562	82.6719
Length   28, alignment  0, c -65:	59.625	61.1406	82.6562
Length   29, alignment  5, c -65:	44.0625	45.2656	84.6562
Length   29, alignment  0, c -65:	62.6406	64.1875	85.3281
Length   30, alignment  6, c -65:	59.9844	61.8125	87.3281
Length   30, alignment  0, c -65:	56.3438	56.9688	87.2656
Length   31, alignment  7, c -65:	56.5469	57.8438	89.6406
Length   31, alignment  0, c -65:	58.9219	60.1406	89.3594
Length   14, alignment  1, c -65:	67.8594	69	55.3906
Length 1024, alignment  3, c -65:	226.281	227.094	2074.62
Length   64, alignment  4, c -65:	71.7812	73.4375	154.969
Length   25, alignment  2, c -65:	57.25	59.2344	77.6875
Length    1, alignment  0, c  0:	26.0312	27.25	21.4062
Length    2, alignment  0, c  0:	31.875	32.8281	27.7812
Length    4, alignment  0, c  0:	42.5781	43.7344	43.9688
Length    8, alignment  0, c  0:	51.7656	52.2656	42.5938
Length   16, alignment  0, c  0:	35.1875	36.4688	59.3281
Length   32, alignment  0, c  0:	41	41.875	90.3281
Length   64, alignment  0, c  0:	53.0156	55.2188	154.609
Length  128, alignment  0, c  0:	59.0312	59.5	283.656
Length  256, alignment  0, c  0:	79.7969	80.8281	539.703
Length  512, alignment  0, c  0:	123.031	123.797	1050.97
Length 1024, alignment  0, c  0:	208	210.25	2075.33
Length 2048, alignment  0, c  0:	378.875	378.906	4122.61
Length 4096, alignment  0, c  0:	1055.81	721.703	8218.92
Length 8192, alignment  0, c  0:	1402.94	1404.17	16411.5
Length 16384, alignment  0, c  0:	2770.08	2771.44	32797
Length 32768, alignment  0, c  0:	5500.91	5502.78	65740.5
Length 65536, alignment  0, c  0:	10961.9	10963.1	131211
Length 131072, alignment  0, c  0:	21885	21886.1	262371
Length    1, alignment  1, c  0:	27.4844	27.3281	21.3594
Length    2, alignment  2, c  0:	32.2344	32.4688	27.0312
Length    3, alignment  3, c  0:	36.2188	38.0781	40.8594
Length    3, alignment  0, c  0:	37.3281	38.8281	42.2812
Length    4, alignment  4, c  0:	42.2969	43.7812	44.3594
Length    5, alignment  5, c  0:	48.7812	49.7344	45.6719
Length    5, alignment  0, c  0:	49.4375	50.6875	46.3125
Length    6, alignment  6, c  0:	46.4688	47.6875	47.2969
Length    6, alignment  0, c  0:	46.0781	47.3906	47.3125
Length    7, alignment  7, c  0:	48.7812	49.2812	41.0156
Length    7, alignment  0, c  0:	49.1406	50.7031	41
Length    8, alignment  0, c  0:	52	52.7188	41.9375
Length    9, alignment  1, c  0:	53.75	56.0938	45.7188
Length    9, alignment  0, c  0:	54.8281	55.7656	44.9844
Length   10, alignment  2, c  0:	57.5156	58.375	47.0156
Length   10, alignment  0, c  0:	56.8125	57.4062	46.6562
Length   11, alignment  3, c  0:	59.0625	60.7188	47.8906
Length   11, alignment  0, c  0:	59.4375	60.375	48.625
Length   12, alignment  4, c  0:	61.8281	63.8125	50.6094
Length   12, alignment  0, c  0:	62.4688	63.7031	51.3594
Length   13, alignment  5, c  0:	65.1094	66.7812	53.3438
Length   13, alignment  0, c  0:	64.2188	65.7344	51.9219
Length   14, alignment  6, c  0:	67.4375	69.0469	54.9531
Length   14, alignment  0, c  0:	68.5156	68.7031	55.3125
Length   15, alignment  7, c  0:	71.2031	71.0469	55.6094
Length   15, alignment  0, c  0:	70.5	71.4062	57.3906
Length   16, alignment  0, c  0:	35.7812	37.8438	57.9375
Length   17, alignment  1, c  0:	38.8594	40.4688	60.6562
Length   17, alignment  0, c  0:	35.1562	36.5156	61.3281
Length   18, alignment  2, c  0:	50.8438	50.875	63.6719
Length   18, alignment  0, c  0:	40.3594	41.8438	63.3438
Length   19, alignment  3, c  0:	53.75	54.0938	64.5625
Length   19, alignment  0, c  0:	52.9375	54.0469	64.9219
Length   20, alignment  4, c  0:	34.4844	36.5156	67.0156
Length   20, alignment  0, c  0:	56.0938	56.7969	66.1875
Length   21, alignment  5, c  0:	39.75	41.2188	68.2344
Length   21, alignment  0, c  0:	57.5469	58.6406	68.6875
Length   22, alignment  6, c  0:	55.3438	55.4219	70.25
Length   22, alignment  0, c  0:	54.9688	57.1094	71.4062
Length   23, alignment  7, c  0:	54.5938	56.4062	73
Length   23, alignment  0, c  0:	57.5625	59.4375	73.0312
Length   24, alignment  0, c  0:	34.2656	35.8438	75.3594
Length   24, alignment  0, c  0:	34.2656	35.5156	74.5938
Length   25, alignment  1, c  0:	41.2656	41.8125	76.5469
Length   25, alignment  0, c  0:	34.7188	36.5469	76.9688
Length   26, alignment  2, c  0:	50.5156	51.8438	79.2969
Length   26, alignment  0, c  0:	41.5	41.3125	79.2812
Length   27, alignment  3, c  0:	53.9062	56.0312	81.0156
Length   27, alignment  0, c  0:	53.5312	55.4688	81.3125
Length   28, alignment  4, c  0:	34.125	35.5625	83.7188
Length   28, alignment  0, c  0:	57.9844	58.6719	83.3281
Length   29, alignment  5, c  0:	40.3281	42.1406	84.5781
Length   29, alignment  0, c  0:	59.6875	59.7188	84.6875
Length   30, alignment  6, c  0:	56.5781	58.4062	87.3594
Length   30, alignment  0, c  0:	54.5625	56.125	87.0156
Length   31, alignment  7, c  0:	53.9062	54.3906	88.9688
Length   31, alignment  0, c  0:	56.5156	58.5	88.9688
Length   14, alignment  1, c  0:	67.5469	68.7188	54.2812
Length 1024, alignment  3, c  0:	223.141	224.703	2074.23
Length   64, alignment  4, c  0:	69.5938	69.4375	154.234
Length   25, alignment  2, c  0:	52.2031	52.4219	76.2344
Length    1, alignment  0, c 65:	26.3281	27.1406	21.75
Length    2, alignment  0, c 65:	31.25	33.2812	26.9688
Length    4, alignment  0, c 65:	42.1719	43.5312	43.5938
Length    8, alignment  0, c 65:	52.6562	52.9844	42.5625
Length   16, alignment  0, c 65:	37.625	38.2656	58.2969
Length   32, alignment  0, c 65:	43.6719	45.1875	90.2812
Length   64, alignment  0, c 65:	55.2812	56.1094	154.328
Length  128, alignment  0, c 65:	59.9844	60.5469	282.641
Length  256, alignment  0, c 65:	81.3906	82.5	538.641
Length  512, alignment  0, c 65:	124.031	125.875	1050.66
Length 1024, alignment  0, c 65:	209.344	210.828	2074.98
Length 2048, alignment  0, c 65:	379.5	380.688	4124.12
Length 4096, alignment  0, c 65:	721.703	722.453	8219.36
Length 8192, alignment  0, c 65:	1405.09	1405.03	16583.1
Length 16384, alignment  0, c 65:	2770.77	2772.28	32796.8
Length 32768, alignment  0, c 65:	5502.23	5503.12	65564.4
Length 65536, alignment  0, c 65:	10963	11083.9	131101
Length 131072, alignment  0, c 65:	21993.8	21887.5	262343
Length    1, alignment  1, c 65:	26.7188	28.9375	21.7031
Length    2, alignment  2, c 65:	31.875	33.1875	26.6719
Length    3, alignment  3, c 65:	36.9219	38.0938	41.6562
Length    3, alignment  0, c 65:	36.6562	37.2031	40.5938
Length    4, alignment  4, c 65:	42	43.3281	43.3125
Length    5, alignment  5, c 65:	48.0938	49.7188	45.2812
Length    5, alignment  0, c 65:	48.7812	50.7656	45.9375
Length    6, alignment  6, c 65:	46.4375	48.0312	47.9688
Length    6, alignment  0, c 65:	46.0781	47.8281	46.9531
Length    7, alignment  7, c 65:	48.75	50.4062	41
Length    7, alignment  0, c 65:	48.1875	50.3906	41.3438
Length    8, alignment  0, c 65:	50.75	52.4375	41.875
Length    9, alignment  1, c 65:	53.4688	55.3906	44.6562
Length    9, alignment  0, c 65:	54.4688	55.7031	45.7812
Length   10, alignment  2, c 65:	56.1562	59.4844	47.7344
Length   10, alignment  0, c 65:	56.7812	58.4375	46.6719
Length   11, alignment  3, c 65:	59.4688	60.75	49.3906
Length   11, alignment  0, c 65:	59.5	61.1094	48.7031
Length   12, alignment  4, c 65:	61.8438	63.4688	50.625
Length   12, alignment  0, c 65:	61.1094	62.7969	49.875
Length   13, alignment  5, c 65:	64.5	66.125	53.3438
Length   13, alignment  0, c 65:	64.0938	65.7656	51.9219
Length   14, alignment  6, c 65:	67.1562	68.3281	53.9062
Length   14, alignment  0, c 65:	66.5781	68.3438	54.3125
Length   15, alignment  7, c 65:	70.125	72.1406	57.0469
Length   15, alignment  0, c 65:	69.7656	71.3594	56.625
Length   16, alignment  0, c 65:	36.1406	38.125	59.6875
Length   17, alignment  1, c 65:	42.125	43.8438	60.2188
Length   17, alignment  0, c 65:	36.3906	37.5156	61.0156
Length   18, alignment  2, c 65:	52.4688	52.8281	62.625
Length   18, alignment  0, c 65:	42.3906	44.1875	63
Length   19, alignment  3, c 65:	55.8438	56.1094	63.8281
Length   19, alignment  0, c 65:	56.2344	57.4062	65.3594
Length   20, alignment  4, c 65:	37.4375	38.125	67
Length   20, alignment  0, c 65:	52.4531	54.8438	66.2188
Length   21, alignment  5, c 65:	42.7812	43.8281	69
Length   21, alignment  0, c 65:	61.6719	61.7344	68.25
Length   22, alignment  6, c 65:	59.0469	60.1406	70.625
Length   22, alignment  0, c 65:	56.625	58.5	71.3594
Length   23, alignment  7, c 65:	57.625	58.1875	73.2969
Length   23, alignment  0, c 65:	59.9688	60.7656	73.8281
Length   24, alignment  0, c 65:	36	37.625	75.5938
Length   24, alignment  0, c 65:	35.6562	36.8438	74.6094
Length   25, alignment  1, c 65:	43.1875	43.7656	76.9219
Length   25, alignment  0, c 65:	35.6562	37.1875	77.3281
Length   26, alignment  2, c 65:	53.7188	55.4844	78.3594
Length   26, alignment  0, c 65:	43.0312	43.8281	78.2812
Length   27, alignment  3, c 65:	56.6719	57.6406	81.2969
Length   27, alignment  0, c 65:	57.9688	58.3594	81.2656
Length   28, alignment  4, c 65:	36.75	37.3594	82.625
Length   28, alignment  0, c 65:	59.2031	60.7344	83.6562
Length   29, alignment  5, c 65:	43.6719	44.4219	84.2812
Length   29, alignment  0, c 65:	61.4688	62.6406	84.5781
Length   30, alignment  6, c 65:	60.5625	62.0781	87
Length   30, alignment  0, c 65:	55.2188	57.7812	87.0312
Length   31, alignment  7, c 65:	55.8438	57.375	88.9688
Length   31, alignment  0, c 65:	58.1875	59.7188	88.625
Length   14, alignment  1, c 65:	67.5781	69.0625	53.8906
Length 1024, alignment  3, c 65:	226.141	227.75	2074.98
Length   64, alignment  4, c 65:	72.1562	73.3438	154.984
Length   25, alignment  2, c 65:	56.8438	58.0625	76.2344
Length    1, alignment  0, c 130:	27.8594	28.125	20.6719
Length    2, alignment  0, c 130:	31.1719	32.875	27.7188
Length    4, alignment  0, c 130:	43.0781	43.4219	42.8906
Length    8, alignment  0, c 130:	51.1562	53.375	42.9531
Length   16, alignment  0, c 130:	35.8438	38.2344	58.9844
Length   32, alignment  0, c 130:	43.5938	45.5781	90.5938
Length   64, alignment  0, c 130:	55	57.5156	154.609
Length  128, alignment  0, c 130:	59.0312	60.5469	282.922
Length  256, alignment  0, c 130:	81.375	83.2344	539.312
Length  512, alignment  0, c 130:	123.641	124.844	1051.66
Length 1024, alignment  0, c 130:	209	209.547	2074.64
Length 2048, alignment  0, c 130:	380.203	382.109	4123.02
Length 4096, alignment  0, c 130:	720.25	722.953	8218.62
Length 8192, alignment  0, c 130:	1403.92	1406.56	16411.3
Length 16384, alignment  0, c 130:	2771.41	2773.3	32968.3
Length 32768, alignment  0, c 130:	5501.72	5502.95	65564.4
Length 65536, alignment  0, c 130:	10962.7	10964.9	131222
Length 131072, alignment  0, c 130:	21886	21889.1	262277
Length    1, alignment  1, c 130:	26.6406	26.5312	20.9844
Length    2, alignment  2, c 130:	31.2031	32.75	27.7656
Length    3, alignment  3, c 130:	36.5	38.5156	41.25
Length    3, alignment  0, c 130:	36.75	38.4375	41.2344
Length    4, alignment  4, c 130:	42.6406	43.75	43.9375
Length    5, alignment  5, c 130:	48.7656	51.4219	45.9688
Length    5, alignment  0, c 130:	49.1875	50.8125	46.0156
Length    6, alignment  6, c 130:	46.4688	47.6406	47.6094
Length    6, alignment  0, c 130:	46.9531	47.4688	49.0312
Length    7, alignment  7, c 130:	48.7656	50.5469	40.625
Length    7, alignment  0, c 130:	49.4688	51.1562	40.5781
Length    8, alignment  0, c 130:	51.1719	52.0625	41.875
Length    9, alignment  1, c 130:	53.4531	55.4844	43.8906
Length    9, alignment  0, c 130:	53.4844	54.7969	43.9219
Length   10, alignment  2, c 130:	55.7969	57.4375	46.2812
Length   10, alignment  0, c 130:	56.125	57.7656	46.9844
Length   11, alignment  3, c 130:	58.8125	60.4375	48.5938
Length   11, alignment  0, c 130:	58.7969	60.4688	48.3281
Length   12, alignment  4, c 130:	61.9219	63.125	50.6719
Length   12, alignment  0, c 130:	61.5	63.4219	50.5781
Length   13, alignment  5, c 130:	64.125	66.8594	52.2031
Length   13, alignment  0, c 130:	64.4844	66.5469	52.5469
Length   14, alignment  6, c 130:	67.1562	68.8438	54.625
Length   14, alignment  0, c 130:	67.5	69.5156	54.2969
Length   15, alignment  7, c 130:	69.7812	71.5312	56.6406
Length   15, alignment  0, c 130:	69.8125	71.7969	56.5781
Length   16, alignment  0, c 130:	36.125	38.6562	58.9688
Length   17, alignment  1, c 130:	41.9844	42.9531	60.9531
Length   17, alignment  0, c 130:	36.0469	37.625	61.2969
Length   18, alignment  2, c 130:	52.4688	53.625	62.9688
Length   18, alignment  0, c 130:	42.2656	43.25	62.5938
Length   19, alignment  3, c 130:	54.4062	56.5156	64.625
Length   19, alignment  0, c 130:	55.25	57.2188	63.5625
Length   20, alignment  4, c 130:	35.8281	37.9062	67.1562
Length   20, alignment  0, c 130:	53.9531	54.9062	66.9062
Length   21, alignment  5, c 130:	42.4688	44.2031	68.5781
Length   21, alignment  0, c 130:	60.9219	63.5625	69.7031
Length   22, alignment  6, c 130:	59.0625	61.2344	70.9688
Length   22, alignment  0, c 130:	56.6875	57.5312	70.2812
Length   23, alignment  7, c 130:	56.2344	57.8125	73.2812
Length   23, alignment  0, c 130:	58.9688	59.9062	72.9219
Length   24, alignment  0, c 130:	35.8125	37.5312	74.5781
Length   24, alignment  0, c 130:	35.8906	36.5938	74.9531
Length   25, alignment  1, c 130:	43.1562	43.7656	76.2656
Length   25, alignment  0, c 130:	36	37.625	76.6094
Length   26, alignment  2, c 130:	53.3438	54.5625	78.625
Length   26, alignment  0, c 130:	43.4062	45.5781	79.3281
Length   27, alignment  3, c 130:	56.9688	58.4531	82
Length   27, alignment  0, c 130:	56.8906	57.4688	80.9375
Length   28, alignment  4, c 130:	36.3906	38.6562	83.0312
Length   28, alignment  0, c 130:	59.1719	61.1406	82.625
Length   29, alignment  5, c 130:	43.3125	44.4375	84.9375
Length   29, alignment  0, c 130:	61.7969	63.7812	85.3281
Length   30, alignment  6, c 130:	59.4531	61.4844	86.25
Length   30, alignment  0, c 130:	55.4375	57.1562	87.6875
Length   31, alignment  7, c 130:	56.5	57.75	89.2969
Length   31, alignment  0, c 130:	57.7812	58.9844	88.9375
Length   14, alignment  1, c 130:	67.1094	68.0781	54.6406
Length 1024, alignment  3, c 130:	225.953	227.125	2074.19
Length   64, alignment  4, c 130:	71.4062	72.9531	153.828
Length   25, alignment  2, c 130:	56.9062	57.7969	76.5938

[-- Attachment #4: bench-memset.out.n64.hard.new --]
[-- Type: text/plain, Size: 17996 bytes --]

                        	memset	builtin_memset	simple_memset
Length    1, alignment  0, c -65:	29.2969	29.2031	21.4688
Length    2, alignment  0, c -65:	27.5	34.5312	27.0156
Length    4, alignment  0, c -65:	37.8594	44.4062	39.2969
Length    8, alignment  0, c -65:	41.1719	46.9688	39.375
Length   16, alignment  0, c -65:	38.6562	45.1875	55.2188
Length   32, alignment  0, c -65:	49.2969	55.4531	86.5469
Length   64, alignment  0, c -65:	34.7188	40.8594	151.625
Length  128, alignment  0, c -65:	45.3594	57.4844	279.719
Length  256, alignment  0, c -65:	63.1406	76.0469	535.297
Length  512, alignment  0, c -65:	95.8125	98.8594	1047.66
Length 1024, alignment  0, c -65:	173.734	178.984	2071.69
Length 2048, alignment  0, c -65:	370.094	376.781	4119.69
Length 4096, alignment  0, c -65:	711.484	718.438	8216.38
Length 8192, alignment  0, c -65:	1394.89	1402.41	16409.3
Length 16384, alignment  0, c -65:	2761.36	2769.2	33069
Length 32768, alignment  0, c -65:	5492.77	5498.34	65564.8
Length 65536, alignment  0, c -65:	10955.2	10961	131372
Length 131072, alignment  0, c -65:	21876.4	22003.1	262278
Length    1, alignment  1, c -65:	23.7344	28.0938	20.75
Length    2, alignment  2, c -65:	26.8438	35.3438	26.625
Length    3, alignment  3, c -65:	32.5938	40.125	41.3906
Length    3, alignment  0, c -65:	33.5156	39.8281	40.9844
Length    4, alignment  4, c -65:	39.1875	45.8594	42.3125
Length    5, alignment  5, c -65:	41.1406	47.5312	35.4688
Length    5, alignment  0, c -65:	41.1406	47.7656	35.4688
Length    6, alignment  6, c -65:	43.5625	49.9375	38.2188
Length    6, alignment  0, c -65:	43.8125	50.5938	38.25
Length    7, alignment  7, c -65:	38.625	45.6562	40.5938
Length    7, alignment  0, c -65:	38.8906	45.5625	40.25
Length    8, alignment  0, c -65:	40.875	46.9062	41.875
Length    9, alignment  1, c -65:	42.875	48.6094	44.25
Length    9, alignment  0, c -65:	42.875	49.1875	44.625
Length   10, alignment  2, c -65:	44.5312	50.9375	45.8594
Length   10, alignment  0, c -65:	44.75	50.5469	45.875
Length   11, alignment  3, c -65:	47.5938	52.5781	48.6094
Length   11, alignment  0, c -65:	47.1875	52.6406	47.8594
Length   12, alignment  4, c -65:	48.4688	54.8906	49.0938
Length   12, alignment  0, c -65:	48.5156	54.1406	50.2188
Length   13, alignment  5, c -65:	50.875	56.8906	52.25
Length   13, alignment  0, c -65:	50.9062	57.5625	52.9219
Length   14, alignment  6, c -65:	52.8438	59.4375	53.8594
Length   14, alignment  0, c -65:	52.9062	58.7656	53.1562
Length   15, alignment  7, c -65:	54.1719	60.4375	55.8594
Length   15, alignment  0, c -65:	54.5312	60.2812	55.875
Length   16, alignment  0, c -65:	38.4531	44.8125	57.875
Length   17, alignment  1, c -65:	37.6562	44.9219	59.8125
Length   17, alignment  0, c -65:	39.0156	50.875	59.8281
Length   18, alignment  2, c -65:	49.2188	55.8906	61.8438
Length   18, alignment  0, c -65:	43.9844	49.1562	61.8281
Length   19, alignment  3, c -65:	53.25	53.9219	63.8594
Length   19, alignment  0, c -65:	49	54.8594	63.8594
Length   20, alignment  4, c -65:	38.3125	44.7812	65.8594
Length   20, alignment  0, c -65:	59.0312	59.9375	66.9844
Length   21, alignment  5, c -65:	42.9688	49.5156	67.125
Length   21, alignment  0, c -65:	61.0156	68.2969	68.2656
Length   22, alignment  6, c -65:	59.2656	61.3594	70.2344
Length   22, alignment  0, c -65:	63	71.2344	69.4531
Length   23, alignment  7, c -65:	62.3281	72.3438	72.2188
Length   23, alignment  0, c -65:	65.2656	72.0938	72.2344
Length   24, alignment  0, c -65:	44.3906	50.2656	74.2344
Length   24, alignment  0, c -65:	44.2031	50.8594	73.9062
Length   25, alignment  1, c -65:	43.0312	49.1875	75.8438
Length   25, alignment  0, c -65:	43.625	57.0469	75.1562
Length   26, alignment  2, c -65:	57.75	60.6094	78.75
Length   26, alignment  0, c -65:	49.3438	55.5938	78.2344
Length   27, alignment  3, c -65:	62.4844	70.4375	79.875
Length   27, alignment  0, c -65:	53.9688	60.5312	79.9375
Length   28, alignment  4, c -65:	42.9844	50.5625	81.875
Length   28, alignment  0, c -65:	64.1875	65.8906	82.2344
Length   29, alignment  5, c -65:	48.5938	54.5	84.5312
Length   29, alignment  0, c -65:	66.375	76.1875	84.2656
Length   30, alignment  6, c -65:	63.5312	66.0469	86.25
Length   30, alignment  0, c -65:	68.2656	77.625	86.5938
Length   31, alignment  7, c -65:	67.2812	77.625	88.2188
Length   31, alignment  0, c -65:	70.6406	79.5781	88.2344
Length   14, alignment  1, c -65:	52.9062	60.0156	54.25
Length 1024, alignment  3, c -65:	192.188	227.156	2074.98
Length   64, alignment  4, c -65:	70.6562	76.5625	154.594
Length   25, alignment  2, c -65:	49.0938	55.2031	75.8125
Length    1, alignment  0, c  0:	22.2969	28.5625	21.125
Length    2, alignment  0, c  0:	28.5938	34.4688	25.9531
Length    4, alignment  0, c  0:	38.8281	44.3906	43.3438
Length    8, alignment  0, c  0:	41.3125	46.8125	42.2344
Length   16, alignment  0, c  0:	39.1719	45.1562	58.2188
Length   32, alignment  0, c  0:	50.2344	55.5312	88.75
Length   64, alignment  0, c  0:	35.375	40.8281	153.828
Length  128, alignment  0, c  0:	46.3594	56.5625	281.875
Length  256, alignment  0, c  0:	63.5312	77.4844	538.25
Length  512, alignment  0, c  0:	94.625	98.8281	1050.23
Length 1024, alignment  0, c  0:	171.094	177.562	2073.53
Length 2048, alignment  0, c  0:	370	377.062	4122.17
Length 4096, alignment  0, c  0:	711.875	717.625	8218.44
Length 8192, alignment  0, c  0:	1394.92	1399.95	16410.8
Length 16384, alignment  0, c  0:	2760.77	2768.11	32796
Length 32768, alignment  0, c  0:	5492.73	5500.73	65564.3
Length 65536, alignment  0, c  0:	10955.4	10960.7	131222
Length 131072, alignment  0, c  0:	21876.1	21883.2	262323
Length    1, alignment  1, c  0:	23.7344	28.0781	21.4375
Length    2, alignment  2, c  0:	27.4688	33.5781	27.3594
Length    3, alignment  3, c  0:	37.4062	44.1719	40.7031
Length    3, alignment  0, c  0:	38.5	44.5938	40.3125
Length    4, alignment  4, c  0:	40.1406	45.9219	42.75
Length    5, alignment  5, c  0:	41.9062	47.7812	35.4844
Length    5, alignment  0, c  0:	42.5469	48.2031	36.5938
Length    6, alignment  6, c  0:	37.9062	42.5312	37.8125
Length    6, alignment  0, c  0:	37.1875	43.5	38.2188
Length    7, alignment  7, c  0:	39.7031	45.2031	40.625
Length    7, alignment  0, c  0:	39.2969	45.2188	39.8594
Length    8, alignment  0, c  0:	41.2031	47.1562	42.9219
Length    9, alignment  1, c  0:	43.875	48.6406	44.5781
Length    9, alignment  0, c  0:	43.8906	48.625	43.8594
Length   10, alignment  2, c  0:	44.4844	51.1875	45.5156
Length   10, alignment  0, c  0:	45.2031	51.875	45.5312
Length   11, alignment  3, c  0:	47.1094	52.5156	47.8438
Length   11, alignment  0, c  0:	46.875	52.375	48.2344
Length   12, alignment  4, c  0:	49.2812	54.1719	49.875
Length   12, alignment  0, c  0:	48.25	54.9375	50.5469
Length   13, alignment  5, c  0:	51.9531	56.7344	51.5
Length   13, alignment  0, c  0:	50.2031	56.5312	52.625
Length   14, alignment  6, c  0:	52.8906	58.875	55.2969
Length   14, alignment  0, c  0:	53.25	59.5312	53.1562
Length   15, alignment  7, c  0:	54.2188	60.8281	55.8125
Length   15, alignment  0, c  0:	54.5156	60.4062	55.8594
Length   16, alignment  0, c  0:	38.375	45.5781	57.5312
Length   17, alignment  1, c  0:	36.4062	43.1875	59.7969
Length   17, alignment  0, c  0:	39.0781	51.0625	59.5469
Length   18, alignment  2, c  0:	48.625	54.6406	61.8594
Length   18, alignment  0, c  0:	44.3594	50.1406	61.9219
Length   19, alignment  3, c  0:	53.625	60.2188	64.1875
Length   19, alignment  0, c  0:	50	55.2344	64.5781
Length   20, alignment  4, c  0:	39.625	43.7031	66.2031
Length   20, alignment  0, c  0:	60.3125	60.5	66.2656
Length   21, alignment  5, c  0:	42.6094	48.8125	68.2188
Length   21, alignment  0, c  0:	61.7031	68.5156	68.2656
Length   22, alignment  6, c  0:	57.8125	59.8594	70.2188
Length   22, alignment  0, c  0:	63.2344	70.4062	70.2031
Length   23, alignment  7, c  0:	62.25	71.2969	72.5625
Length   23, alignment  0, c  0:	65.6094	72.5156	71.8281
Length   24, alignment  0, c  0:	44.5938	49.8125	74.25
Length   24, alignment  0, c  0:	45.6875	49.8125	73.1094
Length   25, alignment  1, c  0:	43.0938	48.4062	76.2344
Length   25, alignment  0, c  0:	46.2656	57.9062	75.9219
Length   26, alignment  2, c  0:	58.1406	59.25	78.2344
Length   26, alignment  0, c  0:	49.2969	55.4375	78.8906
Length   27, alignment  3, c  0:	62.6562	70.125	80.6094
Length   27, alignment  0, c  0:	55.4062	60.1719	79.9062
Length   28, alignment  4, c  0:	45.0156	50.1562	82.1875
Length   28, alignment  0, c  0:	63.8594	65.5469	82.2656
Length   29, alignment  5, c  0:	49.4062	53.4531	83.5
Length   29, alignment  0, c  0:	66.3125	76.1875	84.5625
Length   30, alignment  6, c  0:	63.8906	64.7969	86.2188
Length   30, alignment  0, c  0:	68.625	77.5938	86.2656
Length   31, alignment  7, c  0:	67.625	77.5938	88.5781
Length   31, alignment  0, c  0:	70.625	79.8594	88.2656
Length   14, alignment  1, c  0:	52.4531	57.8906	53.5469
Length 1024, alignment  3, c  0:	193.328	225.672	2074.97
Length   64, alignment  4, c  0:	69.3125	75.4375	153.531
Length   25, alignment  2, c  0:	48.9062	54.7188	75.8594
Length    1, alignment  0, c 65:	23.4375	28.4688	21.0781
Length    2, alignment  0, c 65:	27.4375	34.3906	26.625
Length    4, alignment  0, c 65:	37.7188	44.8438	42.6875
Length    8, alignment  0, c 65:	40.875	47.1562	41.8906
Length   16, alignment  0, c 65:	38.75	45.5	57.8594
Length   32, alignment  0, c 65:	49.9531	56.5312	89.9062
Length   64, alignment  0, c 65:	34.6562	40.7188	153.844
Length  128, alignment  0, c 65:	46.375	56.2188	282.562
Length  256, alignment  0, c 65:	62.9688	76.3906	538.219
Length  512, alignment  0, c 65:	96.2188	98.7031	1049.84
Length 1024, alignment  0, c 65:	170.812	177.094	2074.23
Length 2048, alignment  0, c 65:	369.875	377.062	4122.59
Length 4096, alignment  0, c 65:	710.844	718.406	8218.91
Length 8192, alignment  0, c 65:	1394.36	1401.84	16410.4
Length 16384, alignment  0, c 65:	2760.7	2768.23	32795.4
Length 32768, alignment  0, c 65:	5492.77	5498.69	65686
Length 65536, alignment  0, c 65:	10954.2	10959.9	131209
Length 131072, alignment  0, c 65:	21876.7	21884	262348
Length    1, alignment  1, c 65:	22.7656	27.7812	21.3906
Length    2, alignment  2, c 65:	28.2031	33.6875	27.0469
Length    3, alignment  3, c 65:	37.7812	43.5312	40.6875
Length    3, alignment  0, c 65:	37.125	43.9062	41.3906
Length    4, alignment  4, c 65:	39.5156	45.7344	42.6562
Length    5, alignment  5, c 65:	41.8281	48.4844	36.5625
Length    5, alignment  0, c 65:	41.8906	48.4844	35.8438
Length    6, alignment  6, c 65:	36.5625	42.8906	38.25
Length    6, alignment  0, c 65:	37.6562	43.4844	38.25
Length    7, alignment  7, c 65:	38.8438	45.1562	39.875
Length    7, alignment  0, c 65:	39.2656	45.4844	40.2188
Length    8, alignment  0, c 65:	40.8281	47.1562	42.25
Length    9, alignment  1, c 65:	43.2188	49.2031	43.8594
Length    9, alignment  0, c 65:	43.25	49.2031	43.8906
Length   10, alignment  2, c 65:	45.6406	50.4062	45.8281
Length   10, alignment  0, c 65:	44.5312	51.2188	45.875
Length   11, alignment  3, c 65:	46.6094	52.7969	47.5312
Length   11, alignment  0, c 65:	47.6562	53.2188	48.25
Length   12, alignment  4, c 65:	49.2656	55.2031	50.6094
Length   12, alignment  0, c 65:	50.7656	54.2344	49.9062
Length   13, alignment  5, c 65:	50.7656	56.4219	51.4531
Length   13, alignment  0, c 65:	49.7656	56.4375	51.5
Length   14, alignment  6, c 65:	52.9062	58.5625	53.4844
Length   14, alignment  0, c 65:	52.6094	58.8281	53.8594
Length   15, alignment  7, c 65:	54.9844	60.8594	56.25
Length   15, alignment  0, c 65:	54.5625	60.5156	55.8125
Length   16, alignment  0, c 65:	39.4844	44.7188	58.2188
Length   17, alignment  1, c 65:	38.7656	44.1719	59.5625
Length   17, alignment  0, c 65:	37.2812	51.5156	59.9375
Length   18, alignment  2, c 65:	49.0781	55.1406	62.2656
Length   18, alignment  0, c 65:	43.2656	49.7812	61.875
Length   19, alignment  3, c 65:	53.3125	53.5625	64.25
Length   19, alignment  0, c 65:	49.0938	55.1562	64.5781
Length   20, alignment  4, c 65:	38.6406	46.1406	65.875
Length   20, alignment  0, c 65:	59.3594	60.8594	65.9375
Length   21, alignment  5, c 65:	43.2812	50.125	68.5781
Length   21, alignment  0, c 65:	61.4844	68.9062	68.2344
Length   22, alignment  6, c 65:	58.0938	60.4688	70.2031
Length   22, alignment  0, c 65:	64.0625	71.1562	69.9062
Length   23, alignment  7, c 65:	62.6719	70.8906	72
Length   23, alignment  0, c 65:	65.3438	73.125	72.8906
Length   24, alignment  0, c 65:	43.9062	50.8281	73.4844
Length   24, alignment  0, c 65:	44.2031	50.4844	74.2656
Length   25, alignment  1, c 65:	44.0781	49.5	77.2812
Length   25, alignment  0, c 65:	43.9062	56.8125	76.2656
Length   26, alignment  2, c 65:	57.5156	59.875	78.9688
Length   26, alignment  0, c 65:	49.6719	54.8438	78.5938
Length   27, alignment  3, c 65:	62.6562	70.5469	80.625
Length   27, alignment  0, c 65:	54.3125	60.1719	79.8906
Length   28, alignment  4, c 65:	43.6719	50.8125	82.6094
Length   28, alignment  0, c 65:	65	66.5625	81.9062
Length   29, alignment  5, c 65:	48.625	54.5469	83.4844
Length   29, alignment  0, c 65:	66.7812	75.8281	83.8594
Length   30, alignment  6, c 65:	64.7812	65.875	86.2188
Length   30, alignment  0, c 65:	68.9219	77.7969	86.625
Length   31, alignment  7, c 65:	69.1719	77.8594	87.8438
Length   31, alignment  0, c 65:	70.2969	80.1562	87.5
Length   14, alignment  1, c 65:	52.6094	58.9062	53.8594
Length 1024, alignment  3, c 65:	191.344	227.359	2074.61
Length   64, alignment  4, c 65:	70.8125	75.8594	153.875
Length   25, alignment  2, c 65:	49.7812	55.0781	75.875
Length    1, alignment  0, c 130:	22.25	29.5781	21.4844
Length    2, alignment  0, c 130:	27.8438	35.3125	27.0156
Length    4, alignment  0, c 130:	39.1094	45.6094	43.0625
Length    8, alignment  0, c 130:	41.2656	47.5781	42.2188
Length   16, alignment  0, c 130:	39.375	45.9375	57.9219
Length   32, alignment  0, c 130:	49.8125	55.4688	89.8906
Length   64, alignment  0, c 130:	35.0469	41.1875	154.219
Length  128, alignment  0, c 130:	46.0469	56.4844	281.453
Length  256, alignment  0, c 130:	62.8594	76.5312	537.875
Length  512, alignment  0, c 130:	95.2656	97.5625	1050.59
Length 1024, alignment  0, c 130:	173.094	180.297	2074.27
Length 2048, alignment  0, c 130:	370.859	376.422	4122.61
Length 4096, alignment  0, c 130:	710.797	718.5	8218.75
Length 8192, alignment  0, c 130:	1394.5	1403.56	16410.2
Length 16384, alignment  0, c 130:	2761.34	2767.7	32796
Length 32768, alignment  0, c 130:	5491.73	5499.86	65686.5
Length 65536, alignment  0, c 130:	10953.3	10960.3	131208
Length 131072, alignment  0, c 130:	21876	21883	262353
Length    1, alignment  1, c 130:	23.0156	28.5938	20.7656
Length    2, alignment  2, c 130:	27.1719	34.2031	26.6562
Length    3, alignment  3, c 130:	33.1875	39.5	41.3281
Length    3, alignment  0, c 130:	33.2188	40.2031	41.0781
Length    4, alignment  4, c 130:	38.4844	44.875	42.7031
Length    5, alignment  5, c 130:	42.7344	48.3281	36.25
Length    5, alignment  0, c 130:	40.8906	48.2812	35.875
Length    6, alignment  6, c 130:	43.5938	50.0938	37.4844
Length    6, alignment  0, c 130:	43.8438	50.5312	37.875
Length    7, alignment  7, c 130:	38.7812	44.1875	39.4688
Length    7, alignment  0, c 130:	39.1719	44.375	40.5625
Length    8, alignment  0, c 130:	41.125	46.9062	41.4844
Length    9, alignment  1, c 130:	42.8594	49.875	43.4844
Length    9, alignment  0, c 130:	43.4375	48.5312	43.4375
Length   10, alignment  2, c 130:	44.0312	51.7969	46.1719
Length   10, alignment  0, c 130:	45.1875	51.0625	45.4531
Length   11, alignment  3, c 130:	48.5625	52.6094	48.5625
Length   11, alignment  0, c 130:	46.8906	51.875	48.2188
Length   12, alignment  4, c 130:	49.25	55.2188	50.25
Length   12, alignment  0, c 130:	49.5469	55.5938	50.2188
Length   13, alignment  5, c 130:	51.1875	57.6406	51.8438
Length   13, alignment  0, c 130:	51.2031	56.9062	51.875
Length   14, alignment  6, c 130:	52.9688	59.9062	54.25
Length   14, alignment  0, c 130:	52.8281	58.5312	53.4688
Length   15, alignment  7, c 130:	54.5781	60.9375	56.2344
Length   15, alignment  0, c 130:	54.8281	61.4375	55.5
Length   16, alignment  0, c 130:	39.0312	45.1562	57.8438
Length   17, alignment  1, c 130:	38.6562	44.1719	59.4844
Length   17, alignment  0, c 130:	38.6562	50.8281	60.25
Length   18, alignment  2, c 130:	50.3906	56.5938	62.9375
Length   18, alignment  0, c 130:	42.9531	49.8594	61.9219
Length   19, alignment  3, c 130:	53.9844	54.3281	64.9844
Length   19, alignment  0, c 130:	49.6719	55.875	65.0938
Length   20, alignment  4, c 130:	39.0625	44.7344	65.9062
Length   20, alignment  0, c 130:	58.5156	60.25	66.5938
Length   21, alignment  5, c 130:	42.625	48.875	68.3125
Length   21, alignment  0, c 130:	61.5938	67.9375	67.1562
Length   22, alignment  6, c 130:	57.8125	60.625	70.9531
Length   22, alignment  0, c 130:	62.9531	72.3125	70.6094
Length   23, alignment  7, c 130:	62.3125	70.5938	71.8906
Length   23, alignment  0, c 130:	64.9375	72.9062	71.8125
Length   24, alignment  0, c 130:	44.2344	50.1094	74.2656
Length   24, alignment  0, c 130:	44.6875	50.2031	74.9688
Length   25, alignment  1, c 130:	42.3906	49.1719	76.25
Length   25, alignment  0, c 130:	43.5781	56.9375	76.2188
Length   26, alignment  2, c 130:	58.0625	60.2812	78.2188
Length   26, alignment  0, c 130:	49.25	54.875	78.25
Length   27, alignment  3, c 130:	62.2969	70.2656	80.625
Length   27, alignment  0, c 130:	54.2812	60.1719	79.8281
Length   28, alignment  4, c 130:	43.625	50.1875	82.9375
Length   28, alignment  0, c 130:	64.2031	66.6094	82.2656
Length   29, alignment  5, c 130:	48.7969	55.6094	83.4688
Length   29, alignment  0, c 130:	65.6562	75.5781	84.25
Length   30, alignment  6, c 130:	63.9062	65.6406	86.25
Length   30, alignment  0, c 130:	69.3438	78.2656	85.8906
Length   31, alignment  7, c 130:	68.0938	78.5781	88.25
Length   31, alignment  0, c 130:	71.0312	79.9062	88.25
Length   14, alignment  1, c 130:	53.2969	59.4375	54.25
Length 1024, alignment  3, c 130:	192.891	227.25	2075.08
Length   64, alignment  4, c 130:	69.3594	76.4531	153.859
Length   25, alignment  2, c 130:	48.6562	55.5312	75.5156

[-- Attachment #5: bench-memset.out.n64.hard.orig --]
[-- Type: text/plain, Size: 18044 bytes --]

                        	memset	builtin_memset	simple_memset
Length    1, alignment  0, c -65:	29.5469	31.125	20.8594
Length    2, alignment  0, c -65:	28.1719	35.1875	26.5938
Length    4, alignment  0, c -65:	39.9375	45.1406	39.2812
Length    8, alignment  0, c -65:	48.375	51.6406	39.6719
Length   16, alignment  0, c -65:	32.3438	39.3594	55.2812
Length   32, alignment  0, c -65:	41.7656	47.9844	88.3281
Length   64, alignment  0, c -65:	54.4219	58.8906	152.062
Length  128, alignment  0, c -65:	55.8438	62.4844	280.031
Length  256, alignment  0, c -65:	77.9844	85.375	535.719
Length  512, alignment  0, c -65:	120.172	126.859	1046.97
Length 1024, alignment  0, c -65:	205.203	212.422	2071.61
Length 2048, alignment  0, c -65:	375.906	383.203	4119.33
Length 4096, alignment  0, c -65:	717.641	725.297	8216.7
Length 8192, alignment  0, c -65:	1400.31	1407.83	16407.9
Length 16384, alignment  0, c -65:	2767.69	2774.42	32793.6
Length 32768, alignment  0, c -65:	5499.03	5825.88	65565.6
Length 65536, alignment  0, c -65:	10960.4	10967.4	131228
Length 131072, alignment  0, c -65:	21882.6	21890.9	262326
Length    1, alignment  1, c -65:	23.6562	28.7969	20.75
Length    2, alignment  2, c -65:	27.5	35.4062	26.2812
Length    3, alignment  3, c -65:	33.1406	40.5156	41.0625
Length    3, alignment  0, c -65:	33.8438	40.5	40.0312
Length    4, alignment  4, c -65:	39.1875	45.1719	43
Length    5, alignment  5, c -65:	45.375	49.2344	35.8125
Length    5, alignment  0, c -65:	46.125	49.7031	35.125
Length    6, alignment  6, c -65:	42.8125	46.5312	38.2188
Length    6, alignment  0, c -65:	42.7344	46.0625	38.1719
Length    7, alignment  7, c -65:	45.4062	49.1875	40.1875
Length    7, alignment  0, c -65:	45.1562	49.8281	39.8281
Length    8, alignment  0, c -65:	48.0781	51.8281	41.8906
Length    9, alignment  1, c -65:	50.7656	54.5156	43.5
Length    9, alignment  0, c -65:	50.75	54.9062	44.1875
Length   10, alignment  2, c -65:	52.8125	57.2344	45.4531
Length   10, alignment  0, c -65:	52.8125	56.5938	46.2188
Length   11, alignment  3, c -65:	55.75	59.9375	48.2188
Length   11, alignment  0, c -65:	55.7969	61.1875	48.2188
Length   12, alignment  4, c -65:	58.4062	63.0312	49.875
Length   12, alignment  0, c -65:	58.4375	62.8281	50.1875
Length   13, alignment  5, c -65:	61.4062	64.875	51.2656
Length   13, alignment  0, c -65:	62.7969	65.6094	52.1406
Length   14, alignment  6, c -65:	64.0781	67.5781	53.8281
Length   14, alignment  0, c -65:	64.0938	67.7188	54.8906
Length   15, alignment  7, c -65:	67.4375	71.2031	55.5469
Length   15, alignment  0, c -65:	66.4375	70.5312	55.125
Length   16, alignment  0, c -65:	32.7969	39.8438	57.8281
Length   17, alignment  1, c -65:	38.9219	44.125	59.2344
Length   17, alignment  0, c -65:	32.7344	39.6875	60.2188
Length   18, alignment  2, c -65:	50.1875	55.8438	62.5469
Length   18, alignment  0, c -65:	39.4375	45.0312	62.2188
Length   19, alignment  3, c -65:	51.8594	55.8906	63.8906
Length   19, alignment  0, c -65:	51.9531	54.1562	64.125
Length   20, alignment  4, c -65:	32.8438	40.4688	66.1562
Length   20, alignment  0, c -65:	54.1875	57.0938	65.4844
Length   21, alignment  5, c -65:	38.4375	46.0625	67.4688
Length   21, alignment  0, c -65:	56.4062	59.4844	68.2344
Length   22, alignment  6, c -65:	55.1562	57.8438	69.4531
Length   22, alignment  0, c -65:	52.5781	57.2344	69.8125
Length   23, alignment  7, c -65:	52.8438	57.5469	71.7969
Length   23, alignment  0, c -65:	55.5469	59.5469	71.5
Length   24, alignment  0, c -65:	31.8281	38.5469	74.2188
Length   24, alignment  0, c -65:	31.5469	39.0312	74.125
Length   25, alignment  1, c -65:	38.4688	45.9062	75.4219
Length   25, alignment  0, c -65:	31.4062	39.0469	76.1875
Length   26, alignment  2, c -65:	49.0312	57.5	79.4375
Length   26, alignment  0, c -65:	39.0156	46.4688	78.2031
Length   27, alignment  3, c -65:	52.9531	57.5625	81.1875
Length   27, alignment  0, c -65:	52.8594	55.9062	79.7969
Length   28, alignment  4, c -65:	32.3906	39.8281	81.5312
Length   28, alignment  0, c -65:	55.4531	58.6094	81.4688
Length   29, alignment  5, c -65:	39.2812	46.3125	83.8125
Length   29, alignment  0, c -65:	58.7031	60.7031	83.4375
Length   30, alignment  6, c -65:	55.4688	60.1875	86.5469
Length   30, alignment  0, c -65:	51.5	56	86.2031
Length   31, alignment  7, c -65:	52.0156	56.875	88.9062
Length   31, alignment  0, c -65:	54.5938	59.5312	88.2344
Length   14, alignment  1, c -65:	64.0938	67.8125	54.4844
Length 1024, alignment  3, c -65:	222.422	225.797	2074.2
Length   64, alignment  4, c -65:	73.7188	75.8438	153.844
Length   25, alignment  2, c -65:	52.125	56.5938	76.5781
Length    1, alignment  0, c  0:	23.0156	29.125	21.2188
Length    2, alignment  0, c  0:	28.2656	35.0938	27.0312
Length    4, alignment  0, c  0:	39.2969	45.8594	42.2812
Length    8, alignment  0, c  0:	47.8125	51.7969	42.5312
Length   16, alignment  0, c  0:	32.2656	37.4219	58.5312
Length   32, alignment  0, c  0:	41.1719	43.6875	90.9062
Length   64, alignment  0, c  0:	52.5	55.7031	153.453
Length  128, alignment  0, c  0:	54.7031	61.4062	281.875
Length  256, alignment  0, c  0:	76.9375	83.0625	538.203
Length  512, alignment  0, c  0:	119.641	125.109	1049.81
Length 1024, alignment  0, c  0:	205.047	211.203	2074.23
Length 2048, alignment  0, c  0:	375.766	382.141	4122.06
Length 4096, alignment  0, c  0:	716.719	722.969	8219.2
Length 8192, alignment  0, c  0:	1399.73	1407.48	16410.4
Length 16384, alignment  0, c  0:	2811.95	2772.38	32796
Length 32768, alignment  0, c  0:	5497.08	5503.86	65686.5
Length 65536, alignment  0, c  0:	10959.4	10965.3	131214
Length 131072, alignment  0, c  0:	21881.3	21888.1	262460
Length    1, alignment  1, c  0:	23.6719	28.875	21.1406
Length    2, alignment  2, c  0:	28.5938	35.4219	27.0312
Length    3, alignment  3, c  0:	33.9844	40.4688	41.25
Length    3, alignment  0, c  0:	33.6094	40.3906	40.6562
Length    4, alignment  4, c  0:	39.9688	45.75	42.7656
Length    5, alignment  5, c  0:	45.4062	50.0938	36.1875
Length    5, alignment  0, c  0:	46.9219	49.5	36.6094
Length    6, alignment  6, c  0:	43.1719	45.8438	37.8281
Length    6, alignment  0, c  0:	42.4844	45.4688	37.8125
Length    7, alignment  7, c  0:	45.5156	49.4688	39.4531
Length    7, alignment  0, c  0:	45.875	48.8594	40.1875
Length    8, alignment  0, c  0:	47.7969	51.4375	42.2188
Length    9, alignment  1, c  0:	51.2031	54.0781	43.8125
Length    9, alignment  0, c  0:	50.8438	53.8125	44.2188
Length   10, alignment  2, c  0:	53.5625	57.7656	46.5156
Length   10, alignment  0, c  0:	53.5156	56.375	45.7969
Length   11, alignment  3, c  0:	56.5156	59.1562	47.8125
Length   11, alignment  0, c  0:	56.5	60.2188	48.2188
Length   12, alignment  4, c  0:	59.2344	62.7812	49.8594
Length   12, alignment  0, c  0:	59.25	62.625	49.8906
Length   13, alignment  5, c  0:	61.8594	64.6875	51.8438
Length   13, alignment  0, c  0:	60.8594	65.0938	52.1875
Length   14, alignment  6, c  0:	64.25	67.5	54.25
Length   14, alignment  0, c  0:	64.1406	67.4375	54.5312
Length   15, alignment  7, c  0:	66.875	70.5781	55.8594
Length   15, alignment  0, c  0:	66.4688	71.1562	55.8594
Length   16, alignment  0, c  0:	31.8125	37.3125	57.7812
Length   17, alignment  1, c  0:	36.3125	42.0938	60.2188
Length   17, alignment  0, c  0:	32.1562	37.4062	60.2031
Length   18, alignment  2, c  0:	46.4844	52.7812	62.6094
Length   18, alignment  0, c  0:	37.75	44.2812	62.1719
Length   19, alignment  3, c  0:	49.4844	53.1406	64.2344
Length   19, alignment  0, c  0:	48.9688	52.5	64.9688
Length   20, alignment  4, c  0:	31.0625	37.1719	66.1875
Length   20, alignment  0, c  0:	51.3438	54.8281	65.8125
Length   21, alignment  5, c  0:	36.7344	43.5	68.25
Length   21, alignment  0, c  0:	54.6875	57.1094	68.1875
Length   22, alignment  6, c  0:	51.9531	55.5938	70.5469
Length   22, alignment  0, c  0:	52.3438	55.8438	70.2344
Length   23, alignment  7, c  0:	50.9688	54.0469	71.7969
Length   23, alignment  0, c  0:	54.5625	57.7656	72.5469
Length   24, alignment  0, c  0:	31.7656	36.4844	73.4531
Length   24, alignment  0, c  0:	31.3438	37.0938	74.2188
Length   25, alignment  1, c  0:	36.2188	43.7344	76.2344
Length   25, alignment  0, c  0:	31.8125	37.0938	76.1875
Length   26, alignment  2, c  0:	48.1406	53.3594	79.3438
Length   26, alignment  0, c  0:	37.8125	43.8125	78.9844
Length   27, alignment  3, c  0:	51.375	54	80.2344
Length   27, alignment  0, c  0:	50.8594	52.7031	80.5938
Length   28, alignment  4, c  0:	30.4531	36.125	82.25
Length   28, alignment  0, c  0:	53.9375	54.4062	81.8281
Length   29, alignment  5, c  0:	37.375	43.9375	84.4688
Length   29, alignment  0, c  0:	56.1875	58.0938	83.4688
Length   30, alignment  6, c  0:	53.625	56.25	86.9375
Length   30, alignment  0, c  0:	50.5938	54.8438	86.2188
Length   31, alignment  7, c  0:	50.9531	53.7969	88.2188
Length   31, alignment  0, c  0:	54.8906	56.8594	88.5781
Length   14, alignment  1, c  0:	65.3125	67.8281	54.5625
Length 1024, alignment  3, c  0:	221.156	221.797	2074.12
Length   64, alignment  4, c  0:	69.8281	72.4531	154.172
Length   25, alignment  2, c  0:	48.9844	52.4375	76.5938
Length    1, alignment  0, c 65:	23.6719	29.4062	20.75
Length    2, alignment  0, c 65:	28.2656	34.8281	25.8594
Length    4, alignment  0, c 65:	39.0156	45.8125	43.0312
Length    8, alignment  0, c 65:	48.1406	51.8438	42.2188
Length   16, alignment  0, c 65:	32.5625	39.4531	58.2031
Length   32, alignment  0, c 65:	41.8438	47.2656	90.9062
Length   64, alignment  0, c 65:	54.1562	59.1406	276.141
Length  128, alignment  0, c 65:	56.2188	63.4062	281.875
Length  256, alignment  0, c 65:	78.0312	83.75	538.234
Length  512, alignment  0, c 65:	120.281	127.469	1049.48
Length 1024, alignment  0, c 65:	204.969	212.672	2074.55
Length 2048, alignment  0, c 65:	376.75	383.906	4121.95
Length 4096, alignment  0, c 65:	717.672	724.016	8217.72
Length 8192, alignment  0, c 65:	1400.31	1408.05	16411
Length 16384, alignment  0, c 65:	2767.69	2774.56	32796.5
Length 32768, alignment  0, c 65:	5498	5506.06	65675.1
Length 65536, alignment  0, c 65:	10959	10967	131208
Length 131072, alignment  0, c 65:	21882.3	21890.2	262386
Length    1, alignment  1, c 65:	22.7969	28.3438	21.3594
Length    2, alignment  2, c 65:	28.5625	34.6875	26.625
Length    3, alignment  3, c 65:	33.0156	40.0156	40.6562
Length    3, alignment  0, c 65:	33.9062	40.0625	41.375
Length    4, alignment  4, c 65:	38.875	46.5156	43.0312
Length    5, alignment  5, c 65:	45.7969	49.7969	36.9375
Length    5, alignment  0, c 65:	45.5312	49.4062	36.1875
Length    6, alignment  6, c 65:	43.1562	46.1406	38.5469
Length    6, alignment  0, c 65:	43.1562	45.4844	38.1719
Length    7, alignment  7, c 65:	46.3281	48.8281	39.8438
Length    7, alignment  0, c 65:	45.8281	50.1406	39.4844
Length    8, alignment  0, c 65:	48.1719	51.4062	41.4375
Length    9, alignment  1, c 65:	51.2812	53.8438	44.2188
Length    9, alignment  0, c 65:	50.5156	54.1406	44.1875
Length   10, alignment  2, c 65:	53.8906	56.1719	45.7969
Length   10, alignment  0, c 65:	54.2188	57.8281	46.2188
Length   11, alignment  3, c 65:	56.1875	59.5	48.2656
Length   11, alignment  0, c 65:	56.8906	59.7969	47.8125
Length   12, alignment  4, c 65:	59.9688	61.5	49.8906
Length   12, alignment  0, c 65:	58.7969	62.5312	50.1875
Length   13, alignment  5, c 65:	61.5	65.2656	52.4844
Length   13, alignment  0, c 65:	61.5625	65.7812	52.5938
Length   14, alignment  6, c 65:	64.8438	67.4688	54.1875
Length   14, alignment  0, c 65:	65.0156	66.8438	53.875
Length   15, alignment  7, c 65:	67.5625	70.1562	55.0938
Length   15, alignment  0, c 65:	67.2031	70.0938	56.2188
Length   16, alignment  0, c 65:	32.4375	40.0938	58.5625
Length   17, alignment  1, c 65:	38.5625	45.4531	59.875
Length   17, alignment  0, c 65:	32.7969	39.4375	60.5781
Length   18, alignment  2, c 65:	49.1875	55.7188	62.5469
Length   18, alignment  0, c 65:	39.5469	45.1562	62.5469
Length   19, alignment  3, c 65:	51.4375	55.4688	63.4531
Length   19, alignment  0, c 65:	52.7188	54.6875	64.8906
Length   20, alignment  4, c 65:	33.0938	39.6875	65.4375
Length   20, alignment  0, c 65:	55.2656	57.4219	66.2188
Length   21, alignment  5, c 65:	40.1406	47.0938	68.5781
Length   21, alignment  0, c 65:	57.4688	60.75	68.625
Length   22, alignment  6, c 65:	54.875	58.625	70.6406
Length   22, alignment  0, c 65:	52.2656	56.7656	70.1875
Length   23, alignment  7, c 65:	53.2812	57.8125	72.5938
Length   23, alignment  0, c 65:	54.5938	59.7812	72.2031
Length   24, alignment  0, c 65:	32.1094	38.7656	74.1875
Length   24, alignment  0, c 65:	31.6562	38.0781	74.1875
Length   25, alignment  1, c 65:	39.1719	46.1094	76.2031
Length   25, alignment  0, c 65:	32.5	39.5625	75.8281
Length   26, alignment  2, c 65:	50.4219	56.7812	77.7969
Length   26, alignment  0, c 65:	39.125	46.875	77.8594
Length   27, alignment  3, c 65:	53.7031	57.8438	79.9219
Length   27, alignment  0, c 65:	52.5312	54.7812	79.4219
Length   28, alignment  4, c 65:	32.4219	39.3906	82.2031
Length   28, alignment  0, c 65:	55.5938	58.4219	81.8281
Length   29, alignment  5, c 65:	40.7031	46.0938	84.1562
Length   29, alignment  0, c 65:	57.8438	61.0469	83.8438
Length   30, alignment  6, c 65:	55.8125	58.4531	85.4219
Length   30, alignment  0, c 65:	50.9062	56.0781	86.2031
Length   31, alignment  7, c 65:	51.9688	57.4531	88.1875
Length   31, alignment  0, c 65:	54.9375	59.125	88.1875
Length   14, alignment  1, c 65:	64.5781	67.4062	53.75
Length 1024, alignment  3, c 65:	222.125	224.734	2074.52
Length   64, alignment  4, c 65:	72.0469	75.7812	154.188
Length   25, alignment  2, c 65:	52.8125	56.0781	76.5
Length    1, alignment  0, c 130:	23.5938	29.0781	21.0781
Length    2, alignment  0, c 130:	28.1719	34.7969	26.9844
Length    4, alignment  0, c 130:	38.1875	45.1719	43.0156
Length    8, alignment  0, c 130:	47.7031	51.8438	42.1875
Length   16, alignment  0, c 130:	32.4531	39.7812	58.1875
Length   32, alignment  0, c 130:	42.7344	47.6719	90.2031
Length   64, alignment  0, c 130:	54.0625	59.875	154.578
Length  128, alignment  0, c 130:	55.8594	63.8281	283.203
Length  256, alignment  0, c 130:	76.8438	85.1406	538.203
Length  512, alignment  0, c 130:	119.516	126.812	1049.81
Length 1024, alignment  0, c 130:	204.812	212.141	2073.45
Length 2048, alignment  0, c 130:	375.984	383.438	4121.8
Length 4096, alignment  0, c 130:	718.141	724.703	8344.39
Length 8192, alignment  0, c 130:	1401.27	1407.66	16410.7
Length 16384, alignment  0, c 130:	2767.33	2774.7	32797
Length 32768, alignment  0, c 130:	5498.38	5504.84	65564.2
Length 65536, alignment  0, c 130:	11068.5	10967	131208
Length 131072, alignment  0, c 130:	21881.5	21890.6	262379
Length    1, alignment  1, c 130:	23.8906	28.75	20.8438
Length    2, alignment  2, c 130:	27.875	34.6562	26.2812
Length    3, alignment  3, c 130:	33.5625	40.5312	40.6719
Length    3, alignment  0, c 130:	34.2031	40.1719	41.4219
Length    4, alignment  4, c 130:	39.5625	44.7969	42.25
Length    5, alignment  5, c 130:	44.75	49.875	36.5781
Length    5, alignment  0, c 130:	45.0781	49.5312	36.2188
Length    6, alignment  6, c 130:	42.1562	45.3594	37.4375
Length    6, alignment  0, c 130:	42.4062	46.1719	38.1875
Length    7, alignment  7, c 130:	45.1875	48.7344	39.7969
Length    7, alignment  0, c 130:	44.8125	48.8438	40.2188
Length    8, alignment  0, c 130:	47.75	52.2344	42.1875
Length    9, alignment  1, c 130:	50.7656	53.8906	43.4531
Length    9, alignment  0, c 130:	50.125	54.3281	43.7812
Length   10, alignment  2, c 130:	52.7969	56.9375	45.8438
Length   10, alignment  0, c 130:	53.0781	57.0938	45.8281
Length   11, alignment  3, c 130:	56.0781	60.2188	47.875
Length   11, alignment  0, c 130:	56.4062	59.4844	48.1875
Length   12, alignment  4, c 130:	59.0781	63.2344	49.8438
Length   12, alignment  0, c 130:	58.3906	62.9219	50.9219
Length   13, alignment  5, c 130:	60.5625	64.5	51.5312
Length   13, alignment  0, c 130:	61.1094	63.8438	51.1406
Length   14, alignment  6, c 130:	63.75	67.9531	53.5312
Length   14, alignment  0, c 130:	64.0469	67.0312	53.7969
Length   15, alignment  7, c 130:	66.4219	69.8594	55.8281
Length   15, alignment  0, c 130:	66.4219	70.2031	55.7812
Length   16, alignment  0, c 130:	32.3281	39.8906	58.5625
Length   17, alignment  1, c 130:	37.4219	44.5312	60.5781
Length   17, alignment  0, c 130:	33.125	39.8594	60.2031
Length   18, alignment  2, c 130:	49.1094	56.4688	62.5469
Length   18, alignment  0, c 130:	38.4531	44.8125	61.8281
Length   19, alignment  3, c 130:	52.125	56.2188	63.5156
Length   19, alignment  0, c 130:	51.6406	55.7344	63.9062
Length   20, alignment  4, c 130:	33.0156	40.4844	65.8906
Length   20, alignment  0, c 130:	54.2031	57.6406	65.5312
Length   21, alignment  5, c 130:	38.7969	46.1562	68.25
Length   21, alignment  0, c 130:	56.8125	60.125	67.8438
Length   22, alignment  6, c 130:	54.8438	58.4844	69.9062
Length   22, alignment  0, c 130:	51.9375	57.5625	70.2188
Length   23, alignment  7, c 130:	53.5781	57.875	72.9531
Length   23, alignment  0, c 130:	54.5	59.9219	71.875
Length   24, alignment  0, c 130:	31.3906	38.625	74.2188
Length   24, alignment  0, c 130:	31.1719	40.1406	74.2031
Length   25, alignment  1, c 130:	39.4531	46.8438	76.2031
Length   25, alignment  0, c 130:	32.5156	38.5625	75.8594
Length   26, alignment  2, c 130:	50.0156	57.5156	77.4688
Length   26, alignment  0, c 130:	39.3594	46.5156	77.4688
Length   27, alignment  3, c 130:	52.6719	57.1875	79.0938
Length   27, alignment  0, c 130:	52.1562	56.3594	80.1719
Length   28, alignment  4, c 130:	33.7188	39.4688	83.2188
Length   28, alignment  0, c 130:	55.4844	58.5	82.2656
Length   29, alignment  5, c 130:	39.9688	47.5156	84.125
Length   29, alignment  0, c 130:	58.6094	61.5625	84.1875
Length   30, alignment  6, c 130:	56.8125	59.125	86.6562
Length   30, alignment  0, c 130:	51.8594	55.8125	86.2031
Length   31, alignment  7, c 130:	51.8125	57.4375	88.5312
Length   31, alignment  0, c 130:	54.1875	58.7969	87.8281
Length   14, alignment  1, c 130:	63.7812	67.5469	53.8125
Length 1024, alignment  3, c 130:	222.734	225.875	2073.8
Length   64, alignment  4, c 130:	72.375	75.875	154.5
Length   25, alignment  2, c 130:	52.0938	56	76.625

[-- Attachment #6: bench-memset.out.o32.hard.new --]
[-- Type: text/plain, Size: 18000 bytes --]

                        	memset	builtin_memset	simple_memset
Length    1, alignment  0, c -65:	28.3594	27.0625	21.2188
Length    2, alignment  0, c -65:	29.9688	31.6875	27.9062
Length    4, alignment  0, c -65:	41	42.6875	43.8281
Length    8, alignment  0, c -65:	43.5781	45.7031	42.9062
Length   16, alignment  0, c -65:	54.7656	55.6094	58.1094
Length   32, alignment  0, c -65:	40.5312	40.9844	90.8438
Length   64, alignment  0, c -65:	50.875	51.6875	154.125
Length  128, alignment  0, c -65:	67.7031	69.1875	282.266
Length  256, alignment  0, c -65:	92.0469	93.5781	538.266
Length  512, alignment  0, c -65:	163.641	164.672	1050.77
Length 1024, alignment  0, c -65:	269.547	270.953	2074.41
Length 2048, alignment  0, c -65:	482.906	485.625	4123.52
Length 4096, alignment  0, c -65:	909.688	912.266	8218.48
Length 8192, alignment  0, c -65:	1764	1766.28	16565.6
Length 16384, alignment  0, c -65:	3472.3	3474.88	32796.9
Length 32768, alignment  0, c -65:	6885.55	6887.11	65564.9
Length 65536, alignment  0, c -65:	13714	13714.3	131100
Length 131072, alignment  0, c -65:	27368	27367.8	262418
Length    1, alignment  1, c -65:	25.9375	26.0156	21.6094
Length    2, alignment  2, c -65:	30	31.6406	27.25
Length    3, alignment  3, c -65:	35.625	37.375	32.9844
Length    3, alignment  0, c -65:	35.6406	37.3281	40.1094
Length    4, alignment  4, c -65:	41.1875	42.6562	43.4531
Length    5, alignment  5, c -65:	48.625	49.9531	43.8906
Length    5, alignment  0, c -65:	48.625	50.625	44.7188
Length    6, alignment  6, c -65:	50.8594	51.9688	47.5156
Length    6, alignment  0, c -65:	50.9062	51.9531	45.9531
Length    7, alignment  7, c -65:	44.2188	46.25	40.7031
Length    7, alignment  0, c -65:	45.0938	46.2969	41.5469
Length    8, alignment  0, c -65:	44.3281	45.3281	42.7969
Length    9, alignment  1, c -65:	42.1094	42.9844	44.5156
Length    9, alignment  0, c -65:	43.2344	45	44.4062
Length   10, alignment  2, c -65:	37.9531	38.7031	46.4375
Length   10, alignment  0, c -65:	49.2812	49.2969	46.8125
Length   11, alignment  3, c -65:	42.25	43.7188	49.4688
Length   11, alignment  0, c -65:	61.7188	62.5781	48.7969
Length   12, alignment  4, c -65:	43.5156	46.0312	51.2344
Length   12, alignment  0, c -65:	49.4062	51.0312	50.8438
Length   13, alignment  5, c -65:	48.5938	49.0469	52.7812
Length   13, alignment  0, c -65:	48.4531	50.75	53.0938
Length   14, alignment  6, c -65:	48.875	50.0312	54.8125
Length   14, alignment  0, c -65:	59.6875	60.125	55.8594
Length   15, alignment  7, c -65:	57.5312	59.4688	56.75
Length   15, alignment  0, c -65:	67.4844	67.8281	57
Length   16, alignment  0, c -65:	55.125	57.0938	59.5156
Length   17, alignment  1, c -65:	49.1562	48.7031	60.5
Length   17, alignment  0, c -65:	54.0156	55.3281	60.8125
Length   18, alignment  2, c -65:	47.8906	49.2812	62.4531
Length   18, alignment  0, c -65:	64.125	65.1875	61.9219
Length   19, alignment  3, c -65:	57.2031	58.4219	64.4688
Length   19, alignment  0, c -65:	71.6406	73.125	63.9688
Length   20, alignment  4, c -65:	53.9375	55.3125	67.1094
Length   20, alignment  0, c -65:	58.0312	59.9844	66.1562
Length   21, alignment  5, c -65:	60.0625	60.8281	68.5312
Length   21, alignment  0, c -65:	58.4219	60.0312	68.8438
Length   22, alignment  6, c -65:	58.0469	59.6094	70.875
Length   22, alignment  0, c -65:	63.8125	64.6719	71.0781
Length   23, alignment  7, c -65:	62.3594	63.9688	71.7344
Length   23, alignment  0, c -65:	68.3438	69.1094	72.2812
Length   24, alignment  0, c -65:	55.625	58.0781	75.2031
Length   24, alignment  0, c -65:	55.9531	57.3125	74.4375
Length   25, alignment  1, c -65:	54.1875	54.9219	76.0938
Length   25, alignment  0, c -65:	54.7188	57	76.0156
Length   26, alignment  2, c -65:	51.8125	54.1562	78.75
Length   26, alignment  0, c -65:	61.1094	63.0625	78.8125
Length   27, alignment  3, c -65:	56.4375	58	80.7188
Length   27, alignment  0, c -65:	66.5312	67.2188	80.625
Length   28, alignment  4, c -65:	55.3906	57.0469	83.125
Length   28, alignment  0, c -65:	59.2812	59.3125	82.4219
Length   29, alignment  5, c -65:	60.9219	62.0781	84.4688
Length   29, alignment  0, c -65:	58.0781	61.5312	85.2812
Length   30, alignment  6, c -65:	58.7812	59.0312	86.1094
Length   30, alignment  0, c -65:	63.8125	65.75	87.4688
Length   31, alignment  7, c -65:	62.4375	63.9531	89.0938
Length   31, alignment  0, c -65:	69.1562	71.3125	88.3281
Length   14, alignment  1, c -65:	47.4062	49.3906	55.5625
Length 1024, alignment  3, c -65:	288.344	290.062	2073.66
Length   64, alignment  4, c -65:	60.0938	61.7344	154.406
Length   25, alignment  2, c -65:	60.4062	60.6875	76.1406
Length    1, alignment  0, c  0:	25.0469	26.3125	21.6719
Length    2, alignment  0, c  0:	30.6562	32.0469	27.2969
Length    4, alignment  0, c  0:	42.4219	43	38.2031
Length    8, alignment  0, c  0:	44.7031	44.5781	42.1406
Length   16, alignment  0, c  0:	55.8438	55.9531	58.4688
Length   32, alignment  0, c  0:	39.9219	41.6875	91.2031
Length   64, alignment  0, c  0:	50.7969	52.6562	155.156
Length  128, alignment  0, c  0:	68.3906	69.4531	283.359
Length  256, alignment  0, c  0:	93.4375	93.1875	539
Length  512, alignment  0, c  0:	163.234	165.016	1050.5
Length 1024, alignment  0, c  0:	269.984	271.672	2074.86
Length 2048, alignment  0, c  0:	483.234	484.234	4122.84
Length 4096, alignment  0, c  0:	910.672	911.469	8219.08
Length 8192, alignment  0, c  0:	1764.42	1765.34	16523.3
Length 16384, alignment  0, c  0:	3472.25	3473.5	32796.6
Length 32768, alignment  0, c  0:	6885.19	6886.62	65564
Length 65536, alignment  0, c  0:	13822.4	13713.8	131232
Length 131072, alignment  0, c  0:	27365.3	27366.4	262462
Length    1, alignment  1, c  0:	25.9219	27.0156	22.2812
Length    2, alignment  2, c  0:	30.6719	33.0625	27.5625
Length    3, alignment  3, c  0:	36.7188	36.9844	41.4531
Length    3, alignment  0, c  0:	36.9375	37.3281	40.1406
Length    4, alignment  4, c  0:	41.5625	42.3125	44.1406
Length    5, alignment  5, c  0:	48.8906	50.5938	45.1562
Length    5, alignment  0, c  0:	48.625	49.9375	45.5469
Length    6, alignment  6, c  0:	50.6406	52.1875	47.7812
Length    6, alignment  0, c  0:	51	52.5938	47.5156
Length    7, alignment  7, c  0:	44.2812	45.1875	40.5469
Length    7, alignment  0, c  0:	44.5938	45.8906	40.8125
Length    8, alignment  0, c  0:	44.9688	45.3125	43.1562
Length    9, alignment  1, c  0:	43.4062	43.9531	44.7812
Length    9, alignment  0, c  0:	43.5938	45.2656	45.5312
Length   10, alignment  2, c  0:	38.6406	39.2969	46.4688
Length   10, alignment  0, c  0:	49.125	51.4219	47.1562
Length   11, alignment  3, c  0:	42.9688	42.9062	48.75
Length   11, alignment  0, c  0:	61.3438	62.9219	49.0938
Length   12, alignment  4, c  0:	43.2031	43.9844	50.7969
Length   12, alignment  0, c  0:	49.0625	51.3125	51.1875
Length   13, alignment  5, c  0:	48.5938	49.6562	52.8281
Length   13, alignment  0, c  0:	49.5312	49.25	52.4531
Length   14, alignment  6, c  0:	48.8438	49.6562	54.4688
Length   14, alignment  0, c  0:	58.4219	60.0938	54.8281
Length   15, alignment  7, c  0:	58.2031	59.1094	56.1562
Length   15, alignment  0, c  0:	66.625	67.4375	56.2812
Length   16, alignment  0, c  0:	54.6562	55.9062	58.0781
Length   17, alignment  1, c  0:	47.9375	48.9844	60.5
Length   17, alignment  0, c  0:	53.8594	55.375	60.4844
Length   18, alignment  2, c  0:	48.6562	49.6719	62.7969
Length   18, alignment  0, c  0:	64.4844	65.6406	62.6562
Length   19, alignment  3, c  0:	56.8594	58.3906	65.1562
Length   19, alignment  0, c  0:	72.5469	73.3125	64.6719
Length   20, alignment  4, c  0:	54.125	56.4375	66.5
Length   20, alignment  0, c  0:	57.7344	60	66.7812
Length   21, alignment  5, c  0:	60.5469	61.7344	69.2656
Length   21, alignment  0, c  0:	57.7031	58.6562	68.125
Length   22, alignment  6, c  0:	57.75	58.9688	70.5469
Length   22, alignment  0, c  0:	63.7188	65.3125	69.5469
Length   23, alignment  7, c  0:	62.9844	63.625	72.0781
Length   23, alignment  0, c  0:	68.2188	70.1875	71.9062
Length   24, alignment  0, c  0:	56.0156	56.9844	75.2188
Length   24, alignment  0, c  0:	55.2812	57.0312	73.4688
Length   25, alignment  1, c  0:	53.5312	55.0156	76.1719
Length   25, alignment  0, c  0:	55.1875	55.9688	76.5156
Length   26, alignment  2, c  0:	51.7812	53.6562	77.7812
Length   26, alignment  0, c  0:	60.8906	62.875	78.8125
Length   27, alignment  3, c  0:	56.6406	58.7031	80.125
Length   27, alignment  0, c  0:	66.2812	67.125	80.9844
Length   28, alignment  4, c  0:	55.3594	57.3125	82.4531
Length   28, alignment  0, c  0:	59.2969	59.6719	82.2031
Length   29, alignment  5, c  0:	60.0781	61.3281	84.4688
Length   29, alignment  0, c  0:	58.0469	58.9688	84.4844
Length   30, alignment  6, c  0:	58.2188	58.6562	86.5625
Length   30, alignment  0, c  0:	64.1406	64.4844	87
Length   31, alignment  7, c  0:	63.3125	64.6094	88.6094
Length   31, alignment  0, c  0:	69.0938	70.5	88.9688
Length   14, alignment  1, c  0:	47.5	49.4219	54.4375
Length 1024, alignment  3, c  0:	288.062	288.672	2074.73
Length   64, alignment  4, c  0:	60.4375	62.0781	155.219
Length   25, alignment  2, c  0:	60.3906	59.6406	77.2031
Length    1, alignment  0, c 65:	25.3906	28.0156	21.9062
Length    2, alignment  0, c 65:	30.7031	31.2969	27.25
Length    4, alignment  0, c 65:	41.375	43.75	38.9531
Length    8, alignment  0, c 65:	44.0156	45.6406	41.8438
Length   16, alignment  0, c 65:	54.7656	55.625	58.125
Length   32, alignment  0, c 65:	39.2656	41.9844	91.1719
Length   64, alignment  0, c 65:	51.1875	52.7344	154.875
Length  128, alignment  0, c 65:	68.4062	69.1875	282.734
Length  256, alignment  0, c 65:	93.5781	93.7969	538.312
Length  512, alignment  0, c 65:	164.047	164.625	1051.47
Length 1024, alignment  0, c 65:	269.609	271	2073.67
Length 2048, alignment  0, c 65:	482.875	485.297	4122.47
Length 4096, alignment  0, c 65:	910.266	911.484	8218.53
Length 8192, alignment  0, c 65:	1765.3	1765.83	16410.8
Length 16384, alignment  0, c 65:	3471.88	3473.55	32795.3
Length 32768, alignment  0, c 65:	6884.44	6886.53	65563.4
Length 65536, alignment  0, c 65:	13711.5	13713.8	131206
Length 131072, alignment  0, c 65:	27364.9	27366.4	262485
Length    1, alignment  1, c 65:	26.2969	27.7812	21.5625
Length    2, alignment  2, c 65:	30.6875	31.9844	27.875
Length    3, alignment  3, c 65:	36.4531	37.25	32.9375
Length    3, alignment  0, c 65:	36.4062	37.3281	40.1562
Length    4, alignment  4, c 65:	41.5781	42.5938	43.4375
Length    5, alignment  5, c 65:	48.9375	50.9062	46.2188
Length    5, alignment  0, c 65:	49.3125	50.9219	45.1719
Length    6, alignment  6, c 65:	51.3125	52.625	46.2969
Length    6, alignment  0, c 65:	51.3125	52.5625	47.4844
Length    7, alignment  7, c 65:	44.9219	45.875	41
Length    7, alignment  0, c 65:	44.1875	45.8906	40.4844
Length    8, alignment  0, c 65:	44.2969	44.9531	42.4375
Length    9, alignment  1, c 65:	42.1562	43.2812	44.4688
Length    9, alignment  0, c 65:	43.25	45.375	44.4531
Length   10, alignment  2, c 65:	37.3281	38.6406	46.8438
Length   10, alignment  0, c 65:	49.2188	50.3125	46.7969
Length   11, alignment  3, c 65:	42.8125	43.6875	48.4531
Length   11, alignment  0, c 65:	61.0156	62.2031	48.4375
Length   12, alignment  4, c 65:	43.6562	44.3438	50.4688
Length   12, alignment  0, c 65:	50.0938	51.2969	51.5
Length   13, alignment  5, c 65:	48.2969	49.9688	52.7812
Length   13, alignment  0, c 65:	48.5312	50	53.3906
Length   14, alignment  6, c 65:	48.8906	49.5312	54.5
Length   14, alignment  0, c 65:	58.4844	60.3906	54.75
Length   15, alignment  7, c 65:	58.5938	60.0625	56.7969
Length   15, alignment  0, c 65:	66.3438	67.7188	55.9375
Length   16, alignment  0, c 65:	55.2031	55.6562	58.4688
Length   17, alignment  1, c 65:	48.4375	48.9844	60.8438
Length   17, alignment  0, c 65:	54.9062	55.6875	60.5156
Length   18, alignment  2, c 65:	48.9375	49.6406	63.125
Length   18, alignment  0, c 65:	64.2031	65.2969	63.0156
Length   19, alignment  3, c 65:	57.5469	58.4062	64.125
Length   19, alignment  0, c 65:	72.2812	73.0625	65.0312
Length   20, alignment  4, c 65:	54.7969	56.3438	65.7969
Length   20, alignment  0, c 65:	59.3281	59.9688	66.4844
Length   21, alignment  5, c 65:	60.4844	61.2812	68.7812
Length   21, alignment  0, c 65:	58.4688	58.8906	68.8438
Length   22, alignment  6, c 65:	59.1875	59.9531	70.7969
Length   22, alignment  0, c 65:	64.6562	64.4688	70.9844
Length   23, alignment  7, c 65:	62.7812	64.3281	72.9531
Length   23, alignment  0, c 65:	69.9688	70.2344	72
Length   24, alignment  0, c 65:	56.0156	57.9531	74.7969
Length   24, alignment  0, c 65:	56.6719	57.7188	73.7344
Length   25, alignment  1, c 65:	54.5156	55.2656	77.1562
Length   25, alignment  0, c 65:	55.4062	57.2812	76.8594
Length   26, alignment  2, c 65:	52.5	53.9375	78.7812
Length   26, alignment  0, c 65:	60.7656	62.6562	78.5156
Length   27, alignment  3, c 65:	57.5312	57.625	80.5
Length   27, alignment  0, c 65:	66.9844	67.2344	80.7031
Length   28, alignment  4, c 65:	55.4062	56.2969	82.7812
Length   28, alignment  0, c 65:	58.9531	60.5938	82.1406
Length   29, alignment  5, c 65:	61.1562	61.2812	84.75
Length   29, alignment  0, c 65:	58.125	59.2656	84.75
Length   30, alignment  6, c 65:	58.1562	59.6406	86.8281
Length   30, alignment  0, c 65:	63.8125	64.9375	86.6875
Length   31, alignment  7, c 65:	62.0938	63.2656	88.7812
Length   31, alignment  0, c 65:	69.0625	69.8281	88
Length   14, alignment  1, c 65:	47.2188	49.3438	54.4062
Length 1024, alignment  3, c 65:	287.656	288.641	2074.08
Length   64, alignment  4, c 65:	60.8125	61.9688	155.125
Length   25, alignment  2, c 65:	60.75	61	73.8906
Length    1, alignment  0, c 130:	24.9531	25.5781	20.5625
Length    2, alignment  0, c 130:	30.6562	31.7188	27.8906
Length    4, alignment  0, c 130:	41.6719	42.6406	43.8281
Length    8, alignment  0, c 130:	43.9375	45.3281	42.5156
Length   16, alignment  0, c 130:	54.75	55.6406	58.8125
Length   32, alignment  0, c 130:	39.5	41.0312	90.4844
Length   64, alignment  0, c 130:	50.75	53.1562	154.844
Length  128, alignment  0, c 130:	67.6406	69.5625	282.312
Length  256, alignment  0, c 130:	92.3906	93.9062	538.344
Length  512, alignment  0, c 130:	163.266	164.719	1050.8
Length 1024, alignment  0, c 130:	270.031	271.75	2075.22
Length 2048, alignment  0, c 130:	483.641	486.5	4123.17
Length 4096, alignment  0, c 130:	910.406	911.953	8380.06
Length 8192, alignment  0, c 130:	1763.88	1765.86	16411
Length 16384, alignment  0, c 130:	3471.95	3473.42	32895.2
Length 32768, alignment  0, c 130:	6884.62	6887.44	65672.8
Length 65536, alignment  0, c 130:	13712.1	13712.8	131276
Length 131072, alignment  0, c 130:	27364.8	27366.2	262339
Length    1, alignment  1, c 130:	25.3594	26.1562	21.5
Length    2, alignment  2, c 130:	30.7031	31.9844	27.4844
Length    3, alignment  3, c 130:	35.9531	37.5938	41.3594
Length    3, alignment  0, c 130:	36.2031	37.3125	42.0625
Length    4, alignment  4, c 130:	41.2188	42.2969	43.375
Length    5, alignment  5, c 130:	48.5938	50.25	43.9062
Length    5, alignment  0, c 130:	48.625	50.5938	45.0625
Length    6, alignment  6, c 130:	50.625	53.3125	47.0938
Length    6, alignment  0, c 130:	51.3281	52.625	45.9375
Length    7, alignment  7, c 130:	44.9688	45.1719	40.6875
Length    7, alignment  0, c 130:	44.9375	44.7969	40.3594
Length    8, alignment  0, c 130:	44.2969	45.6406	42.3906
Length    9, alignment  1, c 130:	42.7031	44.3281	44.75
Length    9, alignment  0, c 130:	43.6562	44.2812	44.4219
Length   10, alignment  2, c 130:	37.625	39.7812	45.9844
Length   10, alignment  0, c 130:	48.4375	49.9531	45.6406
Length   11, alignment  3, c 130:	41.625	43.8438	47.6562
Length   11, alignment  0, c 130:	61.0312	63.5938	47.75
Length   12, alignment  4, c 130:	42.8281	44.5938	50.7188
Length   12, alignment  0, c 130:	50.1562	50.625	51.1875
Length   13, alignment  5, c 130:	48.375	50.7344	52.4219
Length   13, alignment  0, c 130:	49.2188	49.9688	52.7656
Length   14, alignment  6, c 130:	48.9062	49.625	54.4219
Length   14, alignment  0, c 130:	59.0781	59.7188	54.4062
Length   15, alignment  7, c 130:	58.2031	60.7656	56.3594
Length   15, alignment  0, c 130:	66.6562	67.7656	56.6094
Length   16, alignment  0, c 130:	54.4531	56.3438	58.7969
Length   17, alignment  1, c 130:	48.4062	49.5781	60.4062
Length   17, alignment  0, c 130:	53.6094	55.2031	60.7656
Length   18, alignment  2, c 130:	47.9844	50.6562	63.0938
Length   18, alignment  0, c 130:	65.2656	65.5938	63.2812
Length   19, alignment  3, c 130:	57.9531	58.3906	64.3438
Length   19, alignment  0, c 130:	72.5781	73.3906	63.8438
Length   20, alignment  4, c 130:	54.1719	55.2188	67.0625
Length   20, alignment  0, c 130:	58.6562	60.1875	66.3594
Length   21, alignment  5, c 130:	60.4688	61.9531	68.3594
Length   21, alignment  0, c 130:	58.4375	59.6406	68.4062
Length   22, alignment  6, c 130:	58.1719	59.9375	71.4219
Length   22, alignment  0, c 130:	63.7969	65.3281	70.9531
Length   23, alignment  7, c 130:	62.8281	64.2969	72.25
Length   23, alignment  0, c 130:	69.1562	70.1875	72.6406
Length   24, alignment  0, c 130:	56.7188	56.9844	75.0781
Length   24, alignment  0, c 130:	55.9531	57.6406	74.4219
Length   25, alignment  1, c 130:	54.6875	54.8906	76.3125
Length   25, alignment  0, c 130:	55.75	57.6562	76.7656
Length   26, alignment  2, c 130:	51.7969	52.9688	78.6875
Length   26, alignment  0, c 130:	60.4688	62.3125	79.0938
Length   27, alignment  3, c 130:	58.0625	58.25	81.4688
Length   27, alignment  0, c 130:	67.0469	67.1875	80.1562
Length   28, alignment  4, c 130:	55.7344	57.6875	82.3438
Length   28, alignment  0, c 130:	58.75	60.1719	82.3594
Length   29, alignment  5, c 130:	60.2656	60.9688	84.0781
Length   29, alignment  0, c 130:	57.7188	60.375	85.1562
Length   30, alignment  6, c 130:	58.4531	59.625	86.7812
Length   30, alignment  0, c 130:	63.625	65	83.6562
Length   31, alignment  7, c 130:	63.1406	64.2969	87.8906
Length   31, alignment  0, c 130:	68.375	69.1875	88.6094
Length   14, alignment  1, c 130:	47.1094	48.2656	54.0312
Length 1024, alignment  3, c 130:	288.719	289.641	2075.09
Length   64, alignment  4, c 130:	59.7656	61.6562	154.047
Length   25, alignment  2, c 130:	58.9688	61.7031	77.0625

[-- Attachment #7: bench-memset.out.o32.hard.orig --]
[-- Type: text/plain, Size: 17965 bytes --]

                        	memset	builtin_memset	simple_memset
Length    1, alignment  0, c -65:	31.0312	28.3906	22.4219
Length    2, alignment  0, c -65:	32	33.4531	27.9688
Length    4, alignment  0, c -65:	41.6875	42.9688	43.7812
Length    8, alignment  0, c -65:	34.8906	36.7344	42.4531
Length   16, alignment  0, c -65:	44.2344	47.4062	59.1875
Length   32, alignment  0, c -65:	56.1875	58.2656	90.4375
Length   64, alignment  0, c -65:	58.375	59.7188	154.844
Length  128, alignment  0, c -65:	79.1562	80.0938	282.656
Length  256, alignment  0, c -65:	123.266	123.391	538.844
Length  512, alignment  0, c -65:	207.719	208.719	1051.12
Length 1024, alignment  0, c -65:	378.344	380.875	2074.8
Length 2048, alignment  0, c -65:	719.719	721.703	4123.38
Length 4096, alignment  0, c -65:	1402.92	1405.09	8219.28
Length 8192, alignment  0, c -65:	2768.95	2771.09	16411.2
Length 16384, alignment  0, c -65:	5500.62	5501.72	32796.2
Length 32768, alignment  0, c -65:	10961.7	11222.3	65563.5
Length 65536, alignment  0, c -65:	21884.1	21885.5	131222
Length 131072, alignment  0, c -65:	43840	43732.3	262415
Length    1, alignment  1, c -65:	26.7344	27.7656	22.0312
Length    2, alignment  2, c -65:	31.375	33.0938	27.625
Length    3, alignment  3, c -65:	37.0938	37.6406	42.9375
Length    3, alignment  0, c -65:	37.4219	38.625	32.9219
Length    4, alignment  4, c -65:	42.4531	43.375	38.6875
Length    5, alignment  5, c -65:	48.2344	50.3125	45.4531
Length    5, alignment  0, c -65:	48.9062	49.9531	44.5625
Length    6, alignment  6, c -65:	47.0625	47.5625	47.4219
Length    6, alignment  0, c -65:	45.8594	47.2969	47.8438
Length    7, alignment  7, c -65:	48.2344	50.2344	40.8906
Length    7, alignment  0, c -65:	49.4375	50.3438	41.2656
Length    8, alignment  0, c -65:	36.7969	36.9531	42.8281
Length    9, alignment  1, c -65:	40.7969	42.8125	45.7188
Length    9, alignment  0, c -65:	35.0156	36.125	45.2031
Length   10, alignment  2, c -65:	34.9062	36.3594	46.8281
Length   10, alignment  0, c -65:	41.0312	41.6875	46.8438
Length   11, alignment  3, c -65:	41.0938	43.0625	48.8594
Length   11, alignment  0, c -65:	45.8125	48.0469	49.5312
Length   12, alignment  4, c -65:	34.9531	35	48.5469
Length   12, alignment  0, c -65:	34.1094	35.6875	50.4219
Length   13, alignment  5, c -65:	41.6406	42.6875	52.4688
Length   13, alignment  0, c -65:	35.0312	36.4844	53.2188
Length   14, alignment  6, c -65:	34.5781	36.3125	54.8281
Length   14, alignment  0, c -65:	42.4375	43.4062	54.4375
Length   15, alignment  7, c -65:	42.5	44.1875	57.1719
Length   15, alignment  0, c -65:	47.7188	48.7656	57.1875
Length   16, alignment  0, c -65:	45.0938	46.25	59.2344
Length   17, alignment  1, c -65:	42.7188	44.4531	60.125
Length   17, alignment  0, c -65:	45.5	47.0938	61.2031
Length   18, alignment  2, c -65:	46.1094	46.9375	62.8281
Length   18, alignment  0, c -65:	48.2031	49.75	63.9219
Length   19, alignment  3, c -65:	48.9688	50.1562	64.8281
Length   19, alignment  0, c -65:	52.75	54.0625	64.75
Length   20, alignment  4, c -65:	41.7656	43.4219	67.5156
Length   20, alignment  0, c -65:	40.9688	42.5938	66.4219
Length   21, alignment  5, c -65:	49.2188	48.9844	68.4219
Length   21, alignment  0, c -65:	41.5156	43.0156	68.7969
Length   22, alignment  6, c -65:	43.2188	43.3906	70.8281
Length   22, alignment  0, c -65:	48.4844	49.4531	71.1562
Length   23, alignment  7, c -65:	47.8125	49.7812	72.8594
Length   23, alignment  0, c -65:	52.3906	53.7031	72.8125
Length   24, alignment  0, c -65:	50.625	52.2812	74.125
Length   24, alignment  0, c -65:	50.9531	52.3438	74.875
Length   25, alignment  1, c -65:	48.1875	49.7812	76.8125
Length   25, alignment  0, c -65:	50.875	52.9688	76.8438
Length   26, alignment  2, c -65:	51.2188	52.9219	77.75
Length   26, alignment  0, c -65:	53.3438	55.0156	79.5
Length   27, alignment  3, c -65:	55.0312	55.625	80.8438
Length   27, alignment  0, c -65:	59.4375	60.0312	81.125
Length   28, alignment  4, c -65:	47.4062	48.6406	82.7969
Length   28, alignment  0, c -65:	48.0625	48.375	82.5156
Length   29, alignment  5, c -65:	54.5312	55.3594	84.125
Length   29, alignment  0, c -65:	48.2656	50.0469	85.5625
Length   30, alignment  6, c -65:	47.8438	49.3594	86.7969
Length   30, alignment  0, c -65:	54.125	56.1094	86.8125
Length   31, alignment  7, c -65:	54.5	57.5312	89.8594
Length   31, alignment  0, c -65:	59.0938	61.1562	89.2188
Length   14, alignment  1, c -65:	47.4062	48.3281	54.7969
Length 1024, alignment  3, c -65:	387.25	388.016	2075.61
Length   64, alignment  4, c -65:	58.875	60.1406	154.469
Length   25, alignment  2, c -65:	54.1406	54.3438	76.4375
Length    1, alignment  0, c  0:	27.1094	27.3281	21.2031
Length    2, alignment  0, c  0:	31.4062	32.625	27.9531
Length    4, alignment  0, c  0:	42.4375	43.9531	42.125
Length    8, alignment  0, c  0:	35.1719	36.7031	43.1719
Length   16, alignment  0, c  0:	43.3281	44.8906	59.1719
Length   32, alignment  0, c  0:	56.0938	57.2812	90.9062
Length   64, alignment  0, c  0:	58.0469	59.3281	155.219
Length  128, alignment  0, c  0:	80.2031	80.4375	282.656
Length  256, alignment  0, c  0:	122.156	123.328	538.859
Length  512, alignment  0, c  0:	208.047	209.047	1050.86
Length 1024, alignment  0, c  0:	378.641	379.609	2075.19
Length 2048, alignment  0, c  0:	720.125	721.359	4123.41
Length 4096, alignment  0, c  0:	1402.62	1403.66	8218.95
Length 8192, alignment  0, c  0:	2769.31	2770.28	16525
Length 16384, alignment  0, c  0:	5500.47	5501.7	32796.2
Length 32768, alignment  0, c  0:	10961.9	10964	65564.6
Length 65536, alignment  0, c  0:	21992.5	21886.5	131206
Length 131072, alignment  0, c  0:	43729.1	43730.8	262499
Length    1, alignment  1, c  0:	26.6719	27.75	21.7188
Length    2, alignment  2, c  0:	31.7812	32.6562	27.6562
Length    3, alignment  3, c  0:	36.75	37.9844	41.4219
Length    3, alignment  0, c  0:	36.8281	39	32.0938
Length    4, alignment  4, c  0:	42.0625	43.2656	43.3906
Length    5, alignment  5, c  0:	47.9062	49.4688	46.1875
Length    5, alignment  0, c  0:	48.5625	49.4531	43.4531
Length    6, alignment  6, c  0:	46.3281	46.8281	46.1875
Length    6, alignment  0, c  0:	46	47.625	46.1719
Length    7, alignment  7, c  0:	48.6406	49.5938	41.1719
Length    7, alignment  0, c  0:	47.9062	50.2812	40.875
Length    8, alignment  0, c  0:	34.5156	35.6094	42.7969
Length    9, alignment  1, c  0:	38.4062	40.7188	44.8594
Length    9, alignment  0, c  0:	35.6562	36.2969	45.5
Length   10, alignment  2, c  0:	34.6094	35.3438	47.5312
Length   10, alignment  0, c  0:	40.9219	41.2969	47.125
Length   11, alignment  3, c  0:	40.25	41.7344	48.8438
Length   11, alignment  0, c  0:	45.8594	46.2656	48.4688
Length   12, alignment  4, c  0:	34.2344	35.2812	50.4688
Length   12, alignment  0, c  0:	34.4844	34.5938	51.5469
Length   13, alignment  5, c  0:	40	41.5781	52.4531
Length   13, alignment  0, c  0:	34.9688	36.0469	53.1719
Length   14, alignment  6, c  0:	33.9375	34.2656	54.7969
Length   14, alignment  0, c  0:	40.2188	41.2656	55.1719
Length   15, alignment  7, c  0:	40.1719	41.5938	56.7656
Length   15, alignment  0, c  0:	45.8594	47.3594	56.8281
Length   16, alignment  0, c  0:	44.1562	45.625	59.8594
Length   17, alignment  1, c  0:	40.7812	41.6562	60.5156
Length   17, alignment  0, c  0:	43.7656	44.625	61.5312
Length   18, alignment  2, c  0:	44.1094	45.2812	63.1406
Length   18, alignment  0, c  0:	47.2812	46.9062	63.5156
Length   19, alignment  3, c  0:	46.2031	47.2969	64.0781
Length   19, alignment  0, c  0:	52.0625	52.6406	64.4844
Length   20, alignment  4, c  0:	39.7344	41.5625	66.75
Length   20, alignment  0, c  0:	41.0156	41.5781	67.1719
Length   21, alignment  5, c  0:	45.5156	47.6406	69.9531
Length   21, alignment  0, c  0:	40.5	42.125	68.1719
Length   22, alignment  6, c  0:	40.8906	41.3594	70.7969
Length   22, alignment  0, c  0:	46.8594	47.6875	71.7656
Length   23, alignment  7, c  0:	46.7656	47.6875	72.8438
Length   23, alignment  0, c  0:	51.7656	53.7031	73.5
Length   24, alignment  0, c  0:	50.0156	51.3125	74.9062
Length   24, alignment  0, c  0:	49.2188	50.875	74.7969
Length   25, alignment  1, c  0:	47.2812	47.2656	76.8281
Length   25, alignment  0, c  0:	50.2812	51.2969	77.2344
Length   26, alignment  2, c  0:	50.3594	51.1562	79.125
Length   26, alignment  0, c  0:	52.9688	54.3594	78.7969
Length   27, alignment  3, c  0:	52.4375	53.6719	81.2188
Length   27, alignment  0, c  0:	56.7812	59	81.8125
Length   28, alignment  4, c  0:	46.1875	46.9688	83.2031
Length   28, alignment  0, c  0:	46.6875	49.0781	83.4688
Length   29, alignment  5, c  0:	52.5938	54.0312	84.1562
Length   29, alignment  0, c  0:	47.1875	48.3281	84.8438
Length   30, alignment  6, c  0:	46.5156	46.8906	86.125
Length   30, alignment  0, c  0:	51.8281	52.9531	87.2188
Length   31, alignment  7, c  0:	162.359	53.9688	88.8281
Length   31, alignment  0, c  0:	66.2031	67.7344	88.2656
Length   14, alignment  1, c  0:	51.9531	53.1875	55.5625
Length 1024, alignment  3, c  0:	394.328	395.547	2075.14
Length   64, alignment  4, c  0:	58.875	59.2969	154.094
Length   25, alignment  2, c  0:	60.8438	61.1562	77.5312
Length    1, alignment  0, c 65:	26.0156	27.2656	21.5312
Length    2, alignment  0, c 65:	31.6875	32.5938	28.3594
Length    4, alignment  0, c 65:	42.4219	43.7188	43.8281
Length    8, alignment  0, c 65:	35.4375	36.5	42.5781
Length   16, alignment  0, c 65:	45.4062	46.5625	58.5
Length   32, alignment  0, c 65:	57.125	58.2969	91.1562
Length   64, alignment  0, c 65:	58.3438	60.375	155.5
Length  128, alignment  0, c 65:	79.9219	80.5156	282.344
Length  256, alignment  0, c 65:	122.469	123.25	539.172
Length  512, alignment  0, c 65:	208.469	210.031	1051.55
Length 1024, alignment  0, c 65:	378.75	380.422	2075.86
Length 2048, alignment  0, c 65:	719.75	721.391	4123.03
Length 4096, alignment  0, c 65:	1402.06	1403.61	8218.97
Length 8192, alignment  0, c 65:	2768.73	2769.47	16412
Length 16384, alignment  0, c 65:	5500.17	5501.12	32795.6
Length 32768, alignment  0, c 65:	10960.2	10962.8	65672.6
Length 65536, alignment  0, c 65:	21883.6	21885.1	131206
Length 131072, alignment  0, c 65:	43729.1	43732.8	262367
Length    1, alignment  1, c 65:	26.8438	28.375	21.3594
Length    2, alignment  2, c 65:	32.4375	33.3594	27.2656
Length    3, alignment  3, c 65:	37.1094	38.2812	42.1562
Length    3, alignment  0, c 65:	37.875	38.0312	40.5
Length    4, alignment  4, c 65:	42.2031	44.375	37.9062
Length    5, alignment  5, c 65:	48.9531	49.5312	45.8125
Length    5, alignment  0, c 65:	48.9844	51	44.8438
Length    6, alignment  6, c 65:	46.3438	48.6719	47.8125
Length    6, alignment  0, c 65:	46.2812	47.9531	46.5156
Length    7, alignment  7, c 65:	49.6719	51.1875	41.2344
Length    7, alignment  0, c 65:	48.9531	51.2031	41.125
Length    8, alignment  0, c 65:	34.7188	35.25	43.2656
Length    9, alignment  1, c 65:	40.8281	41.75	45.2031
Length    9, alignment  0, c 65:	36.1094	36.7031	45.1719
Length   10, alignment  2, c 65:	36.3125	36.7031	47.1719
Length   10, alignment  0, c 65:	41.0625	43.0469	47.5
Length   11, alignment  3, c 65:	41.5938	43.0625	49.5625
Length   11, alignment  0, c 65:	46.2188	48.0625	48.7969
Length   12, alignment  4, c 65:	35.3281	35.7188	51.5
Length   12, alignment  0, c 65:	34.5938	36.4062	51.5312
Length   13, alignment  5, c 65:	42.3594	43.0469	52.4844
Length   13, alignment  0, c 65:	34.3125	35.7344	53.1406
Length   14, alignment  6, c 65:	35.9688	36.2969	55.1719
Length   14, alignment  0, c 65:	42.2656	43.0312	54.8281
Length   15, alignment  7, c 65:	42.9531	43.7188	56.8281
Length   15, alignment  0, c 65:	46.9219	48.3125	56.5
Length   16, alignment  0, c 65:	44.7656	46.5781	59.2188
Length   17, alignment  1, c 65:	42.75	44.4375	61.5
Length   17, alignment  0, c 65:	45.0781	46.6406	60.8594
Length   18, alignment  2, c 65:	46.0625	47.3281	63.1562
Length   18, alignment  0, c 65:	48.8438	49.4062	62.8438
Length   19, alignment  3, c 65:	49.2188	50.3594	64.7969
Length   19, alignment  0, c 65:	53.0625	54.5938	65.5
Length   20, alignment  4, c 65:	41.8594	43.6875	66.4688
Length   20, alignment  0, c 65:	41.7188	44.0156	67.2188
Length   21, alignment  5, c 65:	48.5	49.7812	69.4844
Length   21, alignment  0, c 65:	42.1875	43.7344	69.5
Length   22, alignment  6, c 65:	42.5469	44.375	71.8125
Length   22, alignment  0, c 65:	48.625	49.75	71.1719
Length   23, alignment  7, c 65:	48.8594	50.0312	73.1406
Length   23, alignment  0, c 65:	53.5781	53.7188	72.5
Length   24, alignment  0, c 65:	51.0625	52.5938	75.2344
Length   24, alignment  0, c 65:	51.5781	52.3281	75.2031
Length   25, alignment  1, c 65:	48.75	49.2969	79
Length   25, alignment  0, c 65:	51.7188	52.2812	77.2031
Length   26, alignment  2, c 65:	52.0156	52.9219	79.2344
Length   26, alignment  0, c 65:	54.1719	54.9375	79.1562
Length   27, alignment  3, c 65:	54.75	55.6875	81.1562
Length   27, alignment  0, c 65:	59	60.6875	80.8438
Length   28, alignment  4, c 65:	47.8438	49.6406	82.7969
Length   28, alignment  0, c 65:	48.0469	48.9844	83.875
Length   29, alignment  5, c 65:	54.625	56.1719	85.9062
Length   29, alignment  0, c 65:	48.5625	49.3125	85.875
Length   30, alignment  6, c 65:	48.8906	50.75	86.5156
Length   30, alignment  0, c 65:	54.2344	55.0625	87.5
Length   31, alignment  7, c 65:	54.8125	56.0938	88.4844
Length   31, alignment  0, c 65:	59.1094	60.3125	88.8125
Length   14, alignment  1, c 65:	47.1094	48.0312	54.7812
Length 1024, alignment  3, c 65:	387.531	388.375	2074.36
Length   64, alignment  4, c 65:	58.5781	60.4062	155.219
Length   25, alignment  2, c 65:	54.7188	55.375	77.1875
Length    1, alignment  0, c 130:	26.1094	28.0625	20.9375
Length    2, alignment  0, c 130:	30.7188	32.7031	27.25
Length    4, alignment  0, c 130:	42.75	43.375	38.2812
Length    8, alignment  0, c 130:	34.5781	35.6719	43.5312
Length   16, alignment  0, c 130:	44.5312	46.2812	59.1562
Length   32, alignment  0, c 130:	57	58.9219	90.8438
Length   64, alignment  0, c 130:	58.25	60.0781	155.516
Length  128, alignment  0, c 130:	80.2188	81.3281	283.031
Length  256, alignment  0, c 130:	122.094	123.688	539.188
Length  512, alignment  0, c 130:	207.062	209.094	1051.2
Length 1024, alignment  0, c 130:	378.094	379.953	2075.52
Length 2048, alignment  0, c 130:	719.781	720.344	4122.64
Length 4096, alignment  0, c 130:	1402.52	1403.28	8218.59
Length 8192, alignment  0, c 130:	2768.67	2769.53	16411.6
Length 16384, alignment  0, c 130:	5500.61	5624.25	32796.9
Length 32768, alignment  0, c 130:	10960.8	10962.3	65564.1
Length 65536, alignment  0, c 130:	21992	21885.5	131207
Length 131072, alignment  0, c 130:	43729	43731.3	262320
Length    1, alignment  1, c 130:	26.4688	27.7969	21.6719
Length    2, alignment  2, c 130:	30.9688	32.7188	28
Length    3, alignment  3, c 130:	36.5938	38	32.8125
Length    3, alignment  0, c 130:	37.0781	38.0156	40.1719
Length    4, alignment  4, c 130:	41.8125	43.7188	43.0469
Length    5, alignment  5, c 130:	49.0938	51	43.875
Length    5, alignment  0, c 130:	48.6406	50.0312	46.2656
Length    6, alignment  6, c 130:	46.2344	47.375	46.25
Length    6, alignment  0, c 130:	46.7969	47.5625	48.1562
Length    7, alignment  7, c 130:	48.5625	49.5625	40.8281
Length    7, alignment  0, c 130:	48.5781	50.1875	40.8594
Length    8, alignment  0, c 130:	35.2344	36.6875	42.4219
Length    9, alignment  1, c 130:	40.3594	41.7031	44.4844
Length    9, alignment  0, c 130:	35.0469	36.375	44.5156
Length   10, alignment  2, c 130:	35.9844	37.7344	46.0781
Length   10, alignment  0, c 130:	41.6406	42.7188	47.6094
Length   11, alignment  3, c 130:	41.7656	43.75	48.1875
Length   11, alignment  0, c 130:	46.6406	47.7656	49.2031
Length   12, alignment  4, c 130:	35.0625	36.1094	50.8281
Length   12, alignment  0, c 130:	33.7656	35.3594	50.8125
Length   13, alignment  5, c 130:	41.6562	42.7812	52.4219
Length   13, alignment  0, c 130:	34.2812	35.375	52.8594
Length   14, alignment  6, c 130:	35.2812	35.6875	54.0781
Length   14, alignment  0, c 130:	40.8438	42.6875	54.8281
Length   15, alignment  7, c 130:	42.1562	43.7188	56.75
Length   15, alignment  0, c 130:	46.7969	47.9688	56.4219
Length   16, alignment  0, c 130:	45.375	46.3125	59.125
Length   17, alignment  1, c 130:	42.2344	43.3281	60.4062
Length   17, alignment  0, c 130:	45.0781	47.0156	61.9062
Length   18, alignment  2, c 130:	45.4375	46.6094	63.5156
Length   18, alignment  0, c 130:	48	48.6562	62.125
Length   19, alignment  3, c 130:	47.8438	49.4531	64.5156
Length   19, alignment  0, c 130:	53.125	55.1875	64.5312
Length   20, alignment  4, c 130:	41.5156	43.3438	67.1562
Length   20, alignment  0, c 130:	41.6875	42	66.3906
Length   21, alignment  5, c 130:	48.2031	49.7188	68.4688
Length   21, alignment  0, c 130:	42.1406	42.7188	68.1406
Length   22, alignment  6, c 130:	42.4375	44.5156	70.8125
Length   22, alignment  0, c 130:	47.875	48.6562	69.75
Length   23, alignment  7, c 130:	47.8125	50.7812	72.7812
Length   23, alignment  0, c 130:	53.0938	54.7344	72.5
Length   24, alignment  0, c 130:	50.6562	52.9219	75.5312
Length   24, alignment  0, c 130:	50.5156	51.5312	74.4688
Length   25, alignment  1, c 130:	47.8594	49.4688	76.4688
Length   25, alignment  0, c 130:	51.0312	51.1875	76.1875
Length   26, alignment  2, c 130:	52.5	54.3594	79.5
Length   26, alignment  0, c 130:	54.1719	55.3906	78.8281
Length   27, alignment  3, c 130:	54.5156	55.6875	80.8125
Length   27, alignment  0, c 130:	58.7188	59.75	81.5
Length   28, alignment  4, c 130:	47.8438	48.6875	83.8594
Length   28, alignment  0, c 130:	47.6875	48.3438	83.1562
Length   29, alignment  5, c 130:	54.4844	56.4531	84.7812
Length   29, alignment  0, c 130:	47.9219	49.8125	85.125
Length   30, alignment  6, c 130:	48.7812	50	87.125
Length   30, alignment  0, c 130:	53.9062	56.2031	86.8438
Length   31, alignment  7, c 130:	54.125	56.4375	89.5
Length   31, alignment  0, c 130:	59.3906	61.4375	88.8281
Length   14, alignment  1, c 130:	46.7812	49.5	55.2656
Length 1024, alignment  3, c 130:	387.703	388.469	2074.84
Length   64, alignment  4, c 130:	59.2188	59.4375	155.188
Length   25, alignment  2, c 130:	54.0469	55.4531	185.969

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-18 17:41           ` Steve Ellcey
  2013-09-19 15:25             ` Carlos O'Donell
@ 2013-09-20 16:43             ` Joseph S. Myers
  2013-09-20 17:32               ` Steve Ellcey
  1 sibling, 1 reply; 32+ messages in thread
From: Joseph S. Myers @ 2013-09-20 16:43 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Carlos O'Donell, libc-ports

On Wed, 18 Sep 2013, Steve Ellcey wrote:

> 2013-09-18  Steve Ellcey  <sellcey@mips.com>
> 
> 	* sysdeps/mips/memset.S: Change prefetching and add loop unrolling. 
> 	* sysdeps/mips/mips64/memset.S: Remove.

OK, though I believe "proAptive" in a comment should be "proAptiv".

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-20 16:43             ` Joseph S. Myers
@ 2013-09-20 17:32               ` Steve Ellcey
  2013-12-12 22:19                 ` Andrew Pinski
  0 siblings, 1 reply; 32+ messages in thread
From: Steve Ellcey @ 2013-09-20 17:32 UTC (permalink / raw)
  To: Joseph S. Myers; +Cc: Carlos O'Donell, libc-ports

On Fri, 2013-09-20 at 16:43 +0000, Joseph S. Myers wrote:
> On Wed, 18 Sep 2013, Steve Ellcey wrote:
> 
> > 2013-09-18  Steve Ellcey  <sellcey@mips.com>
> > 
> > 	* sysdeps/mips/memset.S: Change prefetching and add loop unrolling. 
> > 	* sysdeps/mips/mips64/memset.S: Remove.
> 
> OK, though I believe "proAptive" in a comment should be "proAptiv".

You are right.  I fixed that and checked in the patch.

Steve Ellcey
sellcey@mips.com


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-09-20 17:32               ` Steve Ellcey
@ 2013-12-12 22:19                 ` Andrew Pinski
  2013-12-13  0:01                   ` Carlos O'Donell
  0 siblings, 1 reply; 32+ messages in thread
From: Andrew Pinski @ 2013-12-12 22:19 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Joseph S. Myers, Carlos O'Donell, libc-ports

On Fri, Sep 20, 2013 at 10:32 AM, Steve Ellcey <sellcey@mips.com> wrote:
> On Fri, 2013-09-20 at 16:43 +0000, Joseph S. Myers wrote:
>> On Wed, 18 Sep 2013, Steve Ellcey wrote:
>>
>> > 2013-09-18  Steve Ellcey  <sellcey@mips.com>
>> >
>> >     * sysdeps/mips/memset.S: Change prefetching and add loop unrolling.
>> >     * sysdeps/mips/mips64/memset.S: Remove.
>>
>> OK, though I believe "proAptive" in a comment should be "proAptiv".
>
> You are right.  I fixed that and checked in the patch.


I noticed this patch causes some performance regressions on Octeon due
to having 128 byte cache lines.
Changing PREFETCH_CHUNK/PREFETCH_FOR_STORE to assume 128 byte cache
line gives us the performance back and improves over the original code
at least 15%.
That is:
#  define PREFETCH_CHUNK 128
#  define PREFETCH_FOR_STORE(chunk, reg) \
    pref PREFETCH_STORE_HINT, (chunk)*128(reg);

Thanks,
Andrew Pinski

>
> Steve Ellcey
> sellcey@mips.com
>
>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-12-12 22:19                 ` Andrew Pinski
@ 2013-12-13  0:01                   ` Carlos O'Donell
  2013-12-13  0:14                     ` Steve Ellcey
  0 siblings, 1 reply; 32+ messages in thread
From: Carlos O'Donell @ 2013-12-13  0:01 UTC (permalink / raw)
  To: Andrew Pinski
  Cc: Steve Ellcey, Joseph S. Myers, Carlos O'Donell, libc-ports

On Thu, Dec 12, 2013 at 5:19 PM, Andrew Pinski <pinskia@gmail.com> wrote:
> On Fri, Sep 20, 2013 at 10:32 AM, Steve Ellcey <sellcey@mips.com> wrote:
>> On Fri, 2013-09-20 at 16:43 +0000, Joseph S. Myers wrote:
>>> On Wed, 18 Sep 2013, Steve Ellcey wrote:
>>>
>>> > 2013-09-18  Steve Ellcey  <sellcey@mips.com>
>>> >
>>> >     * sysdeps/mips/memset.S: Change prefetching and add loop unrolling.
>>> >     * sysdeps/mips/mips64/memset.S: Remove.
>>>
>>> OK, though I believe "proAptive" in a comment should be "proAptiv".
>>
>> You are right.  I fixed that and checked in the patch.
>
>
> I noticed this patch causes some performance regressions on Octeon due
> to having 128 byte cache lines.
> Changing PREFETCH_CHUNK/PREFETCH_FOR_STORE to assume 128 byte cache
> line gives us the performance back and improves over the original code
> at least 15%.
> That is:
> #  define PREFETCH_CHUNK 128
> #  define PREFETCH_FOR_STORE(chunk, reg) \
>     pref PREFETCH_STORE_HINT, (chunk)*128(reg);

Submit a patch for that?

We have microbenchmarks now, but the next hardest
part is going to be archiving data by device so that
the community can help track performance and point
out regressions like this.

Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-12-13  0:01                   ` Carlos O'Donell
@ 2013-12-13  0:14                     ` Steve Ellcey
  2013-12-13  0:22                       ` Andrew Pinski
  2013-12-13  4:40                       ` Carlos O'Donell
  0 siblings, 2 replies; 32+ messages in thread
From: Steve Ellcey @ 2013-12-13  0:14 UTC (permalink / raw)
  To: Carlos O'Donell
  Cc: Andrew Pinski, Joseph S. Myers, Carlos O'Donell, libc-ports

On Thu, 2013-12-12 at 19:01 -0500, Carlos O'Donell wrote:

> > I noticed this patch causes some performance regressions on Octeon due
> > to having 128 byte cache lines.
> > Changing PREFETCH_CHUNK/PREFETCH_FOR_STORE to assume 128 byte cache
> > line gives us the performance back and improves over the original code
> > at least 15%.
> > That is:
> > #  define PREFETCH_CHUNK 128
> > #  define PREFETCH_FOR_STORE(chunk, reg) \
> >     pref PREFETCH_STORE_HINT, (chunk)*128(reg);
> 
> Submit a patch for that?
> 
> We have microbenchmarks now, but the next hardest
> part is going to be archiving data by device so that
> the community can help track performance and point
> out regressions like this.
> 
> Cheers,
> Carlos.

Unless the change is under some kind of ifdef for Octeon changing this
will probably slow down other MIPS chips.  Most of them have 32 byte
cache lines.

Steve Ellcey
sellcey@mips.com


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-12-13  0:14                     ` Steve Ellcey
@ 2013-12-13  0:22                       ` Andrew Pinski
  2013-12-13  4:40                       ` Carlos O'Donell
  1 sibling, 0 replies; 32+ messages in thread
From: Andrew Pinski @ 2013-12-13  0:22 UTC (permalink / raw)
  To: Steve Ellcey
  Cc: Carlos O'Donell, Joseph S. Myers, Carlos O'Donell, libc-ports

On Thu, Dec 12, 2013 at 4:14 PM, Steve Ellcey <sellcey@mips.com> wrote:
> On Thu, 2013-12-12 at 19:01 -0500, Carlos O'Donell wrote:
>
>> > I noticed this patch causes some performance regressions on Octeon due
>> > to having 128 byte cache lines.
>> > Changing PREFETCH_CHUNK/PREFETCH_FOR_STORE to assume 128 byte cache
>> > line gives us the performance back and improves over the original code
>> > at least 15%.
>> > That is:
>> > #  define PREFETCH_CHUNK 128
>> > #  define PREFETCH_FOR_STORE(chunk, reg) \
>> >     pref PREFETCH_STORE_HINT, (chunk)*128(reg);
>>
>> Submit a patch for that?
>>
>> We have microbenchmarks now, but the next hardest
>> part is going to be archiving data by device so that
>> the community can help track performance and point
>> out regressions like this.
>>
>> Cheers,
>> Carlos.
>
> Unless the change is under some kind of ifdef for Octeon changing this
> will probably slow down other MIPS chips.  Most of them have 32 byte
> cache lines.

I think once ifunc support has been finalized, it would be useful to
build memset optimized for Octeon but until then I think it might be
best to disable the prefetching in the generic mips code.

Thanks,
Andrew Pinski


>
> Steve Ellcey
> sellcey@mips.com
>
>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [patch, mips] Improved memset for MIPS
  2013-12-13  0:14                     ` Steve Ellcey
  2013-12-13  0:22                       ` Andrew Pinski
@ 2013-12-13  4:40                       ` Carlos O'Donell
  1 sibling, 0 replies; 32+ messages in thread
From: Carlos O'Donell @ 2013-12-13  4:40 UTC (permalink / raw)
  To: Steve Ellcey, Carlos O'Donell
  Cc: Andrew Pinski, Joseph S. Myers, libc-ports

On 12/12/2013 07:14 PM, Steve Ellcey wrote:
> On Thu, 2013-12-12 at 19:01 -0500, Carlos O'Donell wrote:
> 
>>> I noticed this patch causes some performance regressions on Octeon due
>>> to having 128 byte cache lines.
>>> Changing PREFETCH_CHUNK/PREFETCH_FOR_STORE to assume 128 byte cache
>>> line gives us the performance back and improves over the original code
>>> at least 15%.
>>> That is:
>>> #  define PREFETCH_CHUNK 128
>>> #  define PREFETCH_FOR_STORE(chunk, reg) \
>>>     pref PREFETCH_STORE_HINT, (chunk)*128(reg);
>>
>> Submit a patch for that?
>>
>> We have microbenchmarks now, but the next hardest
>> part is going to be archiving data by device so that
>> the community can help track performance and point
>> out regressions like this.
>>
>> Cheers,
>> Carlos.
> 
> Unless the change is under some kind of ifdef for Octeon changing this
> will probably slow down other MIPS chips.  Most of them have 32 byte
> cache lines.

Absolutely. I don't suggest he just change it, but Andrew would have to add
enough framework for Octeon to be enabled with an optimal implementation.
For example you could compile an alternate version with 128 byte cache
line support and select it via IFUNC based on AT_HWCAP?

Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2013-12-13  4:40 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-09-05 17:06 [patch, mips] Improved memset for MIPS Steve Ellcey 
2013-09-06  0:40 ` Mike Frysinger
2013-09-06 15:42   ` Steve Ellcey
2013-09-06  4:18 ` Carlos O'Donell
2013-09-06 16:03   ` Steve Ellcey
2013-09-06 17:12     ` Carlos O'Donell
2013-09-06 23:33       ` Steve Ellcey
2013-09-07  2:38         ` Carlos O'Donell
2013-09-10 20:31           ` Steve Ellcey
2013-09-10 21:01             ` Carlos O'Donell
2013-09-10 21:14               ` Steve Ellcey
2013-09-10 22:35                 ` Carlos O'Donell
2013-09-10 22:38                   ` Carlos O'Donell
2013-09-07  5:46         ` Andreas Schwab
2013-09-06 14:31 ` Joseph S. Myers
2013-09-06 15:58   ` Steve Ellcey
2013-09-06 16:09     ` Joseph S. Myers
2013-09-06 16:50       ` Steve Ellcey
2013-09-06 16:59         ` Joseph S. Myers
2013-09-06 17:43           ` Steve Ellcey
2013-09-06 18:57             ` Brooks Moses
2013-09-18 17:41           ` Steve Ellcey
2013-09-19 15:25             ` Carlos O'Donell
2013-09-19 17:02               ` Steve Ellcey
2013-09-20 16:43             ` Joseph S. Myers
2013-09-20 17:32               ` Steve Ellcey
2013-12-12 22:19                 ` Andrew Pinski
2013-12-13  0:01                   ` Carlos O'Donell
2013-12-13  0:14                     ` Steve Ellcey
2013-12-13  0:22                       ` Andrew Pinski
2013-12-13  4:40                       ` Carlos O'Donell
2013-09-06 16:59       ` Steve Ellcey

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).