From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <libc-ports-return-3348-listarch-libc-ports=sources.redhat.com@sourceware.org>
Received: (qmail 16539 invoked by alias); 3 Nov 2012 03:42:44 -0000
Received: (qmail 16531 invoked by uid 22791); 3 Nov 2012 03:42:43 -0000
X-SWARE-Spam-Status: No, hits=-1.6 required=5.0	tests=AWL,BAYES_00,DATE_IN_PAST_06_12,KHOP_SPAMHAUS_DROP,RP_MATCHES_RCVD,TW_BW,TW_CP,TW_EG
X-Spam-Check-By: sourceware.org
Received: from usmamail.tilera.com (HELO USMAMAIL.TILERA.COM) (12.216.194.151)    by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Sat, 03 Nov 2012 03:42:35 +0000
Received: from farm-0002.internal.tilera.com (10.2.0.32) by USMAEXCH2.tad.internal.tilera.com (10.3.0.33) with Microsoft SMTP Server (TLS) id 14.0.694.0; Fri, 2 Nov 2012 23:42:34 -0400
Received: (from cmetcalf@localhost)	by farm-0002.internal.tilera.com (8.14.4/8.12.11/Submit) id qA33gX9m008629;	Fri, 2 Nov 2012 23:42:33 -0400
Message-ID: <201211030342.qA33gX9m008629@farm-0002.internal.tilera.com>
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Sat, 03 Nov 2012 03:42:00 -0000
Subject: [PATCH] Optimize tile (mostly tilegx) memcpy and memmove performance.
To: <libc-ports@sourceware.org>
MIME-Version: 1.0
Content-Type: text/plain
X-IsSubscribed: yes
Mailing-List: contact libc-ports-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-ports.sourceware.org>
List-Subscribe: <mailto:libc-ports-subscribe@sourceware.org>
List-Post: <mailto:libc-ports@sourceware.org>
List-Help: <mailto:libc-ports-help@sourceware.org>, <http://sourceware.org/lists.html#faqs>
Sender: libc-ports-owner@sourceware.org
X-SW-Source: 2012-11/txt/msg00007.txt.bz2

2012-11-02  Chris Metcalf  <cmetcalf@tilera.com>

	* sysdeps/tile/tilegx/memcpy.c (__memcpy): Optimize.
	* sysdeps/tile/memcopy.h: New file.
	* sysdeps/tile/wordcopy.c: New file.

- Override <memcopy.h> so we use full 8-byte word copies on tilegx32
  for memmove, then use op_t in memcpy instead of the previous
  locally-defined word_t just to avoid proliferating identical types.
- Fix bug in memcpy prefetch that caused us to never prefetch past
  the first cache line.
- Optimize misaligned memcpy by inlining _wordcopy_fwd_dest_aligned
  instead of just doing a dumb word-at-a-time copy.
- Make memcpy safe for forward copies by doing all the loads from
  a given cache line prior to doing a wh64 (cache line zero-fill)
  on the destination.  Remove now-redundant src == dst check.
- Copy and optimize the generic wordcopy.c routines to use the tile
  "double align" instruction instead of the MERGE macro; to avoid
  offset addressing mode (which tile doesn't have) by rewriting the
  pointer math to load and store with a zero index; and to use
  post-increment addresses in the inner loops to improve scheduling.
---
 ports/sysdeps/tile/memcopy.h       |   27 +++
 ports/sysdeps/tile/tilegx/memcpy.c |  200 ++++++++++------
 ports/sysdeps/tile/wordcopy.c      |  449 ++++++++++++++++++++++++++++++++++++
 4 files changed, 615 insertions(+), 67 deletions(-)
 create mode 100644 ports/sysdeps/tile/memcopy.h
 create mode 100644 ports/sysdeps/tile/wordcopy.c

diff --git a/ports/sysdeps/tile/memcopy.h b/ports/sysdeps/tile/memcopy.h
new file mode 100644
index 0000000..2bc3fce
--- /dev/null
+++ b/ports/sysdeps/tile/memcopy.h
@@ -0,0 +1,27 @@
+/* memcopy.h -- definitions for memory copy functions.  Tile version.
+   Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdeps/generic/memcopy.h>
+#include <bits/wordsize.h>
+
+/* Support more efficient copying on tilegx32, which supports
+   long long as a native 64-bit type.  */
+#if defined (__tilegx__) && __WORDSIZE == 32
+# undef op_t
+# define op_t	unsigned long long int
+#endif
diff --git a/ports/sysdeps/tile/tilegx/memcpy.c b/ports/sysdeps/tile/tilegx/memcpy.c
index dd6e30d..5b015f3 100644
--- a/ports/sysdeps/tile/tilegx/memcpy.c
+++ b/ports/sysdeps/tile/tilegx/memcpy.c
@@ -19,11 +19,9 @@
 #include <string.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <memcopy.h>
 #include <arch/chip.h>
 
-/* Must be 8 bytes in size. */
-#define word_t uint64_t
-
 /* How many cache lines ahead should we prefetch? */
 #define PREFETCH_LINES_AHEAD 3
 
@@ -34,8 +32,8 @@ __memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n)
   const char *__restrict src1 = (const char *) srcv;
   const char *__restrict src1_end;
   const char *__restrict prefetch;
-  word_t *__restrict dst8; /* 8-byte pointer to destination memory. */
-  word_t final; /* Final bytes to write to trailing word, if any */
+  op_t *__restrict dst8; /* 8-byte pointer to destination memory. */
+  op_t final; /* Final bytes to write to trailing word, if any */
   long i;
 
   if (n < 16)
@@ -55,101 +53,169 @@ __memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n)
     {
       __insn_prefetch (prefetch);
       prefetch += CHIP_L2_LINE_SIZE ();
-      prefetch = (prefetch > src1_end) ? prefetch : src1;
+      prefetch = (prefetch < src1_end) ? prefetch : src1;
     }
 
   /* Copy bytes until dst is word-aligned. */
-  for (; (uintptr_t) dst1 & (sizeof (word_t) - 1); n--)
+  for (; (uintptr_t) dst1 & (sizeof (op_t) - 1); n--)
     *dst1++ = *src1++;
 
   /* 8-byte pointer to destination memory. */
-  dst8 = (word_t *) dst1;
+  dst8 = (op_t *) dst1;
 
-  if (__builtin_expect ((uintptr_t) src1 & (sizeof (word_t) - 1), 0))
+  if (__builtin_expect ((uintptr_t) src1 & (sizeof (op_t) - 1), 0))
     {
-      /* Misaligned copy.  Copy 8 bytes at a time, but don't bother
-         with other fanciness.
-         TODO: Consider prefetching and using wh64 as well.  */
+      /* Misaligned copy.  Use glibc's _wordcopy_fwd_dest_aligned, but
+         inline it to avoid prologue/epilogue.  TODO: Consider
+         prefetching and using wh64 as well.  */
+      void * srci;
+      op_t a0, a1, a2, a3;
+      long int dstp = (long int) dst1;
+      long int srcp = (long int) src1;
+      long int len = n / OPSIZ;
 
-      /* Create an aligned src8. */
-      const word_t *__restrict src8 =
-        (const word_t *) ((uintptr_t) src1 & -sizeof (word_t));
-      word_t b;
+      /* Save the initial source pointer so we know the number of
+         bytes to shift for merging two unaligned results.  */
+      srci = (void *) srcp;
 
-      word_t a = *src8++;
-      for (; n >= sizeof (word_t); n -= sizeof (word_t))
-        {
-          b = *src8++;
-          a = __insn_dblalign (a, b, src1);
-          *dst8++ = a;
-          a = b;
-        }
+      /* Make SRCP aligned by rounding it down to the beginning of the
+         `op_t' it points in the middle of.  */
+      srcp &= -OPSIZ;
+
+      switch (len % 4)
+	{
+	case 2:
+	  a1 = ((op_t *) srcp)[0];
+	  a2 = ((op_t *) srcp)[1];
+	  len += 2;
+	  srcp += 2 * OPSIZ;
+	  goto do1;
+	case 3:
+	  a0 = ((op_t *) srcp)[0];
+	  a1 = ((op_t *) srcp)[1];
+	  len += 1;
+	  srcp += 2 * OPSIZ;
+	  goto do2;
+	case 0:
+	  if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	    return dstv;
+	  a3 = ((op_t *) srcp)[0];
+	  a0 = ((op_t *) srcp)[1];
+	  len += 0;
+	  srcp += 2 * OPSIZ;
+	  goto do3;
+	case 1:
+	  a2 = ((op_t *) srcp)[0];
+	  a3 = ((op_t *) srcp)[1];
+	  srcp += 2 * OPSIZ;
+	  len -= 1;
+	  if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	    goto do0;
+	  goto do4;			/* No-op.  */
+	}
 
+      do
+	{
+	do4:
+	  a0 = ((op_t *) srcp)[0];
+	  a2 = __insn_dblalign (a2, a3, srci);
+	  ((op_t *) dstp)[0] = a2;
+	  srcp += OPSIZ;
+	  dstp += OPSIZ;
+	do3:
+	  a1 = ((op_t *) srcp)[0];
+	  a3 = __insn_dblalign (a3, a0, srci);
+	  ((op_t *) dstp)[0] = a3;
+	  srcp += OPSIZ;
+	  dstp += OPSIZ;
+	do2:
+	  a2 = ((op_t *) srcp)[0];
+	  a0 = __insn_dblalign (a0, a1, srci);
+	  ((op_t *) dstp)[0] = a0;
+	  srcp += OPSIZ;
+	  dstp += OPSIZ;
+	do1:
+	  a3 = ((op_t *) srcp)[0];
+	  a1 = __insn_dblalign (a1, a2, srci);
+	  ((op_t *) dstp)[0] = a1;
+	  srcp += OPSIZ;
+	  dstp += OPSIZ;
+	  len -= 4;
+	}
+      while (len != 0);
+
+      /* This is the right position for do0.  Please don't move
+         it into the loop.  */
+    do0:
+      ((op_t *) dstp)[0] = __insn_dblalign (a2, a3, srci);
+
+      n = n % OPSIZ;
       if (n == 0)
-        return dstv;
+	return dstv;
 
-      b = ((const char *) src8 <= src1_end) ? *src8 : 0;
+      a0 = ((const char *) srcp <= src1_end) ? ((op_t *) srcp)[0] : 0;
 
-      /* Final source bytes to write to trailing partial word, if any. */
-      final = __insn_dblalign (a, b, src1);
+      final = __insn_dblalign (a3, a0, srci);
+      dst8 = (op_t *)(dstp + OPSIZ);
     }
   else
     {
       /* Aligned copy. */
 
-      const word_t *__restrict src8 = (const word_t *) src1;
+      const op_t *__restrict src8 = (const op_t *) src1;
 
       /* src8 and dst8 are both word-aligned. */
       if (n >= CHIP_L2_LINE_SIZE ())
         {
           /* Copy until 'dst' is cache-line-aligned. */
           for (; (uintptr_t) dst8 & (CHIP_L2_LINE_SIZE () - 1);
-               n -= sizeof (word_t))
+               n -= sizeof (op_t))
             *dst8++ = *src8++;
 
-          /* If copying to self, return.  The test is cheap enough
-             that we do it despite the fact that the memcpy() contract
-             doesn't require us to support overlapping dst and src.
-             This is the most common case of overlap, and any close
-             overlap will cause corruption due to the wh64 below.
-             This case is particularly important since the compiler
-             will emit memcpy() calls for aggregate copies even if it
-             can't prove that src != dst.  */
-          if (__builtin_expect (dst8 == src8, 0))
-            return dstv;
-
           for (; n >= CHIP_L2_LINE_SIZE ();)
-            {
-              __insn_wh64 (dst8);
-
-              /* Prefetch and advance to next line to prefetch, but
-                 don't go past the end.  */
-              __insn_prefetch (prefetch);
-              prefetch += CHIP_L2_LINE_SIZE ();
-              prefetch = (prefetch > src1_end) ? prefetch :
-                (const char *) src8;
-
-              /* Copy an entire cache line.  Manually unrolled to
-                 avoid idiosyncracies of compiler unrolling.  */
-#define COPY_WORD(offset) ({ dst8[offset] = src8[offset]; n -= 8; })
-              COPY_WORD (0);
-              COPY_WORD (1);
-              COPY_WORD (2);
-              COPY_WORD (3);
-              COPY_WORD (4);
-              COPY_WORD (5);
-              COPY_WORD (6);
-              COPY_WORD (7);
+	    {
+	      op_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+	      /* Prefetch and advance to next line to prefetch, but
+		 don't go past the end.  */
+	      __insn_prefetch (prefetch);
+	      prefetch += CHIP_L2_LINE_SIZE ();
+	      prefetch = (prefetch < src1_end) ? prefetch :
+		(const char *) src8;
+
+	      /* Do all the loads before wh64.  This is necessary if
+		 [src8, src8+7] and [dst8, dst8+7] share the same
+		 cache line and dst8 <= src8, as can be the case when
+		 called from memmove, or with code tested on x86 whose
+		 memcpy always works with forward copies.  */
+	      tmp0 = *src8++;
+	      tmp1 = *src8++;
+	      tmp2 = *src8++;
+	      tmp3 = *src8++;
+	      tmp4 = *src8++;
+	      tmp5 = *src8++;
+	      tmp6 = *src8++;
+	      tmp7 = *src8++;
+
+	      __insn_wh64 (dst8);
+
+	      *dst8++ = tmp0;
+	      *dst8++ = tmp1;
+	      *dst8++ = tmp2;
+	      *dst8++ = tmp3;
+	      *dst8++ = tmp4;
+	      *dst8++ = tmp5;
+	      *dst8++ = tmp6;
+	      *dst8++ = tmp7;
+
+	      n -= 64;
+	    }
 #if CHIP_L2_LINE_SIZE() != 64
 # error "Fix code that assumes particular L2 cache line size."
 #endif
-
-              dst8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
-              src8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
-            }
         }
 
-      for (; n >= sizeof (word_t); n -= sizeof (word_t))
+      for (; n >= sizeof (op_t); n -= sizeof (op_t))
         *dst8++ = *src8++;
 
       if (__builtin_expect (n == 0, 1))
diff --git a/ports/sysdeps/tile/wordcopy.c b/ports/sysdeps/tile/wordcopy.c
new file mode 100644
index 0000000..f978d8f
--- /dev/null
+++ b/ports/sysdeps/tile/wordcopy.c
@@ -0,0 +1,449 @@
+/* wordcopy.c -- subroutines for memory copy functions.  Tile version.
+   Copyright (C) 1991-2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* To optimize for tile, we make the following changes from the
+   default glibc version:
+   - Use the double align instruction instead of the MERGE macro.
+   - Since we don't have offset addressing mode, make sure the loads /
+     stores in the inner loop always have indices of 0.
+   - Use post-increment addresses in the inner loops, which yields
+     better scheduling.  */
+
+/* BE VERY CAREFUL IF YOU CHANGE THIS CODE...!  */
+
+#include <stddef.h>
+#include <memcopy.h>
+
+/* Provide the appropriate dblalign builtin to shift two registers
+   based on the alignment of a pointer held in a third register.  */
+#ifdef __tilegx__
+#define DBLALIGN __insn_dblalign
+#else
+#define DBLALIGN __insn_dword_align
+#endif
+
+/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to
+   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
+   Both SRCP and DSTP should be aligned for memory operations on `op_t's.  */
+
+void
+_wordcopy_fwd_aligned (dstp, srcp, len)
+     long int dstp;
+     long int srcp;
+     size_t len;
+{
+  op_t a0, a1;
+
+  switch (len % 8)
+    {
+    case 2:
+      a0 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 6;
+      goto do1;
+    case 3:
+      a1 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 5;
+      goto do2;
+    case 4:
+      a0 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 4;
+      goto do3;
+    case 5:
+      a1 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 3;
+      goto do4;
+    case 6:
+      a0 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 2;
+      goto do5;
+    case 7:
+      a1 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 1;
+      goto do6;
+
+    case 0:
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	return;
+      a0 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      goto do7;
+    case 1:
+      a1 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len -= 1;
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	goto do0;
+      goto do8;			/* No-op.  */
+    }
+
+  do
+    {
+    do8:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do7:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do6:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do5:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do4:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do3:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do2:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do1:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+
+      len -= 8;
+    }
+  while (len != 0);
+
+  /* This is the right position for do0.  Please don't move
+     it into the loop.  */
+ do0:
+  ((op_t *) dstp)[0] = a1;
+}
+
+/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to
+   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
+   DSTP should be aligned for memory operations on `op_t's, but SRCP must
+   *not* be aligned.  */
+
+void
+_wordcopy_fwd_dest_aligned (dstp, srcp, len)
+     long int dstp;
+     long int srcp;
+     size_t len;
+{
+  void * srci;
+  op_t a0, a1, a2, a3;
+
+  /* Save the initial source pointer so we know the number of bytes to
+     shift for merging two unaligned results.  */
+  srci = (void *) srcp;
+
+  /* Make SRCP aligned by rounding it down to the beginning of the `op_t'
+     it points in the middle of.  */
+  srcp &= -OPSIZ;
+
+  switch (len % 4)
+    {
+    case 2:
+      a1 = ((op_t *) srcp)[0];
+      a2 = ((op_t *) srcp)[1];
+      len += 2;
+      srcp += 2 * OPSIZ;
+      goto do1;
+    case 3:
+      a0 = ((op_t *) srcp)[0];
+      a1 = ((op_t *) srcp)[1];
+      len += 1;
+      srcp += 2 * OPSIZ;
+      goto do2;
+    case 0:
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	return;
+      a3 = ((op_t *) srcp)[0];
+      a0 = ((op_t *) srcp)[1];
+      len += 0;
+      srcp += 2 * OPSIZ;
+      goto do3;
+    case 1:
+      a2 = ((op_t *) srcp)[0];
+      a3 = ((op_t *) srcp)[1];
+      srcp += 2 * OPSIZ;
+      len -= 1;
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	goto do0;
+      goto do4;			/* No-op.  */
+    }
+
+  do
+    {
+    do4:
+      a0 = ((op_t *) srcp)[0];
+      a2 = DBLALIGN (a2, a3, srci);
+      ((op_t *) dstp)[0] = a2;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do3:
+      a1 = ((op_t *) srcp)[0];
+      a3 = DBLALIGN (a3, a0, srci);
+      ((op_t *) dstp)[0] = a3;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do2:
+      a2 = ((op_t *) srcp)[0];
+      a0 = DBLALIGN (a0, a1, srci);
+      ((op_t *) dstp)[0] = a0;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do1:
+      a3 = ((op_t *) srcp)[0];
+      a1 = DBLALIGN (a1, a2, srci);
+      ((op_t *) dstp)[0] = a1;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+      len -= 4;
+    }
+  while (len != 0);
+
+  /* This is the right position for do0.  Please don't move
+     it into the loop.  */
+ do0:
+  ((op_t *) dstp)[0] = DBLALIGN (a2, a3, srci);
+}
+
+/* _wordcopy_bwd_aligned -- Copy block finishing right before
+   SRCP to block finishing right before DSTP with LEN `op_t' words
+   (not LEN bytes!).  Both SRCP and DSTP should be aligned for memory
+   operations on `op_t's.  */
+
+void
+_wordcopy_bwd_aligned (dstp, srcp, len)
+     long int dstp;
+     long int srcp;
+     size_t len;
+{
+  op_t a0, a1;
+  long int srcp1;
+
+  srcp1 = srcp - 1 * OPSIZ;
+  srcp -= 2 * OPSIZ;
+  dstp -= 1 * OPSIZ;
+
+  switch (len % 8)
+    {
+    case 2:
+      a0 = ((op_t *) srcp1)[0];
+      len += 6;
+      goto do1;
+    case 3:
+      a1 = ((op_t *) srcp1)[0];
+      len += 5;
+      goto do2;
+    case 4:
+      a0 = ((op_t *) srcp1)[0];
+      len += 4;
+      goto do3;
+    case 5:
+      a1 = ((op_t *) srcp1)[0];
+      len += 3;
+      goto do4;
+    case 6:
+      a0 = ((op_t *) srcp1)[0];
+      len += 2;
+      goto do5;
+    case 7:
+      a1 = ((op_t *) srcp1)[0];
+      len += 1;
+      goto do6;
+
+    case 0:
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	return;
+      a0 = ((op_t *) srcp1)[0];
+      goto do7;
+    case 1:
+      a1 = ((op_t *) srcp1)[0];
+      len -= 1;
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	goto do0;
+      goto do8;			/* No-op.  */
+    }
+
+  do
+    {
+    do8:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do7:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do6:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do5:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do4:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do3:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do2:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do1:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+
+      len -= 8;
+    }
+  while (len != 0);
+
+  /* This is the right position for do0.  Please don't move
+     it into the loop.  */
+ do0:
+  ((op_t *) dstp)[0] = a1;
+}
+
+/* _wordcopy_bwd_dest_aligned -- Copy block finishing right
+   before SRCP to block finishing right before DSTP with LEN `op_t'
+   words (not LEN bytes!).  DSTP should be aligned for memory
+   operations on `op_t', but SRCP must *not* be aligned.  */
+
+void
+_wordcopy_bwd_dest_aligned (dstp, srcp, len)
+     long int dstp;
+     long int srcp;
+     size_t len;
+{
+  void * srci;
+  op_t a0, a1, a2, a3;
+  op_t b0, b1, b2, b3;
+
+  /* Save the initial source pointer so we know the number of bytes to
+     shift for merging two unaligned results.  */
+  srci = (void *) srcp;
+
+  /* Make SRCP aligned by rounding it down to the beginning of the op_t
+     it points in the middle of.  */
+  srcp &= -OPSIZ;
+  srcp += OPSIZ;
+
+  switch (len % 4)
+    {
+    case 2:
+      srcp -= 3 * OPSIZ;
+      dstp -= 1 * OPSIZ;
+      b2 = ((op_t *) srcp)[2];
+      b1 = a1 = ((op_t *) srcp)[1];
+      len += 2;
+      goto do1;
+    case 3:
+      srcp -= 3 * OPSIZ;
+      dstp -= 1 * OPSIZ;
+      b3 = ((op_t *) srcp)[2];
+      b2 = a2 = ((op_t *) srcp)[1];
+      len += 1;
+      goto do2;
+    case 0:
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	return;
+      srcp -= 3 * OPSIZ;
+      dstp -= 1 * OPSIZ;
+      b0 = ((op_t *) srcp)[2];
+      b3 = a3 = ((op_t *) srcp)[1];
+      goto do3;
+    case 1:
+      srcp -= 3 * OPSIZ;
+      dstp -= 1 * OPSIZ;
+      b1 = ((op_t *) srcp)[2];
+      b0 = a0 = ((op_t *) srcp)[1];
+      len -= 1;
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	goto do0;
+      goto do4;			/* No-op.  */
+    }
+
+  do
+    {
+    do4:
+      b3 = a3 = ((op_t *) srcp)[0];
+      a0 = DBLALIGN (a0, b1, srci);
+      ((op_t *) dstp)[0] = a0;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do3:
+      b2 = a2 = ((op_t *) srcp)[0];
+      a3 = DBLALIGN (a3, b0, srci);
+      ((op_t *) dstp)[0] = a3;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do2:
+      b1 = a1 = ((op_t *) srcp)[0];
+      a2 = DBLALIGN (a2, b3, srci);
+      ((op_t *) dstp)[0] = a2;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do1:
+      b0 = a0 = ((op_t *) srcp)[0];
+      a1 = DBLALIGN (a1, b2, srci);
+      ((op_t *) dstp)[0] = a1;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+
+      len -= 4;
+    }
+  while (len != 0);
+
+  /* This is the right position for do0.  Please don't move
+     it into the loop.  */
+ do0:
+  a0 = DBLALIGN (a0, b1, srci);
+  ((op_t *) dstp)[0] = a0;
+}
-- 
1.7.10.3