From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 16539 invoked by alias); 3 Nov 2012 03:42:44 -0000 Received: (qmail 16531 invoked by uid 22791); 3 Nov 2012 03:42:43 -0000 X-SWARE-Spam-Status: No, hits=-1.6 required=5.0 tests=AWL,BAYES_00,DATE_IN_PAST_06_12,KHOP_SPAMHAUS_DROP,RP_MATCHES_RCVD,TW_BW,TW_CP,TW_EG X-Spam-Check-By: sourceware.org Received: from usmamail.tilera.com (HELO USMAMAIL.TILERA.COM) (12.216.194.151) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Sat, 03 Nov 2012 03:42:35 +0000 Received: from farm-0002.internal.tilera.com (10.2.0.32) by USMAEXCH2.tad.internal.tilera.com (10.3.0.33) with Microsoft SMTP Server (TLS) id 14.0.694.0; Fri, 2 Nov 2012 23:42:34 -0400 Received: (from cmetcalf@localhost) by farm-0002.internal.tilera.com (8.14.4/8.12.11/Submit) id qA33gX9m008629; Fri, 2 Nov 2012 23:42:33 -0400 Message-ID: <201211030342.qA33gX9m008629@farm-0002.internal.tilera.com> From: Chris Metcalf Date: Sat, 03 Nov 2012 03:42:00 -0000 Subject: [PATCH] Optimize tile (mostly tilegx) memcpy and memmove performance. To: MIME-Version: 1.0 Content-Type: text/plain X-IsSubscribed: yes Mailing-List: contact libc-ports-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: libc-ports-owner@sourceware.org X-SW-Source: 2012-11/txt/msg00007.txt.bz2 2012-11-02 Chris Metcalf * sysdeps/tile/tilegx/memcpy.c (__memcpy): Optimize. * sysdeps/tile/memcopy.h: New file. * sysdeps/tile/wordcopy.c: New file. - Override so we use full 8-byte word copies on tilegx32 for memmove, then use op_t in memcpy instead of the previous locally-defined word_t just to avoid proliferating identical types. - Fix bug in memcpy prefetch that caused us to never prefetch past the first cache line. - Optimize misaligned memcpy by inlining _wordcopy_fwd_dest_aligned instead of just doing a dumb word-at-a-time copy. - Make memcpy safe for forward copies by doing all the loads from a given cache line prior to doing a wh64 (cache line zero-fill) on the destination. Remove now-redundant src == dst check. - Copy and optimize the generic wordcopy.c routines to use the tile "double align" instruction instead of the MERGE macro; to avoid offset addressing mode (which tile doesn't have) by rewriting the pointer math to load and store with a zero index; and to use post-increment addresses in the inner loops to improve scheduling. --- ports/sysdeps/tile/memcopy.h | 27 +++ ports/sysdeps/tile/tilegx/memcpy.c | 200 ++++++++++------ ports/sysdeps/tile/wordcopy.c | 449 ++++++++++++++++++++++++++++++++++++ 4 files changed, 615 insertions(+), 67 deletions(-) create mode 100644 ports/sysdeps/tile/memcopy.h create mode 100644 ports/sysdeps/tile/wordcopy.c diff --git a/ports/sysdeps/tile/memcopy.h b/ports/sysdeps/tile/memcopy.h new file mode 100644 index 0000000..2bc3fce --- /dev/null +++ b/ports/sysdeps/tile/memcopy.h @@ -0,0 +1,27 @@ +/* memcopy.h -- definitions for memory copy functions. Tile version. + Copyright (C) 2012 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +/* Support more efficient copying on tilegx32, which supports + long long as a native 64-bit type. */ +#if defined (__tilegx__) && __WORDSIZE == 32 +# undef op_t +# define op_t unsigned long long int +#endif diff --git a/ports/sysdeps/tile/tilegx/memcpy.c b/ports/sysdeps/tile/tilegx/memcpy.c index dd6e30d..5b015f3 100644 --- a/ports/sysdeps/tile/tilegx/memcpy.c +++ b/ports/sysdeps/tile/tilegx/memcpy.c @@ -19,11 +19,9 @@ #include #include #include +#include #include -/* Must be 8 bytes in size. */ -#define word_t uint64_t - /* How many cache lines ahead should we prefetch? */ #define PREFETCH_LINES_AHEAD 3 @@ -34,8 +32,8 @@ __memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n) const char *__restrict src1 = (const char *) srcv; const char *__restrict src1_end; const char *__restrict prefetch; - word_t *__restrict dst8; /* 8-byte pointer to destination memory. */ - word_t final; /* Final bytes to write to trailing word, if any */ + op_t *__restrict dst8; /* 8-byte pointer to destination memory. */ + op_t final; /* Final bytes to write to trailing word, if any */ long i; if (n < 16) @@ -55,101 +53,169 @@ __memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n) { __insn_prefetch (prefetch); prefetch += CHIP_L2_LINE_SIZE (); - prefetch = (prefetch > src1_end) ? prefetch : src1; + prefetch = (prefetch < src1_end) ? prefetch : src1; } /* Copy bytes until dst is word-aligned. */ - for (; (uintptr_t) dst1 & (sizeof (word_t) - 1); n--) + for (; (uintptr_t) dst1 & (sizeof (op_t) - 1); n--) *dst1++ = *src1++; /* 8-byte pointer to destination memory. */ - dst8 = (word_t *) dst1; + dst8 = (op_t *) dst1; - if (__builtin_expect ((uintptr_t) src1 & (sizeof (word_t) - 1), 0)) + if (__builtin_expect ((uintptr_t) src1 & (sizeof (op_t) - 1), 0)) { - /* Misaligned copy. Copy 8 bytes at a time, but don't bother - with other fanciness. - TODO: Consider prefetching and using wh64 as well. */ + /* Misaligned copy. Use glibc's _wordcopy_fwd_dest_aligned, but + inline it to avoid prologue/epilogue. TODO: Consider + prefetching and using wh64 as well. */ + void * srci; + op_t a0, a1, a2, a3; + long int dstp = (long int) dst1; + long int srcp = (long int) src1; + long int len = n / OPSIZ; - /* Create an aligned src8. */ - const word_t *__restrict src8 = - (const word_t *) ((uintptr_t) src1 & -sizeof (word_t)); - word_t b; + /* Save the initial source pointer so we know the number of + bytes to shift for merging two unaligned results. */ + srci = (void *) srcp; - word_t a = *src8++; - for (; n >= sizeof (word_t); n -= sizeof (word_t)) - { - b = *src8++; - a = __insn_dblalign (a, b, src1); - *dst8++ = a; - a = b; - } + /* Make SRCP aligned by rounding it down to the beginning of the + `op_t' it points in the middle of. */ + srcp &= -OPSIZ; + + switch (len % 4) + { + case 2: + a1 = ((op_t *) srcp)[0]; + a2 = ((op_t *) srcp)[1]; + len += 2; + srcp += 2 * OPSIZ; + goto do1; + case 3: + a0 = ((op_t *) srcp)[0]; + a1 = ((op_t *) srcp)[1]; + len += 1; + srcp += 2 * OPSIZ; + goto do2; + case 0: + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + return dstv; + a3 = ((op_t *) srcp)[0]; + a0 = ((op_t *) srcp)[1]; + len += 0; + srcp += 2 * OPSIZ; + goto do3; + case 1: + a2 = ((op_t *) srcp)[0]; + a3 = ((op_t *) srcp)[1]; + srcp += 2 * OPSIZ; + len -= 1; + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + goto do0; + goto do4; /* No-op. */ + } + do + { + do4: + a0 = ((op_t *) srcp)[0]; + a2 = __insn_dblalign (a2, a3, srci); + ((op_t *) dstp)[0] = a2; + srcp += OPSIZ; + dstp += OPSIZ; + do3: + a1 = ((op_t *) srcp)[0]; + a3 = __insn_dblalign (a3, a0, srci); + ((op_t *) dstp)[0] = a3; + srcp += OPSIZ; + dstp += OPSIZ; + do2: + a2 = ((op_t *) srcp)[0]; + a0 = __insn_dblalign (a0, a1, srci); + ((op_t *) dstp)[0] = a0; + srcp += OPSIZ; + dstp += OPSIZ; + do1: + a3 = ((op_t *) srcp)[0]; + a1 = __insn_dblalign (a1, a2, srci); + ((op_t *) dstp)[0] = a1; + srcp += OPSIZ; + dstp += OPSIZ; + len -= 4; + } + while (len != 0); + + /* This is the right position for do0. Please don't move + it into the loop. */ + do0: + ((op_t *) dstp)[0] = __insn_dblalign (a2, a3, srci); + + n = n % OPSIZ; if (n == 0) - return dstv; + return dstv; - b = ((const char *) src8 <= src1_end) ? *src8 : 0; + a0 = ((const char *) srcp <= src1_end) ? ((op_t *) srcp)[0] : 0; - /* Final source bytes to write to trailing partial word, if any. */ - final = __insn_dblalign (a, b, src1); + final = __insn_dblalign (a3, a0, srci); + dst8 = (op_t *)(dstp + OPSIZ); } else { /* Aligned copy. */ - const word_t *__restrict src8 = (const word_t *) src1; + const op_t *__restrict src8 = (const op_t *) src1; /* src8 and dst8 are both word-aligned. */ if (n >= CHIP_L2_LINE_SIZE ()) { /* Copy until 'dst' is cache-line-aligned. */ for (; (uintptr_t) dst8 & (CHIP_L2_LINE_SIZE () - 1); - n -= sizeof (word_t)) + n -= sizeof (op_t)) *dst8++ = *src8++; - /* If copying to self, return. The test is cheap enough - that we do it despite the fact that the memcpy() contract - doesn't require us to support overlapping dst and src. - This is the most common case of overlap, and any close - overlap will cause corruption due to the wh64 below. - This case is particularly important since the compiler - will emit memcpy() calls for aggregate copies even if it - can't prove that src != dst. */ - if (__builtin_expect (dst8 == src8, 0)) - return dstv; - for (; n >= CHIP_L2_LINE_SIZE ();) - { - __insn_wh64 (dst8); - - /* Prefetch and advance to next line to prefetch, but - don't go past the end. */ - __insn_prefetch (prefetch); - prefetch += CHIP_L2_LINE_SIZE (); - prefetch = (prefetch > src1_end) ? prefetch : - (const char *) src8; - - /* Copy an entire cache line. Manually unrolled to - avoid idiosyncracies of compiler unrolling. */ -#define COPY_WORD(offset) ({ dst8[offset] = src8[offset]; n -= 8; }) - COPY_WORD (0); - COPY_WORD (1); - COPY_WORD (2); - COPY_WORD (3); - COPY_WORD (4); - COPY_WORD (5); - COPY_WORD (6); - COPY_WORD (7); + { + op_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + /* Prefetch and advance to next line to prefetch, but + don't go past the end. */ + __insn_prefetch (prefetch); + prefetch += CHIP_L2_LINE_SIZE (); + prefetch = (prefetch < src1_end) ? prefetch : + (const char *) src8; + + /* Do all the loads before wh64. This is necessary if + [src8, src8+7] and [dst8, dst8+7] share the same + cache line and dst8 <= src8, as can be the case when + called from memmove, or with code tested on x86 whose + memcpy always works with forward copies. */ + tmp0 = *src8++; + tmp1 = *src8++; + tmp2 = *src8++; + tmp3 = *src8++; + tmp4 = *src8++; + tmp5 = *src8++; + tmp6 = *src8++; + tmp7 = *src8++; + + __insn_wh64 (dst8); + + *dst8++ = tmp0; + *dst8++ = tmp1; + *dst8++ = tmp2; + *dst8++ = tmp3; + *dst8++ = tmp4; + *dst8++ = tmp5; + *dst8++ = tmp6; + *dst8++ = tmp7; + + n -= 64; + } #if CHIP_L2_LINE_SIZE() != 64 # error "Fix code that assumes particular L2 cache line size." #endif - - dst8 += CHIP_L2_LINE_SIZE () / sizeof (word_t); - src8 += CHIP_L2_LINE_SIZE () / sizeof (word_t); - } } - for (; n >= sizeof (word_t); n -= sizeof (word_t)) + for (; n >= sizeof (op_t); n -= sizeof (op_t)) *dst8++ = *src8++; if (__builtin_expect (n == 0, 1)) diff --git a/ports/sysdeps/tile/wordcopy.c b/ports/sysdeps/tile/wordcopy.c new file mode 100644 index 0000000..f978d8f --- /dev/null +++ b/ports/sysdeps/tile/wordcopy.c @@ -0,0 +1,449 @@ +/* wordcopy.c -- subroutines for memory copy functions. Tile version. + Copyright (C) 1991-2012 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* To optimize for tile, we make the following changes from the + default glibc version: + - Use the double align instruction instead of the MERGE macro. + - Since we don't have offset addressing mode, make sure the loads / + stores in the inner loop always have indices of 0. + - Use post-increment addresses in the inner loops, which yields + better scheduling. */ + +/* BE VERY CAREFUL IF YOU CHANGE THIS CODE...! */ + +#include +#include + +/* Provide the appropriate dblalign builtin to shift two registers + based on the alignment of a pointer held in a third register. */ +#ifdef __tilegx__ +#define DBLALIGN __insn_dblalign +#else +#define DBLALIGN __insn_dword_align +#endif + +/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to + block beginning at DSTP with LEN `op_t' words (not LEN bytes!). + Both SRCP and DSTP should be aligned for memory operations on `op_t's. */ + +void +_wordcopy_fwd_aligned (dstp, srcp, len) + long int dstp; + long int srcp; + size_t len; +{ + op_t a0, a1; + + switch (len % 8) + { + case 2: + a0 = ((op_t *) srcp)[0]; + srcp += OPSIZ; + len += 6; + goto do1; + case 3: + a1 = ((op_t *) srcp)[0]; + srcp += OPSIZ; + len += 5; + goto do2; + case 4: + a0 = ((op_t *) srcp)[0]; + srcp += OPSIZ; + len += 4; + goto do3; + case 5: + a1 = ((op_t *) srcp)[0]; + srcp += OPSIZ; + len += 3; + goto do4; + case 6: + a0 = ((op_t *) srcp)[0]; + srcp += OPSIZ; + len += 2; + goto do5; + case 7: + a1 = ((op_t *) srcp)[0]; + srcp += OPSIZ; + len += 1; + goto do6; + + case 0: + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + return; + a0 = ((op_t *) srcp)[0]; + srcp += OPSIZ; + goto do7; + case 1: + a1 = ((op_t *) srcp)[0]; + srcp += OPSIZ; + len -= 1; + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + goto do0; + goto do8; /* No-op. */ + } + + do + { + do8: + a0 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a1; + srcp += OPSIZ; + dstp += OPSIZ; + do7: + a1 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a0; + srcp += OPSIZ; + dstp += OPSIZ; + do6: + a0 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a1; + srcp += OPSIZ; + dstp += OPSIZ; + do5: + a1 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a0; + srcp += OPSIZ; + dstp += OPSIZ; + do4: + a0 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a1; + srcp += OPSIZ; + dstp += OPSIZ; + do3: + a1 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a0; + srcp += OPSIZ; + dstp += OPSIZ; + do2: + a0 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a1; + srcp += OPSIZ; + dstp += OPSIZ; + do1: + a1 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a0; + srcp += OPSIZ; + dstp += OPSIZ; + + len -= 8; + } + while (len != 0); + + /* This is the right position for do0. Please don't move + it into the loop. */ + do0: + ((op_t *) dstp)[0] = a1; +} + +/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to + block beginning at DSTP with LEN `op_t' words (not LEN bytes!). + DSTP should be aligned for memory operations on `op_t's, but SRCP must + *not* be aligned. */ + +void +_wordcopy_fwd_dest_aligned (dstp, srcp, len) + long int dstp; + long int srcp; + size_t len; +{ + void * srci; + op_t a0, a1, a2, a3; + + /* Save the initial source pointer so we know the number of bytes to + shift for merging two unaligned results. */ + srci = (void *) srcp; + + /* Make SRCP aligned by rounding it down to the beginning of the `op_t' + it points in the middle of. */ + srcp &= -OPSIZ; + + switch (len % 4) + { + case 2: + a1 = ((op_t *) srcp)[0]; + a2 = ((op_t *) srcp)[1]; + len += 2; + srcp += 2 * OPSIZ; + goto do1; + case 3: + a0 = ((op_t *) srcp)[0]; + a1 = ((op_t *) srcp)[1]; + len += 1; + srcp += 2 * OPSIZ; + goto do2; + case 0: + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + return; + a3 = ((op_t *) srcp)[0]; + a0 = ((op_t *) srcp)[1]; + len += 0; + srcp += 2 * OPSIZ; + goto do3; + case 1: + a2 = ((op_t *) srcp)[0]; + a3 = ((op_t *) srcp)[1]; + srcp += 2 * OPSIZ; + len -= 1; + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + goto do0; + goto do4; /* No-op. */ + } + + do + { + do4: + a0 = ((op_t *) srcp)[0]; + a2 = DBLALIGN (a2, a3, srci); + ((op_t *) dstp)[0] = a2; + srcp += OPSIZ; + dstp += OPSIZ; + do3: + a1 = ((op_t *) srcp)[0]; + a3 = DBLALIGN (a3, a0, srci); + ((op_t *) dstp)[0] = a3; + srcp += OPSIZ; + dstp += OPSIZ; + do2: + a2 = ((op_t *) srcp)[0]; + a0 = DBLALIGN (a0, a1, srci); + ((op_t *) dstp)[0] = a0; + srcp += OPSIZ; + dstp += OPSIZ; + do1: + a3 = ((op_t *) srcp)[0]; + a1 = DBLALIGN (a1, a2, srci); + ((op_t *) dstp)[0] = a1; + srcp += OPSIZ; + dstp += OPSIZ; + len -= 4; + } + while (len != 0); + + /* This is the right position for do0. Please don't move + it into the loop. */ + do0: + ((op_t *) dstp)[0] = DBLALIGN (a2, a3, srci); +} + +/* _wordcopy_bwd_aligned -- Copy block finishing right before + SRCP to block finishing right before DSTP with LEN `op_t' words + (not LEN bytes!). Both SRCP and DSTP should be aligned for memory + operations on `op_t's. */ + +void +_wordcopy_bwd_aligned (dstp, srcp, len) + long int dstp; + long int srcp; + size_t len; +{ + op_t a0, a1; + long int srcp1; + + srcp1 = srcp - 1 * OPSIZ; + srcp -= 2 * OPSIZ; + dstp -= 1 * OPSIZ; + + switch (len % 8) + { + case 2: + a0 = ((op_t *) srcp1)[0]; + len += 6; + goto do1; + case 3: + a1 = ((op_t *) srcp1)[0]; + len += 5; + goto do2; + case 4: + a0 = ((op_t *) srcp1)[0]; + len += 4; + goto do3; + case 5: + a1 = ((op_t *) srcp1)[0]; + len += 3; + goto do4; + case 6: + a0 = ((op_t *) srcp1)[0]; + len += 2; + goto do5; + case 7: + a1 = ((op_t *) srcp1)[0]; + len += 1; + goto do6; + + case 0: + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + return; + a0 = ((op_t *) srcp1)[0]; + goto do7; + case 1: + a1 = ((op_t *) srcp1)[0]; + len -= 1; + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + goto do0; + goto do8; /* No-op. */ + } + + do + { + do8: + a0 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a1; + srcp -= OPSIZ; + dstp -= OPSIZ; + do7: + a1 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a0; + srcp -= OPSIZ; + dstp -= OPSIZ; + do6: + a0 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a1; + srcp -= OPSIZ; + dstp -= OPSIZ; + do5: + a1 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a0; + srcp -= OPSIZ; + dstp -= OPSIZ; + do4: + a0 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a1; + srcp -= OPSIZ; + dstp -= OPSIZ; + do3: + a1 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a0; + srcp -= OPSIZ; + dstp -= OPSIZ; + do2: + a0 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a1; + srcp -= OPSIZ; + dstp -= OPSIZ; + do1: + a1 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a0; + srcp -= OPSIZ; + dstp -= OPSIZ; + + len -= 8; + } + while (len != 0); + + /* This is the right position for do0. Please don't move + it into the loop. */ + do0: + ((op_t *) dstp)[0] = a1; +} + +/* _wordcopy_bwd_dest_aligned -- Copy block finishing right + before SRCP to block finishing right before DSTP with LEN `op_t' + words (not LEN bytes!). DSTP should be aligned for memory + operations on `op_t', but SRCP must *not* be aligned. */ + +void +_wordcopy_bwd_dest_aligned (dstp, srcp, len) + long int dstp; + long int srcp; + size_t len; +{ + void * srci; + op_t a0, a1, a2, a3; + op_t b0, b1, b2, b3; + + /* Save the initial source pointer so we know the number of bytes to + shift for merging two unaligned results. */ + srci = (void *) srcp; + + /* Make SRCP aligned by rounding it down to the beginning of the op_t + it points in the middle of. */ + srcp &= -OPSIZ; + srcp += OPSIZ; + + switch (len % 4) + { + case 2: + srcp -= 3 * OPSIZ; + dstp -= 1 * OPSIZ; + b2 = ((op_t *) srcp)[2]; + b1 = a1 = ((op_t *) srcp)[1]; + len += 2; + goto do1; + case 3: + srcp -= 3 * OPSIZ; + dstp -= 1 * OPSIZ; + b3 = ((op_t *) srcp)[2]; + b2 = a2 = ((op_t *) srcp)[1]; + len += 1; + goto do2; + case 0: + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + return; + srcp -= 3 * OPSIZ; + dstp -= 1 * OPSIZ; + b0 = ((op_t *) srcp)[2]; + b3 = a3 = ((op_t *) srcp)[1]; + goto do3; + case 1: + srcp -= 3 * OPSIZ; + dstp -= 1 * OPSIZ; + b1 = ((op_t *) srcp)[2]; + b0 = a0 = ((op_t *) srcp)[1]; + len -= 1; + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + goto do0; + goto do4; /* No-op. */ + } + + do + { + do4: + b3 = a3 = ((op_t *) srcp)[0]; + a0 = DBLALIGN (a0, b1, srci); + ((op_t *) dstp)[0] = a0; + srcp -= OPSIZ; + dstp -= OPSIZ; + do3: + b2 = a2 = ((op_t *) srcp)[0]; + a3 = DBLALIGN (a3, b0, srci); + ((op_t *) dstp)[0] = a3; + srcp -= OPSIZ; + dstp -= OPSIZ; + do2: + b1 = a1 = ((op_t *) srcp)[0]; + a2 = DBLALIGN (a2, b3, srci); + ((op_t *) dstp)[0] = a2; + srcp -= OPSIZ; + dstp -= OPSIZ; + do1: + b0 = a0 = ((op_t *) srcp)[0]; + a1 = DBLALIGN (a1, b2, srci); + ((op_t *) dstp)[0] = a1; + srcp -= OPSIZ; + dstp -= OPSIZ; + + len -= 4; + } + while (len != 0); + + /* This is the right position for do0. Please don't move + it into the loop. */ + do0: + a0 = DBLALIGN (a0, b1, srci); + ((op_t *) dstp)[0] = a0; +} -- 1.7.10.3