diff mbox

[1/3] lib: introduce some memory copy macros and functions

Message ID 4C7E2CA9.9060702@cn.fujitsu.com (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Miao Xie Sept. 1, 2010, 10:36 a.m. UTC
None
diff mbox

Patch

diff --git a/include/linux/memcopy.h b/include/linux/memcopy.h
new file mode 100644
index 0000000..9c65ac8
--- /dev/null
+++ b/include/linux/memcopy.h
@@ -0,0 +1,226 @@ 
+/*
+ * memcopy.h -- definitions for memory copy functions.  Generic C version.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2.1 of the License, or (at your option)
+ * any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+ * Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * The code is derived from the GNU C Library.
+ * Copyright (C) 1991, 1992, 1993, 1997, 2004 Free Software Foundation, Inc.
+ */
+#ifndef _LINUX_MEMCOPY_H_
+#define _LINUX_MEMCOPY_H_
+
+/*
+ * The strategy of the memory functions is:
+ *
+ *   1. Copy bytes until the destination pointer is aligned.
+ *
+ *   2. Copy words in unrolled loops.  If the source and destination
+ *      are not aligned in the same way, use word memory operations,
+ *      but shift and merge two read words before writing.
+ *
+ *   3. Copy the few remaining bytes.
+ *
+ * This is fast on processors that have at least 10 registers for
+ * allocation by GCC, and that can access memory at reg+const in one
+ * instruction.
+ */
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <asm/byteorder.h>
+
+/*
+ * The macros defined in this file are:
+ *
+ * BYTE_COPY_FWD(dst_beg_ptr, src_beg_ptr, nbytes_to_copy)
+ * 
+ * BYTE_COPY_BWD(dst_end_ptr, src_end_ptr, nbytes_to_copy)
+ * 
+ * WORD_COPY_FWD(dst_beg_ptr, src_beg_ptr, nbytes_remaining, nbytes_to_copy)
+ * 
+ * WORD_COPY_BWD(dst_end_ptr, src_end_ptr, nbytes_remaining, nbytes_to_copy)
+ * 
+ * MERGE(old_word, sh_1, new_word, sh_2)
+ * 
+ * MEM_COPY_FWD(dst_beg_ptr, src_beg_ptr, nbytes_to_copy)
+ * 
+ * MEM_COPY_BWD(dst_end_ptr, src_end_ptr, nbytes_to_copy)
+ */
+
+#define OP_T_THRESHOLD	16
+
+/*
+ * Type to use for aligned memory operations.
+ * This should normally be the biggest type supported by a single load
+ * and store.
+ */
+#define	op_t	unsigned long int
+#define OPSIZ	(sizeof(op_t))
+
+/* Type to use for unaligned operations.  */
+typedef unsigned char byte;
+
+#ifndef MERGE
+# ifdef __LITTLE_ENDIAN
+#  define MERGE(w0, sh_1, w1, sh_2) (((w0) >> (sh_1)) | ((w1) << (sh_2)))
+# elif defined(__BIG_ENDIAN)
+#  define MERGE(w0, sh_1, w1, sh_2) (((w0) << (sh_1)) | ((w1) >> (sh_2)))
+# else
+#  error "Macro MERGE() hasn't defined!"
+# endif
+#endif
+
+/*
+ * Copy exactly NBYTES bytes from SRC_BP to DST_BP,
+ * without any assumptions about alignment of the pointers.
+ */
+#ifndef BYTE_COPY_FWD
+#define BYTE_COPY_FWD(dst_bp, src_bp, nbytes)				      \
+do {									      \
+	size_t __nbytes = (nbytes);					      \
+	while (__nbytes > 0) {						      \
+		byte __x = ((byte *) src_bp)[0];			      \
+		src_bp += 1;						      \
+		__nbytes -= 1;						      \
+		((byte *) dst_bp)[0] = __x;				      \
+		dst_bp += 1;						      \
+	}								      \
+} while (0)
+#endif
+
+/*
+ * Copy exactly NBYTES_TO_COPY bytes from SRC_END_PTR to DST_END_PTR,
+ * beginning at the bytes right before the pointers and continuing towards
+ * smaller addresses.  Don't assume anything about alignment of the
+ * pointers.
+ */
+#ifndef BYTE_COPY_BWD
+#define BYTE_COPY_BWD(dst_ep, src_ep, nbytes)				      \
+do {									      \
+	size_t __nbytes = (nbytes);					      \
+	while (__nbytes > 0) {						      \
+		byte __x;						      \
+		src_ep -= 1;						      \
+		__x = ((byte *) src_ep)[0];				      \
+		dst_ep -= 1;						      \
+		__nbytes -= 1;						      \
+		((byte *) dst_ep)[0] = __x;				      \
+	}								      \
+} while (0)
+#endif
+/*
+ * Copy *up to* NBYTES bytes from SRC_BP to DST_BP, with
+ * the assumption that DST_BP is aligned on an OPSIZ multiple.  If
+ * not all bytes could be easily copied, store remaining number of bytes
+ * in NBYTES_LEFT, otherwise store 0.
+ */
+extern void _wordcopy_fwd_aligned(long int, long int, size_t);
+extern void _wordcopy_fwd_dest_aligned(long int, long int, size_t);
+#ifndef WORD_COPY_FWD
+#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes)		      \
+do {									      \
+	if (src_bp % OPSIZ == 0)					      \
+		_wordcopy_fwd_aligned (dst_bp, src_bp, (nbytes) / OPSIZ);     \
+	else								      \
+		_wordcopy_fwd_dest_aligned (dst_bp, src_bp, (nbytes) / OPSIZ);\
+									      \
+	src_bp += (nbytes) & -OPSIZ;					      \
+	dst_bp += (nbytes) & -OPSIZ;					      \
+	(nbytes_left) = (nbytes) % OPSIZ;				      \
+} while (0)
+#endif
+
+/*
+ * Copy *up to* NBYTES_TO_COPY bytes from SRC_END_PTR to DST_END_PTR,
+ * beginning at the words (of type op_t) right before the pointers and
+ * continuing towards smaller addresses.  May take advantage of that
+ * DST_END_PTR is aligned on an OPSIZ multiple.  If not all bytes could be
+ * easily copied, store remaining number of bytes in NBYTES_REMAINING,
+ * otherwise store 0.
+ */
+extern void _wordcopy_bwd_aligned(long int, long int, size_t);
+extern void _wordcopy_bwd_dest_aligned(long int, long int, size_t);
+#ifndef WORD_COPY_BWD
+#define WORD_COPY_BWD(dst_ep, src_ep, nbytes_left, nbytes)		      \
+do {									      \
+	if (src_ep % OPSIZ == 0)					      \
+		_wordcopy_bwd_aligned (dst_ep, src_ep, (nbytes) / OPSIZ);     \
+	else								      \
+		_wordcopy_bwd_dest_aligned (dst_ep, src_ep, (nbytes) / OPSIZ);\
+									      \
+	src_ep -= (nbytes) & -OPSIZ;					      \
+	dst_ep -= (nbytes) & -OPSIZ;					      \
+	(nbytes_left) = (nbytes) % OPSIZ;				      \
+} while (0)
+#endif
+
+/* Copy memory from the beginning to the end */
+#ifndef MEM_COPY_FWD
+static __always_inline void mem_copy_fwd(unsigned long dstp,
+					unsigned long srcp,
+					size_t count)
+{
+	/* If there not too few bytes to copy, use word copy. */
+	if (count >= OP_T_THRESHOLD) {
+		/* Copy just a few bytes to make dstp aligned. */
+		count -= (-dstp) % OPSIZ;
+		BYTE_COPY_FWD(dstp, srcp, (-dstp) % OPSIZ);
+
+		/*
+		 * Copy from srcp to dstp taking advantage of the known
+		 * alignment of dstp. Number if bytes remaining is put in
+		 * the third argument.
+		 */
+		WORD_COPY_FWD(dstp, srcp, count, count);
+
+		/* Fall out and copy the tail. */
+	}
+
+	/* There are just a few bytes to copy. Use byte memory operations. */
+	BYTE_COPY_FWD(dstp, srcp, count);
+}
+#endif
+
+/* Copy memory from the end to the beginning.  */
+#ifndef MEM_COPY_BWD
+static __always_inline void mem_copy_bwd(unsigned long dstp,
+					unsigned long srcp,
+					size_t count)
+{
+	srcp += count;
+	dstp += count;
+
+	/* If there not too few bytes to copy, use word copy. */
+	if (count >= OP_T_THRESHOLD) {
+		/* Copy just a few bytes to make dstp aligned. */
+		count -= dstp % OPSIZ;
+		BYTE_COPY_BWD(dstp, srcp, dstp % OPSIZ);
+
+		/*
+		 * Copy from srcp to dstp taking advantage of the known
+		 * alignment of dstp. Number if bytes remaining is put in
+		 * the third argument.
+		 */
+		WORD_COPY_BWD(dstp, srcp, count, count);
+
+		/* Fall out and copy the tail. */
+	}
+
+	/* There are just a few bytes to copy. Use byte memory operations. */
+	BYTE_COPY_BWD (dstp, srcp, count);
+}
+#endif
+
+#endif
diff --git a/lib/Makefile b/lib/Makefile
index e6a3763..10a319e 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -12,7 +12,8 @@  lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 idr.o int_sqrt.o extable.o prio_tree.o \
 	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
 	 proportions.o prio_heap.o ratelimit.o show_mem.o \
-	 is_single_threaded.o plist.o decompress.o flex_array.o
+	 is_single_threaded.o plist.o decompress.o flex_array.o \
+	 memcopy.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/memcopy.c b/lib/memcopy.c
new file mode 100644
index 0000000..70fb6b2
--- /dev/null
+++ b/lib/memcopy.c
@@ -0,0 +1,403 @@ 
+/*
+ * memcopy.c -- subroutines for memory copy functions.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2.1 of the License, or (at your option)
+ * any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+ * Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * The code is derived from the GNU C Library.
+ * Copyright (C) 1991, 1992, 1993, 1997, 2004 Free Software Foundation, Inc.
+ */
+
+/* BE VERY CAREFUL IF YOU CHANGE THIS CODE...!  */
+
+#include <linux/memcopy.h>
+
+/*
+ * _wordcopy_fwd_aligned -- Copy block beginning at SRCP to block beginning
+ * at DSTP with LEN `op_t' words (not LEN bytes!).
+ * Both SRCP and DSTP should be aligned for memory operations on `op_t's.
+ */
+void _wordcopy_fwd_aligned (long int dstp, long int srcp, size_t len)
+{
+	op_t a0, a1;
+
+	switch (len % 8) {
+	case 2:
+		a0 = ((op_t *) srcp)[0];
+		srcp -= 6 * OPSIZ;
+		dstp -= 7 * OPSIZ;
+		len += 6;
+		goto do1;
+	case 3:
+		a1 = ((op_t *) srcp)[0];
+		srcp -= 5 * OPSIZ;
+		dstp -= 6 * OPSIZ;
+		len += 5;
+		goto do2;
+	case 4:
+		a0 = ((op_t *) srcp)[0];
+		srcp -= 4 * OPSIZ;
+		dstp -= 5 * OPSIZ;
+		len += 4;
+		goto do3;
+	case 5:
+		a1 = ((op_t *) srcp)[0];
+		srcp -= 3 * OPSIZ;
+		dstp -= 4 * OPSIZ;
+		len += 3;
+		goto do4;
+	case 6:
+		a0 = ((op_t *) srcp)[0];
+		srcp -= 2 * OPSIZ;
+		dstp -= 3 * OPSIZ;
+		len += 2;
+		goto do5;
+	case 7:
+		a1 = ((op_t *) srcp)[0];
+		srcp -= 1 * OPSIZ;
+		dstp -= 2 * OPSIZ;
+		len += 1;
+		goto do6;
+	case 0:
+		if (OP_T_THRESHOLD <= 3 * OPSIZ && len == 0)
+			return;
+		a0 = ((op_t *) srcp)[0];
+		srcp -= 0 * OPSIZ;
+		dstp -= 1 * OPSIZ;
+		goto do7;
+	case 1:
+		a1 = ((op_t *) srcp)[0];
+		srcp -=-1 * OPSIZ;
+		dstp -= 0 * OPSIZ;
+		len -= 1;
+		if (OP_T_THRESHOLD <= 3 * OPSIZ && len == 0)
+			goto do0;
+		goto do8;			/* No-op.  */
+	}
+
+	do {
+do8:
+		a0 = ((op_t *) srcp)[0];
+		((op_t *) dstp)[0] = a1;
+do7:
+		a1 = ((op_t *) srcp)[1];
+		((op_t *) dstp)[1] = a0;
+do6:
+		a0 = ((op_t *) srcp)[2];
+		((op_t *) dstp)[2] = a1;
+do5:
+		a1 = ((op_t *) srcp)[3];
+		((op_t *) dstp)[3] = a0;
+do4:
+		a0 = ((op_t *) srcp)[4];
+		((op_t *) dstp)[4] = a1;
+do3:
+		a1 = ((op_t *) srcp)[5];
+		((op_t *) dstp)[5] = a0;
+do2:
+		a0 = ((op_t *) srcp)[6];
+		((op_t *) dstp)[6] = a1;
+do1:
+		a1 = ((op_t *) srcp)[7];
+		((op_t *) dstp)[7] = a0;
+
+		srcp += 8 * OPSIZ;
+		dstp += 8 * OPSIZ;
+		len -= 8;
+	} while (len != 0);
+
+	/*
+	 * This is the right position for do0.  Please don't move it into
+	 * the loop.
+	 */
+do0:
+	((op_t *) dstp)[0] = a1;
+}
+
+/*
+ * _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to block
+ * beginning at DSTP with LEN `op_t' words (not LEN bytes!). DSTP should
+ * be aligned for memory operations on `op_t's, but SRCP must *not* be aligned.
+ */
+
+void _wordcopy_fwd_dest_aligned (long int dstp, long int srcp, size_t len)
+{
+	op_t a0, a1, a2, a3;
+	int sh_1, sh_2;
+
+	/*
+	 * Calculate how to shift a word read at the memory operation aligned
+	 * srcp to make it aligned for copy.
+	 */
+	sh_1 = 8 * (srcp % OPSIZ);
+	sh_2 = 8 * OPSIZ - sh_1;
+
+	/*
+	 * Make SRCP aligned by rounding it down to the beginning of the `op_t'
+	 * it points in the middle of.
+	 */
+	srcp &= -OPSIZ;
+
+	switch (len % 4) {
+	case 2:
+		a1 = ((op_t *) srcp)[0];
+		a2 = ((op_t *) srcp)[1];
+		srcp -= 1 * OPSIZ;
+		dstp -= 3 * OPSIZ;
+		len += 2;
+		goto do1;
+	case 3:
+		a0 = ((op_t *) srcp)[0];
+		a1 = ((op_t *) srcp)[1];
+		srcp -= 0 * OPSIZ;
+		dstp -= 2 * OPSIZ;
+		len += 1;
+		goto do2;
+	case 0:
+		if (OP_T_THRESHOLD <= 3 * OPSIZ && len == 0)
+			return;
+		a3 = ((op_t *) srcp)[0];
+		a0 = ((op_t *) srcp)[1];
+		srcp -=-1 * OPSIZ;
+		dstp -= 1 * OPSIZ;
+		len += 0;
+		goto do3;
+	case 1:
+		a2 = ((op_t *) srcp)[0];
+		a3 = ((op_t *) srcp)[1];
+		srcp -=-2 * OPSIZ;
+		dstp -= 0 * OPSIZ;
+		len -= 1;
+		if (OP_T_THRESHOLD <= 3 * OPSIZ && len == 0)
+			goto do0;
+		goto do4;			/* No-op. */
+	}
+
+	do {
+do4:
+		a0 = ((op_t *) srcp)[0];
+		((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
+do3:
+		a1 = ((op_t *) srcp)[1];
+		((op_t *) dstp)[1] = MERGE (a3, sh_1, a0, sh_2);
+do2:
+		a2 = ((op_t *) srcp)[2];
+		((op_t *) dstp)[2] = MERGE (a0, sh_1, a1, sh_2);
+do1:
+		a3 = ((op_t *) srcp)[3];
+		((op_t *) dstp)[3] = MERGE (a1, sh_1, a2, sh_2);
+
+		srcp += 4 * OPSIZ;
+		dstp += 4 * OPSIZ;
+		len -= 4;
+	} while (len != 0);
+
+	/*
+	 * This is the right position for do0.  Please don't move it into
+	 * the loop.
+	 */
+do0:
+	((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
+}
+
+/*
+ * _wordcopy_bwd_aligned -- Copy block finishing right before
+ * SRCP to block finishing right before DSTP with LEN `op_t' words (not LEN
+ * bytes!).  Both SRCP and DSTP should be aligned for memory operations
+ * on `op_t's.
+ */
+void _wordcopy_bwd_aligned (long int dstp, long int srcp, size_t len)
+{
+	op_t a0, a1;
+
+	switch (len % 8) {
+	case 2:
+		srcp -= 2 * OPSIZ;
+		dstp -= 1 * OPSIZ;
+		a0 = ((op_t *) srcp)[1];
+		len += 6;
+		goto do1;
+	case 3:
+		srcp -= 3 * OPSIZ;
+		dstp -= 2 * OPSIZ;
+		a1 = ((op_t *) srcp)[2];
+		len += 5;
+		goto do2;
+	case 4:
+		srcp -= 4 * OPSIZ;
+		dstp -= 3 * OPSIZ;
+		a0 = ((op_t *) srcp)[3];
+		len += 4;
+		goto do3;
+	case 5:
+		srcp -= 5 * OPSIZ;
+		dstp -= 4 * OPSIZ;
+		a1 = ((op_t *) srcp)[4];
+		len += 3;
+		goto do4;
+	case 6:
+		srcp -= 6 * OPSIZ;
+		dstp -= 5 * OPSIZ;
+		a0 = ((op_t *) srcp)[5];
+		len += 2;
+		goto do5;
+	case 7:
+		srcp -= 7 * OPSIZ;
+		dstp -= 6 * OPSIZ;
+		a1 = ((op_t *) srcp)[6];
+		len += 1;
+		goto do6;
+	case 0:
+		if (OP_T_THRESHOLD <= 3 * OPSIZ && len == 0)
+			return;
+		srcp -= 8 * OPSIZ;
+		dstp -= 7 * OPSIZ;
+		a0 = ((op_t *) srcp)[7];
+		goto do7;
+	case 1:
+		srcp -= 9 * OPSIZ;
+		dstp -= 8 * OPSIZ;
+		a1 = ((op_t *) srcp)[8];
+		len -= 1;
+		if (OP_T_THRESHOLD <= 3 * OPSIZ && len == 0)
+			goto do0;
+		goto do8;			/* No-op.  */
+	}
+
+	do {
+do8:
+		a0 = ((op_t *) srcp)[7];
+		((op_t *) dstp)[7] = a1;
+do7:
+		a1 = ((op_t *) srcp)[6];
+		((op_t *) dstp)[6] = a0;
+do6:
+		a0 = ((op_t *) srcp)[5];
+		((op_t *) dstp)[5] = a1;
+do5:
+		a1 = ((op_t *) srcp)[4];
+		((op_t *) dstp)[4] = a0;
+do4:
+		a0 = ((op_t *) srcp)[3];
+		((op_t *) dstp)[3] = a1;
+do3:
+		a1 = ((op_t *) srcp)[2];
+		((op_t *) dstp)[2] = a0;
+do2:
+		a0 = ((op_t *) srcp)[1];
+		((op_t *) dstp)[1] = a1;
+do1:
+		a1 = ((op_t *) srcp)[0];
+		((op_t *) dstp)[0] = a0;
+
+		srcp -= 8 * OPSIZ;
+		dstp -= 8 * OPSIZ;
+		len -= 8;
+	} while (len != 0);
+
+	/*
+	 * This is the right position for do0.  Please don't move it into
+	 * the loop.
+	 */
+do0:
+	((op_t *) dstp)[7] = a1;
+}
+
+/*
+ * _wordcopy_bwd_dest_aligned -- Copy block finishing right before SRCP to
+ * block finishing right before DSTP with LEN `op_t' words (not LEN bytes!).
+ * DSTP should be aligned for memory operations on `op_t', but SRCP must *not*
+ * be aligned.
+ */
+void _wordcopy_bwd_dest_aligned (long int dstp, long int srcp, size_t len)
+{
+	op_t a0, a1, a2, a3;
+	int sh_1, sh_2;
+
+	/*
+	 * Calculate how to shift a word read at the memory operation aligned
+	 * srcp to make it aligned for copy.
+	 */
+
+	sh_1 = 8 * (srcp % OPSIZ);
+	sh_2 = 8 * OPSIZ - sh_1;
+
+	/*
+	 * Make srcp aligned by rounding it down to the beginning of the op_t
+	 * it points in the middle of.
+	 */
+	srcp &= -OPSIZ;
+	srcp += OPSIZ;
+
+	switch (len % 4) {
+	case 2:
+		srcp -= 3 * OPSIZ;
+		dstp -= 1 * OPSIZ;
+		a2 = ((op_t *) srcp)[2];
+		a1 = ((op_t *) srcp)[1];
+		len += 2;
+		goto do1;
+	case 3:
+		srcp -= 4 * OPSIZ;
+		dstp -= 2 * OPSIZ;
+		a3 = ((op_t *) srcp)[3];
+		a2 = ((op_t *) srcp)[2];
+		len += 1;
+		goto do2;
+	case 0:
+		if (OP_T_THRESHOLD <= 3 * OPSIZ && len == 0)
+			return;
+		srcp -= 5 * OPSIZ;
+		dstp -= 3 * OPSIZ;
+		a0 = ((op_t *) srcp)[4];
+		a3 = ((op_t *) srcp)[3];
+		goto do3;
+	case 1:
+		srcp -= 6 * OPSIZ;
+		dstp -= 4 * OPSIZ;
+		a1 = ((op_t *) srcp)[5];
+		a0 = ((op_t *) srcp)[4];
+		len -= 1;
+		if (OP_T_THRESHOLD <= 3 * OPSIZ && len == 0)
+			goto do0;
+		goto do4;			/* No-op.  */
+	}
+
+	do {
+do4:
+		a3 = ((op_t *) srcp)[3];
+		((op_t *) dstp)[3] = MERGE (a0, sh_1, a1, sh_2);
+do3:
+		a2 = ((op_t *) srcp)[2];
+		((op_t *) dstp)[2] = MERGE (a3, sh_1, a0, sh_2);
+do2:
+		a1 = ((op_t *) srcp)[1];
+		((op_t *) dstp)[1] = MERGE (a2, sh_1, a3, sh_2);
+do1:
+		a0 = ((op_t *) srcp)[0];
+		((op_t *) dstp)[0] = MERGE (a1, sh_1, a2, sh_2);
+
+		srcp -= 4 * OPSIZ;
+		dstp -= 4 * OPSIZ;
+		len -= 4;
+	} while (len != 0);
+
+	/*
+	 * This is the right position for do0.  Please don't move it into
+	 * the loop.
+	 */
+do0:
+	((op_t *) dstp)[3] = MERGE (a0, sh_1, a1, sh_2);
+}
+