string.c 6.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
/* A memcpy for CRIS.
   Copyright (C) 1994-2005 Axis Communications.
   All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:

   1. Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.

   2. Neither the name of Axis Communications nor the names of its
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.

   THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS
   COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
   IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   POSSIBILITY OF SUCH DAMAGE.  */

/* FIXME: This file should really only be used for reference, as the
   result is somewhat depending on gcc generating what we expect rather
   than what we describe.  An assembly file should be used instead.  */

#include <stddef.h>

/* Break even between movem and move16 is really at 38.7 * 2, but
   modulo 44, so up to the next multiple of 44, we use ordinary code.  */
#define MEMCPY_BY_BLOCK_THRESHOLD (44 * 2)

/* No name ambiguities in this file.  */
__asm__ (".syntax no_register_prefix");

void *
memcpy(void *pdst, const void *psrc, size_t pn)
44
{
45
  /* Now we want the parameters put in special registers.
46
     Make sure the compiler is able to make something useful of this.
47
     As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
48

49 50
     If gcc was allright, it really would need no temporaries, and no
     stack space to save stuff on.  */
51 52

  register void *return_dst __asm__ ("r10") = pdst;
53 54
  register unsigned char *dst __asm__ ("r13") = pdst;
  register unsigned const char *src __asm__ ("r11") = psrc;
55 56 57 58 59 60 61 62 63 64 65
  register int n __asm__ ("r12") = pn;

  /* When src is aligned but not dst, this makes a few extra needless
     cycles.  I believe it would take as many to check that the
     re-alignment was unnecessary.  */
  if (((unsigned long) dst & 3) != 0
      /* Don't align if we wouldn't copy more than a few bytes; so we
	 don't have to check further for overflows.  */
      && n >= 3)
  {
    if ((unsigned long) dst & 1)
66 67 68 69 70 71
      {
	n--;
	*dst = *src;
	src++;
	dst++;
      }
72 73

    if ((unsigned long) dst & 2)
74 75 76 77 78 79
      {
	n -= 2;
	*(short *) dst = *(short *) src;
	src += 2;
	dst += 2;
      }
80 81
  }

82 83 84 85 86 87 88 89 90 91 92 93 94 95
  /* Decide which copying method to use.  */
  if (n >= MEMCPY_BY_BLOCK_THRESHOLD)
    {
      /* It is not optimal to tell the compiler about clobbering any
	 registers; that will move the saving/restoring of those registers
	 to the function prologue/epilogue, and make non-movem sizes
	 suboptimal.  */
      __asm__ volatile
	("\
	 ;; GCC does promise correct register allocations, but let's	\n\
	 ;; make sure it keeps its promises.				\n\
	 .ifnc %0-%1-%2,$r13-$r11-$r12					\n\
	 .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\"	\n\
	 .endif								\n\
96
									\n\
97 98 99 100
	 ;; Save the registers we'll use in the movem process		\n\
	 ;; on the stack.						\n\
	 subq	11*4,sp							\n\
	 movem	r10,[sp]						\n\
101
									\n\
102 103 104 105
	 ;; Now we've got this:						\n\
	 ;; r11 - src							\n\
	 ;; r13 - dst							\n\
	 ;; r12 - n							\n\
106
									\n\
107 108
	 ;; Update n for the first loop.				\n\
	 subq	 44,r12							\n\
109
0:									\n\
110 111 112 113 114 115 116 117 118 119
"
#ifdef __arch_common_v10_v32
	 /* Cater to branch offset difference between v32 and v10.  We
	    assume the branch below has an 8-bit offset.  */
"	 setf\n"
#endif
"	 movem	[r11+],r10						\n\
	 subq	44,r12							\n\
	 bge	 0b							\n\
	 movem	r10,[r13+]						\n\
120
									\n\
121 122
	 ;; Compensate for last loop underflowing n.			\n\
	 addq	44,r12							\n\
123
									\n\
124 125
	 ;; Restore registers from stack.				\n\
	 movem [sp+],r10"
126

127 128
	 /* Outputs.  */
	 : "=r" (dst), "=r" (src), "=r" (n)
129

130 131 132
	 /* Inputs.  */
	 : "0" (dst), "1" (src), "2" (n));
    }
133

134 135 136 137 138 139
  while (n >= 16)
    {
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(long *) dst = *(long *) src; dst += 4; src += 4;
140

141 142
      n -= 16;
    }
143 144

  switch (n)
145
    {
146 147
    case 0:
      break;
148

149
    case 1:
150
      *dst = *src;
151
      break;
152

153
    case 2:
154
      *(short *) dst = *(short *) src;
155
      break;
156

157
    case 3:
158 159
      *(short *) dst = *(short *) src; dst += 2; src += 2;
      *dst = *src;
160
      break;
161

162
    case 4:
163
      *(long *) dst = *(long *) src;
164
      break;
165

166
    case 5:
167 168
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *dst = *src;
169
      break;
170

171
    case 6:
172 173
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(short *) dst = *(short *) src;
174
      break;
175

176
    case 7:
177 178 179
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(short *) dst = *(short *) src; dst += 2; src += 2;
      *dst = *src;
180
      break;
181

182
    case 8:
183 184
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(long *) dst = *(long *) src;
185
      break;
186

187
    case 9:
188 189 190
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *dst = *src;
191
      break;
192

193
    case 10:
194 195 196
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(short *) dst = *(short *) src;
197
      break;
198

199
    case 11:
200 201 202 203
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(short *) dst = *(short *) src; dst += 2; src += 2;
      *dst = *src;
204
      break;
205

206
    case 12:
207 208 209
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(long *) dst = *(long *) src;
210
      break;
211

212
    case 13:
213 214 215 216
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *dst = *src;
217
      break;
218

219
    case 14:
220 221 222 223
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(short *) dst = *(short *) src;
224
      break;
225

226
    case 15:
227 228 229 230 231
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(long *) dst = *(long *) src; dst += 4; src += 4;
      *(short *) dst = *(short *) src; dst += 2; src += 2;
      *dst = *src;
232
      break;
233
    }
234

235 236
  return return_dst;
}