ARMEBS4  revision-26.06.2015
memcpy.c
Go to the documentation of this file.
1 /**
2  * \file memcpy.c
3  *
4  * This file provides a faster alternative to the memcpy function provided
5  * by newlib < 1.19.
6  * newlib from 1.20 up use a handwritten faster version
7  */
8 
9 #if 0
10 /* Copyright (c) 2009 CodeSourcery, Inc.
11  * All rights reserved.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions are met:
15  * * Redistributions of source code must retain the above copyright
16  * notice, this list of conditions and the following disclaimer.
17  * * Redistributions in binary form must reproduce the above copyright
18  * notice, this list of conditions and the following disclaimer in the
19  * documentation and/or other materials provided with the distribution.
20  * * Neither the name of CodeSourcery nor the
21  * names of its contributors may be used to endorse or promote products
22  * derived from this software without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY CODESOURCERY, INC. ``AS IS'' AND ANY
25  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
26  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
27  * DISCLAIMED. IN NO EVENT SHALL CODESOURCERY BE LIABLE FOR ANY
28  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
29  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
31  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  */
35 
36 #include <string.h>
37 #include <stdint.h>
38 #include <stddef.h>
39 
40 /* Standard operations for word-sized values. */
41 #define WORD_REF(ADDRESS, OFFSET) \
42  *((WORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))
43 #define WORD_COPY(OUT, IN, OFFSET) \
44  WORD_REF(OUT, OFFSET) = WORD_REF(IN, OFFSET)
45 
46 #define DWORD_REF(ADDRESS, OFFSET) \
47  *((DWORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))
48 
49 #define DWORD_COPY(OUT, IN, OFFSET) \
50  DWORD_REF(OUT, OFFSET) = DWORD_REF(IN, OFFSET)
51 
52 /* On processors with NEON, we use 128-bit vectors. Also,
53  we need to include arm_neon.h to use these. */
54 #if defined(__ARM_NEON__)
55  #include <arm_neon.h>
56 
57  #define WORD_TYPE uint8x16_t
58  #define WORD_SIZE 16
59  #define MAYBE_PREFETCH(IN) __builtin_prefetch((IN), 0, 0)
60 
61 /* On ARM processors with 64-bit ldrd instructions, we use those,
62  except on Cortex-M* where benchmarking has shown them to
63  be slower. */
64 #elif defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
65  || defined(__ARM_ARCH_5TEJ__) || defined(_ISA_ARM_6)
66  #define WORD_TYPE uint64_t
67  #define WORD_SIZE 8
68  #define MAYBE_PREFETCH(IN) __builtin_prefetch((IN), 0, 0)
69 
70 /* On everything else, we use 32-bit loads and stores, and
71  do not use prefetching. */
72 #else
73  #define WORD_TYPE uint32_t
74  #define WORD_SIZE 4
75  #define MAYBE_PREFETCH(IN)
76  #define DWORD_TYPE uint64_t
77  #define DWORD_SIZE 8
78 #endif
79 
80 /* On all ARM platforms, 'SHORTWORD' is a 32-bit value. */
81 #define SHORTWORD_TYPE uint32_t
82 #define SHORTWORD_SIZE 4
83 #define SHORTWORD_REF(ADDRESS, OFFSET) \
84  *((SHORTWORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))
85 #define SHORTWORD_COPY(OUT, IN, OFFSET) \
86  SHORTWORD_REF(OUT, OFFSET) = SHORTWORD_REF(IN, OFFSET)
87 
88 /* Shifting directionality depends on endianness. */
89 #ifdef __ARMEB__
90 #define SHORTWORD_SHIFT(IN0, IN1, OFFSET) \
91  ((IN0) << ((OFFSET)*8)) | ((IN1) >> (SHORTWORD_SIZE*8 - (OFFSET)*8))
92 #else
93 #define SHORTWORD_SHIFT(IN0, IN1, OFFSET) \
94  ((IN0) >> ((OFFSET)*8)) | ((IN1) << (SHORTWORD_SIZE*8 - (OFFSET)*8))
95 #endif
96 
97 _PTR
98 _DEFUN (memcpy, (OUT, IN, N),
99  _PTR OUT _AND
100  _CONST _PTR IN _AND
101  size_t N)
102 {
103  void* OUT0 = OUT;
104 
105 #if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
106  const char* OUT_end = (char*)OUT + N;
107  while ((char*)OUT < OUT_end) {
108  *((char*)OUT) = *((char*)IN);
109  OUT++;
110  IN++;
111  }
112 
113  return OUT0;
114 #else
115  /* Handle short strings and immediately return. */
116  if (__builtin_expect(N < SHORTWORD_SIZE, 1)) {
117  size_t i = 0;
118  while (i < N) {
119  ((char*)OUT)[i] = ((char*)IN)[i];
120  i++;
121  }
122  return OUT;
123  }
124 
125  const char* OUT_end = (char*)OUT + N;
126 
127  /* Align OUT to SHORTWORD_SIZE. */
128  while ((uintptr_t)OUT % SHORTWORD_SIZE != 0) {
129  *(char*) (OUT++) = *(char*) (IN++);
130  }
131 #if 0
132  /* Align OUT to DWORD_SIZE. */
133  while ((uintptr_t)OUT % DWORD_SIZE != 0) {
134  *(char*) (OUT++) = *(char*) (IN++);
135  }
136 
137  if ((uintptr_t) IN % DWORD_SIZE == 0) {
138  while (OUT_end - (char*)OUT >= DWORD_SIZE * 16) {
139  DWORD_COPY(OUT, IN, 0);
140  DWORD_COPY(OUT, IN, DWORD_SIZE * 1);
141  DWORD_COPY(OUT, IN, DWORD_SIZE * 2);
142  DWORD_COPY(OUT, IN, DWORD_SIZE * 3);
143  DWORD_COPY(OUT, IN, DWORD_SIZE * 4);
144  DWORD_COPY(OUT, IN, DWORD_SIZE * 5);
145  DWORD_COPY(OUT, IN, DWORD_SIZE * 6);
146  DWORD_COPY(OUT, IN, DWORD_SIZE * 7);
147  DWORD_COPY(OUT, IN, DWORD_SIZE * 8);
148  DWORD_COPY(OUT, IN, DWORD_SIZE * 9);
149  DWORD_COPY(OUT, IN, DWORD_SIZE * 10);
150  DWORD_COPY(OUT, IN, DWORD_SIZE * 11);
151  DWORD_COPY(OUT, IN, DWORD_SIZE * 12);
152  DWORD_COPY(OUT, IN, DWORD_SIZE * 13);
153  DWORD_COPY(OUT, IN, DWORD_SIZE * 14);
154  DWORD_COPY(OUT, IN, DWORD_SIZE * 15);
155  OUT += DWORD_SIZE * 16;
156  IN += DWORD_SIZE * 16;
157  }
158  }
159 #endif
160  if ((uintptr_t) IN % SHORTWORD_SIZE == 0) {
161 
162 #if WORD_SIZE > SHORTWORD_SIZE
163  /* Align OUT to WORD_SIZE in steps of SHORTWORD_SIZE. */
164  if (__builtin_expect(OUT_end - (char*)OUT >= WORD_SIZE, 0)) {
165  while ((uintptr_t)OUT % WORD_SIZE != 0) {
166  SHORTWORD_COPY(OUT, IN, 0);
167  OUT += SHORTWORD_SIZE;
168  IN += SHORTWORD_SIZE;
169  }
170 
171  if ((uintptr_t) IN % WORD_SIZE == 0) {
172 #endif /* WORD_SIZE > SHORTWORD_SIZE */
173 
174 #if defined(__ARM_NEON__)
175  /* Testing on Cortex-A8 indicates that the following idiom
176  produces faster assembly code when doing vector copies,
177  but not when doing regular copies. */
178  size_t i = 0;
179  N = OUT_end - (char*)OUT;
180  MAYBE_PREFETCH(IN + 64);
181  MAYBE_PREFETCH(IN + 128);
182  MAYBE_PREFETCH(IN + 192);
183  if (N >= 640) {
184  MAYBE_PREFETCH(IN + 256);
185  MAYBE_PREFETCH(IN + 320);
186  MAYBE_PREFETCH(IN + 384);
187  MAYBE_PREFETCH(IN + 448);
188  MAYBE_PREFETCH(IN + 512);
189  MAYBE_PREFETCH(IN + 576);
190  MAYBE_PREFETCH(IN + 640);
191  MAYBE_PREFETCH(IN + 704);
192  /* We phrase the loop condition in this way so that the
193  i + WORD_SIZE * 16 value can be reused to increment i. */
194  while (i + WORD_SIZE * 16 <= N - 640) {
195  MAYBE_PREFETCH(IN + 768);
196  MAYBE_PREFETCH(IN + 832);
197  MAYBE_PREFETCH(IN + 896);
198  MAYBE_PREFETCH(IN + 960);
199  WORD_COPY(OUT, IN, i);
200  WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
201  WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
202  WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
203  WORD_COPY(OUT, IN, i + WORD_SIZE * 4);
204  WORD_COPY(OUT, IN, i + WORD_SIZE * 5);
205  WORD_COPY(OUT, IN, i + WORD_SIZE * 6);
206  WORD_COPY(OUT, IN, i + WORD_SIZE * 7);
207  WORD_COPY(OUT, IN, i + WORD_SIZE * 8);
208  WORD_COPY(OUT, IN, i + WORD_SIZE * 9);
209  WORD_COPY(OUT, IN, i + WORD_SIZE * 10);
210  WORD_COPY(OUT, IN, i + WORD_SIZE * 11);
211  WORD_COPY(OUT, IN, i + WORD_SIZE * 12);
212  WORD_COPY(OUT, IN, i + WORD_SIZE * 13);
213  WORD_COPY(OUT, IN, i + WORD_SIZE * 14);
214  WORD_COPY(OUT, IN, i + WORD_SIZE * 15);
215  i += WORD_SIZE * 16;
216  }
217  }
218  while (i + WORD_SIZE * 16 <= N) {
219  WORD_COPY(OUT, IN, i);
220  WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
221  WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
222  WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
223  WORD_COPY(OUT, IN, i + WORD_SIZE * 4);
224  WORD_COPY(OUT, IN, i + WORD_SIZE * 5);
225  WORD_COPY(OUT, IN, i + WORD_SIZE * 6);
226  WORD_COPY(OUT, IN, i + WORD_SIZE * 7);
227  WORD_COPY(OUT, IN, i + WORD_SIZE * 8);
228  WORD_COPY(OUT, IN, i + WORD_SIZE * 9);
229  WORD_COPY(OUT, IN, i + WORD_SIZE * 10);
230  WORD_COPY(OUT, IN, i + WORD_SIZE * 11);
231  WORD_COPY(OUT, IN, i + WORD_SIZE * 12);
232  WORD_COPY(OUT, IN, i + WORD_SIZE * 13);
233  WORD_COPY(OUT, IN, i + WORD_SIZE * 14);
234  WORD_COPY(OUT, IN, i + WORD_SIZE * 15);
235  i += WORD_SIZE * 16;
236  }
237  while (i + WORD_SIZE * 4 <= N) {
238  WORD_COPY(OUT, IN, i);
239  WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
240  WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
241  WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
242  i += WORD_SIZE * 4;
243  }
244  while (i + WORD_SIZE <= N) {
245  WORD_COPY(OUT, IN, i);
246  i += WORD_SIZE;
247  }
248  OUT += i;
249  IN += i;
250 #else /* not defined(__ARM_NEON__) */
251  /* Note: 16-times unrolling is about 20% faster than 4-times
252  unrolling on both ARM Cortex-A8 and Cortex-M3. */
253  MAYBE_PREFETCH(IN + 64);
254  MAYBE_PREFETCH(IN + 128);
255  MAYBE_PREFETCH(IN + 192);
256  while (OUT_end - (char*)OUT >= WORD_SIZE * 16) {
257  MAYBE_PREFETCH(IN + 256);
258  MAYBE_PREFETCH(IN + 320);
259  WORD_COPY(OUT, IN, 0);
260  WORD_COPY(OUT, IN, WORD_SIZE * 1);
261  WORD_COPY(OUT, IN, WORD_SIZE * 2);
262  WORD_COPY(OUT, IN, WORD_SIZE * 3);
263  WORD_COPY(OUT, IN, WORD_SIZE * 4);
264  WORD_COPY(OUT, IN, WORD_SIZE * 5);
265  WORD_COPY(OUT, IN, WORD_SIZE * 6);
266  WORD_COPY(OUT, IN, WORD_SIZE * 7);
267  WORD_COPY(OUT, IN, WORD_SIZE * 8);
268  WORD_COPY(OUT, IN, WORD_SIZE * 9);
269  WORD_COPY(OUT, IN, WORD_SIZE * 10);
270  WORD_COPY(OUT, IN, WORD_SIZE * 11);
271  WORD_COPY(OUT, IN, WORD_SIZE * 12);
272  WORD_COPY(OUT, IN, WORD_SIZE * 13);
273  WORD_COPY(OUT, IN, WORD_SIZE * 14);
274  WORD_COPY(OUT, IN, WORD_SIZE * 15);
275  OUT += WORD_SIZE * 16;
276  IN += WORD_SIZE * 16;
277  }
278  while (WORD_SIZE * 4 <= OUT_end - (char*)OUT) {
279  WORD_COPY(OUT, IN, 0);
280  WORD_COPY(OUT, IN, WORD_SIZE * 1);
281  WORD_COPY(OUT, IN, WORD_SIZE * 2);
282  WORD_COPY(OUT, IN, WORD_SIZE * 3);
283  OUT += WORD_SIZE * 4;
284  IN += WORD_SIZE * 4;
285  }
286  while (WORD_SIZE <= OUT_end - (char*)OUT) {
287  WORD_COPY(OUT, IN, 0);
288  OUT += WORD_SIZE;
289  IN += WORD_SIZE;
290  }
291 #endif /* not defined(__ARM_NEON__) */
292 
293 #if WORD_SIZE > SHORTWORD_SIZE
294  } else { /* if IN is not WORD_SIZE aligned */
295  while (SHORTWORD_SIZE * 4 <= OUT_end - (char*)OUT) {
296  SHORTWORD_COPY(OUT, IN, 0);
297  SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 1);
298  SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 2);
299  SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 3);
300  OUT += SHORTWORD_SIZE * 4;
301  IN += SHORTWORD_SIZE * 4;
302  }
303  } /* end if IN is not WORD_SIZE aligned */
304  } /* end if N >= WORD_SIZE */
305 
306  while (SHORTWORD_SIZE <= OUT_end - (char*)OUT) {
307  SHORTWORD_COPY(OUT, IN, 0);
308  OUT += SHORTWORD_SIZE;
309  IN += SHORTWORD_SIZE;
310  }
311 #endif /* WORD_SIZE > SHORTWORD_SIZE */
312 
313  } else { /* if IN is not SHORTWORD_SIZE aligned */
314  ptrdiff_t misalign = (uintptr_t)IN % SHORTWORD_SIZE;
315 
316  SHORTWORD_TYPE temp1, temp2;
317  temp1 = SHORTWORD_REF(IN, -misalign);
318 
319  /* Benchmarking indicates that unrolling this loop doesn't
320  produce a measurable performance improvement on ARM. */
321  while (SHORTWORD_SIZE <= OUT_end - (char*)OUT) {
322  IN += SHORTWORD_SIZE;
323  temp2 = SHORTWORD_REF(IN, -misalign);
324  SHORTWORD_REF(OUT, 0) = SHORTWORD_SHIFT(temp1, temp2, misalign);
325  temp1 = temp2;
326  OUT += SHORTWORD_SIZE;
327  }
328 
329  } /* end if IN is not SHORTWORD_SIZE aligned */
330 
331  while ((char*)OUT < OUT_end) {
332  *((char*)OUT) = *((char*)IN);
333  OUT++;
334  IN++;
335  }
336 
337  return OUT0;
338 #endif
339 }
340 #endif
void * memcpy(void *dest, const void *src, size_t n)