mirror of
				https://sourceware.org/git/glibc.git
				synced 2025-11-03 20:53:13 +03:00 
			
		
		
		
	I've moved the TILE-Gx and TILEPro ports to the main sysdeps hierarchy,
along with the linux-generic ports infrastructure.  Beyond the README
update, the move was just
    git mv ports/sysdeps/tile sysdeps/tile
    git mv ports/sysdeps/unix/sysv/linux/tile \
      sysdeps/unix/sysv/linux/tile
    git mv ports/sysdeps/unix/sysv/linux/generic \
      sysdeps/unix/sysv/linux/generic
I updated the relevant ChangeLogs along the lines of the ARM move
in commit c6bfe5c4d7 and tested the 64-bit tilegx build to confirm that
there were no changes in "objdump -dr" output in the shared objects.
		
	
		
			
				
	
	
		
			273 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			273 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/* Copyright (C) 2011-2014 Free Software Foundation, Inc.
 | 
						|
   This file is part of the GNU C Library.
 | 
						|
   Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011.
 | 
						|
 | 
						|
   The GNU C Library is free software; you can redistribute it and/or
 | 
						|
   modify it under the terms of the GNU Lesser General Public
 | 
						|
   License as published by the Free Software Foundation; either
 | 
						|
   version 2.1 of the License, or (at your option) any later version.
 | 
						|
 | 
						|
   The GNU C Library is distributed in the hope that it will be useful,
 | 
						|
   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
   Lesser General Public License for more details.
 | 
						|
 | 
						|
   You should have received a copy of the GNU Lesser General Public
 | 
						|
   License along with the GNU C Library.  If not, see
 | 
						|
   <http://www.gnu.org/licenses/>.  */
 | 
						|
 | 
						|
#include <string.h>
 | 
						|
#include <stdint.h>
 | 
						|
#include <stdlib.h>
 | 
						|
#include <memcopy.h>
 | 
						|
#include <arch/chip.h>
 | 
						|
 | 
						|
/* How many cache lines ahead should we prefetch? */
 | 
						|
#define PREFETCH_LINES_AHEAD 3
 | 
						|
 | 
						|
void *
 | 
						|
__memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n)
 | 
						|
{
 | 
						|
  char *__restrict dst1 = (char *) dstv;
 | 
						|
  const char *__restrict src1 = (const char *) srcv;
 | 
						|
  const char *__restrict src1_end;
 | 
						|
  const char *__restrict prefetch;
 | 
						|
  op_t *__restrict dst8; /* 8-byte pointer to destination memory. */
 | 
						|
  op_t final; /* Final bytes to write to trailing word, if any */
 | 
						|
  long i;
 | 
						|
 | 
						|
  if (n < 16)
 | 
						|
    {
 | 
						|
      for (; n; n--)
 | 
						|
        *dst1++ = *src1++;
 | 
						|
      return dstv;
 | 
						|
    }
 | 
						|
 | 
						|
  /* Locate the end of source memory we will copy.  Don't prefetch
 | 
						|
     past this.  */
 | 
						|
  src1_end = src1 + n - 1;
 | 
						|
 | 
						|
  /* Prefetch ahead a few cache lines, but not past the end. */
 | 
						|
  prefetch = src1;
 | 
						|
  for (i = 0; i < PREFETCH_LINES_AHEAD; i++)
 | 
						|
    {
 | 
						|
      __insn_prefetch (prefetch);
 | 
						|
      prefetch += CHIP_L2_LINE_SIZE ();
 | 
						|
      prefetch = (prefetch < src1_end) ? prefetch : src1;
 | 
						|
    }
 | 
						|
 | 
						|
  /* Copy bytes until dst is word-aligned. */
 | 
						|
  for (; (uintptr_t) dst1 & (sizeof (op_t) - 1); n--)
 | 
						|
    *dst1++ = *src1++;
 | 
						|
 | 
						|
  /* 8-byte pointer to destination memory. */
 | 
						|
  dst8 = (op_t *) dst1;
 | 
						|
 | 
						|
  if (__builtin_expect ((uintptr_t) src1 & (sizeof (op_t) - 1), 0))
 | 
						|
    {
 | 
						|
      /* Misaligned copy.  Use glibc's _wordcopy_fwd_dest_aligned, but
 | 
						|
         inline it to avoid prologue/epilogue.  TODO: Consider
 | 
						|
         prefetching and using wh64 as well.  */
 | 
						|
      void * srci;
 | 
						|
      op_t a0, a1, a2, a3;
 | 
						|
      long int dstp = (long int) dst1;
 | 
						|
      long int srcp = (long int) src1;
 | 
						|
      long int len = n / OPSIZ;
 | 
						|
 | 
						|
      /* Save the initial source pointer so we know the number of
 | 
						|
         bytes to shift for merging two unaligned results.  */
 | 
						|
      srci = (void *) srcp;
 | 
						|
 | 
						|
      /* Make SRCP aligned by rounding it down to the beginning of the
 | 
						|
         `op_t' it points in the middle of.  */
 | 
						|
      srcp &= -OPSIZ;
 | 
						|
 | 
						|
      switch (len % 4)
 | 
						|
	{
 | 
						|
	case 2:
 | 
						|
	  a1 = ((op_t *) srcp)[0];
 | 
						|
	  a2 = ((op_t *) srcp)[1];
 | 
						|
	  len += 2;
 | 
						|
	  srcp += 2 * OPSIZ;
 | 
						|
	  goto do1;
 | 
						|
	case 3:
 | 
						|
	  a0 = ((op_t *) srcp)[0];
 | 
						|
	  a1 = ((op_t *) srcp)[1];
 | 
						|
	  len += 1;
 | 
						|
	  srcp += 2 * OPSIZ;
 | 
						|
	  goto do2;
 | 
						|
	case 0:
 | 
						|
	  if (OP_T_THRES <= 3 * OPSIZ && len == 0)
 | 
						|
	    return dstv;
 | 
						|
	  a3 = ((op_t *) srcp)[0];
 | 
						|
	  a0 = ((op_t *) srcp)[1];
 | 
						|
	  len += 0;
 | 
						|
	  srcp += 2 * OPSIZ;
 | 
						|
	  goto do3;
 | 
						|
	case 1:
 | 
						|
	  a2 = ((op_t *) srcp)[0];
 | 
						|
	  a3 = ((op_t *) srcp)[1];
 | 
						|
	  srcp += 2 * OPSIZ;
 | 
						|
	  len -= 1;
 | 
						|
	  if (OP_T_THRES <= 3 * OPSIZ && len == 0)
 | 
						|
	    goto do0;
 | 
						|
	  goto do4;			/* No-op.  */
 | 
						|
	}
 | 
						|
 | 
						|
      do
 | 
						|
	{
 | 
						|
	do4:
 | 
						|
	  a0 = ((op_t *) srcp)[0];
 | 
						|
	  a2 = __insn_dblalign (a2, a3, srci);
 | 
						|
	  ((op_t *) dstp)[0] = a2;
 | 
						|
	  srcp += OPSIZ;
 | 
						|
	  dstp += OPSIZ;
 | 
						|
	do3:
 | 
						|
	  a1 = ((op_t *) srcp)[0];
 | 
						|
	  a3 = __insn_dblalign (a3, a0, srci);
 | 
						|
	  ((op_t *) dstp)[0] = a3;
 | 
						|
	  srcp += OPSIZ;
 | 
						|
	  dstp += OPSIZ;
 | 
						|
	do2:
 | 
						|
	  a2 = ((op_t *) srcp)[0];
 | 
						|
	  a0 = __insn_dblalign (a0, a1, srci);
 | 
						|
	  ((op_t *) dstp)[0] = a0;
 | 
						|
	  srcp += OPSIZ;
 | 
						|
	  dstp += OPSIZ;
 | 
						|
	do1:
 | 
						|
	  a3 = ((op_t *) srcp)[0];
 | 
						|
	  a1 = __insn_dblalign (a1, a2, srci);
 | 
						|
	  ((op_t *) dstp)[0] = a1;
 | 
						|
	  srcp += OPSIZ;
 | 
						|
	  dstp += OPSIZ;
 | 
						|
	  len -= 4;
 | 
						|
	}
 | 
						|
      while (len != 0);
 | 
						|
 | 
						|
      /* This is the right position for do0.  Please don't move
 | 
						|
         it into the loop.  */
 | 
						|
    do0:
 | 
						|
      ((op_t *) dstp)[0] = __insn_dblalign (a2, a3, srci);
 | 
						|
 | 
						|
      n = n % OPSIZ;
 | 
						|
      if (n == 0)
 | 
						|
	return dstv;
 | 
						|
 | 
						|
      a0 = ((const char *) srcp <= src1_end) ? ((op_t *) srcp)[0] : 0;
 | 
						|
 | 
						|
      final = __insn_dblalign (a3, a0, srci);
 | 
						|
      dst8 = (op_t *)(dstp + OPSIZ);
 | 
						|
    }
 | 
						|
  else
 | 
						|
    {
 | 
						|
      /* Aligned copy. */
 | 
						|
 | 
						|
      const op_t *__restrict src8 = (const op_t *) src1;
 | 
						|
 | 
						|
      /* src8 and dst8 are both word-aligned. */
 | 
						|
      if (n >= CHIP_L2_LINE_SIZE ())
 | 
						|
        {
 | 
						|
          /* Copy until 'dst' is cache-line-aligned. */
 | 
						|
          for (; (uintptr_t) dst8 & (CHIP_L2_LINE_SIZE () - 1);
 | 
						|
               n -= sizeof (op_t))
 | 
						|
            *dst8++ = *src8++;
 | 
						|
 | 
						|
          for (; n >= CHIP_L2_LINE_SIZE ();)
 | 
						|
	    {
 | 
						|
	      op_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 | 
						|
 | 
						|
	      /* Prefetch and advance to next line to prefetch, but
 | 
						|
		 don't go past the end.  */
 | 
						|
	      __insn_prefetch (prefetch);
 | 
						|
	      prefetch += CHIP_L2_LINE_SIZE ();
 | 
						|
	      prefetch = (prefetch < src1_end) ? prefetch :
 | 
						|
		(const char *) src8;
 | 
						|
 | 
						|
	      /* Do all the loads before wh64.  This is necessary if
 | 
						|
		 [src8, src8+7] and [dst8, dst8+7] share the same
 | 
						|
		 cache line and dst8 <= src8, as can be the case when
 | 
						|
		 called from memmove, or with code tested on x86 whose
 | 
						|
		 memcpy always works with forward copies.  */
 | 
						|
	      tmp0 = *src8++;
 | 
						|
	      tmp1 = *src8++;
 | 
						|
	      tmp2 = *src8++;
 | 
						|
	      tmp3 = *src8++;
 | 
						|
	      tmp4 = *src8++;
 | 
						|
	      tmp5 = *src8++;
 | 
						|
	      tmp6 = *src8++;
 | 
						|
	      tmp7 = *src8++;
 | 
						|
 | 
						|
	      __insn_wh64 (dst8);
 | 
						|
 | 
						|
	      *dst8++ = tmp0;
 | 
						|
	      *dst8++ = tmp1;
 | 
						|
	      *dst8++ = tmp2;
 | 
						|
	      *dst8++ = tmp3;
 | 
						|
	      *dst8++ = tmp4;
 | 
						|
	      *dst8++ = tmp5;
 | 
						|
	      *dst8++ = tmp6;
 | 
						|
	      *dst8++ = tmp7;
 | 
						|
 | 
						|
	      n -= 64;
 | 
						|
	    }
 | 
						|
#if CHIP_L2_LINE_SIZE() != 64
 | 
						|
# error "Fix code that assumes particular L2 cache line size."
 | 
						|
#endif
 | 
						|
        }
 | 
						|
 | 
						|
      for (; n >= sizeof (op_t); n -= sizeof (op_t))
 | 
						|
        *dst8++ = *src8++;
 | 
						|
 | 
						|
      if (__builtin_expect (n == 0, 1))
 | 
						|
        return dstv;
 | 
						|
 | 
						|
      final = *src8;
 | 
						|
    }
 | 
						|
 | 
						|
  /* n != 0 if we get here.  Write out any trailing bytes. */
 | 
						|
  dst1 = (char *) dst8;
 | 
						|
#ifndef __BIG_ENDIAN__
 | 
						|
  if (n & 4)
 | 
						|
    {
 | 
						|
      *(uint32_t *) dst1 = final;
 | 
						|
      dst1 += 4;
 | 
						|
      final >>= 32;
 | 
						|
      n &= 3;
 | 
						|
    }
 | 
						|
  if (n & 2)
 | 
						|
    {
 | 
						|
      *(uint16_t *) dst1 = final;
 | 
						|
      dst1 += 2;
 | 
						|
      final >>= 16;
 | 
						|
      n &= 1;
 | 
						|
    }
 | 
						|
  if (n)
 | 
						|
    *(uint8_t *) dst1 = final;
 | 
						|
#else
 | 
						|
  if (n & 4)
 | 
						|
    {
 | 
						|
      *(uint32_t *) dst1 = final >> 32;
 | 
						|
      dst1 += 4;
 | 
						|
    }
 | 
						|
  else
 | 
						|
    {
 | 
						|
      final >>= 32;
 | 
						|
    }
 | 
						|
  if (n & 2)
 | 
						|
    {
 | 
						|
      *(uint16_t *) dst1 = final >> 16;
 | 
						|
      dst1 += 2;
 | 
						|
    }
 | 
						|
  else
 | 
						|
    {
 | 
						|
      final >>= 16;
 | 
						|
    }
 | 
						|
  if (n & 1)
 | 
						|
    *(uint8_t *) dst1 = final >> 8;
 | 
						|
#endif
 | 
						|
 | 
						|
  return dstv;
 | 
						|
}
 | 
						|
weak_alias (__memcpy, memcpy)
 | 
						|
libc_hidden_builtin_def (memcpy)
 |