5 files changed, 1 insertions, 537 deletions
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile
index 9adfd76..c4211cb 100644
--- a/arch/tile/lib/Makefile
+++ b/arch/tile/lib/Makefile
@@ -7,7 +7,7 @@ lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \
 	strchr_$(BITS).o strlen_$(BITS).o strnlen_$(BITS).o
 
 lib-$(CONFIG_TILEGX) += memcpy_user_64.o
-lib-$(CONFIG_TILEPRO) += atomic_32.o atomic_asm_32.o memcpy_tile64.o
+lib-$(CONFIG_TILEPRO) += atomic_32.o atomic_asm_32.o
 lib-$(CONFIG_SMP) += spinlock_$(BITS).o usercopy_$(BITS).o
 
 obj-$(CONFIG_MODULES) += exports.o
diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c
index 42eacb1..5d91d18 100644
--- a/arch/tile/lib/atomic_32.c
+++ b/arch/tile/lib/atomic_32.c
@@ -20,50 +20,12 @@
 #include <linux/atomic.h>
 #include <arch/chip.h>
 
-/* See <asm/atomic_32.h> */
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-
-/*
- * A block of memory containing locks for atomic ops. Each instance of this
- * struct will be homed on a different CPU.
- */
-struct atomic_locks_on_cpu {
-	int lock[ATOMIC_HASH_L2_SIZE];
-} __attribute__((aligned(ATOMIC_HASH_L2_SIZE * 4)));
-
-static DEFINE_PER_CPU(struct atomic_locks_on_cpu, atomic_lock_pool);
-
-/* The locks we'll use until __init_atomic_per_cpu is called. */
-static struct atomic_locks_on_cpu __initdata initial_atomic_locks;
-
-/* Hash into this vector to get a pointer to lock for the given atomic. */
-struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE]
-	__write_once = {
-	[0 ... ATOMIC_HASH_L1_SIZE-1] (&initial_atomic_locks)
-};
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 /* This page is remapped on startup to be hash-for-home. */
 int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss;
 
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 int *__atomic_hashed_lock(volatile void *v)
 {
 	/* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-	unsigned long i =
-		(unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long));
-	unsigned long n = __insn_crc32_32(0, i);
-
-	/* Grab high bits for L1 index. */
-	unsigned long l1_index = n >> ((sizeof(n) * 8) - ATOMIC_HASH_L1_SHIFT);
-	/* Grab low bits for L2 index. */
-	unsigned long l2_index = n & (ATOMIC_HASH_L2_SIZE - 1);
-
-	return &atomic_lock_ptr[l1_index]->lock[l2_index];
-#else
 	/*
 	 * Use bits [3, 3 + ATOMIC_HASH_SHIFT) as the lock index.
 	 * Using mm works here because atomic_locks is page aligned.
@@ -72,26 +34,13 @@ int *__atomic_hashed_lock(volatile void *v)
 				      (unsigned long)atomic_locks,
 				      2, (ATOMIC_HASH_SHIFT + 2) - 1);
 	return (int *)ptr;
-#endif
 }
 
 #ifdef CONFIG_SMP
 /* Return whether the passed pointer is a valid atomic lock pointer. */
 static int is_atomic_lock(int *p)
 {
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-	int i;
-	for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
-
-		if (p >= &atomic_lock_ptr[i]->lock[0] &&
-		    p < &atomic_lock_ptr[i]->lock[ATOMIC_HASH_L2_SIZE]) {
-			return 1;
-		}
-	}
-	return 0;
-#else
 	return p >= &atomic_locks[0] && p < &atomic_locks[ATOMIC_HASH_SIZE];
-#endif
 }
 
 void __atomic_fault_unlock(int *irqlock_word)
@@ -210,43 +159,6 @@ struct __get_user __atomic_bad_address(int __user *addr)
 
 void __init __init_atomic_per_cpu(void)
 {
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-
-	unsigned int i;
-	int actual_cpu;
-
-	/*
-	 * Before this is called from setup, we just have one lock for
-	 * all atomic objects/operations.  Here we replace the
-	 * elements of atomic_lock_ptr so that they point at per_cpu
-	 * integers.  This seemingly over-complex approach stems from
-	 * the fact that DEFINE_PER_CPU defines an entry for each cpu
-	 * in the grid, not each cpu from 0..ATOMIC_HASH_SIZE-1.  But
-	 * for efficient hashing of atomics to their locks we want a
-	 * compile time constant power of 2 for the size of this
-	 * table, so we use ATOMIC_HASH_SIZE.
-	 *
-	 * Here we populate atomic_lock_ptr from the per cpu
-	 * atomic_lock_pool, interspersing by actual cpu so that
-	 * subsequent elements are homed on consecutive cpus.
-	 */
-
-	actual_cpu = cpumask_first(cpu_possible_mask);
-
-	for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
-		/*
-		 * Preincrement to slightly bias against using cpu 0,
-		 * which has plenty of stuff homed on it already.
-		 */
-		actual_cpu = cpumask_next(actual_cpu, cpu_possible_mask);
-		if (actual_cpu >= nr_cpu_ids)
-			actual_cpu = cpumask_first(cpu_possible_mask);
-
-		atomic_lock_ptr[i] = &per_cpu(atomic_lock_pool, actual_cpu);
-	}
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 	/* Validate power-of-two and "bigger than cpus" assumption */
 	BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1));
 	BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids);
@@ -270,6 +182,4 @@ void __init __init_atomic_per_cpu(void)
 	 * That should not produce more indices than ATOMIC_HASH_SIZE.
 	 */
 	BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE);
-
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
 }
diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S
index 8ba7626..a2771ae 100644
--- a/arch/tile/lib/memcpy_32.S
+++ b/arch/tile/lib/memcpy_32.S
@@ -22,14 +22,6 @@
 
 #include <linux/linkage.h>
 
-/* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-#define memcpy __memcpy_asm
-#define __copy_to_user_inatomic __copy_to_user_inatomic_asm
-#define __copy_from_user_inatomic __copy_from_user_inatomic_asm
-#define __copy_from_user_zeroing __copy_from_user_zeroing_asm
-#endif
-
 #define IS_MEMCPY	  0
 #define IS_COPY_FROM_USER  1
 #define IS_COPY_FROM_USER_ZEROING  2
@@ -159,12 +151,9 @@ EX:	{ sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
 
 	{ addi r3, r1, 60; andi r9, r9, -64 }
 
-#if CHIP_HAS_WH64()
 	/* No need to prefetch dst, we'll just do the wh64
 	 * right before we copy a line.
 	 */
-#endif
-
 EX:	{ lw r5, r3; addi r3, r3, 64; movei r4, 1 }
 	/* Intentionally stall for a few cycles to leave L2 cache alone. */
 	{ bnzt zero, .; move r27, lr }
@@ -172,21 +161,6 @@ EX:	{ lw r6, r3; addi r3, r3, 64 }
 	/* Intentionally stall for a few cycles to leave L2 cache alone. */
 	{ bnzt zero, . }
 EX:	{ lw r7, r3; addi r3, r3, 64 }
-#if !CHIP_HAS_WH64()
-	/* Prefetch the dest */
-	/* Intentionally stall for a few cycles to leave L2 cache alone. */
-	{ bnzt zero, . }
-	/* Use a real load to cause a TLB miss if necessary.  We aren't using
-	 * r28, so this should be fine.
-	 */
-EX:	{ lw r28, r9; addi r9, r9, 64 }
-	/* Intentionally stall for a few cycles to leave L2 cache alone. */
-	{ bnzt zero, . }
-	{ prefetch r9; addi r9, r9, 64 }
-	/* Intentionally stall for a few cycles to leave L2 cache alone. */
-	{ bnzt zero, . }
-	{ prefetch r9; addi r9, r9, 64 }
-#endif
 	/* Intentionally stall for a few cycles to leave L2 cache alone. */
 	{ bz zero, .Lbig_loop2 }
 
@@ -287,13 +261,8 @@ EX:	{ lw r7, r3; addi r3, r3, 64 }
 	/* Fill second L1D line. */
 EX:	{ lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */
 
-#if CHIP_HAS_WH64()
 	/* Prepare destination line for writing. */
 EX:	{ wh64 r9; addi r9, r9, 64 }
-#else
-	/* Prefetch dest line */
-	{ prefetch r9; addi r9, r9, 64 }
-#endif
 	/* Load seven words that are L1D hits to cover wh64 L2 usage. */
 
 	/* Load the three remaining words from the last L1D line, which
@@ -331,16 +300,7 @@ EX:	{ lw r18, r1; addi r1, r1, 4 }                  /* r18 = WORD_8 */
 EX:	{ sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */
 EX:	{ sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */
 EX:	{ sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */
-#if CHIP_HAS_WH64()
 EX:	{ sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */
-#else
-	/* Back up the r9 to a cache line we are already storing to
-	 * if it gets past the end of the dest vector.  Strictly speaking,
-	 * we don't need to back up to the start of a cache line, but it's free
-	 * and tidy, so why not?
-	 */
-EX:	{ sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */
-#endif
 	/* Store second L1D line. */
 EX:	{ sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */
 EX:	{ sw r0, r19; addi r0, r0, 4 }                  /* store(WORD_5) */
@@ -404,7 +364,6 @@ EX:	{ sb r0, r3;   addi r0, r0, 1; addi r2, r2, -1 }
 
 .Ldest_is_word_aligned:
 
-#if CHIP_HAS_DWORD_ALIGN()
 EX:	{ andi r8, r0, 63; lwadd_na r6, r1, 4}
 	{ slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned }
 
@@ -512,26 +471,6 @@ EX:	{ swadd r0, r13, 4; addi r2, r2, -32 }
 	/* Move r1 back to the point where it corresponds to r0. */
 	{ addi r1, r1, -4 }
 
-#else /* !CHIP_HAS_DWORD_ALIGN() */
-
-	/* Compute right/left shift counts and load initial source words. */
-	{ andi r5, r1, -4; andi r3, r1, 3 }
-EX:	{ lw r6, r5; addi r5, r5, 4; shli r3, r3, 3 }
-EX:	{ lw r7, r5; addi r5, r5, 4; sub r4, zero, r3 }
-
-	/* Load and store one word at a time, using shifts and ORs
-	 * to correct for the misaligned src.
-	 */
-.Lcopy_unaligned_src_loop:
-	{ shr r6, r6, r3; shl r8, r7, r4 }
-EX:	{ lw r7, r5; or r8, r8, r6; move r6, r7 }
-EX:	{ sw r0, r8; addi r0, r0, 4; addi r2, r2, -4 }
-	{ addi r5, r5, 4; slti_u r8, r2, 8 }
-	{ bzt r8, .Lcopy_unaligned_src_loop; addi r1, r1, 4 }
-
-	{ bz r2, .Lcopy_unaligned_done }
-#endif /* !CHIP_HAS_DWORD_ALIGN() */
-
 	/* Fall through */
 
 /*
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c
deleted file mode 100644
index 0290c22..0000000
--- a/arch/tile/lib/memcpy_tile64.c
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- */
-
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <asm/fixmap.h>
-#include <asm/kmap_types.h>
-#include <asm/tlbflush.h>
-#include <hv/hypervisor.h>
-#include <arch/chip.h>
-
-
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-
-/* Defined in memcpy.S */
-extern unsigned long __memcpy_asm(void *to, const void *from, unsigned long n);
-extern unsigned long __copy_to_user_inatomic_asm(
-	void __user *to, const void *from, unsigned long n);
-extern unsigned long __copy_from_user_inatomic_asm(
-	void *to, const void __user *from, unsigned long n);
-extern unsigned long __copy_from_user_zeroing_asm(
-	void *to, const void __user *from, unsigned long n);
-
-typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long);
-
-/* Size above which to consider TLB games for performance */
-#define LARGE_COPY_CUTOFF 2048
-
-/* Communicate to the simulator what we are trying to do. */
-#define sim_allow_multiple_caching(b) \
-  __insn_mtspr(SPR_SIM_CONTROL, \
-   SIM_CONTROL_ALLOW_MULTIPLE_CACHING | ((b) << _SIM_CONTROL_OPERATOR_BITS))
-
-/*
- * Copy memory by briefly enabling incoherent cacheline-at-a-time mode.
- *
- * We set up our own source and destination PTEs that we fully control.
- * This is the only way to guarantee that we don't race with another
- * thread that is modifying the PTE; we can't afford to try the
- * copy_{to,from}_user() technique of catching the interrupt, since
- * we must run with interrupts disabled to avoid the risk of some
- * other code seeing the incoherent data in our cache.  (Recall that
- * our cache is indexed by PA, so even if the other code doesn't use
- * our kmap_atomic virtual addresses, they'll still hit in cache using
- * the normal VAs that aren't supposed to hit in cache.)
- */
-static void memcpy_multicache(void *dest, const void *source,
-			      pte_t dst_pte, pte_t src_pte, int len)
-{
-	int idx;
-	unsigned long flags, newsrc, newdst;
-	pmd_t *pmdp;
-	pte_t *ptep;
-	int type0, type1;
-	int cpu = smp_processor_id();
-
-	/*
-	 * Disable interrupts so that we don't recurse into memcpy()
-	 * in an interrupt handler, nor accidentally reference
-	 * the PA of the source from an interrupt routine.  Also
-	 * notify the simulator that we're playing games so we don't
-	 * generate spurious coherency warnings.
-	 */
-	local_irq_save(flags);
-	sim_allow_multiple_caching(1);
-
-	/* Set up the new dest mapping */
-	type0 = kmap_atomic_idx_push();
-	idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + type0;
-	newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1));
-	pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst);
-	ptep = pte_offset_kernel(pmdp, newdst);
-	if (pte_val(*ptep) != pte_val(dst_pte)) {
-		set_pte(ptep, dst_pte);
-		local_flush_tlb_page(NULL, newdst, PAGE_SIZE);
-	}
-
-	/* Set up the new source mapping */
-	type1 = kmap_atomic_idx_push();
-	idx += (type0 - type1);
-	src_pte = hv_pte_set_nc(src_pte);
-	src_pte = hv_pte_clear_writable(src_pte);  /* be paranoid */
-	newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
-	pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
-	ptep = pte_offset_kernel(pmdp, newsrc);
-	__set_pte(ptep, src_pte);   /* set_pte() would be confused by this */
-	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
-
-	/* Actually move the data. */
-	__memcpy_asm((void *)newdst, (const void *)newsrc, len);
-
-	/*
-	 * Remap the source as locally-cached and not OLOC'ed so that
-	 * we can inval without also invaling the remote cpu's cache.
-	 * This also avoids known errata with inv'ing cacheable oloc data.
-	 */
-	src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
-	src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
-	__set_pte(ptep, src_pte);   /* set_pte() would be confused by this */
-	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
-
-	/*
-	 * Do the actual invalidation, covering the full L2 cache line
-	 * at the end since __memcpy_asm() is somewhat aggressive.
-	 */
-	__inv_buffer((void *)newsrc, len);
-
-	/*
-	 * We're done: notify the simulator that all is back to normal,
-	 * and re-enable interrupts and pre-emption.
-	 */
-	kmap_atomic_idx_pop();
-	kmap_atomic_idx_pop();
-	sim_allow_multiple_caching(0);
-	local_irq_restore(flags);
-}
-
-/*
- * Identify large copies from remotely-cached memory, and copy them
- * via memcpy_multicache() if they look good, otherwise fall back
- * to the particular kind of copying passed as the memcpy_t function.
- */
-static unsigned long fast_copy(void *dest, const void *source, int len,
-			       memcpy_t func)
-{
-	int cpu = get_cpu();
-	unsigned long retval;
-
-	/*
-	 * Check if it's big enough to bother with.  We may end up doing a
-	 * small copy via TLB manipulation if we're near a page boundary,
-	 * but presumably we'll make it up when we hit the second page.
-	 */
-	while (len >= LARGE_COPY_CUTOFF) {
-		int copy_size, bytes_left_on_page;
-		pte_t *src_ptep, *dst_ptep;
-		pte_t src_pte, dst_pte;
-		struct page *src_page, *dst_page;
-
-		/* Is the source page oloc'ed to a remote cpu? */
-retry_source:
-		src_ptep = virt_to_pte(current->mm, (unsigned long)source);
-		if (src_ptep == NULL)
-			break;
-		src_pte = *src_ptep;
-		if (!hv_pte_get_present(src_pte) ||
-		    !hv_pte_get_readable(src_pte) ||
-		    hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3)
-			break;
-		if (get_remote_cache_cpu(src_pte) == cpu)
-			break;
-		src_page = pfn_to_page(pte_pfn(src_pte));
-		get_page(src_page);
-		if (pte_val(src_pte) != pte_val(*src_ptep)) {
-			put_page(src_page);
-			goto retry_source;
-		}
-		if (pte_huge(src_pte)) {
-			/* Adjust the PTE to correspond to a small page */
-			int pfn = pte_pfn(src_pte);
-			pfn += (((unsigned long)source & (HPAGE_SIZE-1))
-				>> PAGE_SHIFT);
-			src_pte = pfn_pte(pfn, src_pte);
-			src_pte = pte_mksmall(src_pte);
-		}
-
-		/* Is the destination page writable? */
-retry_dest:
-		dst_ptep = virt_to_pte(current->mm, (unsigned long)dest);
-		if (dst_ptep == NULL) {
-			put_page(src_page);
-			break;
-		}
-		dst_pte = *dst_ptep;
-		if (!hv_pte_get_present(dst_pte) ||
-		    !hv_pte_get_writable(dst_pte)) {
-			put_page(src_page);
-			break;
-		}
-		dst_page = pfn_to_page(pte_pfn(dst_pte));
-		if (dst_page == src_page) {
-			/*
-			 * Source and dest are on the same page; this
-			 * potentially exposes us to incoherence if any
-			 * part of src and dest overlap on a cache line.
-			 * Just give up rather than trying to be precise.
-			 */
-			put_page(src_page);
-			break;
-		}
-		get_page(dst_page);
-		if (pte_val(dst_pte) != pte_val(*dst_ptep)) {
-			put_page(dst_page);
-			goto retry_dest;
-		}
-		if (pte_huge(dst_pte)) {
-			/* Adjust the PTE to correspond to a small page */
-			int pfn = pte_pfn(dst_pte);
-			pfn += (((unsigned long)dest & (HPAGE_SIZE-1))
-				>> PAGE_SHIFT);
-			dst_pte = pfn_pte(pfn, dst_pte);
-			dst_pte = pte_mksmall(dst_pte);
-		}
-
-		/* All looks good: create a cachable PTE and copy from it */
-		copy_size = len;
-		bytes_left_on_page =
-			PAGE_SIZE - (((int)source) & (PAGE_SIZE-1));
-		if (copy_size > bytes_left_on_page)
-			copy_size = bytes_left_on_page;
-		bytes_left_on_page =
-			PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1));
-		if (copy_size > bytes_left_on_page)
-			copy_size = bytes_left_on_page;
-		memcpy_multicache(dest, source, dst_pte, src_pte, copy_size);
-
-		/* Release the pages */
-		put_page(dst_page);
-		put_page(src_page);
-
-		/* Continue on the next page */
-		dest += copy_size;
-		source += copy_size;
-		len -= copy_size;
-	}
-
-	retval = func(dest, source, len);
-	put_cpu();
-	return retval;
-}
-
-void *memcpy(void *to, const void *from, __kernel_size_t n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return (void *)__memcpy_asm(to, from, n);
-	else
-		return (void *)fast_copy(to, from, n, __memcpy_asm);
-}
-
-unsigned long __copy_to_user_inatomic(void __user *to, const void *from,
-				      unsigned long n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return __copy_to_user_inatomic_asm(to, from, n);
-	else
-		return fast_copy(to, from, n, __copy_to_user_inatomic_asm);
-}
-
-unsigned long __copy_from_user_inatomic(void *to, const void __user *from,
-					unsigned long n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return __copy_from_user_inatomic_asm(to, from, n);
-	else
-		return fast_copy(to, from, n, __copy_from_user_inatomic_asm);
-}
-
-unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
-				       unsigned long n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return __copy_from_user_zeroing_asm(to, from, n);
-	else
-		return fast_copy(to, from, n, __copy_from_user_zeroing_asm);
-}
-
-#endif /* !CHIP_HAS_COHERENT_LOCAL_CACHE() */
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
index 9a7837d..2042bfe 100644
--- a/arch/tile/lib/memset_32.c
+++ b/arch/tile/lib/memset_32.c
@@ -23,11 +23,7 @@ void *memset(void *s, int c, size_t n)
 	int n32;
 	uint32_t v16, v32;
 	uint8_t *out8 = s;
-#if !CHIP_HAS_WH64()
-	int ahead32;
-#else
 	int to_align32;
-#endif
 
 	/* Experimentation shows that a trivial tight loop is a win up until
 	 * around a size of 20, where writing a word at a time starts to win.
@@ -58,21 +54,6 @@ void *memset(void *s, int c, size_t n)
 		return s;
 	}
 
-#if !CHIP_HAS_WH64()
-	/* Use a spare issue slot to start prefetching the first cache
-	 * line early. This instruction is free as the store can be buried
-	 * in otherwise idle issue slots doing ALU ops.
-	 */
-	__insn_prefetch(out8);
-
-	/* We prefetch the end so that a short memset that spans two cache
-	 * lines gets some prefetching benefit. Again we believe this is free
-	 * to issue.
-	 */
-	__insn_prefetch(&out8[n - 1]);
-#endif /* !CHIP_HAS_WH64() */
-
-
 	/* Align 'out8'. We know n >= 3 so this won't write past the end. */
 	while (((uintptr_t) out8 & 3) != 0) {
 		*out8++ = c;
@@ -93,90 +74,6 @@ void *memset(void *s, int c, size_t n)
 	/* This must be at least 8 or the following loop doesn't work. */
 #define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
 
-#if !CHIP_HAS_WH64()
-
-	ahead32 = CACHE_LINE_SIZE_IN_WORDS;
-
-	/* We already prefetched the first and last cache lines, so
-	 * we only need to do more prefetching if we are storing
-	 * to more than two cache lines.
-	 */
-	if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
-		int i;
-
-		/* Prefetch the next several cache lines.
-		 * This is the setup code for the software-pipelined
-		 * loop below.
-		 */
-#define MAX_PREFETCH 5
-		ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
-		if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
-			ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
-
-		for (i = CACHE_LINE_SIZE_IN_WORDS;
-		     i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
-			__insn_prefetch(&out32[i]);
-	}
-
-	if (n32 > ahead32) {
-		while (1) {
-			int j;
-
-			/* Prefetch by reading one word several cache lines
-			 * ahead.  Since loads are non-blocking this will
-			 * cause the full cache line to be read while we are
-			 * finishing earlier cache lines.  Using a store
-			 * here causes microarchitectural performance
-			 * problems where a victimizing store miss goes to
-			 * the head of the retry FIFO and locks the pipe for
-			 * a few cycles.  So a few subsequent stores in this
-			 * loop go into the retry FIFO, and then later
-			 * stores see other stores to the same cache line
-			 * are already in the retry FIFO and themselves go
-			 * into the retry FIFO, filling it up and grinding
-			 * to a halt waiting for the original miss to be
-			 * satisfied.
-			 */
-			__insn_prefetch(&out32[ahead32]);
-
-#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
-#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
-#endif
-
-			n32 -= CACHE_LINE_SIZE_IN_WORDS;
-
-			/* Save icache space by only partially unrolling
-			 * this loop.
-			 */
-			for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
-				*out32++ = v32;
-				*out32++ = v32;
-				*out32++ = v32;
-				*out32++ = v32;
-			}
-
-			/* To save compiled code size, reuse this loop even
-			 * when we run out of prefetching to do by dropping
-			 * ahead32 down.
-			 */
-			if (n32 <= ahead32) {
-				/* Not even a full cache line left,
-				 * so stop now.
-				 */
-				if (n32 < CACHE_LINE_SIZE_IN_WORDS)
-					break;
-
-				/* Choose a small enough value that we don't
-				 * prefetch past the end.  There's no sense
-				 * in touching cache lines we don't have to.
-				 */
-				ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
-			}
-		}
-	}
-
-#else /* CHIP_HAS_WH64() */
-
 	/* Determine how many words we need to emit before the 'out32'
 	 * pointer becomes aligned modulo the cache line size.
 	 */
@@ -233,8 +130,6 @@ void *memset(void *s, int c, size_t n)
 		n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
 	}
 
-#endif /* CHIP_HAS_WH64() */
-
 	/* Now handle any leftover values. */
 	if (n32 != 0) {
 		do {