Merge git://git.kernel.org/pub/scm/linux/kernel/git/cmetcalf/linux-tile

Pull tile updates from Chris Metcalf: "These changes cover a range of new arch/tile features and optimizations. They've been through LKML review and on linux-next for a month or so. There's also one bug-fix that just missed 3.4, which I've marked for stable." Fixed up trivial conflict in arch/tile/Kconfig (new added tile Kconfig entries clashing with the generic timer/clockevents changes). * git://git.kernel.org/pub/scm/linux/kernel/git/cmetcalf/linux-tile: tile: default to tilegx_defconfig for ARCH=tile tile: fix bug where fls(0) was not returning 0 arch/tile: mark TILEGX as not EXPERIMENTAL tile/mm/fault.c: Port OOM changes to handle_page_fault arch/tile: add descriptive text if the kernel reports a bad trap arch/tile: allow querying cpu module information from the hypervisor arch/tile: fix hardwall for tilegx and generalize for idn and ipi arch/tile: support multiple huge page sizes dynamically mm: add new arch_make_huge_pte() method for tile support arch/tile: support kexec() for tilegx arch/tile: support <asm/cachectl.h> header for cacheflush() syscall arch/tile: Allow tilegx to build with either 16K or 64K page size arch/tile: optimize get_user/put_user and friends arch/tile: support building big-endian kernel arch/tile: allow building Linux with transparent huge pages enabled arch/tile: use interrupt critical sections less

Merge git://git.kernel.org/pub/scm/linux/kernel/git/cmetcalf/linux-tile
Pull tile updates from Chris Metcalf: "These changes cover a range of new arch/tile features and optimizations. They've been through LKML review and on linux-next for a month or so. There's also one bug-fix that just missed 3.4, which I've marked for stable." Fixed up trivial conflict in arch/tile/Kconfig (new added tile Kconfig entries clashing with the generic timer/clockevents changes). * git://git.kernel.org/pub/scm/linux/kernel/git/cmetcalf/linux-tile: tile: default to tilegx_defconfig for ARCH=tile tile: fix bug where fls(0) was not returning 0 arch/tile: mark TILEGX as not EXPERIMENTAL tile/mm/fault.c: Port OOM changes to handle_page_fault arch/tile: add descriptive text if the kernel reports a bad trap arch/tile: allow querying cpu module information from the hypervisor arch/tile: fix hardwall for tilegx and generalize for idn and ipi arch/tile: support multiple huge page sizes dynamically mm: add new arch_make_huge_pte() method for tile support arch/tile: support kexec() for tilegx arch/tile: support <asm/cachectl.h> header for cacheflush() syscall arch/tile: Allow tilegx to build with either 16K or 64K page size arch/tile: optimize get_user/put_user and friends arch/tile: support building big-endian kernel arch/tile: allow building Linux with transparent huge pages enabled arch/tile: use interrupt critical sections less
fa2af6e4 · Linus Torvalds · 109b9b04 · 1fcb78e9 · fa2af6e4 · fa2af6e4
74 changed file
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -48,6 +48,14 @@ config NEED_PER_CPU_PAGE_FIRST_CHUNK
 config SYS_SUPPORTS_HUGETLBFS
 	def_bool y

+# Support for additional huge page sizes besides HPAGE_SIZE.
+# The software support is currently only present in the TILE-Gx
+# hypervisor. TILEPro in any case does not support page sizes
+# larger than the default HPAGE_SIZE.
+config HUGETLB_SUPER_PAGES
+	depends on HUGETLB_PAGE && TILEGX
+	def_bool y
+
 # FIXME: tilegx can implement a more efficient rwsem.
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
@@ -107,16 +115,14 @@ config HVC_TILE
 	select HVC_DRIVER
 	def_bool y

-# Please note: TILE-Gx support is not yet finalized; this is
-# the preliminary support.  TILE-Gx drivers are only provided
-# with the alpha or beta test versions for Tilera customers.
 config TILEGX
-	depends on EXPERIMENTAL
 	bool "Building with TILE-Gx (64-bit) compiler and toolchain"

+config TILEPRO
+	def_bool !TILEGX
+
 config 64BIT
-	depends on TILEGX
-	def_bool y
+	def_bool TILEGX

 config ARCH_DEFCONFIG
 	string
@@ -137,6 +143,31 @@ config NR_CPUS
 	  smaller kernel memory footprint results from using a smaller
 	  value on chips with fewer tiles.

+if TILEGX
+
+choice
+	prompt "Kernel page size"
+	default PAGE_SIZE_64KB
+	help
+	  This lets you select the page size of the kernel.  For best
+	  performance on memory-intensive applications, a page size of 64KB
+	  is recommended.  For workloads involving many small files, many
+	  connections, etc., it may be better to select 16KB, which uses
+	  memory more efficiently at some cost in TLB performance.
+
+	  Note that this option is TILE-Gx specific; currently
+	  TILEPro page size is set by rebuilding the hypervisor.
+
+config PAGE_SIZE_16KB
+	bool "16KB"
+
+config PAGE_SIZE_64KB
+	bool "64KB"
+
+endchoice
+
+endif
+
 source "kernel/Kconfig.hz"

 config KEXEC

--- a/arch/tile/Makefile
+++ b/arch/tile/Makefile
@@ -34,7 +34,12 @@ LIBGCC_PATH     := \
  $(shell $(CC) $(KBUILD_CFLAGS) $(KCFLAGS) -print-libgcc-file-name)

 # Provide the path to use for "make defconfig".
-KBUILD_DEFCONFIG := $(ARCH)_defconfig
+# We default to the newer TILE-Gx architecture if only "tile" is given.
+ifeq ($(ARCH),tile)
+        KBUILD_DEFCONFIG := tilegx_defconfig
+else
+        KBUILD_DEFCONFIG := $(ARCH)_defconfig
+endif

 # Used as a file extension when useful, e.g. head_$(BITS).o
 # Not needed for (e.g.) "$(CC) -m32" since the compiler automatically

--- a/arch/tile/include/arch/spr_def_32.h
+++ b/arch/tile/include/arch/spr_def_32.h
@@ -65,6 +65,31 @@
 #define SPR_EX_CONTEXT_2_1__ICS_RMASK 0x1
 #define SPR_EX_CONTEXT_2_1__ICS_MASK  0x4
 #define SPR_FAIL 0x4e09
+#define SPR_IDN_AVAIL_EN 0x3e05
+#define SPR_IDN_CA_DATA 0x0b00
+#define SPR_IDN_DATA_AVAIL 0x0b03
+#define SPR_IDN_DEADLOCK_TIMEOUT 0x3406
+#define SPR_IDN_DEMUX_CA_COUNT 0x0a05
+#define SPR_IDN_DEMUX_COUNT_0 0x0a06
+#define SPR_IDN_DEMUX_COUNT_1 0x0a07
+#define SPR_IDN_DEMUX_CTL 0x0a08
+#define SPR_IDN_DEMUX_QUEUE_SEL 0x0a0a
+#define SPR_IDN_DEMUX_STATUS 0x0a0b
+#define SPR_IDN_DEMUX_WRITE_FIFO 0x0a0c
+#define SPR_IDN_DIRECTION_PROTECT 0x2e05
+#define SPR_IDN_PENDING 0x0a0e
+#define SPR_IDN_REFILL_EN 0x0e05
+#define SPR_IDN_SP_FIFO_DATA 0x0a0f
+#define SPR_IDN_SP_FIFO_SEL 0x0a10
+#define SPR_IDN_SP_FREEZE 0x0a11
+#define SPR_IDN_SP_FREEZE__SP_FRZ_MASK  0x1
+#define SPR_IDN_SP_FREEZE__DEMUX_FRZ_MASK  0x2
+#define SPR_IDN_SP_FREEZE__NON_DEST_EXT_MASK  0x4
+#define SPR_IDN_SP_STATE 0x0a12
+#define SPR_IDN_TAG_0 0x0a13
+#define SPR_IDN_TAG_1 0x0a14
+#define SPR_IDN_TAG_VALID 0x0a15
+#define SPR_IDN_TILE_COORD 0x0a16
 #define SPR_INTCTRL_0_STATUS 0x4a07
 #define SPR_INTCTRL_1_STATUS 0x4807
 #define SPR_INTCTRL_2_STATUS 0x4607
@@ -87,12 +112,36 @@
 #define SPR_INTERRUPT_MASK_SET_1_1 0x480e
 #define SPR_INTERRUPT_MASK_SET_2_0 0x460c
 #define SPR_INTERRUPT_MASK_SET_2_1 0x460d
+#define SPR_MPL_AUX_PERF_COUNT_SET_0 0x6000
+#define SPR_MPL_AUX_PERF_COUNT_SET_1 0x6001
+#define SPR_MPL_AUX_PERF_COUNT_SET_2 0x6002
 #define SPR_MPL_DMA_CPL_SET_0 0x5800
 #define SPR_MPL_DMA_CPL_SET_1 0x5801
 #define SPR_MPL_DMA_CPL_SET_2 0x5802
 #define SPR_MPL_DMA_NOTIFY_SET_0 0x3800
 #define SPR_MPL_DMA_NOTIFY_SET_1 0x3801
 #define SPR_MPL_DMA_NOTIFY_SET_2 0x3802
+#define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
+#define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
+#define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
+#define SPR_MPL_IDN_AVAIL_SET_0 0x3e00
+#define SPR_MPL_IDN_AVAIL_SET_1 0x3e01
+#define SPR_MPL_IDN_AVAIL_SET_2 0x3e02
+#define SPR_MPL_IDN_CA_SET_0 0x3a00
+#define SPR_MPL_IDN_CA_SET_1 0x3a01
+#define SPR_MPL_IDN_CA_SET_2 0x3a02
+#define SPR_MPL_IDN_COMPLETE_SET_0 0x1200
+#define SPR_MPL_IDN_COMPLETE_SET_1 0x1201
+#define SPR_MPL_IDN_COMPLETE_SET_2 0x1202
+#define SPR_MPL_IDN_FIREWALL_SET_0 0x2e00
+#define SPR_MPL_IDN_FIREWALL_SET_1 0x2e01
+#define SPR_MPL_IDN_FIREWALL_SET_2 0x2e02
+#define SPR_MPL_IDN_REFILL_SET_0 0x0e00
+#define SPR_MPL_IDN_REFILL_SET_1 0x0e01
+#define SPR_MPL_IDN_REFILL_SET_2 0x0e02
+#define SPR_MPL_IDN_TIMER_SET_0 0x3400
+#define SPR_MPL_IDN_TIMER_SET_1 0x3401
+#define SPR_MPL_IDN_TIMER_SET_2 0x3402
 #define SPR_MPL_INTCTRL_0_SET_0 0x4a00
 #define SPR_MPL_INTCTRL_0_SET_1 0x4a01
 #define SPR_MPL_INTCTRL_0_SET_2 0x4a02
@@ -102,6 +151,9 @@
 #define SPR_MPL_INTCTRL_2_SET_0 0x4600
 #define SPR_MPL_INTCTRL_2_SET_1 0x4601
 #define SPR_MPL_INTCTRL_2_SET_2 0x4602
+#define SPR_MPL_PERF_COUNT_SET_0 0x4200
+#define SPR_MPL_PERF_COUNT_SET_1 0x4201
+#define SPR_MPL_PERF_COUNT_SET_2 0x4202
 #define SPR_MPL_SN_ACCESS_SET_0 0x0800
 #define SPR_MPL_SN_ACCESS_SET_1 0x0801
 #define SPR_MPL_SN_ACCESS_SET_2 0x0802
@@ -181,6 +233,7 @@
 #define SPR_UDN_DEMUX_STATUS 0x0c0d
 #define SPR_UDN_DEMUX_WRITE_FIFO 0x0c0e
 #define SPR_UDN_DIRECTION_PROTECT 0x3005
+#define SPR_UDN_PENDING 0x0c10
 #define SPR_UDN_REFILL_EN 0x1005
 #define SPR_UDN_SP_FIFO_DATA 0x0c11
 #define SPR_UDN_SP_FIFO_SEL 0x0c12
@@ -195,6 +248,9 @@
 #define SPR_UDN_TAG_3 0x0c18
 #define SPR_UDN_TAG_VALID 0x0c19
 #define SPR_UDN_TILE_COORD 0x0c1a
+#define SPR_WATCH_CTL 0x4209
+#define SPR_WATCH_MASK 0x420a
+#define SPR_WATCH_VAL 0x420b

 #endif /* !defined(__ARCH_SPR_DEF_H__) */


--- a/arch/tile/include/arch/spr_def_64.h
+++ b/arch/tile/include/arch/spr_def_64.h
@@ -52,6 +52,13 @@
 #define SPR_EX_CONTEXT_2_1__ICS_RMASK 0x1
 #define SPR_EX_CONTEXT_2_1__ICS_MASK  0x4
 #define SPR_FAIL 0x2707
+#define SPR_IDN_AVAIL_EN 0x1a05
+#define SPR_IDN_DATA_AVAIL 0x0a80
+#define SPR_IDN_DEADLOCK_TIMEOUT 0x1806
+#define SPR_IDN_DEMUX_COUNT_0 0x0a05
+#define SPR_IDN_DEMUX_COUNT_1 0x0a06
+#define SPR_IDN_DIRECTION_PROTECT 0x1405
+#define SPR_IDN_PENDING 0x0a08
 #define SPR_ILL_TRANS_REASON__I_STREAM_VA_RMASK 0x1
 #define SPR_INTCTRL_0_STATUS 0x2505
 #define SPR_INTCTRL_1_STATUS 0x2405
@@ -88,9 +95,27 @@
 #define SPR_IPI_MASK_SET_0 0x1f0a
 #define SPR_IPI_MASK_SET_1 0x1e0a
 #define SPR_IPI_MASK_SET_2 0x1d0a
+#define SPR_MPL_AUX_PERF_COUNT_SET_0 0x2100
+#define SPR_MPL_AUX_PERF_COUNT_SET_1 0x2101
+#define SPR_MPL_AUX_PERF_COUNT_SET_2 0x2102
 #define SPR_MPL_AUX_TILE_TIMER_SET_0 0x1700
 #define SPR_MPL_AUX_TILE_TIMER_SET_1 0x1701
 #define SPR_MPL_AUX_TILE_TIMER_SET_2 0x1702
+#define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
+#define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
+#define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
+#define SPR_MPL_IDN_AVAIL_SET_0 0x1a00
+#define SPR_MPL_IDN_AVAIL_SET_1 0x1a01
+#define SPR_MPL_IDN_AVAIL_SET_2 0x1a02
+#define SPR_MPL_IDN_COMPLETE_SET_0 0x0500
+#define SPR_MPL_IDN_COMPLETE_SET_1 0x0501
+#define SPR_MPL_IDN_COMPLETE_SET_2 0x0502
+#define SPR_MPL_IDN_FIREWALL_SET_0 0x1400
+#define SPR_MPL_IDN_FIREWALL_SET_1 0x1401
+#define SPR_MPL_IDN_FIREWALL_SET_2 0x1402
+#define SPR_MPL_IDN_TIMER_SET_0 0x1800
+#define SPR_MPL_IDN_TIMER_SET_1 0x1801
+#define SPR_MPL_IDN_TIMER_SET_2 0x1802
 #define SPR_MPL_INTCTRL_0_SET_0 0x2500
 #define SPR_MPL_INTCTRL_0_SET_1 0x2501
 #define SPR_MPL_INTCTRL_0_SET_2 0x2502
@@ -100,6 +125,21 @@
 #define SPR_MPL_INTCTRL_2_SET_0 0x2300
 #define SPR_MPL_INTCTRL_2_SET_1 0x2301
 #define SPR_MPL_INTCTRL_2_SET_2 0x2302
+#define SPR_MPL_IPI_0 0x1f04
+#define SPR_MPL_IPI_0_SET_0 0x1f00
+#define SPR_MPL_IPI_0_SET_1 0x1f01
+#define SPR_MPL_IPI_0_SET_2 0x1f02
+#define SPR_MPL_IPI_1 0x1e04
+#define SPR_MPL_IPI_1_SET_0 0x1e00
+#define SPR_MPL_IPI_1_SET_1 0x1e01
+#define SPR_MPL_IPI_1_SET_2 0x1e02
+#define SPR_MPL_IPI_2 0x1d04
+#define SPR_MPL_IPI_2_SET_0 0x1d00
+#define SPR_MPL_IPI_2_SET_1 0x1d01
+#define SPR_MPL_IPI_2_SET_2 0x1d02
+#define SPR_MPL_PERF_COUNT_SET_0 0x2000
+#define SPR_MPL_PERF_COUNT_SET_1 0x2001
+#define SPR_MPL_PERF_COUNT_SET_2 0x2002
 #define SPR_MPL_UDN_ACCESS_SET_0 0x0b00
 #define SPR_MPL_UDN_ACCESS_SET_1 0x0b01
 #define SPR_MPL_UDN_ACCESS_SET_2 0x0b02
@@ -167,6 +207,9 @@
 #define SPR_UDN_DEMUX_COUNT_2 0x0b07
 #define SPR_UDN_DEMUX_COUNT_3 0x0b08
 #define SPR_UDN_DIRECTION_PROTECT 0x1505
+#define SPR_UDN_PENDING 0x0b0a
+#define SPR_WATCH_MASK 0x200a
+#define SPR_WATCH_VAL 0x200b

 #endif /* !defined(__ARCH_SPR_DEF_H__) */


--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -2,6 +2,7 @@ include include/asm-generic/Kbuild.asm

 header-y += ../arch/

+header-y += cachectl.h
 header-y += ucontext.h
 header-y += hardwall.h

@@ -21,7 +22,6 @@ generic-y += ipcbuf.h
 generic-y += irq_regs.h
 generic-y += kdebug.h
 generic-y += local.h
-generic-y += module.h
 generic-y += msgbuf.h
 generic-y += mutex.h
 generic-y += param.h

--- a/arch/tile/include/asm/atomic_32.h
+++ b/arch/tile/include/asm/atomic_32.h
@@ -303,7 +303,14 @@ void __init_atomic_per_cpu(void);
 void __atomic_fault_unlock(int *lock_ptr);
 #endif

+/* Return a pointer to the lock for the given address. */
+int *__atomic_hashed_lock(volatile void *v);
+
 /* Private helper routines in lib/atomic_asm_32.S */
+struct __get_user {
+	unsigned long val;
+	int err;
+};
 extern struct __get_user __atomic_cmpxchg(volatile int *p,
 					  int *lock, int o, int n);
 extern struct __get_user __atomic_xchg(volatile int *p, int *lock, int n);
@@ -319,6 +326,9 @@ extern u64 __atomic64_xchg_add(volatile u64 *p, int *lock, u64 n);
 extern u64 __atomic64_xchg_add_unless(volatile u64 *p,
 				      int *lock, u64 o, u64 n);

+/* Return failure from the atomic wrappers. */
+struct __get_user __atomic_bad_address(int __user *addr);
+
 #endif /* !__ASSEMBLY__ */

 #endif /* _ASM_TILE_ATOMIC_32_H */
--- a/arch/tile/include/asm/bitops.h
+++ b/arch/tile/include/asm/bitops.h
@@ -77,6 +77,11 @@ static inline int ffs(int x)
 	return __builtin_ffs(x);
 }

+static inline int fls64(__u64 w)
+{
+	return (sizeof(__u64) * 8) - __builtin_clzll(w);
+}
+
 /**
 * fls - find last set bit in word
 * @x: the word to search
@@ -90,12 +95,7 @@ static inline int ffs(int x)
 */
 static inline int fls(int x)
 {
-	return (sizeof(int) * 8) - __builtin_clz(x);
-}
-
-static inline int fls64(__u64 w)
-{
-	return (sizeof(__u64) * 8) - __builtin_clzll(w);
+	return fls64((unsigned int) x);
 }

 static inline unsigned int __arch_hweight32(unsigned int w)

--- a/arch/tile/include/asm/byteorder.h
+++ b/arch/tile/include/asm/byteorder.h
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#if defined (__BIG_ENDIAN__)
+#include <linux/byteorder/big_endian.h>
+#elif defined (__LITTLE_ENDIAN__)
 #include <linux/byteorder/little_endian.h>
+#else
+#error "__BIG_ENDIAN__ or __LITTLE_ENDIAN__ must be defined."
+#endif
--- a/arch/tile/include/asm/cachectl.h
+++ b/arch/tile/include/asm/cachectl.h
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_CACHECTL_H
+#define _ASM_TILE_CACHECTL_H
+
+/*
+ * Options for cacheflush system call.
+ *
+ * The ICACHE flush is performed on all cores currently running the
+ * current process's address space.  The intent is for user
+ * applications to be able to modify code, invoke the system call,
+ * then allow arbitrary other threads in the same address space to see
+ * the newly-modified code.  Passing a length of CHIP_L1I_CACHE_SIZE()
+ * or more invalidates the entire icache on all cores in the address
+ * spaces.  (Note: currently this option invalidates the entire icache
+ * regardless of the requested address and length, but we may choose
+ * to honor the arguments at some point.)
+ *
+ * Flush and invalidation of memory can normally be performed with the
+ * __insn_flush(), __insn_inv(), and __insn_finv() instructions from
+ * userspace.  The DCACHE option to the system call allows userspace
+ * to flush the entire L1+L2 data cache from the core.  In this case,
+ * the address and length arguments are not used.  The DCACHE flush is
+ * restricted to the current core, not all cores in the address space.
+ */
+#define	ICACHE	(1<<0)		/* invalidate L1 instruction cache */
+#define	DCACHE	(1<<1)		/* flush and invalidate data cache */
+#define	BCACHE	(ICACHE|DCACHE)	/* flush both caches               */
+
+#endif	/* _ASM_TILE_CACHECTL_H */
--- a/arch/tile/include/asm/compat.h
+++ b/arch/tile/include/asm/compat.h
@@ -242,9 +242,6 @@ long compat_sys_fallocate(int fd, int mode,
 long compat_sys_sched_rr_get_interval(compat_pid_t pid,
 				      struct compat_timespec __user *interval);

-/* Tilera Linux syscalls that don't have "compat" versions. */
-#define compat_sys_flush_cache sys_flush_cache
-
 /* These are the intvec_64.S trampolines. */
 long _compat_sys_execve(const char __user *path,
 			const compat_uptr_t __user *argv,

--- a/arch/tile/include/asm/elf.h
+++ b/arch/tile/include/asm/elf.h
@@ -44,7 +44,11 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
 #else
 #define ELF_CLASS	ELFCLASS32
 #endif
+#ifdef __BIG_ENDIAN__
+#define ELF_DATA	ELFDATA2MSB
+#else
 #define ELF_DATA	ELFDATA2LSB
+#endif

 /*
 * There seems to be a bug in how compat_binfmt_elf.c works: it
@@ -59,6 +63,7 @@ enum { ELF_ARCH = CHIP_ELF_TYPE() };
 */
 #define elf_check_arch(x)  \
 	((x)->e_ident[EI_CLASS] == ELF_CLASS && \
+	 (x)->e_ident[EI_DATA] == ELF_DATA && \
 	 (x)->e_machine == CHIP_ELF_TYPE())

 /* The module loader only handles a few relocation types. */

--- a/arch/tile/include/asm/futex.h
+++ b/arch/tile/include/asm/futex.h
@@ -28,29 +28,81 @@
 #include <linux/futex.h>
 #include <linux/uaccess.h>
 #include <linux/errno.h>
+#include <asm/atomic.h>

-extern struct __get_user futex_set(u32 __user *v, int i);
-extern struct __get_user futex_add(u32 __user *v, int n);
-extern struct __get_user futex_or(u32 __user *v, int n);
-extern struct __get_user futex_andn(u32 __user *v, int n);
-extern struct __get_user futex_cmpxchg(u32 __user *v, int o, int n);
+/*
+ * Support macros for futex operations.  Do not use these macros directly.
+ * They assume "ret", "val", "oparg", and "uaddr" in the lexical context.
+ * __futex_cmpxchg() additionally assumes "oldval".
+ */
+
+#ifdef __tilegx__
+
+#define __futex_asm(OP) \
+	asm("1: {" #OP " %1, %3, %4; movei %0, 0 }\n"		\
+	    ".pushsection .fixup,\"ax\"\n"			\
+	    "0: { movei %0, %5; j 9f }\n"			\
+	    ".section __ex_table,\"a\"\n"			\
+	    ".quad 1b, 0b\n"					\
+	    ".popsection\n"					\
+	    "9:"						\
+	    : "=r" (ret), "=r" (val), "+m" (*(uaddr))		\
+	    : "r" (uaddr), "r" (oparg), "i" (-EFAULT))
+
+#define __futex_set() __futex_asm(exch4)
+#define __futex_add() __futex_asm(fetchadd4)
+#define __futex_or() __futex_asm(fetchor4)
+#define __futex_andn() ({ oparg = ~oparg; __futex_asm(fetchand4); })
+#define __futex_cmpxchg() \
+	({ __insn_mtspr(SPR_CMPEXCH_VALUE, oldval); __futex_asm(cmpexch4); })
+
+#define __futex_xor()						\
+	({							\
+		u32 oldval, n = oparg;				\
+		if ((ret = __get_user(oldval, uaddr)) == 0) {	\
+			do {					\
+				oparg = oldval ^ n;		\
+				__futex_cmpxchg();		\
+			} while (ret == 0 && oldval != val);	\
+		}						\
+	})
+
+/* No need to prefetch, since the atomic ops go to the home cache anyway. */
+#define __futex_prolog()

-#ifndef __tilegx__
-extern struct __get_user futex_xor(u32 __user *v, int n);
 #else
-static inline struct __get_user futex_xor(u32 __user *uaddr, int n)
-{
-	struct __get_user asm_ret = __get_user_4(uaddr);
-	if (!asm_ret.err) {
-		int oldval, newval;
-		do {
-			oldval = asm_ret.val;
-			newval = oldval ^ n;
-			asm_ret = futex_cmpxchg(uaddr, oldval, newval);
-		} while (asm_ret.err == 0 && oldval != asm_ret.val);
+
+#define __futex_call(FN)						\
+	{								\
+		struct __get_user gu = FN((u32 __force *)uaddr, lock, oparg); \
+		val = gu.val;						\
+		ret = gu.err;						\
 	}
-	return asm_ret;
-}
+
+#define __futex_set() __futex_call(__atomic_xchg)
+#define __futex_add() __futex_call(__atomic_xchg_add)
+#define __futex_or() __futex_call(__atomic_or)
+#define __futex_andn() __futex_call(__atomic_andn)
+#define __futex_xor() __futex_call(__atomic_xor)
+
+#define __futex_cmpxchg()						\
+	{								\
+		struct __get_user gu = __atomic_cmpxchg((u32 __force *)uaddr, \
+							lock, oldval, oparg); \
+		val = gu.val;						\
+		ret = gu.err;						\
+	}
+
+/*
+ * Find the lock pointer for the atomic calls to use, and issue a
+ * prefetch to the user address to bring it into cache.  Similar to
+ * __atomic_setup(), but we can't do a read into the L1 since it might
+ * fault; instead we do a prefetch into the L2.
+ */
+#define __futex_prolog()					\
+	int *lock;						\
+	__insn_prefetch(uaddr);					\
+	lock = __atomic_hashed_lock((int __force *)uaddr)
 #endif

 static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
@@ -59,8 +111,12 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	int cmp = (encoded_op >> 24) & 15;
 	int oparg = (encoded_op << 8) >> 20;
 	int cmparg = (encoded_op << 20) >> 20;
-	int ret;
-	struct __get_user asm_ret;
+	int uninitialized_var(val), ret;
+
+	__futex_prolog();
+
+	/* The 32-bit futex code makes this assumption, so validate it here. */
+	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));

 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
@@ -71,46 +127,45 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	pagefault_disable();
 	switch (op) {
 	case FUTEX_OP_SET:
-		asm_ret = futex_set(uaddr, oparg);
+		__futex_set();
 		break;
 	case FUTEX_OP_ADD:
-		asm_ret = futex_add(uaddr, oparg);
+		__futex_add();
 		break;
 	case FUTEX_OP_OR:
-		asm_ret = futex_or(uaddr, oparg);
+		__futex_or();
 		break;
 	case FUTEX_OP_ANDN:
-		asm_ret = futex_andn(uaddr, oparg);
+		__futex_andn();
 		break;
 	case FUTEX_OP_XOR:
-		asm_ret = futex_xor(uaddr, oparg);
+		__futex_xor();
 		break;
 	default:
-		asm_ret.err = -ENOSYS;
+		ret = -ENOSYS;
+		break;
 	}
 	pagefault_enable();

-	ret = asm_ret.err;
-
 	if (!ret) {
 		switch (cmp) {
 		case FUTEX_OP_CMP_EQ:
-			ret = (asm_ret.val == cmparg);
+			ret = (val == cmparg);
 			break;
 		case FUTEX_OP_CMP_NE:
-			ret = (asm_ret.val != cmparg);
+			ret = (val != cmparg);
 			break;
 		case FUTEX_OP_CMP_LT:
-			ret = (asm_ret.val < cmparg);
+			ret = (val < cmparg);
 			break;
 		case FUTEX_OP_CMP_GE:
-			ret = (asm_ret.val >= cmparg);
+			ret = (val >= cmparg);
 			break;
 		case FUTEX_OP_CMP_LE:
-			ret = (asm_ret.val <= cmparg);
+			ret = (val <= cmparg);
 			break;
 		case FUTEX_OP_CMP_GT:
-			ret = (asm_ret.val > cmparg);
+			ret = (val > cmparg);
 			break;
 		default:
 			ret = -ENOSYS;
@@ -120,22 +175,20 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 }

 static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
-						u32 oldval, u32 newval)
+						u32 oldval, u32 oparg)
 {
-	struct __get_user asm_ret;
+	int ret, val;
+
+	__futex_prolog();

 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;

-	asm_ret = futex_cmpxchg(uaddr, oldval, newval);
-	*uval = asm_ret.val;
-	return asm_ret.err;
-}
+	__futex_cmpxchg();

-#ifndef __tilegx__
-/* Return failure from the atomic wrappers. */
-struct __get_user __atomic_bad_address(int __user *addr);
-#endif
+	*uval = val;
+	return ret;
+}

 #endif /* !__ASSEMBLY__ */


--- a/arch/tile/include/asm/hardwall.h
+++ b/arch/tile/include/asm/hardwall.h
@@ -11,12 +11,14 @@
 *   NON INFRINGEMENT.  See the GNU General Public License for
 *   more details.
 *
- * Provide methods for the HARDWALL_FILE for accessing the UDN.
+ * Provide methods for access control of per-cpu resources like
+ * UDN, IDN, or IPI.
 */

 #ifndef _ASM_TILE_HARDWALL_H
 #define _ASM_TILE_HARDWALL_H

+#include <arch/chip.h>
 #include <linux/ioctl.h>

 #define HARDWALL_IOCTL_BASE 0xa2
@@ -24,8 +26,9 @@
 /*
 * The HARDWALL_CREATE() ioctl is a macro with a "size" argument.
 * The resulting ioctl value is passed to the kernel in conjunction
- * with a pointer to a little-endian bitmask of cpus, which must be
- * physically in a rectangular configuration on the chip.
+ * with a pointer to a standard kernel bitmask of cpus.
+ * For network resources (UDN or IDN) the bitmask must physically
+ * represent a rectangular configuration on the chip.
 * The "size" is the number of bytes of cpu mask data.
 */
 #define _HARDWALL_CREATE 1
@@ -44,13 +47,7 @@
 #define HARDWALL_GET_ID \
 _IO(HARDWALL_IOCTL_BASE, _HARDWALL_GET_ID)

-#ifndef __KERNEL__
-
-/* This is the canonical name expected by userspace. */
-#define HARDWALL_FILE "/dev/hardwall"
-
-#else
-
+#ifdef __KERNEL__
 /* /proc hooks for hardwall. */
 struct proc_dir_entry;
 #ifdef CONFIG_HARDWALL
@@ -59,7 +56,6 @@ int proc_pid_hardwall(struct task_struct *task, char *buffer);
 #else
 static inline void proc_tile_hardwall_init(struct proc_dir_entry *root) {}
 #endif
-
 #endif

 #endif /* _ASM_TILE_HARDWALL_H */
--- a/arch/tile/include/asm/hugetlb.h
+++ b/arch/tile/include/asm/hugetlb.h
@@ -106,4 +106,25 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }

+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
+				       struct page *page, int writable)
+{
+	size_t pagesize = huge_page_size(hstate_vma(vma));
+	if (pagesize != PUD_SIZE && pagesize != PMD_SIZE)
+		entry = pte_mksuper(entry);
+	return entry;
+}
+#define arch_make_huge_pte arch_make_huge_pte
+
+/* Sizes to scale up page size for PTEs with HV_PTE_SUPER bit. */
+enum {
+	HUGE_SHIFT_PGDIR = 0,
+	HUGE_SHIFT_PMD = 1,
+	HUGE_SHIFT_PAGE = 2,
+	HUGE_SHIFT_ENTRIES
+};
+extern int huge_shift[HUGE_SHIFT_ENTRIES];
+#endif
+
 #endif /* _ASM_TILE_HUGETLB_H */
--- a/arch/tile/include/asm/irqflags.h
+++ b/arch/tile/include/asm/irqflags.h
@@ -28,10 +28,10 @@
 */
 #if CHIP_HAS_AUX_PERF_COUNTERS()
 #define LINUX_MASKABLE_INTERRUPTS_HI \
-       (~(INT_MASK_HI(INT_PERF_COUNT) | INT_MASK_HI(INT_AUX_PERF_COUNT)))
+	(~(INT_MASK_HI(INT_PERF_COUNT) | INT_MASK_HI(INT_AUX_PERF_COUNT)))
 #else
 #define LINUX_MASKABLE_INTERRUPTS_HI \
-       (~(INT_MASK_HI(INT_PERF_COUNT)))
+	(~(INT_MASK_HI(INT_PERF_COUNT)))
 #endif

 #else
@@ -90,6 +90,14 @@
 	__insn_mtspr(SPR_INTERRUPT_MASK_RESET_K_0, (unsigned long)(__m)); \
 	__insn_mtspr(SPR_INTERRUPT_MASK_RESET_K_1, (unsigned long)(__m>>32)); \
 } while (0)
+#define interrupt_mask_save_mask() \
+	(__insn_mfspr(SPR_INTERRUPT_MASK_SET_K_0) | \
+	 (((unsigned long long)__insn_mfspr(SPR_INTERRUPT_MASK_SET_K_1))<<32))
+#define interrupt_mask_restore_mask(mask) do { \
+	unsigned long long __m = (mask); \
+	__insn_mtspr(SPR_INTERRUPT_MASK_K_0, (unsigned long)(__m)); \
+	__insn_mtspr(SPR_INTERRUPT_MASK_K_1, (unsigned long)(__m>>32)); \
+} while (0)
 #else
 #define interrupt_mask_set(n) \
 	__insn_mtspr(SPR_INTERRUPT_MASK_SET_K, (1UL << (n)))
@@ -101,6 +109,10 @@
 	__insn_mtspr(SPR_INTERRUPT_MASK_SET_K, (mask))
 #define interrupt_mask_reset_mask(mask) \
 	__insn_mtspr(SPR_INTERRUPT_MASK_RESET_K, (mask))
+#define interrupt_mask_save_mask() \
+	__insn_mfspr(SPR_INTERRUPT_MASK_K)
+#define interrupt_mask_restore_mask(mask) \
+	__insn_mtspr(SPR_INTERRUPT_MASK_K, (mask))
 #endif

 /*
@@ -122,7 +134,7 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);

 /* Disable all interrupts, including NMIs. */
 #define arch_local_irq_disable_all() \
-	interrupt_mask_set_mask(-1UL)
+	interrupt_mask_set_mask(-1ULL)

 /* Re-enable all maskable interrupts. */
 #define arch_local_irq_enable() \
@@ -179,7 +191,7 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 #ifdef __tilegx__

 #if INT_MEM_ERROR != 0
-# error Fix IRQ_DISABLED() macro
+# error Fix IRQS_DISABLED() macro
 #endif

 /* Return 0 or 1 to indicate whether interrupts are currently disabled. */
@@ -207,9 +219,10 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 	mtspr   SPR_INTERRUPT_MASK_SET_K, tmp

 /* Enable interrupts. */
-#define IRQ_ENABLE(tmp0, tmp1)					\
+#define IRQ_ENABLE_LOAD(tmp0, tmp1)				\
 	GET_INTERRUPTS_ENABLED_MASK_PTR(tmp0);			\
-	ld      tmp0, tmp0;					\
+	ld      tmp0, tmp0
+#define IRQ_ENABLE_APPLY(tmp0, tmp1)				\
 	mtspr   SPR_INTERRUPT_MASK_RESET_K, tmp0

 #else /* !__tilegx__ */
@@ -253,17 +266,22 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 	mtspr   SPR_INTERRUPT_MASK_SET_K_1, tmp

 /* Enable interrupts. */
-#define IRQ_ENABLE(tmp0, tmp1)					\
+#define IRQ_ENABLE_LOAD(tmp0, tmp1)				\
 	GET_INTERRUPTS_ENABLED_MASK_PTR(tmp0);			\
 	{							\
 	 lw     tmp0, tmp0;					\
 	 addi   tmp1, tmp0, 4					\
 	};							\
-	lw      tmp1, tmp1;					\
+	lw      tmp1, tmp1
+#define IRQ_ENABLE_APPLY(tmp0, tmp1)				\
 	mtspr   SPR_INTERRUPT_MASK_RESET_K_0, tmp0;		\
 	mtspr   SPR_INTERRUPT_MASK_RESET_K_1, tmp1
 #endif

+#define IRQ_ENABLE(tmp0, tmp1)					\
+	IRQ_ENABLE_LOAD(tmp0, tmp1);				\
+	IRQ_ENABLE_APPLY(tmp0, tmp1)
+
 /*
 * Do the CPU's IRQ-state tracing from assembly code. We call a
 * C function, but almost everywhere we do, we don't mind clobbering

--- a/arch/tile/include/asm/kexec.h
+++ b/arch/tile/include/asm/kexec.h
@@ -19,12 +19,24 @@

 #include <asm/page.h>

+#ifndef __tilegx__
 /* Maximum physical address we can use pages from. */
 #define KEXEC_SOURCE_MEMORY_LIMIT TASK_SIZE
 /* Maximum address we can reach in physical address mode. */
 #define KEXEC_DESTINATION_MEMORY_LIMIT TASK_SIZE
 /* Maximum address we can use for the control code buffer. */
 #define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
+#else
+/* We need to limit the memory below PGDIR_SIZE since
+ * we only setup page table for [0, PGDIR_SIZE) before final kexec.
+ */
+/* Maximum physical address we can use pages from. */
+#define KEXEC_SOURCE_MEMORY_LIMIT PGDIR_SIZE
+/* Maximum address we can reach in physical address mode. */
+#define KEXEC_DESTINATION_MEMORY_LIMIT PGDIR_SIZE
+/* Maximum address we can use for the control code buffer. */
+#define KEXEC_CONTROL_MEMORY_LIMIT PGDIR_SIZE
+#endif

 #define KEXEC_CONTROL_PAGE_SIZE	PAGE_SIZE


--- a/arch/tile/include/asm/mmu.h
+++ b/arch/tile/include/asm/mmu.h
@@ -21,7 +21,7 @@ struct mm_context {
 	 * Written under the mmap_sem semaphore; read without the
 	 * semaphore but atomically, but it is conservatively set.
 	 */
-	unsigned int priority_cached;
+	unsigned long priority_cached;
 };

 typedef struct mm_context mm_context_t;

--- a/arch/tile/include/asm/mmu_context.h
+++ b/arch/tile/include/asm/mmu_context.h
@@ -30,11 +30,15 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	return 0;
 }

-/* Note that arch/tile/kernel/head.S also calls hv_install_context() */
+/*
+ * Note that arch/tile/kernel/head_NN.S and arch/tile/mm/migrate_NN.S
+ * also call hv_install_context().
+ */
 static inline void __install_page_table(pgd_t *pgdir, int asid, pgprot_t prot)
 {
 	/* FIXME: DIRECTIO should not always be set. FIXME. */
-	int rc = hv_install_context(__pa(pgdir), prot, asid, HV_CTX_DIRECTIO);
+	int rc = hv_install_context(__pa(pgdir), prot, asid,
+				    HV_CTX_DIRECTIO | CTX_PAGE_FLAG);
 	if (rc < 0)
 		panic("hv_install_context failed: %d", rc);
 }

--- a/arch/tile/include/asm/module.h
+++ b/arch/tile/include/asm/module.h
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_MODULE_H
+#define _ASM_TILE_MODULE_H
+
+#include <arch/chip.h>
+
+#include <asm-generic/module.h>
+
+/* We can't use modules built with different page sizes. */
+#if defined(CONFIG_PAGE_SIZE_16KB)
+# define MODULE_PGSZ " 16KB"
+#elif defined(CONFIG_PAGE_SIZE_64KB)
+# define MODULE_PGSZ " 64KB"
+#else
+# define MODULE_PGSZ ""
+#endif
+
+/* We don't really support no-SMP so tag if someone tries. */
+#ifdef CONFIG_SMP
+#define MODULE_NOSMP ""
+#else
+#define MODULE_NOSMP " nosmp"
+#endif
+
+#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_NOSMP
+
+#endif /* _ASM_TILE_MODULE_H */
--- a/arch/tile/include/asm/page.h
+++ b/arch/tile/include/asm/page.h
@@ -20,8 +20,17 @@
 #include <arch/chip.h>

 /* PAGE_SHIFT and HPAGE_SHIFT determine the page sizes. */
-#define PAGE_SHIFT	HV_LOG2_PAGE_SIZE_SMALL
-#define HPAGE_SHIFT	HV_LOG2_PAGE_SIZE_LARGE
+#if defined(CONFIG_PAGE_SIZE_16KB)
+#define PAGE_SHIFT	14
+#define CTX_PAGE_FLAG	HV_CTX_PG_SM_16K
+#elif defined(CONFIG_PAGE_SIZE_64KB)
+#define PAGE_SHIFT	16
+#define CTX_PAGE_FLAG	HV_CTX_PG_SM_64K
+#else
+#define PAGE_SHIFT	HV_LOG2_DEFAULT_PAGE_SIZE_SMALL
+#define CTX_PAGE_FLAG	0
+#endif
+#define HPAGE_SHIFT	HV_LOG2_DEFAULT_PAGE_SIZE_LARGE

 #define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
 #define HPAGE_SIZE	(_AC(1, UL) << HPAGE_SHIFT)
@@ -78,8 +87,7 @@ typedef HV_PTE pgprot_t;
 /*
 * User L2 page tables are managed as one L2 page table per page,
 * because we use the page allocator for them.  This keeps the allocation
- * simple and makes it potentially useful to implement HIGHPTE at some point.
- * However, it's also inefficient, since L2 page tables are much smaller
+ * simple, but it's also inefficient, since L2 page tables are much smaller
 * than pages (currently 2KB vs 64KB).  So we should revisit this.
 */
 typedef struct page *pgtable_t;
@@ -128,7 +136,7 @@ static inline __attribute_const__ int get_order(unsigned long size)

 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)

-#define HUGE_MAX_HSTATE		2
+#define HUGE_MAX_HSTATE		6

 #ifdef CONFIG_HUGETLB_PAGE
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA

--- a/arch/tile/include/asm/pgalloc.h
+++ b/arch/tile/include/asm/pgalloc.h
@@ -19,24 +19,24 @@
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <asm/fixmap.h>
+#include <asm/page.h>
 #include <hv/hypervisor.h>

 /* Bits for the size of the second-level page table. */
-#define L2_KERNEL_PGTABLE_SHIFT \
-  (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL + HV_LOG2_PTE_SIZE)
+#define L2_KERNEL_PGTABLE_SHIFT _HV_LOG2_L2_SIZE(HPAGE_SHIFT, PAGE_SHIFT)
+
+/* How big is a kernel L2 page table? */
+#define L2_KERNEL_PGTABLE_SIZE (1UL << L2_KERNEL_PGTABLE_SHIFT)

 /* We currently allocate user L2 page tables by page (unlike kernel L2s). */
-#if L2_KERNEL_PGTABLE_SHIFT < HV_LOG2_PAGE_SIZE_SMALL
-#define L2_USER_PGTABLE_SHIFT HV_LOG2_PAGE_SIZE_SMALL
+#if L2_KERNEL_PGTABLE_SHIFT < PAGE_SHIFT
+#define L2_USER_PGTABLE_SHIFT PAGE_SHIFT
 #else
 #define L2_USER_PGTABLE_SHIFT L2_KERNEL_PGTABLE_SHIFT
 #endif

 /* How many pages do we need, as an "order", for a user L2 page table? */
-#define L2_USER_PGTABLE_ORDER (L2_USER_PGTABLE_SHIFT - HV_LOG2_PAGE_SIZE_SMALL)
-
-/* How big is a kernel L2 page table? */
-#define L2_KERNEL_PGTABLE_SIZE (1 << L2_KERNEL_PGTABLE_SHIFT)
+#define L2_USER_PGTABLE_ORDER (L2_USER_PGTABLE_SHIFT - PAGE_SHIFT)

 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
@@ -50,14 +50,14 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 static inline void pmd_populate_kernel(struct mm_struct *mm,
 				       pmd_t *pmd, pte_t *ptep)
 {
-	set_pmd(pmd, ptfn_pmd(__pa(ptep) >> HV_LOG2_PAGE_TABLE_ALIGN,
+	set_pmd(pmd, ptfn_pmd(HV_CPA_TO_PTFN(__pa(ptep)),
 			      __pgprot(_PAGE_PRESENT)));
 }

 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 				pgtable_t page)
 {
-	set_pmd(pmd, ptfn_pmd(HV_PFN_TO_PTFN(page_to_pfn(page)),
+	set_pmd(pmd, ptfn_pmd(HV_CPA_TO_PTFN(PFN_PHYS(page_to_pfn(page))),
 			      __pgprot(_PAGE_PRESENT)));
 }

@@ -68,8 +68,20 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);

-extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address);
-extern void pte_free(struct mm_struct *mm, struct page *pte);
+extern pgtable_t pgtable_alloc_one(struct mm_struct *mm, unsigned long address,
+				   int order);
+extern void pgtable_free(struct mm_struct *mm, struct page *pte, int order);
+
+static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
+				      unsigned long address)
+{
+	return pgtable_alloc_one(mm, address, L2_USER_PGTABLE_ORDER);
+}
+
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
+{
+	pgtable_free(mm, pte, L2_USER_PGTABLE_ORDER);
+}

 #define pmd_pgtable(pmd) pmd_page(pmd)

@@ -85,8 +97,13 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 	pte_free(mm, virt_to_page(pte));
 }

-extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
-			   unsigned long address);
+extern void __pgtable_free_tlb(struct mmu_gather *tlb, struct page *pte,
+			       unsigned long address, int order);
+static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
+				  unsigned long address)
+{
+	__pgtable_free_tlb(tlb, pte, address, L2_USER_PGTABLE_ORDER);
+}

 #define check_pgt_cache()	do { } while (0)

@@ -104,19 +121,44 @@ void shatter_pmd(pmd_t *pmd);
 void shatter_huge_page(unsigned long addr);

 #ifdef __tilegx__
-/* We share a single page allocator for both L1 and L2 page tables. */
-#if HV_L1_SIZE != HV_L2_SIZE
-# error Rework assumption that L1 and L2 page tables are same size.
-#endif
-#define L1_USER_PGTABLE_ORDER L2_USER_PGTABLE_ORDER
+
 #define pud_populate(mm, pud, pmd) \
  pmd_populate_kernel((mm), (pmd_t *)(pud), (pte_t *)(pmd))
-#define pmd_alloc_one(mm, addr) \
-  ((pmd_t *)page_to_virt(pte_alloc_one((mm), (addr))))
-#define pmd_free(mm, pmdp) \
-  pte_free((mm), virt_to_page(pmdp))
-#define __pmd_free_tlb(tlb, pmdp, address) \
-  __pte_free_tlb((tlb), virt_to_page(pmdp), (address))
+
+/* Bits for the size of the L1 (intermediate) page table. */
+#define L1_KERNEL_PGTABLE_SHIFT _HV_LOG2_L1_SIZE(HPAGE_SHIFT)
+
+/* How big is a kernel L2 page table? */
+#define L1_KERNEL_PGTABLE_SIZE (1UL << L1_KERNEL_PGTABLE_SHIFT)
+
+/* We currently allocate L1 page tables by page. */
+#if L1_KERNEL_PGTABLE_SHIFT < PAGE_SHIFT
+#define L1_USER_PGTABLE_SHIFT PAGE_SHIFT
+#else
+#define L1_USER_PGTABLE_SHIFT L1_KERNEL_PGTABLE_SHIFT
 #endif

+/* How many pages do we need, as an "order", for an L1 page table? */
+#define L1_USER_PGTABLE_ORDER (L1_USER_PGTABLE_SHIFT - PAGE_SHIFT)
+
+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+	struct page *p = pgtable_alloc_one(mm, address, L1_USER_PGTABLE_ORDER);
+	return (pmd_t *)page_to_virt(p);
+}
+
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmdp)
+{
+	pgtable_free(mm, virt_to_page(pmdp), L1_USER_PGTABLE_ORDER);
+}
+
+static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
+				  unsigned long address)
+{
+	__pgtable_free_tlb(tlb, virt_to_page(pmdp), address,
+			   L1_USER_PGTABLE_ORDER);
+}
+
+#endif /* __tilegx__ */
+
 #endif /* _ASM_TILE_PGALLOC_H */
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -27,8 +27,10 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
+#include <linux/pfn.h>
 #include <asm/processor.h>
 #include <asm/fixmap.h>
+#include <asm/page.h>

 struct mm_struct;
 struct vm_area_struct;
@@ -69,6 +71,7 @@ extern void set_page_homes(void);

 #define _PAGE_PRESENT           HV_PTE_PRESENT
 #define _PAGE_HUGE_PAGE         HV_PTE_PAGE
+#define _PAGE_SUPER_PAGE        HV_PTE_SUPER
 #define _PAGE_READABLE          HV_PTE_READABLE
 #define _PAGE_WRITABLE          HV_PTE_WRITABLE
 #define _PAGE_EXECUTABLE        HV_PTE_EXECUTABLE
@@ -85,6 +88,7 @@ extern void set_page_homes(void);
 #define _PAGE_ALL (\
  _PAGE_PRESENT | \
  _PAGE_HUGE_PAGE | \
+  _PAGE_SUPER_PAGE | \
  _PAGE_READABLE | \
  _PAGE_WRITABLE | \
  _PAGE_EXECUTABLE | \
@@ -162,7 +166,7 @@ extern void set_page_homes(void);
  (pgprot_t) { ((oldprot).val & ~_PAGE_ALL) | (newprot).val }

 /* Just setting the PFN to zero suffices. */
-#define pte_pgprot(x) hv_pte_set_pfn((x), 0)
+#define pte_pgprot(x) hv_pte_set_pa((x), 0)

 /*
 * For PTEs and PDEs, we must clear the Present bit first when
@@ -187,6 +191,7 @@ static inline void __pte_clear(pte_t *ptep)
 * Undefined behaviour if not..
 */
 #define pte_present hv_pte_get_present
+#define pte_mknotpresent hv_pte_clear_present
 #define pte_user hv_pte_get_user
 #define pte_read hv_pte_get_readable
 #define pte_dirty hv_pte_get_dirty
@@ -194,6 +199,7 @@ static inline void __pte_clear(pte_t *ptep)
 #define pte_write hv_pte_get_writable
 #define pte_exec hv_pte_get_executable
 #define pte_huge hv_pte_get_page
+#define pte_super hv_pte_get_super
 #define pte_rdprotect hv_pte_clear_readable
 #define pte_exprotect hv_pte_clear_executable
 #define pte_mkclean hv_pte_clear_dirty
@@ -206,6 +212,7 @@ static inline void __pte_clear(pte_t *ptep)
 #define pte_mkyoung hv_pte_set_accessed
 #define pte_mkwrite hv_pte_set_writable
 #define pte_mkhuge hv_pte_set_page
+#define pte_mksuper hv_pte_set_super

 #define pte_special(pte) 0
 #define pte_mkspecial(pte) (pte)
@@ -261,7 +268,7 @@ static inline int pte_none(pte_t pte)

 static inline unsigned long pte_pfn(pte_t pte)
 {
-	return hv_pte_get_pfn(pte);
+	return PFN_DOWN(hv_pte_get_pa(pte));
 }

 /* Set or get the remote cache cpu in a pgprot with remote caching. */
@@ -270,7 +277,7 @@ extern int get_remote_cache_cpu(pgprot_t prot);

 static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot)
 {
-	return hv_pte_set_pfn(prot, pfn);
+	return hv_pte_set_pa(prot, PFN_PHYS(pfn));
 }

 /* Support for priority mappings. */
@@ -312,7 +319,7 @@ extern void check_mm_caching(struct mm_struct *prev, struct mm_struct *next);
 */
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
-	return pfn_pte(hv_pte_get_pfn(pte), newprot);
+	return pfn_pte(pte_pfn(pte), newprot);
 }

 /*
@@ -335,13 +342,8 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 */
 #define pgd_offset_k(address) pgd_offset(&init_mm, address)

-#if defined(CONFIG_HIGHPTE)
-extern pte_t *pte_offset_map(pmd_t *, unsigned long address);
-#define pte_unmap(pte) kunmap_atomic(pte)
-#else
 #define pte_offset_map(dir, address) pte_offset_kernel(dir, address)
 #define pte_unmap(pte) do { } while (0)
-#endif

 /* Clear a non-executable kernel PTE and flush it from the TLB. */
 #define kpte_clear_flush(ptep, vaddr)		\
@@ -410,6 +412,46 @@ static inline unsigned long pmd_index(unsigned long address)
 	return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
 }

+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+					    unsigned long address,
+					    pmd_t *pmdp)
+{
+	return ptep_test_and_clear_young(vma, address, pmdp_ptep(pmdp));
+}
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+				      unsigned long address, pmd_t *pmdp)
+{
+	ptep_set_wrprotect(mm, address, pmdp_ptep(pmdp));
+}
+
+
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+				       unsigned long address,
+				       pmd_t *pmdp)
+{
+	return pte_pmd(ptep_get_and_clear(mm, address, pmdp_ptep(pmdp)));
+}
+
+static inline void __set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+	set_pte(pmdp_ptep(pmdp), pmd_pte(pmdval));
+}
+
+#define set_pmd_at(mm, addr, pmdp, pmdval) __set_pmd(pmdp, pmdval)
+
+/* Create a pmd from a PTFN. */
+static inline pmd_t ptfn_pmd(unsigned long ptfn, pgprot_t prot)
+{
+	return pte_pmd(hv_pte_set_ptfn(prot, ptfn));
+}
+
+/* Return the page-table frame number (ptfn) that a pmd_t points at. */
+#define pmd_ptfn(pmd) hv_pte_get_ptfn(pmd_pte(pmd))
+
 /*
 * A given kernel pmd_t maps to a specific virtual address (either a
 * kernel huge page or a kernel pte_t table).  Since kernel pte_t
@@ -430,7 +472,48 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 * OK for pte_lockptr(), since we just end up with potentially one
 * lock being used for several pte_t arrays.
 */
-#define pmd_page(pmd) pfn_to_page(HV_PTFN_TO_PFN(pmd_ptfn(pmd)))
+#define pmd_page(pmd) pfn_to_page(PFN_DOWN(HV_PTFN_TO_CPA(pmd_ptfn(pmd))))
+
+static inline void pmd_clear(pmd_t *pmdp)
+{
+	__pte_clear(pmdp_ptep(pmdp));
+}
+
+#define pmd_mknotpresent(pmd)	pte_pmd(pte_mknotpresent(pmd_pte(pmd)))
+#define pmd_young(pmd)		pte_young(pmd_pte(pmd))
+#define pmd_mkyoung(pmd)	pte_pmd(pte_mkyoung(pmd_pte(pmd)))
+#define pmd_mkold(pmd)		pte_pmd(pte_mkold(pmd_pte(pmd)))
+#define pmd_mkwrite(pmd)	pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#define pmd_write(pmd)		pte_write(pmd_pte(pmd))
+#define pmd_wrprotect(pmd)	pte_pmd(pte_wrprotect(pmd_pte(pmd)))
+#define pmd_mkdirty(pmd)	pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+#define pmd_huge_page(pmd)	pte_huge(pmd_pte(pmd))
+#define pmd_mkhuge(pmd)		pte_pmd(pte_mkhuge(pmd_pte(pmd)))
+#define __HAVE_ARCH_PMD_WRITE
+
+#define pfn_pmd(pfn, pgprot)	pte_pmd(pfn_pte((pfn), (pgprot)))
+#define pmd_pfn(pmd)		pte_pfn(pmd_pte(pmd))
+#define mk_pmd(page, pgprot)	pfn_pmd(page_to_pfn(page), (pgprot))
+
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+	return pfn_pmd(pmd_pfn(pmd), newprot);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define has_transparent_hugepage() 1
+#define pmd_trans_huge pmd_huge_page
+
+static inline pmd_t pmd_mksplitting(pmd_t pmd)
+{
+	return pte_pmd(hv_pte_set_client2(pmd_pte(pmd)));
+}
+
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+	return hv_pte_get_client2(pmd_pte(pmd));
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

 /*
 * The pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
@@ -448,17 +531,13 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
       return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
 }

-static inline int pmd_huge_page(pmd_t pmd)
-{
-	return pmd_val(pmd) & _PAGE_HUGE_PAGE;
-}
-
 #include <asm-generic/pgtable.h>

 /* Support /proc/NN/pgtable API. */
 struct seq_file;
 int arch_proc_pgtable_show(struct seq_file *m, struct mm_struct *mm,
-			   unsigned long vaddr, pte_t *ptep, void **datap);
+			   unsigned long vaddr, unsigned long pagesize,
+			   pte_t *ptep, void **datap);

 #endif /* !__ASSEMBLY__ */


--- a/arch/tile/include/asm/pgtable_32.h
+++ b/arch/tile/include/asm/pgtable_32.h
@@ -20,11 +20,12 @@
 * The level-1 index is defined by the huge page size.  A PGD is composed
 * of PTRS_PER_PGD pgd_t's and is the top level of the page table.
 */
-#define PGDIR_SHIFT	HV_LOG2_PAGE_SIZE_LARGE
-#define PGDIR_SIZE	HV_PAGE_SIZE_LARGE
+#define PGDIR_SHIFT	HPAGE_SHIFT
+#define PGDIR_SIZE	HPAGE_SIZE
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
-#define PTRS_PER_PGD	(1 << (32 - PGDIR_SHIFT))
-#define SIZEOF_PGD	(PTRS_PER_PGD * sizeof(pgd_t))
+#define PTRS_PER_PGD	_HV_L1_ENTRIES(HPAGE_SHIFT)
+#define PGD_INDEX(va)	_HV_L1_INDEX(va, HPAGE_SHIFT)
+#define SIZEOF_PGD	_HV_L1_SIZE(HPAGE_SHIFT)

 /*
 * The level-2 index is defined by the difference between the huge
@@ -33,8 +34,9 @@
 * Note that the hypervisor docs use PTE for what we call pte_t, so
 * this nomenclature is somewhat confusing.
 */
-#define PTRS_PER_PTE (1 << (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL))
-#define SIZEOF_PTE	(PTRS_PER_PTE * sizeof(pte_t))
+#define PTRS_PER_PTE	_HV_L2_ENTRIES(HPAGE_SHIFT, PAGE_SHIFT)
+#define PTE_INDEX(va)	_HV_L2_INDEX(va, HPAGE_SHIFT, PAGE_SHIFT)
+#define SIZEOF_PTE	_HV_L2_SIZE(HPAGE_SHIFT, PAGE_SHIFT)

 #ifndef __ASSEMBLY__

@@ -111,24 +113,14 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 	return pte;
 }

-static inline void __set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-	set_pte(&pmdp->pud.pgd, pmdval.pud.pgd);
-}
-
-/* Create a pmd from a PTFN. */
-static inline pmd_t ptfn_pmd(unsigned long ptfn, pgprot_t prot)
-{
-	return (pmd_t){ { hv_pte_set_ptfn(prot, ptfn) } };
-}
-
-/* Return the page-table frame number (ptfn) that a pmd_t points at. */
-#define pmd_ptfn(pmd) hv_pte_get_ptfn((pmd).pud.pgd)
-
-static inline void pmd_clear(pmd_t *pmdp)
-{
-	__pte_clear(&pmdp->pud.pgd);
-}
+/*
+ * pmds are wrappers around pgds, which are the same as ptes.
+ * It's often convenient to "cast" back and forth and use the pte methods,
+ * which are the methods supplied by the hypervisor.
+ */
+#define pmd_pte(pmd) ((pmd).pud.pgd)
+#define pmdp_ptep(pmdp) (&(pmdp)->pud.pgd)
+#define pte_pmd(pte) ((pmd_t){ { (pte) } })

 #endif /* __ASSEMBLY__ */


--- a/arch/tile/include/asm/pgtable_64.h
+++ b/arch/tile/include/asm/pgtable_64.h
@@ -21,17 +21,19 @@
 #define PGDIR_SIZE	HV_L1_SPAN
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 #define PTRS_PER_PGD	HV_L0_ENTRIES
-#define SIZEOF_PGD	(PTRS_PER_PGD * sizeof(pgd_t))
+#define PGD_INDEX(va)	HV_L0_INDEX(va)
+#define SIZEOF_PGD	HV_L0_SIZE

 /*
 * The level-1 index is defined by the huge page size.  A PMD is composed
 * of PTRS_PER_PMD pgd_t's and is the middle level of the page table.
 */
-#define PMD_SHIFT	HV_LOG2_PAGE_SIZE_LARGE
-#define PMD_SIZE	HV_PAGE_SIZE_LARGE
+#define PMD_SHIFT	HPAGE_SHIFT
+#define PMD_SIZE	HPAGE_SIZE
 #define PMD_MASK	(~(PMD_SIZE-1))
-#define PTRS_PER_PMD	(1 << (PGDIR_SHIFT - PMD_SHIFT))
-#define SIZEOF_PMD	(PTRS_PER_PMD * sizeof(pmd_t))
+#define PTRS_PER_PMD	_HV_L1_ENTRIES(HPAGE_SHIFT)
+#define PMD_INDEX(va)	_HV_L1_INDEX(va, HPAGE_SHIFT)
+#define SIZEOF_PMD	_HV_L1_SIZE(HPAGE_SHIFT)

 /*
 * The level-2 index is defined by the difference between the huge
@@ -40,17 +42,19 @@
 * Note that the hypervisor docs use PTE for what we call pte_t, so
 * this nomenclature is somewhat confusing.
 */
-#define PTRS_PER_PTE (1 << (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL))
-#define SIZEOF_PTE	(PTRS_PER_PTE * sizeof(pte_t))
+#define PTRS_PER_PTE	_HV_L2_ENTRIES(HPAGE_SHIFT, PAGE_SHIFT)
+#define PTE_INDEX(va)	_HV_L2_INDEX(va, HPAGE_SHIFT, PAGE_SHIFT)
+#define SIZEOF_PTE	_HV_L2_SIZE(HPAGE_SHIFT, PAGE_SHIFT)

 /*
- * Align the vmalloc area to an L2 page table, and leave a guard page
- * at the beginning and end.  The vmalloc code also puts in an internal
+ * Align the vmalloc area to an L2 page table.  Omit guard pages at
+ * the beginning and end for simplicity (particularly in the per-cpu
+ * memory allocation code).  The vmalloc code puts in an internal
 * guard page between each allocation.
 */
 #define _VMALLOC_END	HUGE_VMAP_BASE
-#define VMALLOC_END	(_VMALLOC_END - PAGE_SIZE)
-#define VMALLOC_START	(_VMALLOC_START + PAGE_SIZE)
+#define VMALLOC_END	_VMALLOC_END
+#define VMALLOC_START	_VMALLOC_START

 #define HUGE_VMAP_END	(HUGE_VMAP_BASE + PGDIR_SIZE)

@@ -98,7 +102,7 @@ static inline int pud_bad(pud_t pud)
 * A pud_t points to a pmd_t array.  Since we can have multiple per
 * page, we don't have a one-to-one mapping of pud_t's to pages.
 */
-#define pud_page(pud) pfn_to_page(HV_PTFN_TO_PFN(pud_ptfn(pud)))
+#define pud_page(pud) pfn_to_page(PFN_DOWN(HV_PTFN_TO_CPA(pud_ptfn(pud))))

 static inline unsigned long pud_index(unsigned long address)
 {
@@ -108,28 +112,6 @@ static inline unsigned long pud_index(unsigned long address)
 #define pmd_offset(pud, address) \
 	((pmd_t *)pud_page_vaddr(*(pud)) + pmd_index(address))

-static inline void __set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-	set_pte(pmdp, pmdval);
-}
-
-/* Create a pmd from a PTFN and pgprot. */
-static inline pmd_t ptfn_pmd(unsigned long ptfn, pgprot_t prot)
-{
-	return hv_pte_set_ptfn(prot, ptfn);
-}
-
-/* Return the page-table frame number (ptfn) that a pmd_t points at. */
-static inline unsigned long pmd_ptfn(pmd_t pmd)
-{
-	return hv_pte_get_ptfn(pmd);
-}
-
-static inline void pmd_clear(pmd_t *pmdp)
-{
-	__pte_clear(pmdp);
-}
-
 /* Normalize an address to having the correct high bits set. */
 #define pgd_addr_normalize pgd_addr_normalize
 static inline unsigned long pgd_addr_normalize(unsigned long addr)
@@ -170,6 +152,13 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 	return hv_pte(__insn_exch(&ptep->val, 0UL));
 }

+/*
+ * pmds are the same as pgds and ptes, so converting is a no-op.
+ */
+#define pmd_pte(pmd) (pmd)
+#define pmdp_ptep(pmdp) (pmdp)
+#define pte_pmd(pte) (pte)
+
 #endif /* __ASSEMBLY__ */

 #endif /* _ASM_TILE_PGTABLE_64_H */
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -76,6 +76,17 @@ struct async_tlb {

 #ifdef CONFIG_HARDWALL
 struct hardwall_info;
+struct hardwall_task {
+	/* Which hardwall is this task tied to? (or NULL if none) */
+	struct hardwall_info *info;
+	/* Chains this task into the list at info->task_head. */
+	struct list_head list;
+};
+#ifdef __tilepro__
+#define HARDWALL_TYPES 1   /* udn */
+#else
+#define HARDWALL_TYPES 3   /* udn, idn, and ipi */
+#endif
 #endif

 struct thread_struct {
@@ -116,10 +127,8 @@ struct thread_struct {
 	unsigned long dstream_pf;
 #endif
 #ifdef CONFIG_HARDWALL
-	/* Is this task tied to an activated hardwall? */
-	struct hardwall_info *hardwall;
-	/* Chains this task into the list at hardwall->list. */
-	struct list_head hardwall_list;
+	/* Hardwall information for various resources. */
+	struct hardwall_task hardwall[HARDWALL_TYPES];
 #endif
 #if CHIP_HAS_TILE_DMA()
 	/* Async DMA TLB fault information */

--- a/arch/tile/include/asm/setup.h
+++ b/arch/tile/include/asm/setup.h
@@ -41,15 +41,15 @@ void restrict_dma_mpls(void);
 #ifdef CONFIG_HARDWALL
 /* User-level network management functions */
 void reset_network_state(void);
-void grant_network_mpls(void);
-void restrict_network_mpls(void);
 struct task_struct;
-int hardwall_deactivate(struct task_struct *task);
+void hardwall_switch_tasks(struct task_struct *prev, struct task_struct *next);
+void hardwall_deactivate_all(struct task_struct *task);
+int hardwall_ipi_valid(int cpu);

 /* Hook hardwall code into changes in affinity. */
 #define arch_set_cpus_allowed(p, new_mask) do { \
-	if (p->thread.hardwall && !cpumask_equal(&p->cpus_allowed, new_mask)) \
-		hardwall_deactivate(p); \
+	if (!cpumask_equal(&p->cpus_allowed, new_mask)) \
+		hardwall_deactivate_all(p); \
 } while (0)
 #endif


--- a/arch/tile/include/asm/syscalls.h
+++ b/arch/tile/include/asm/syscalls.h
@@ -43,7 +43,8 @@ long sys32_fadvise64(int fd, u32 offset_lo, u32 offset_hi,
 		     u32 len, int advice);
 int sys32_fadvise64_64(int fd, u32 offset_lo, u32 offset_hi,
 		       u32 len_lo, u32 len_hi, int advice);
-long sys_flush_cache(void);
+long sys_cacheflush(unsigned long addr, unsigned long len,
+		    unsigned long flags);
 #ifndef __tilegx__  /* No mmap() in the 32-bit kernel. */
 #define sys_mmap sys_mmap
 #endif

--- a/arch/tile/include/asm/tlbflush.h
+++ b/arch/tile/include/asm/tlbflush.h
@@ -38,16 +38,11 @@ DECLARE_PER_CPU(int, current_asid);
 /* The hypervisor tells us what ASIDs are available to us. */
 extern int min_asid, max_asid;

-static inline unsigned long hv_page_size(const struct vm_area_struct *vma)
-{
-	return (vma->vm_flags & VM_HUGETLB) ? HPAGE_SIZE : PAGE_SIZE;
-}
-
 /* Pass as vma pointer for non-executable mapping, if no vma available. */
-#define FLUSH_NONEXEC ((const struct vm_area_struct *)-1UL)
+#define FLUSH_NONEXEC ((struct vm_area_struct *)-1UL)

 /* Flush a single user page on this cpu. */
-static inline void local_flush_tlb_page(const struct vm_area_struct *vma,
+static inline void local_flush_tlb_page(struct vm_area_struct *vma,
 					unsigned long addr,
 					unsigned long page_size)
 {
@@ -60,7 +55,7 @@ static inline void local_flush_tlb_page(const struct vm_area_struct *vma,
 }

 /* Flush range of user pages on this cpu. */
-static inline void local_flush_tlb_pages(const struct vm_area_struct *vma,
+static inline void local_flush_tlb_pages(struct vm_area_struct *vma,
 					 unsigned long addr,
 					 unsigned long page_size,
 					 unsigned long len)
@@ -117,10 +112,10 @@ extern void flush_tlb_all(void);
 extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
 extern void flush_tlb_current_task(void);
 extern void flush_tlb_mm(struct mm_struct *);
-extern void flush_tlb_page(const struct vm_area_struct *, unsigned long);
-extern void flush_tlb_page_mm(const struct vm_area_struct *,
+extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
+extern void flush_tlb_page_mm(struct vm_area_struct *,
 			      struct mm_struct *, unsigned long);
-extern void flush_tlb_range(const struct vm_area_struct *,
+extern void flush_tlb_range(struct vm_area_struct *,
 			    unsigned long start, unsigned long end);

 #define flush_tlb()     flush_tlb_current_task()

--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -114,45 +114,75 @@ struct exception_table_entry {
 extern int fixup_exception(struct pt_regs *regs);

 /*
- * We return the __get_user_N function results in a structure,
- * thus in r0 and r1.  If "err" is zero, "val" is the result
- * of the read; otherwise, "err" is -EFAULT.
- *
- * We rarely need 8-byte values on a 32-bit architecture, but
- * we size the structure to accommodate.  In practice, for the
- * the smaller reads, we can zero the high word for free, and
- * the caller will ignore it by virtue of casting anyway.
+ * Support macros for __get_user().
+ *
+ * Implementation note: The "case 8" logic of casting to the type of
+ * the result of subtracting the value from itself is basically a way
+ * of keeping all integer types the same, but casting any pointers to
+ * ptrdiff_t, i.e. also an integer type.  This way there are no
+ * questionable casts seen by the compiler on an ILP32 platform.
+ *
+ * Note that __get_user() and __put_user() assume proper alignment.
 */
-struct __get_user {
-	unsigned long long val;
-	int err;
-};

-/*
- * FIXME: we should express these as inline extended assembler, since
- * they're fundamentally just a variable dereference and some
- * supporting exception_table gunk.  Note that (a la i386) we can
- * extend the copy_to_user and copy_from_user routines to call into
- * such extended assembler routines, though we will have to use a
- * different return code in that case (1, 2, or 4, rather than -EFAULT).
- */
-extern struct __get_user __get_user_1(const void __user *);
-extern struct __get_user __get_user_2(const void __user *);
-extern struct __get_user __get_user_4(const void __user *);
-extern struct __get_user __get_user_8(const void __user *);
-extern int __put_user_1(long, void __user *);
-extern int __put_user_2(long, void __user *);
-extern int __put_user_4(long, void __user *);
-extern int __put_user_8(long long, void __user *);
-
-/* Unimplemented routines to cause linker failures */
-extern struct __get_user __get_user_bad(void);
-extern int __put_user_bad(void);
+#ifdef __LP64__
+#define _ASM_PTR	".quad"
+#else
+#define _ASM_PTR	".long"
+#endif
+
+#define __get_user_asm(OP, x, ptr, ret)					\
+	asm volatile("1: {" #OP " %1, %2; movei %0, 0 }\n"		\
+		     ".pushsection .fixup,\"ax\"\n"			\
+		     "0: { movei %1, 0; movei %0, %3 }\n"		\
+		     "j 9f\n"						\
+		     ".section __ex_table,\"a\"\n"			\
+		     _ASM_PTR " 1b, 0b\n"				\
+		     ".popsection\n"					\
+		     "9:"						\
+		     : "=r" (ret), "=r" (x)				\
+		     : "r" (ptr), "i" (-EFAULT))
+
+#ifdef __tilegx__
+#define __get_user_1(x, ptr, ret) __get_user_asm(ld1u, x, ptr, ret)
+#define __get_user_2(x, ptr, ret) __get_user_asm(ld2u, x, ptr, ret)
+#define __get_user_4(x, ptr, ret) __get_user_asm(ld4u, x, ptr, ret)
+#define __get_user_8(x, ptr, ret) __get_user_asm(ld, x, ptr, ret)
+#else
+#define __get_user_1(x, ptr, ret) __get_user_asm(lb_u, x, ptr, ret)
+#define __get_user_2(x, ptr, ret) __get_user_asm(lh_u, x, ptr, ret)
+#define __get_user_4(x, ptr, ret) __get_user_asm(lw, x, ptr, ret)
+#ifdef __LITTLE_ENDIAN
+#define __lo32(a, b) a
+#define __hi32(a, b) b
+#else
+#define __lo32(a, b) b
+#define __hi32(a, b) a
+#endif
+#define __get_user_8(x, ptr, ret)					\
+	({								\
+		unsigned int __a, __b;					\
+		asm volatile("1: { lw %1, %3; addi %2, %3, 4 }\n"	\
+			     "2: { lw %2, %2; movei %0, 0 }\n"		\
+			     ".pushsection .fixup,\"ax\"\n"		\
+			     "0: { movei %1, 0; movei %2, 0 }\n"	\
+			     "{ movei %0, %4; j 9f }\n"			\
+			     ".section __ex_table,\"a\"\n"		\
+			     ".word 1b, 0b\n"				\
+			     ".word 2b, 0b\n"				\
+			     ".popsection\n"				\
+			     "9:"					\
+			     : "=r" (ret), "=r" (__a), "=&r" (__b)	\
+			     : "r" (ptr), "i" (-EFAULT));		\
+		(x) = (__typeof(x))(__typeof((x)-(x)))			\
+			(((u64)__hi32(__a, __b) << 32) |		\
+			 __lo32(__a, __b));				\
+	})
+#endif
+
+extern int __get_user_bad(void)
+  __attribute__((warning("sizeof __get_user argument not 1, 2, 4 or 8")));

-/*
- * Careful: we have to cast the result to the type of the pointer
- * for sign reasons.
- */
 /**
 * __get_user: - Get a simple variable from user space, with less checking.
 * @x:   Variable to store result.
@@ -174,30 +204,62 @@ extern int __put_user_bad(void);
 * function.
 */
 #define __get_user(x, ptr)						\
-({	struct __get_user __ret;					\
-	__typeof__(*(ptr)) const __user *__gu_addr = (ptr);		\
-	__chk_user_ptr(__gu_addr);					\
-	switch (sizeof(*(__gu_addr))) {					\
-	case 1:								\
-		__ret = __get_user_1(__gu_addr);			\
-		break;							\
-	case 2:								\
-		__ret = __get_user_2(__gu_addr);			\
-		break;							\
-	case 4:								\
-		__ret = __get_user_4(__gu_addr);			\
-		break;							\
-	case 8:								\
-		__ret = __get_user_8(__gu_addr);			\
-		break;							\
-	default:							\
-		__ret = __get_user_bad();				\
-		break;							\
-	}								\
-	(x) = (__typeof__(*__gu_addr)) (__typeof__(*__gu_addr - *__gu_addr)) \
-	  __ret.val;			                                \
-	__ret.err;							\
-})
+	({								\
+		int __ret;						\
+		__chk_user_ptr(ptr);					\
+		switch (sizeof(*(ptr))) {				\
+		case 1: __get_user_1(x, ptr, __ret); break;		\
+		case 2: __get_user_2(x, ptr, __ret); break;		\
+		case 4: __get_user_4(x, ptr, __ret); break;		\
+		case 8: __get_user_8(x, ptr, __ret); break;		\
+		default: __ret = __get_user_bad(); break;		\
+		}							\
+		__ret;							\
+	})
+
+/* Support macros for __put_user(). */
+
+#define __put_user_asm(OP, x, ptr, ret)			\
+	asm volatile("1: {" #OP " %1, %2; movei %0, 0 }\n"		\
+		     ".pushsection .fixup,\"ax\"\n"			\
+		     "0: { movei %0, %3; j 9f }\n"			\
+		     ".section __ex_table,\"a\"\n"			\
+		     _ASM_PTR " 1b, 0b\n"				\
+		     ".popsection\n"					\
+		     "9:"						\
+		     : "=r" (ret)					\
+		     : "r" (ptr), "r" (x), "i" (-EFAULT))
+
+#ifdef __tilegx__
+#define __put_user_1(x, ptr, ret) __put_user_asm(st1, x, ptr, ret)
+#define __put_user_2(x, ptr, ret) __put_user_asm(st2, x, ptr, ret)
+#define __put_user_4(x, ptr, ret) __put_user_asm(st4, x, ptr, ret)
+#define __put_user_8(x, ptr, ret) __put_user_asm(st, x, ptr, ret)
+#else
+#define __put_user_1(x, ptr, ret) __put_user_asm(sb, x, ptr, ret)
+#define __put_user_2(x, ptr, ret) __put_user_asm(sh, x, ptr, ret)
+#define __put_user_4(x, ptr, ret) __put_user_asm(sw, x, ptr, ret)
+#define __put_user_8(x, ptr, ret)					\
+	({								\
+		u64 __x = (__typeof((x)-(x)))(x);			\
+		int __lo = (int) __x, __hi = (int) (__x >> 32);		\
+		asm volatile("1: { sw %1, %2; addi %0, %1, 4 }\n"	\
+			     "2: { sw %0, %3; movei %0, 0 }\n"		\
+			     ".pushsection .fixup,\"ax\"\n"		\
+			     "0: { movei %0, %4; j 9f }\n"		\
+			     ".section __ex_table,\"a\"\n"		\
+			     ".word 1b, 0b\n"				\
+			     ".word 2b, 0b\n"				\
+			     ".popsection\n"				\
+			     "9:"					\
+			     : "=&r" (ret)				\
+			     : "r" (ptr), "r" (__lo32(__lo, __hi)),	\
+			     "r" (__hi32(__lo, __hi)), "i" (-EFAULT));	\
+	})
+#endif
+
+extern int __put_user_bad(void)
+  __attribute__((warning("sizeof __put_user argument not 1, 2, 4 or 8")));

 /**
 * __put_user: - Write a simple value into user space, with less checking.
@@ -217,39 +279,19 @@ extern int __put_user_bad(void);
 * function.
 *
 * Returns zero on success, or -EFAULT on error.
- *
- * Implementation note: The "case 8" logic of casting to the type of
- * the result of subtracting the value from itself is basically a way
- * of keeping all integer types the same, but casting any pointers to
- * ptrdiff_t, i.e. also an integer type.  This way there are no
- * questionable casts seen by the compiler on an ILP32 platform.
 */
 #define __put_user(x, ptr)						\
 ({									\
-	int __pu_err = 0;						\
-	__typeof__(*(ptr)) __user *__pu_addr = (ptr);			\
-	typeof(*__pu_addr) __pu_val = (x);				\
-	__chk_user_ptr(__pu_addr);					\
-	switch (sizeof(__pu_val)) {					\
-	case 1:								\
-		__pu_err = __put_user_1((long)__pu_val, __pu_addr);	\
-		break;							\
-	case 2:								\
-		__pu_err = __put_user_2((long)__pu_val, __pu_addr);	\
-		break;							\
-	case 4:								\
-		__pu_err = __put_user_4((long)__pu_val, __pu_addr);	\
-		break;							\
-	case 8:								\
-		__pu_err =						\
-		  __put_user_8((__typeof__(__pu_val - __pu_val))__pu_val,\
-			__pu_addr);					\
-		break;							\
-	default:							\
-		__pu_err = __put_user_bad();				\
-		break;							\
+	int __ret;							\
+	__chk_user_ptr(ptr);						\
+	switch (sizeof(*(ptr))) {					\
+	case 1: __put_user_1(x, ptr, __ret); break;			\
+	case 2: __put_user_2(x, ptr, __ret); break;			\
+	case 4: __put_user_4(x, ptr, __ret); break;			\
+	case 8: __put_user_8(x, ptr, __ret); break;			\
+	default: __ret = __put_user_bad(); break;			\
 	}								\
-	__pu_err;							\
+	__ret;								\
 })

 /*
@@ -378,7 +420,7 @@ static inline unsigned long __must_check copy_from_user(void *to,
 /**
 * __copy_in_user() - copy data within user space, with less checking.
 * @to:   Destination address, in user space.
- * @from: Source address, in kernel space.
+ * @from: Source address, in user space.
 * @n:    Number of bytes to copy.
 *
 * Context: User context only.  This function may sleep.

--- a/arch/tile/include/asm/unistd.h
+++ b/arch/tile/include/asm/unistd.h
@@ -24,8 +24,8 @@
 #include <asm-generic/unistd.h>

 /* Additional Tilera-specific syscalls. */
-#define __NR_flush_cache	(__NR_arch_specific_syscall + 1)
-__SYSCALL(__NR_flush_cache, sys_flush_cache)
+#define __NR_cacheflush	(__NR_arch_specific_syscall + 1)
+__SYSCALL(__NR_cacheflush, sys_cacheflush)

 #ifndef __tilegx__
 /* "Fast" syscalls provide atomic support for 32-bit chips. */

--- a/arch/tile/include/hv/drv_xgbe_intf.h
+++ b/arch/tile/include/hv/drv_xgbe_intf.h
@@ -460,7 +460,7 @@ typedef void* lepp_comp_t;
 *  linux's "MAX_SKB_FRAGS", and presumably over-estimates by one, for
 *  our page size of exactly 65536.  We add one for a "body" fragment.
 */
-#define LEPP_MAX_FRAGS (65536 / HV_PAGE_SIZE_SMALL + 2 + 1)
+#define LEPP_MAX_FRAGS (65536 / HV_DEFAULT_PAGE_SIZE_SMALL + 2 + 1)

 /** Total number of bytes needed for an lepp_tso_cmd_t. */
 #define LEPP_TSO_CMD_SIZE(num_frags, header_size) \

--- a/arch/tile/include/hv/hypervisor.h
+++ b/arch/tile/include/hv/hypervisor.h
--- a/arch/tile/kernel/Makefile
+++ b/arch/tile/kernel/Makefile
@@ -9,10 +9,9 @@ obj-y := backtrace.o entry.o irq.o messaging.o \
 	intvec_$(BITS).o regs_$(BITS).o tile-desc_$(BITS).o

 obj-$(CONFIG_HARDWALL)		+= hardwall.o
-obj-$(CONFIG_TILEGX)		+= futex_64.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_signal.o
 obj-$(CONFIG_SMP)		+= smpboot.o smp.o tlb.o
 obj-$(CONFIG_MODULES)		+= module.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
-obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel_$(BITS).o
 obj-$(CONFIG_PCI)		+= pci.o
--- a/arch/tile/kernel/entry.S
+++ b/arch/tile/kernel/entry.S
@@ -100,8 +100,9 @@ STD_ENTRY(smp_nap)
 */
 STD_ENTRY(_cpu_idle)
 	movei r1, 1
+	IRQ_ENABLE_LOAD(r2, r3)
 	mtspr INTERRUPT_CRITICAL_SECTION, r1
-	IRQ_ENABLE(r2, r3)             /* unmask, but still with ICS set */
+	IRQ_ENABLE_APPLY(r2, r3)       /* unmask, but still with ICS set */
 	mtspr INTERRUPT_CRITICAL_SECTION, zero
 	.global _cpu_idle_nap
 _cpu_idle_nap:

--- a/arch/tile/kernel/hardwall.c
+++ b/arch/tile/kernel/hardwall.c
--- a/arch/tile/kernel/head_32.S
+++ b/arch/tile/kernel/head_32.S
@@ -69,7 +69,7 @@ ENTRY(_start)
 	}
 	{
 	  moveli lr, lo16(1f)
-	  move r5, zero
+	  moveli r5, CTX_PAGE_FLAG
 	}
 	{
 	  auli lr, lr, ha16(1f)
@@ -141,11 +141,11 @@ ENTRY(empty_zero_page)

 	.macro PTE va, cpa, bits1, no_org=0
 	.ifeq \no_org
-	.org swapper_pg_dir + HV_L1_INDEX(\va) * HV_PTE_SIZE
+	.org swapper_pg_dir + PGD_INDEX(\va) * HV_PTE_SIZE
 	.endif
 	.word HV_PTE_PAGE | HV_PTE_DIRTY | HV_PTE_PRESENT | HV_PTE_ACCESSED | \
 	      (HV_PTE_MODE_CACHE_NO_L3 << HV_PTE_INDEX_MODE)
-	.word (\bits1) | (HV_CPA_TO_PFN(\cpa) << (HV_PTE_INDEX_PFN - 32))
+	.word (\bits1) | (HV_CPA_TO_PTFN(\cpa) << (HV_PTE_INDEX_PTFN - 32))
 	.endm

 __PAGE_ALIGNED_DATA
@@ -166,7 +166,7 @@ ENTRY(swapper_pg_dir)
 	/* The true text VAs are mapped as VA = PA + MEM_SV_INTRPT */
 	PTE MEM_SV_INTRPT, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
 			      (1 << (HV_PTE_INDEX_EXECUTABLE - 32))
-	.org swapper_pg_dir + HV_L1_SIZE
+	.org swapper_pg_dir + PGDIR_SIZE
 	END(swapper_pg_dir)

 	/*

--- a/arch/tile/kernel/head_64.S
+++ b/arch/tile/kernel/head_64.S
@@ -114,7 +114,7 @@ ENTRY(_start)
 	  shl16insli r0, r0, hw0(swapper_pg_dir - PAGE_OFFSET)
 	}
 	{
-	  move r3, zero
+	  moveli r3, CTX_PAGE_FLAG
 	  j hv_install_context
 	}
 1:
@@ -210,19 +210,19 @@ ENTRY(empty_zero_page)
 	.macro PTE cpa, bits1
 	.quad HV_PTE_PAGE | HV_PTE_DIRTY | HV_PTE_PRESENT | HV_PTE_ACCESSED |\
 	      HV_PTE_GLOBAL | (HV_PTE_MODE_CACHE_NO_L3 << HV_PTE_INDEX_MODE) |\
-	      (\bits1) | (HV_CPA_TO_PFN(\cpa) << HV_PTE_INDEX_PFN)
+	      (\bits1) | (HV_CPA_TO_PTFN(\cpa) << HV_PTE_INDEX_PTFN)
 	.endm

 __PAGE_ALIGNED_DATA
 	.align PAGE_SIZE
 ENTRY(swapper_pg_dir)
-	.org swapper_pg_dir + HV_L0_INDEX(PAGE_OFFSET) * HV_PTE_SIZE
+	.org swapper_pg_dir + PGD_INDEX(PAGE_OFFSET) * HV_PTE_SIZE
 .Lsv_data_pmd:
 	.quad 0  /* PTE temp_data_pmd - PAGE_OFFSET, 0 */
-	.org swapper_pg_dir + HV_L0_INDEX(MEM_SV_START) * HV_PTE_SIZE
+	.org swapper_pg_dir + PGD_INDEX(MEM_SV_START) * HV_PTE_SIZE
 .Lsv_code_pmd:
 	.quad 0  /* PTE temp_code_pmd - PAGE_OFFSET, 0 */
-	.org swapper_pg_dir + HV_L0_SIZE
+	.org swapper_pg_dir + SIZEOF_PGD
 	END(swapper_pg_dir)

 	.align HV_PAGE_TABLE_ALIGN
@@ -233,11 +233,11 @@ ENTRY(temp_data_pmd)
 	 * permissions later.
 	 */
 	.set addr, 0
-	.rept HV_L1_ENTRIES
+	.rept PTRS_PER_PMD
 	PTE addr, HV_PTE_READABLE | HV_PTE_WRITABLE
-	.set addr, addr + HV_PAGE_SIZE_LARGE
+	.set addr, addr + HPAGE_SIZE
 	.endr
-	.org temp_data_pmd + HV_L1_SIZE
+	.org temp_data_pmd + SIZEOF_PMD
 	END(temp_data_pmd)

 	.align HV_PAGE_TABLE_ALIGN
@@ -248,11 +248,11 @@ ENTRY(temp_code_pmd)
 	 * permissions later.
 	 */
 	.set addr, 0
-	.rept HV_L1_ENTRIES
+	.rept PTRS_PER_PMD
 	PTE addr, HV_PTE_READABLE | HV_PTE_EXECUTABLE
-	.set addr, addr + HV_PAGE_SIZE_LARGE
+	.set addr, addr + HPAGE_SIZE
 	.endr
-	.org temp_code_pmd + HV_L1_SIZE
+	.org temp_code_pmd + SIZEOF_PMD
 	END(temp_code_pmd)

 	/*

--- a/arch/tile/kernel/hvglue.lds
+++ b/arch/tile/kernel/hvglue.lds
@@ -55,4 +55,5 @@ hv_store_mapping = TEXT_OFFSET + 0x106a0;
 hv_inquire_realpa = TEXT_OFFSET + 0x106c0;
 hv_flush_all = TEXT_OFFSET + 0x106e0;
 hv_get_ipi_pte = TEXT_OFFSET + 0x10700;
-hv_glue_internals = TEXT_OFFSET + 0x10720;
+hv_set_pte_super_shift = TEXT_OFFSET + 0x10720;
+hv_glue_internals = TEXT_OFFSET + 0x10740;
--- a/arch/tile/kernel/intvec_64.S
+++ b/arch/tile/kernel/intvec_64.S
@@ -220,7 +220,9 @@ intvec_\vecname:
 	 * This routine saves just the first four registers, plus the
 	 * stack context so we can do proper backtracing right away,
 	 * and defers to handle_interrupt to save the rest.
-	 * The backtracer needs pc, ex1, lr, sp, r52, and faultnum.
+	 * The backtracer needs pc, ex1, lr, sp, r52, and faultnum,
+	 * and needs sp set to its final location at the bottom of
+	 * the stack frame.
 	 */
 	addli   r0, r0, PTREGS_OFFSET_LR - (PTREGS_SIZE + KSTK_PTREGS_GAP)
 	wh64    r0   /* cache line 7 */
@@ -450,23 +452,6 @@ intvec_\vecname:
 	push_reg r5, r52
 	st      r52, r4

-	/* Load tp with our per-cpu offset. */
-#ifdef CONFIG_SMP
-	{
-	 mfspr  r20, SPR_SYSTEM_SAVE_K_0
-	 moveli r21, hw2_last(__per_cpu_offset)
-	}
-	{
-	 shl16insli r21, r21, hw1(__per_cpu_offset)
-	 bfextu r20, r20, 0, LOG2_THREAD_SIZE-1
-	}
-	shl16insli r21, r21, hw0(__per_cpu_offset)
-	shl3add r20, r20, r21
-	ld      tp, r20
-#else
-	move    tp, zero
-#endif
-
 	/*
 	 * If we will be returning to the kernel, we will need to
 	 * reset the interrupt masks to the state they had before.
@@ -489,6 +474,44 @@ intvec_\vecname:
 	.endif
 	st      r21, r32

+	/*
+	 * we've captured enough state to the stack (including in
+	 * particular our EX_CONTEXT state) that we can now release
+	 * the interrupt critical section and replace it with our
+	 * standard "interrupts disabled" mask value.  This allows
+	 * synchronous interrupts (and profile interrupts) to punch
+	 * through from this point onwards.
+	 *
+	 * It's important that no code before this point touch memory
+	 * other than our own stack (to keep the invariant that this
+	 * is all that gets touched under ICS), and that no code after
+	 * this point reference any interrupt-specific SPR, in particular
+	 * the EX_CONTEXT_K_ values.
+	 */
+	.ifc \function,handle_nmi
+	IRQ_DISABLE_ALL(r20)
+	.else
+	IRQ_DISABLE(r20, r21)
+	.endif
+	mtspr   INTERRUPT_CRITICAL_SECTION, zero
+
+	/* Load tp with our per-cpu offset. */
+#ifdef CONFIG_SMP
+	{
+	 mfspr  r20, SPR_SYSTEM_SAVE_K_0
+	 moveli r21, hw2_last(__per_cpu_offset)
+	}
+	{
+	 shl16insli r21, r21, hw1(__per_cpu_offset)
+	 bfextu r20, r20, 0, LOG2_THREAD_SIZE-1
+	}
+	shl16insli r21, r21, hw0(__per_cpu_offset)
+	shl3add r20, r20, r21
+	ld      tp, r20
+#else
+	move    tp, zero
+#endif
+
 #ifdef __COLLECT_LINKER_FEEDBACK__
 	/*
 	 * Notify the feedback routines that we were in the
@@ -512,21 +535,6 @@ intvec_\vecname:
 	FEEDBACK_ENTER(\function)
 #endif

-	/*
-	 * we've captured enough state to the stack (including in
-	 * particular our EX_CONTEXT state) that we can now release
-	 * the interrupt critical section and replace it with our
-	 * standard "interrupts disabled" mask value.  This allows
-	 * synchronous interrupts (and profile interrupts) to punch
-	 * through from this point onwards.
-	 */
-	.ifc \function,handle_nmi
-	IRQ_DISABLE_ALL(r20)
-	.else
-	IRQ_DISABLE(r20, r21)
-	.endif
-	mtspr   INTERRUPT_CRITICAL_SECTION, zero
-
 	/*
 	 * Prepare the first 256 stack bytes to be rapidly accessible
 	 * without having to fetch the background data.
@@ -736,9 +744,10 @@ STD_ENTRY(interrupt_return)
 	beqzt   r30, .Lrestore_regs
 	j       3f
 2:	TRACE_IRQS_ON
+	IRQ_ENABLE_LOAD(r20, r21)
 	movei   r0, 1
 	mtspr   INTERRUPT_CRITICAL_SECTION, r0
-	IRQ_ENABLE(r20, r21)
+	IRQ_ENABLE_APPLY(r20, r21)
 	beqzt   r30, .Lrestore_regs
 3:

@@ -755,7 +764,6 @@ STD_ENTRY(interrupt_return)
 	 * that will save some cycles if this turns out to be a syscall.
 	 */
 .Lrestore_regs:
-	FEEDBACK_REENTER(interrupt_return)   /* called from elsewhere */

 	/*
 	 * Rotate so we have one high bit and one low bit to test.
@@ -1249,7 +1257,7 @@ STD_ENTRY(fill_ra_stack)
 	int_hand     INT_UNALIGN_DATA, UNALIGN_DATA, int_unalign
 	int_hand     INT_DTLB_MISS, DTLB_MISS, do_page_fault
 	int_hand     INT_DTLB_ACCESS, DTLB_ACCESS, do_page_fault
-	int_hand     INT_IDN_FIREWALL, IDN_FIREWALL, bad_intr
+	int_hand     INT_IDN_FIREWALL, IDN_FIREWALL, do_hardwall_trap
 	int_hand     INT_UDN_FIREWALL, UDN_FIREWALL, do_hardwall_trap
 	int_hand     INT_TILE_TIMER, TILE_TIMER, do_timer_interrupt
 	int_hand     INT_IDN_TIMER, IDN_TIMER, bad_intr

--- a/arch/tile/kernel/machine_kexec.c
+++ b/arch/tile/kernel/machine_kexec.c
@@ -31,6 +31,8 @@
 #include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
 #include <asm/checksum.h>
+#include <asm/tlbflush.h>
+#include <asm/homecache.h>
 #include <hv/hypervisor.h>


@@ -222,11 +224,22 @@ struct page *kimage_alloc_pages_arch(gfp_t gfp_mask, unsigned int order)
 	return alloc_pages_node(0, gfp_mask, order);
 }

+/*
+ * Address range in which pa=va mapping is set in setup_quasi_va_is_pa().
+ * For tilepro, PAGE_OFFSET is used since this is the largest possbile value
+ * for tilepro, while for tilegx, we limit it to entire middle level page
+ * table which we assume has been allocated and is undoubtedly large enough.
+ */
+#ifndef __tilegx__
+#define	QUASI_VA_IS_PA_ADDR_RANGE PAGE_OFFSET
+#else
+#define	QUASI_VA_IS_PA_ADDR_RANGE PGDIR_SIZE
+#endif
+
 static void setup_quasi_va_is_pa(void)
 {
-	HV_PTE *pgtable;
 	HV_PTE pte;
-	int i;
+	unsigned long i;

 	/*
 	 * Flush our TLB to prevent conflicts between the previous contents
@@ -234,16 +247,22 @@ static void setup_quasi_va_is_pa(void)
 	 */
 	local_flush_tlb_all();

-	/* setup VA is PA, at least up to PAGE_OFFSET */
-
-	pgtable = (HV_PTE *)current->mm->pgd;
+	/*
+	 * setup VA is PA, at least up to QUASI_VA_IS_PA_ADDR_RANGE.
+	 * Note here we assume that level-1 page table is defined by
+	 * HPAGE_SIZE.
+	 */
 	pte = hv_pte(_PAGE_KERNEL | _PAGE_HUGE_PAGE);
 	pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3);
-
-	for (i = 0; i < pgd_index(PAGE_OFFSET); i++) {
+	for (i = 0; i < (QUASI_VA_IS_PA_ADDR_RANGE >> HPAGE_SHIFT); i++) {
+		unsigned long vaddr = i << HPAGE_SHIFT;
+		pgd_t *pgd = pgd_offset(current->mm, vaddr);
+		pud_t *pud = pud_offset(pgd, vaddr);
+		pte_t *ptep = (pte_t *) pmd_offset(pud, vaddr);
 		unsigned long pfn = i << (HPAGE_SHIFT - PAGE_SHIFT);
+
 		if (pfn_valid(pfn))
-			__set_pte(&pgtable[i], pfn_pte(pfn, pte));
+			__set_pte(ptep, pfn_pte(pfn, pte));
 	}
 }

@@ -251,6 +270,7 @@ static void setup_quasi_va_is_pa(void)
 void machine_kexec(struct kimage *image)
 {
 	void *reboot_code_buffer;
+	pte_t *ptep;
 	void (*rnk)(unsigned long, void *, unsigned long)
 		__noreturn;

@@ -266,8 +286,10 @@ void machine_kexec(struct kimage *image)
 	 */
 	homecache_change_page_home(image->control_code_page, 0,
 				   smp_processor_id());
-	reboot_code_buffer = vmap(&image->control_code_page, 1, 0,
-				  __pgprot(_PAGE_KERNEL | _PAGE_EXECUTABLE));
+	reboot_code_buffer = page_address(image->control_code_page);
+	BUG_ON(reboot_code_buffer == NULL);
+	ptep = virt_to_pte(NULL, (unsigned long)reboot_code_buffer);
+	__set_pte(ptep, pte_mkexec(*ptep));
 	memcpy(reboot_code_buffer, relocate_new_kernel,
 	       relocate_new_kernel_size);
 	__flush_icache_range(

--- a/arch/tile/kernel/module.c
+++ b/arch/tile/kernel/module.c
@@ -159,7 +159,17 @@ int apply_relocate_add(Elf_Shdr *sechdrs,

 		switch (ELF_R_TYPE(rel[i].r_info)) {

-#define MUNGE(func) (*location = ((*location & ~func(-1)) | func(value)))
+#ifdef __LITTLE_ENDIAN
+# define MUNGE(func) \
+	(*location = ((*location & ~func(-1)) | func(value)))
+#else
+/*
+ * Instructions are always little-endian, so when we read them as data,
+ * we have to swap them around before and after modifying them.
+ */
+# define MUNGE(func) \
+	(*location = swab64((swab64(*location) & ~func(-1)) | func(value)))
+#endif

 #ifndef __tilegx__
 		case R_TILE_32:

--- a/arch/tile/kernel/proc.c
+++ b/arch/tile/kernel/proc.c
@@ -22,6 +22,7 @@
 #include <linux/proc_fs.h>
 #include <linux/sysctl.h>
 #include <linux/hardirq.h>
+#include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <asm/unaligned.h>
 #include <asm/pgtable.h>

--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -128,10 +128,10 @@ void arch_release_thread_info(struct thread_info *info)
 	 * Calling deactivate here just frees up the data structures.
 	 * If the task we're freeing held the last reference to a
 	 * hardwall fd, it would have been released prior to this point
-	 * anyway via exit_files(), and "hardwall" would be NULL by now.
+	 * anyway via exit_files(), and the hardwall_task.info pointers
+	 * would be NULL by now.
 	 */
-	if (info->task->thread.hardwall)
-		hardwall_deactivate(info->task);
+	hardwall_deactivate_all(info->task);
 #endif

 	if (step_state) {
@@ -245,7 +245,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,

 #ifdef CONFIG_HARDWALL
 	/* New thread does not own any networks. */
-	p->thread.hardwall = NULL;
+	memset(&p->thread.hardwall[0], 0,
+	       sizeof(struct hardwall_task) * HARDWALL_TYPES);
 #endif


@@ -515,12 +516,7 @@ struct task_struct *__sched _switch_to(struct task_struct *prev,

 #ifdef CONFIG_HARDWALL
 	/* Enable or disable access to the network registers appropriately. */
-	if (prev->thread.hardwall != NULL) {
-		if (next->thread.hardwall == NULL)
-			restrict_network_mpls();
-	} else if (next->thread.hardwall != NULL) {
-		grant_network_mpls();
-	}
+	hardwall_switch_tasks(prev, next);
 #endif

 	/*

--- a/arch/tile/kernel/relocate_kernel.S
+++ b/arch/tile/kernel/relocate_kernel.S
--- a/arch/tile/kernel/relocate_kernel_64.S
+++ b/arch/tile/kernel/relocate_kernel_64.S
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * copy new kernel into place and then call hv_reexec
+ *
+ */
+
+#include <linux/linkage.h>
+#include <arch/chip.h>
+#include <asm/page.h>
+#include <hv/hypervisor.h>
+
+#undef RELOCATE_NEW_KERNEL_VERBOSE
+
+STD_ENTRY(relocate_new_kernel)
+
+	move	r30, r0		/* page list */
+	move	r31, r1		/* address of page we are on */
+	move	r32, r2		/* start address of new kernel */
+
+	shrui	r1, r1, PAGE_SHIFT
+	addi	r1, r1, 1
+	shli	sp, r1, PAGE_SHIFT
+	addi	sp, sp, -8
+	/* we now have a stack (whether we need one or not) */
+
+	moveli	r40, hw2_last(hv_console_putc)
+	shl16insli r40, r40, hw1(hv_console_putc)
+	shl16insli r40, r40, hw0(hv_console_putc)
+
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+	moveli	r0, 'r'
+	jalr	r40
+
+	moveli	r0, '_'
+	jalr	r40
+
+	moveli	r0, 'n'
+	jalr	r40
+
+	moveli	r0, '_'
+	jalr	r40
+
+	moveli	r0, 'k'
+	jalr	r40
+
+	moveli	r0, '\n'
+	jalr	r40
+#endif
+
+	/*
+	 * Throughout this code r30 is pointer to the element of page
+	 * list we are working on.
+	 *
+	 * Normally we get to the next element of the page list by
+	 * incrementing r30 by eight.  The exception is if the element
+	 * on the page list is an IND_INDIRECTION in which case we use
+	 * the element with the low bits masked off as the new value
+	 * of r30.
+	 *
+	 * To get this started, we need the value passed to us (which
+	 * will always be an IND_INDIRECTION) in memory somewhere with
+	 * r30 pointing at it.  To do that, we push the value passed
+	 * to us on the stack and make r30 point to it.
+	 */
+
+	st	sp, r30
+	move	r30, sp
+	addi	sp, sp, -16
+
+#if CHIP_HAS_CBOX_HOME_MAP()
+	/*
+	 * On TILE-GX, we need to flush all tiles' caches, since we may
+	 * have been doing hash-for-home caching there.  Note that we
+	 * must do this _after_ we're completely done modifying any memory
+	 * other than our output buffer (which we know is locally cached).
+	 * We want the caches to be fully clean when we do the reexec,
+	 * because the hypervisor is going to do this flush again at that
+	 * point, and we don't want that second flush to overwrite any memory.
+	 */
+	{
+	 move	r0, zero	 /* cache_pa */
+	 moveli	r1, hw2_last(HV_FLUSH_EVICT_L2)
+	}
+	{
+	 shl16insli	r1, r1, hw1(HV_FLUSH_EVICT_L2)
+	 movei	r2, -1		 /* cache_cpumask; -1 means all client tiles */
+	}
+	{
+	 shl16insli	r1, r1, hw0(HV_FLUSH_EVICT_L2)  /* cache_control */
+	 move	r3, zero	 /* tlb_va */
+	}
+	{
+	 move	r4, zero	 /* tlb_length */
+	 move	r5, zero	 /* tlb_pgsize */
+	}
+	{
+	 move	r6, zero	 /* tlb_cpumask */
+	 move	r7, zero	 /* asids */
+	}
+	{
+	 moveli	r20, hw2_last(hv_flush_remote)
+	 move	r8, zero	 /* asidcount */
+	}
+	shl16insli	r20, r20, hw1(hv_flush_remote)
+	shl16insli	r20, r20, hw0(hv_flush_remote)
+
+	jalr	r20
+#endif
+
+	/* r33 is destination pointer, default to zero */
+
+	moveli	r33, 0
+
+.Lloop:	ld	r10, r30
+
+	andi	r9, r10, 0xf	/* low 4 bits tell us what type it is */
+	xor	r10, r10, r9	/* r10 is now value with low 4 bits stripped */
+
+	cmpeqi	r0, r9, 0x1	/* IND_DESTINATION */
+	beqzt	r0, .Ltry2
+
+	move	r33, r10
+
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+	moveli	r0, 'd'
+	jalr	r40
+#endif
+
+	addi	r30, r30, 8
+	j	.Lloop
+
+.Ltry2:
+	cmpeqi	r0, r9, 0x2	/* IND_INDIRECTION */
+	beqzt	r0, .Ltry4
+
+	move	r30, r10
+
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+	moveli	r0, 'i'
+	jalr	r40
+#endif
+
+	j	.Lloop
+
+.Ltry4:
+	cmpeqi	r0, r9, 0x4	/* IND_DONE */
+	beqzt	r0, .Ltry8
+
+	mf
+
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+	moveli	r0, 'D'
+	jalr	r40
+	moveli	r0, '\n'
+	jalr	r40
+#endif
+
+	move	r0, r32
+
+	moveli	r41, hw2_last(hv_reexec)
+	shl16insli	r41, r41, hw1(hv_reexec)
+	shl16insli	r41, r41, hw0(hv_reexec)
+
+	jalr	r41
+
+	/* we should not get here */
+
+	moveli	r0, '?'
+	jalr	r40
+	moveli	r0, '\n'
+	jalr	r40
+
+	j	.Lhalt
+
+.Ltry8:	cmpeqi	r0, r9, 0x8	/* IND_SOURCE */
+	beqz	r0, .Lerr	/* unknown type */
+
+	/* copy page at r10 to page at r33 */
+
+	move	r11, r33
+
+	moveli	r0, hw2_last(PAGE_SIZE)
+	shl16insli	r0, r0, hw1(PAGE_SIZE)
+	shl16insli	r0, r0, hw0(PAGE_SIZE)
+	add	r33, r33, r0
+
+	/* copy word at r10 to word at r11 until r11 equals r33 */
+
+	/* We know page size must be multiple of 8, so we can unroll
+	 * 8 times safely without any edge case checking.
+	 *
+	 * Issue a flush of the destination every 8 words to avoid
+	 * incoherence when starting the new kernel.  (Now this is
+	 * just good paranoia because the hv_reexec call will also
+	 * take care of this.)
+	 */
+
+1:
+	{ ld	r0, r10; addi	r10, r10, 8 }
+	{ st	r11, r0; addi	r11, r11, 8 }
+	{ ld	r0, r10; addi	r10, r10, 8 }
+	{ st	r11, r0; addi	r11, r11, 8 }
+	{ ld	r0, r10; addi	r10, r10, 8 }
+	{ st	r11, r0; addi	r11, r11, 8 }
+	{ ld	r0, r10; addi	r10, r10, 8 }
+	{ st	r11, r0; addi	r11, r11, 8 }
+	{ ld	r0, r10; addi	r10, r10, 8 }
+	{ st	r11, r0; addi	r11, r11, 8 }
+	{ ld	r0, r10; addi	r10, r10, 8 }
+	{ st	r11, r0; addi	r11, r11, 8 }
+	{ ld	r0, r10; addi	r10, r10, 8 }
+	{ st	r11, r0; addi	r11, r11, 8 }
+	{ ld	r0, r10; addi	r10, r10, 8 }
+	{ st	r11, r0 }
+	{ flush r11    ; addi	r11, r11, 8 }
+
+	cmpeq	r0, r33, r11
+	beqzt	r0, 1b
+
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+	moveli	r0, 's'
+	jalr	r40
+#endif
+
+	addi	r30, r30, 8
+	j	.Lloop
+
+
+.Lerr:	moveli	r0, 'e'
+	jalr	r40
+	moveli	r0, 'r'
+	jalr	r40
+	moveli	r0, 'r'
+	jalr	r40
+	moveli	r0, '\n'
+	jalr	r40
+.Lhalt:
+	moveli r41, hw2_last(hv_halt)
+	shl16insli r41, r41, hw1(hv_halt)
+	shl16insli r41, r41, hw0(hv_halt)
+
+	jalr	r41
+	STD_ENDPROC(relocate_new_kernel)
+
+	.section .rodata,"a"
+
+	.globl relocate_new_kernel_size
+relocate_new_kernel_size:
+	.long .Lend_relocate_new_kernel - relocate_new_kernel
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -28,6 +28,7 @@
 #include <linux/highmem.h>
 #include <linux/smp.h>
 #include <linux/timex.h>
+#include <linux/hugetlb.h>
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
@@ -49,9 +50,6 @@ char chip_model[64] __write_once;
 struct pglist_data node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);

-/* We only create bootmem data on node 0. */
-static bootmem_data_t __initdata node0_bdata;
-
 /* Information on the NUMA nodes that we compute early */
 unsigned long __cpuinitdata node_start_pfn[MAX_NUMNODES];
 unsigned long __cpuinitdata node_end_pfn[MAX_NUMNODES];
@@ -534,37 +532,96 @@ static void __init setup_memory(void)
 #endif
 }

-static void __init setup_bootmem_allocator(void)
+/*
+ * On 32-bit machines, we only put bootmem on the low controller,
+ * since PAs > 4GB can't be used in bootmem.  In principle one could
+ * imagine, e.g., multiple 1 GB controllers all of which could support
+ * bootmem, but in practice using controllers this small isn't a
+ * particularly interesting scenario, so we just keep it simple and
+ * use only the first controller for bootmem on 32-bit machines.
+ */
+static inline int node_has_bootmem(int nid)
 {
-	unsigned long bootmap_size, first_alloc_pfn, last_alloc_pfn;
+#ifdef CONFIG_64BIT
+	return 1;
+#else
+	return nid == 0;
+#endif
+}

-	/* Provide a node 0 bdata. */
-	NODE_DATA(0)->bdata = &node0_bdata;
+static inline unsigned long alloc_bootmem_pfn(int nid,
+					      unsigned long size,
+					      unsigned long goal)
+{
+	void *kva = __alloc_bootmem_node(NODE_DATA(nid), size,
+					 PAGE_SIZE, goal);
+	unsigned long pfn = kaddr_to_pfn(kva);
+	BUG_ON(goal && PFN_PHYS(pfn) != goal);
+	return pfn;
+}

-#ifdef CONFIG_PCI
-	/* Don't let boot memory alias the PCI region. */
-	last_alloc_pfn = min(max_low_pfn, pci_reserve_start_pfn);
+static void __init setup_bootmem_allocator_node(int i)
+{
+	unsigned long start, end, mapsize, mapstart;
+
+	if (node_has_bootmem(i)) {
+		NODE_DATA(i)->bdata = &bootmem_node_data[i];
+	} else {
+		/* Share controller zero's bdata for now. */
+		NODE_DATA(i)->bdata = &bootmem_node_data[0];
+		return;
+	}
+
+	/* Skip up to after the bss in node 0. */
+	start = (i == 0) ? min_low_pfn : node_start_pfn[i];
+
+	/* Only lowmem, if we're a HIGHMEM build. */
+#ifdef CONFIG_HIGHMEM
+	end = node_lowmem_end_pfn[i];
 #else
-	last_alloc_pfn = max_low_pfn;
+	end = node_end_pfn[i];
 #endif

-	/*
-	 * Initialize the boot-time allocator (with low memory only):
-	 * The first argument says where to put the bitmap, and the
-	 * second says where the end of allocatable memory is.
-	 */
-	bootmap_size = init_bootmem(min_low_pfn, last_alloc_pfn);
+	/* No memory here. */
+	if (end == start)
+		return;
+
+	/* Figure out where the bootmem bitmap is located. */
+	mapsize = bootmem_bootmap_pages(end - start);
+	if (i == 0) {
+		/* Use some space right before the heap on node 0. */
+		mapstart = start;
+		start += mapsize;
+	} else {
+		/* Allocate bitmap on node 0 to avoid page table issues. */
+		mapstart = alloc_bootmem_pfn(0, PFN_PHYS(mapsize), 0);
+	}

+	/* Initialize a node. */
+	init_bootmem_node(NODE_DATA(i), mapstart, start, end);
+
+	/* Free all the space back into the allocator. */
+	free_bootmem(PFN_PHYS(start), PFN_PHYS(end - start));
+
+#if defined(CONFIG_PCI)
 	/*
-	 * Let the bootmem allocator use all the space we've given it
-	 * except for its own bitmap.
+	 * Throw away any memory aliased by the PCI region.  FIXME: this
+	 * is a temporary hack to work around bug 10502, and needs to be
+	 * fixed properly.
 	 */
-	first_alloc_pfn = min_low_pfn + PFN_UP(bootmap_size);
-	if (first_alloc_pfn >= last_alloc_pfn)
-		early_panic("Not enough memory on controller 0 for bootmem\n");
+	if (pci_reserve_start_pfn < end && pci_reserve_end_pfn > start)
+		reserve_bootmem(PFN_PHYS(pci_reserve_start_pfn),
+				PFN_PHYS(pci_reserve_end_pfn -
+					 pci_reserve_start_pfn),
+				BOOTMEM_EXCLUSIVE);
+#endif
+}

-	free_bootmem(PFN_PHYS(first_alloc_pfn),
-		     PFN_PHYS(last_alloc_pfn - first_alloc_pfn));
+static void __init setup_bootmem_allocator(void)
+{
+	int i;
+	for (i = 0; i < MAX_NUMNODES; ++i)
+		setup_bootmem_allocator_node(i);

 #ifdef CONFIG_KEXEC
 	if (crashk_res.start != crashk_res.end)
@@ -595,14 +652,6 @@ static int __init percpu_size(void)
 	return size;
 }

-static inline unsigned long alloc_bootmem_pfn(int size, unsigned long goal)
-{
-	void *kva = __alloc_bootmem(size, PAGE_SIZE, goal);
-	unsigned long pfn = kaddr_to_pfn(kva);
-	BUG_ON(goal && PFN_PHYS(pfn) != goal);
-	return pfn;
-}
-
 static void __init zone_sizes_init(void)
 {
 	unsigned long zones_size[MAX_NR_ZONES] = { 0 };
@@ -640,21 +689,22 @@ static void __init zone_sizes_init(void)
 		 * though, there'll be no lowmem, so we just alloc_bootmem
 		 * the memmap.  There will be no percpu memory either.
 		 */
-		if (__pfn_to_highbits(start) == 0) {
-			/* In low PAs, allocate via bootmem. */
+		if (i != 0 && cpu_isset(i, isolnodes)) {
+			node_memmap_pfn[i] =
+				alloc_bootmem_pfn(0, memmap_size, 0);
+			BUG_ON(node_percpu[i] != 0);
+		} else if (node_has_bootmem(start)) {
 			unsigned long goal = 0;
 			node_memmap_pfn[i] =
-				alloc_bootmem_pfn(memmap_size, goal);
+				alloc_bootmem_pfn(i, memmap_size, 0);
 			if (kdata_huge)
 				goal = PFN_PHYS(lowmem_end) - node_percpu[i];
 			if (node_percpu[i])
 				node_percpu_pfn[i] =
-				    alloc_bootmem_pfn(node_percpu[i], goal);
-		} else if (cpu_isset(i, isolnodes)) {
-			node_memmap_pfn[i] = alloc_bootmem_pfn(memmap_size, 0);
-			BUG_ON(node_percpu[i] != 0);
+					alloc_bootmem_pfn(i, node_percpu[i],
+							  goal);
 		} else {
-			/* In high PAs, just reserve some pages. */
+			/* In non-bootmem zones, just reserve some pages. */
 			node_memmap_pfn[i] = node_free_pfn[i];
 			node_free_pfn[i] += PFN_UP(memmap_size);
 			if (!kdata_huge) {
@@ -678,16 +728,9 @@ static void __init zone_sizes_init(void)
 		zones_size[ZONE_NORMAL] = end - start;
 #endif

-		/*
-		 * Everyone shares node 0's bootmem allocator, but
-		 * we use alloc_remap(), above, to put the actual
-		 * struct page array on the individual controllers,
-		 * which is most of the data that we actually care about.
-		 * We can't place bootmem allocators on the other
-		 * controllers since the bootmem allocator can only
-		 * operate on 32-bit physical addresses.
-		 */
-		NODE_DATA(i)->bdata = NODE_DATA(0)->bdata;
+		/* Take zone metadata from controller 0 if we're isolnode. */
+		if (node_isset(i, isolnodes))
+			NODE_DATA(i)->bdata = &bootmem_node_data[0];

 		free_area_init_node(i, zones_size, start, NULL);
 		printk(KERN_DEBUG "  Normal zone: %ld per-cpu pages\n",
@@ -870,6 +913,22 @@ subsys_initcall(topology_init);

 #endif /* CONFIG_NUMA */

+/*
+ * Initialize hugepage support on this cpu.  We do this on all cores
+ * early in boot: before argument parsing for the boot cpu, and after
+ * argument parsing but before the init functions run on the secondaries.
+ * So the values we set up here in the hypervisor may be overridden on
+ * the boot cpu as arguments are parsed.
+ */
+static __cpuinit void init_super_pages(void)
+{
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+	int i;
+	for (i = 0; i < HUGE_SHIFT_ENTRIES; ++i)
+		hv_set_pte_super_shift(i, huge_shift[i]);
+#endif
+}
+
 /**
 * setup_cpu() - Do all necessary per-cpu, tile-specific initialization.
 * @boot: Is this the boot cpu?
@@ -924,6 +983,8 @@ void __cpuinit setup_cpu(int boot)
 	/* Reset the network state on this cpu. */
 	reset_network_state();
 #endif
+
+	init_super_pages();
 }

 #ifdef CONFIG_BLK_DEV_INITRD
@@ -1412,13 +1473,13 @@ void __init setup_per_cpu_areas(void)
 		for (i = 0; i < size; i += PAGE_SIZE, ++pfn, ++pg) {

 			/* Update the vmalloc mapping and page home. */
-			pte_t *ptep =
-				virt_to_pte(NULL, (unsigned long)ptr + i);
+			unsigned long addr = (unsigned long)ptr + i;
+			pte_t *ptep = virt_to_pte(NULL, addr);
 			pte_t pte = *ptep;
 			BUG_ON(pfn != pte_pfn(pte));
 			pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_TILE_L3);
 			pte = set_remote_cache_cpu(pte, cpu);
-			set_pte(ptep, pte);
+			set_pte_at(&init_mm, addr, ptep, pte);

 			/* Update the lowmem mapping for consistency. */
 			lowmem_va = (unsigned long)pfn_to_kaddr(pfn);
@@ -1431,7 +1492,7 @@ void __init setup_per_cpu_areas(void)
 				BUG_ON(pte_huge(*ptep));
 			}
 			BUG_ON(pfn != pte_pfn(*ptep));
-			set_pte(ptep, pte);
+			set_pte_at(&init_mm, lowmem_va, ptep, pte);
 		}
 	}


--- a/arch/tile/kernel/single_step.c
+++ b/arch/tile/kernel/single_step.c
@@ -172,9 +172,6 @@ static tile_bundle_bits rewrite_load_store_unaligned(
 		return (tilepro_bundle_bits) 0;
 	}

-#ifndef __LITTLE_ENDIAN
-# error We assume little-endian representation with copy_xx_user size 2 here
-#endif
 	/* Handle unaligned load/store */
 	if (mem_op == MEMOP_LOAD || mem_op == MEMOP_LOAD_POSTINCR) {
 		unsigned short val_16;
@@ -195,8 +192,19 @@ static tile_bundle_bits rewrite_load_store_unaligned(
 			state->update = 1;
 		}
 	} else {
+		unsigned short val_16;
 		val = (val_reg == TREG_ZERO) ? 0 : regs->regs[val_reg];
-		err = copy_to_user(addr, &val, size);
+		switch (size) {
+		case 2:
+			val_16 = val;
+			err = copy_to_user(addr, &val_16, sizeof(val_16));
+			break;
+		case 4:
+			err = copy_to_user(addr, &val, sizeof(val));
+			break;
+		default:
+			BUG();
+		}
 	}

 	if (err) {

--- a/arch/tile/kernel/smp.c
+++ b/arch/tile/kernel/smp.c
@@ -203,7 +203,7 @@ void __init ipi_init(void)
 		if (hv_get_ipi_pte(tile, KERNEL_PL, &pte) != 0)
 			panic("Failed to initialize IPI for cpu %d\n", cpu);

-		offset = hv_pte_get_pfn(pte) << PAGE_SHIFT;
+		offset = PFN_PHYS(pte_pfn(pte));
 		ipi_mappings[cpu] = ioremap_prot(offset, PAGE_SIZE, pte);
 	}
 #endif

--- a/arch/tile/kernel/sys.c
+++ b/arch/tile/kernel/sys.c
@@ -32,11 +32,17 @@
 #include <asm/syscalls.h>
 #include <asm/pgtable.h>
 #include <asm/homecache.h>
+#include <asm/cachectl.h>
 #include <arch/chip.h>

-SYSCALL_DEFINE0(flush_cache)
+SYSCALL_DEFINE3(cacheflush, unsigned long, addr, unsigned long, len,
+		unsigned long, flags)
 {
-	homecache_evict(cpumask_of(smp_processor_id()));
+	if (flags & DCACHE)
+		homecache_evict(cpumask_of(smp_processor_id()));
+	if (flags & ICACHE)
+		flush_remote(0, HV_FLUSH_EVICT_L1I, mm_cpumask(current->mm),
+			     0, 0, 0, NULL, NULL, 0);
 	return 0;
 }


--- a/arch/tile/kernel/sysfs.c
+++ b/arch/tile/kernel/sysfs.c
@@ -93,6 +93,10 @@ HV_CONF_ATTR(mezz_part,		HV_CONFSTR_MEZZ_PART_NUM)
 HV_CONF_ATTR(mezz_serial,	HV_CONFSTR_MEZZ_SERIAL_NUM)
 HV_CONF_ATTR(mezz_revision,	HV_CONFSTR_MEZZ_REV)
 HV_CONF_ATTR(mezz_description,	HV_CONFSTR_MEZZ_DESC)
+HV_CONF_ATTR(cpumod_part,	HV_CONFSTR_CPUMOD_PART_NUM)
+HV_CONF_ATTR(cpumod_serial,	HV_CONFSTR_CPUMOD_SERIAL_NUM)
+HV_CONF_ATTR(cpumod_revision,	HV_CONFSTR_CPUMOD_REV)
+HV_CONF_ATTR(cpumod_description,HV_CONFSTR_CPUMOD_DESC)
 HV_CONF_ATTR(switch_control,	HV_CONFSTR_SWITCH_CONTROL)

 static struct attribute *board_attrs[] = {
@@ -104,6 +108,10 @@ static struct attribute *board_attrs[] = {
 	&dev_attr_mezz_serial.attr,
 	&dev_attr_mezz_revision.attr,
 	&dev_attr_mezz_description.attr,
+	&dev_attr_cpumod_part.attr,
+	&dev_attr_cpumod_serial.attr,
+	&dev_attr_cpumod_revision.attr,
+	&dev_attr_cpumod_description.attr,
 	&dev_attr_switch_control.attr,
 	NULL
 };

--- a/arch/tile/kernel/tlb.c
+++ b/arch/tile/kernel/tlb.c
@@ -15,6 +15,7 @@

 #include <linux/cpumask.h>
 #include <linux/module.h>
+#include <linux/hugetlb.h>
 #include <asm/tlbflush.h>
 #include <asm/homecache.h>
 #include <hv/hypervisor.h>
@@ -49,25 +50,25 @@ void flush_tlb_current_task(void)
 	flush_tlb_mm(current->mm);
 }

-void flush_tlb_page_mm(const struct vm_area_struct *vma, struct mm_struct *mm,
+void flush_tlb_page_mm(struct vm_area_struct *vma, struct mm_struct *mm,
 		       unsigned long va)
 {
-	unsigned long size = hv_page_size(vma);
+	unsigned long size = vma_kernel_pagesize(vma);
 	int cache = (vma->vm_flags & VM_EXEC) ? HV_FLUSH_EVICT_L1I : 0;
 	flush_remote(0, cache, mm_cpumask(mm),
 		     va, size, size, mm_cpumask(mm), NULL, 0);
 }

-void flush_tlb_page(const struct vm_area_struct *vma, unsigned long va)
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
 {
 	flush_tlb_page_mm(vma, vma->vm_mm, va);
 }
 EXPORT_SYMBOL(flush_tlb_page);

-void flush_tlb_range(const struct vm_area_struct *vma,
+void flush_tlb_range(struct vm_area_struct *vma,
 		     unsigned long start, unsigned long end)
 {
-	unsigned long size = hv_page_size(vma);
+	unsigned long size = vma_kernel_pagesize(vma);
 	struct mm_struct *mm = vma->vm_mm;
 	int cache = (vma->vm_flags & VM_EXEC) ? HV_FLUSH_EVICT_L1I : 0;
 	flush_remote(0, cache, mm_cpumask(mm), start, end - start, size,

--- a/arch/tile/kernel/traps.c
+++ b/arch/tile/kernel/traps.c
@@ -195,6 +195,25 @@ static int special_ill(bundle_bits bundle, int *sigp, int *codep)
 	return 1;
 }

+static const char *const int_name[] = {
+	[INT_MEM_ERROR] = "Memory error",
+	[INT_ILL] = "Illegal instruction",
+	[INT_GPV] = "General protection violation",
+	[INT_UDN_ACCESS] = "UDN access",
+	[INT_IDN_ACCESS] = "IDN access",
+#if CHIP_HAS_SN()
+	[INT_SN_ACCESS] = "SN access",
+#endif
+	[INT_SWINT_3] = "Software interrupt 3",
+	[INT_SWINT_2] = "Software interrupt 2",
+	[INT_SWINT_0] = "Software interrupt 0",
+	[INT_UNALIGN_DATA] = "Unaligned data",
+	[INT_DOUBLE_FAULT] = "Double fault",
+#ifdef __tilegx__
+	[INT_ILL_TRANS] = "Illegal virtual address",
+#endif
+};
+
 void __kprobes do_trap(struct pt_regs *regs, int fault_num,
 		       unsigned long reason)
 {
@@ -211,10 +230,17 @@ void __kprobes do_trap(struct pt_regs *regs, int fault_num,
 	 * current process and hope for the best.
 	 */
 	if (!user_mode(regs)) {
+		const char *name;
 		if (fixup_exception(regs))  /* only UNALIGN_DATA in practice */
 			return;
-		pr_alert("Kernel took bad trap %d at PC %#lx\n",
-		       fault_num, regs->pc);
+		if (fault_num >= 0 &&
+		    fault_num < sizeof(int_name)/sizeof(int_name[0]) &&
+		    int_name[fault_num] != NULL)
+			name = int_name[fault_num];
+		else
+			name = "Unknown interrupt";
+		pr_alert("Kernel took bad trap %d (%s) at PC %#lx\n",
+			 fault_num, name, regs->pc);
 		if (fault_num == INT_GPV)
 			pr_alert("GPV_REASON is %#lx\n", reason);
 		show_regs(regs);

--- a/arch/tile/lib/atomic_32.c
+++ b/arch/tile/lib/atomic_32.c
@@ -18,7 +18,6 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/atomic.h>
-#include <asm/futex.h>
 #include <arch/chip.h>

 /* See <asm/atomic_32.h> */
@@ -50,7 +49,7 @@ int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss;

 #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */

-static inline int *__atomic_hashed_lock(volatile void *v)
+int *__atomic_hashed_lock(volatile void *v)
 {
 	/* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */
 #if ATOMIC_LOCKS_FOUND_VIA_TABLE()
@@ -191,47 +190,6 @@ u64 _atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n)
 EXPORT_SYMBOL(_atomic64_cmpxchg);


-static inline int *__futex_setup(int __user *v)
-{
-	/*
-	 * Issue a prefetch to the counter to bring it into cache.
-	 * As for __atomic_setup, but we can't do a read into the L1
-	 * since it might fault; instead we do a prefetch into the L2.
-	 */
-	__insn_prefetch(v);
-	return __atomic_hashed_lock((int __force *)v);
-}
-
-struct __get_user futex_set(u32 __user *v, int i)
-{
-	return __atomic_xchg((int __force *)v, __futex_setup(v), i);
-}
-
-struct __get_user futex_add(u32 __user *v, int n)
-{
-	return __atomic_xchg_add((int __force *)v, __futex_setup(v), n);
-}
-
-struct __get_user futex_or(u32 __user *v, int n)
-{
-	return __atomic_or((int __force *)v, __futex_setup(v), n);
-}
-
-struct __get_user futex_andn(u32 __user *v, int n)
-{
-	return __atomic_andn((int __force *)v, __futex_setup(v), n);
-}
-
-struct __get_user futex_xor(u32 __user *v, int n)
-{
-	return __atomic_xor((int __force *)v, __futex_setup(v), n);
-}
-
-struct __get_user futex_cmpxchg(u32 __user *v, int o, int n)
-{
-	return __atomic_cmpxchg((int __force *)v, __futex_setup(v), o, n);
-}
-
 /*
 * If any of the atomic or futex routines hit a bad address (not in
 * the page tables at kernel PL) this routine is called.  The futex
@@ -323,7 +281,4 @@ void __init __init_atomic_per_cpu(void)
 	BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE);

 #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
-	/* The futex code makes this assumption, so we validate it here. */
-	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));
 }
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -18,14 +18,6 @@

 /* arch/tile/lib/usercopy.S */
 #include <linux/uaccess.h>
-EXPORT_SYMBOL(__get_user_1);
-EXPORT_SYMBOL(__get_user_2);
-EXPORT_SYMBOL(__get_user_4);
-EXPORT_SYMBOL(__get_user_8);
-EXPORT_SYMBOL(__put_user_1);
-EXPORT_SYMBOL(__put_user_2);
-EXPORT_SYMBOL(__put_user_4);
-EXPORT_SYMBOL(__put_user_8);
 EXPORT_SYMBOL(strnlen_user_asm);
 EXPORT_SYMBOL(strncpy_from_user_asm);
 EXPORT_SYMBOL(clear_user_asm);

--- a/arch/tile/lib/memchr_64.c
+++ b/arch/tile/lib/memchr_64.c
@@ -15,6 +15,7 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/module.h>
+#include "string-endian.h"

 void *memchr(const void *s, int c, size_t n)
 {
@@ -39,11 +40,8 @@ void *memchr(const void *s, int c, size_t n)

 	/* Read the first word, but munge it so that bytes before the array
 	 * will not match goal.
-	 *
-	 * Note that this shift count expression works because we know
-	 * shift counts are taken mod 64.
 	 */
-	before_mask = (1ULL << (s_int << 3)) - 1;
+	before_mask = MASK(s_int);
 	v = (*p | before_mask) ^ (goal & before_mask);

 	/* Compute the address of the last byte. */
@@ -65,7 +63,7 @@ void *memchr(const void *s, int c, size_t n)
 	/* We found a match, but it might be in a byte past the end
 	 * of the array.
 	 */
-	ret = ((char *)p) + (__insn_ctz(bits) >> 3);
+	ret = ((char *)p) + (CFZ(bits) >> 3);
 	return (ret <= last_byte_ptr) ? ret : NULL;
 }
 EXPORT_SYMBOL(memchr);
--- a/arch/tile/lib/memcpy_64.c
+++ b/arch/tile/lib/memcpy_64.c
@@ -15,7 +15,6 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/module.h>
-#define __memcpy memcpy
 /* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */

 /* Must be 8 bytes in size. */
@@ -188,6 +187,7 @@ int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)

 	/* n != 0 if we get here.  Write out any trailing bytes. */
 	dst1 = (char *)dst8;
+#ifndef __BIG_ENDIAN__
 	if (n & 4) {
 		ST4((uint32_t *)dst1, final);
 		dst1 += 4;
@@ -202,11 +202,30 @@ int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
 	}
 	if (n)
 		ST1((uint8_t *)dst1, final);
+#else
+	if (n & 4) {
+		ST4((uint32_t *)dst1, final >> 32);
+		dst1 += 4;
+        }
+        else
+        {
+		final >>= 32;
+        }
+	if (n & 2) {
+		ST2((uint16_t *)dst1, final >> 16);
+		dst1 += 2;
+        }
+        else
+        {
+		final >>= 16;
+        }
+	if (n & 1)
+		ST1((uint8_t *)dst1, final >> 8);
+#endif

 	return RETVAL;
 }

-
 #ifdef USERCOPY_FUNC
 #undef ST1
 #undef ST2

--- a/arch/tile/lib/memcpy_tile64.c
+++ b/arch/tile/lib/memcpy_tile64.c
@@ -160,7 +160,7 @@ static unsigned long fast_copy(void *dest, const void *source, int len,
 			break;
 		if (get_remote_cache_cpu(src_pte) == smp_processor_id())
 			break;
-		src_page = pfn_to_page(hv_pte_get_pfn(src_pte));
+		src_page = pfn_to_page(pte_pfn(src_pte));
 		get_page(src_page);
 		if (pte_val(src_pte) != pte_val(*src_ptep)) {
 			put_page(src_page);
@@ -168,7 +168,7 @@ static unsigned long fast_copy(void *dest, const void *source, int len,
 		}
 		if (pte_huge(src_pte)) {
 			/* Adjust the PTE to correspond to a small page */
-			int pfn = hv_pte_get_pfn(src_pte);
+			int pfn = pte_pfn(src_pte);
 			pfn += (((unsigned long)source & (HPAGE_SIZE-1))
 				>> PAGE_SHIFT);
 			src_pte = pfn_pte(pfn, src_pte);
@@ -188,7 +188,7 @@ static unsigned long fast_copy(void *dest, const void *source, int len,
 			put_page(src_page);
 			break;
 		}
-		dst_page = pfn_to_page(hv_pte_get_pfn(dst_pte));
+		dst_page = pfn_to_page(pte_pfn(dst_pte));
 		if (dst_page == src_page) {
 			/*
 			 * Source and dest are on the same page; this
@@ -206,7 +206,7 @@ static unsigned long fast_copy(void *dest, const void *source, int len,
 		}
 		if (pte_huge(dst_pte)) {
 			/* Adjust the PTE to correspond to a small page */
-			int pfn = hv_pte_get_pfn(dst_pte);
+			int pfn = pte_pfn(dst_pte);
 			pfn += (((unsigned long)dest & (HPAGE_SIZE-1))
 				>> PAGE_SHIFT);
 			dst_pte = pfn_pte(pfn, dst_pte);

--- a/arch/tile/lib/strchr_64.c
+++ b/arch/tile/lib/strchr_64.c
@@ -15,8 +15,7 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/module.h>
-
-#undef strchr
+#include "string-endian.h"

 char *strchr(const char *s, int c)
 {
@@ -33,13 +32,9 @@ char *strchr(const char *s, int c)
 	 * match neither zero nor goal (we make sure the high bit of each
 	 * byte is 1, and the low 7 bits are all the opposite of the goal
 	 * byte).
-	 *
-	 * Note that this shift count expression works because we know shift
-	 * counts are taken mod 64.
 	 */
-	const uint64_t before_mask = (1ULL << (s_int << 3)) - 1;
-	uint64_t v = (*p | before_mask) ^
-		(goal & __insn_v1shrsi(before_mask, 1));
+	const uint64_t before_mask = MASK(s_int);
+	uint64_t v = (*p | before_mask) ^ (goal & __insn_v1shrui(before_mask, 1));

 	uint64_t zero_matches, goal_matches;
 	while (1) {
@@ -55,8 +50,8 @@ char *strchr(const char *s, int c)
 		v = *++p;
 	}

-	z = __insn_ctz(zero_matches);
-	g = __insn_ctz(goal_matches);
+	z = CFZ(zero_matches);
+	g = CFZ(goal_matches);

 	/* If we found c before '\0' we got a match. Note that if c == '\0'
 	 * then g == z, and we correctly return the address of the '\0'

--- a/arch/tile/lib/string-endian.h
+++ b/arch/tile/lib/string-endian.h
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * Provide a mask based on the pointer alignment that
+ * sets up non-zero bytes before the beginning of the string.
+ * The MASK expression works because shift counts are taken mod 64.
+ * Also, specify how to count "first" and "last" bits
+ * when the bits have been read as a word.
+ */
+
+#include <asm/byteorder.h>
+
+#ifdef __LITTLE_ENDIAN
+#define MASK(x) (__insn_shl(1ULL, (x << 3)) - 1)
+#define NULMASK(x) ((2ULL << x) - 1)
+#define CFZ(x) __insn_ctz(x)
+#define REVCZ(x) __insn_clz(x)
+#else
+#define MASK(x) (__insn_shl(-2LL, ((-x << 3) - 1)))
+#define NULMASK(x) (-2LL << (63 - x))
+#define CFZ(x) __insn_clz(x)
+#define REVCZ(x) __insn_ctz(x)
+#endif
--- a/arch/tile/lib/strlen_64.c
+++ b/arch/tile/lib/strlen_64.c
@@ -15,8 +15,7 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/module.h>
-
-#undef strlen
+#include "string-endian.h"

 size_t strlen(const char *s)
 {
@@ -24,15 +23,13 @@ size_t strlen(const char *s)
 	const uintptr_t s_int = (uintptr_t) s;
 	const uint64_t *p = (const uint64_t *)(s_int & -8);

-	/* Read the first word, but force bytes before the string to be nonzero.
-	 * This expression works because we know shift counts are taken mod 64.
-	 */
-	uint64_t v = *p | ((1ULL << (s_int << 3)) - 1);
+	/* Read and MASK the first word. */
+	uint64_t v = *p | MASK(s_int);

 	uint64_t bits;
 	while ((bits = __insn_v1cmpeqi(v, 0)) == 0)
 		v = *++p;

-	return ((const char *)p) + (__insn_ctz(bits) >> 3) - s;
+	return ((const char *)p) + (CFZ(bits) >> 3) - s;
 }
 EXPORT_SYMBOL(strlen);
--- a/arch/tile/lib/usercopy_32.S
+++ b/arch/tile/lib/usercopy_32.S
@@ -19,82 +19,6 @@

 /* Access user memory, but use MMU to avoid propagating kernel exceptions. */

-	.pushsection .fixup,"ax"
-
-get_user_fault:
-	{ move r0, zero; move r1, zero }
-	{ movei r2, -EFAULT; jrp lr }
-	ENDPROC(get_user_fault)
-
-put_user_fault:
-	{ movei r0, -EFAULT; jrp lr }
-	ENDPROC(put_user_fault)
-
-	.popsection
-
-/*
- * __get_user_N functions take a pointer in r0, and return 0 in r2
- * on success, with the value in r0; or else -EFAULT in r2.
- */
-#define __get_user_N(bytes, LOAD) \
-	STD_ENTRY(__get_user_##bytes); \
-1:	{ LOAD r0, r0; move r1, zero; move r2, zero }; \
-	jrp lr; \
-	STD_ENDPROC(__get_user_##bytes); \
-	.pushsection __ex_table,"a"; \
-	.word 1b, get_user_fault; \
-	.popsection
-
-__get_user_N(1, lb_u)
-__get_user_N(2, lh_u)
-__get_user_N(4, lw)
-
-/*
- * __get_user_8 takes a pointer in r0, and returns 0 in r2
- * on success, with the value in r0/r1; or else -EFAULT in r2.
- */
-	STD_ENTRY(__get_user_8);
-1:	{ lw r0, r0; addi r1, r0, 4 };
-2:	{ lw r1, r1; move r2, zero };
-	jrp lr;
-	STD_ENDPROC(__get_user_8);
-	.pushsection __ex_table,"a";
-	.word 1b, get_user_fault;
-	.word 2b, get_user_fault;
-	.popsection
-
-/*
- * __put_user_N functions take a value in r0 and a pointer in r1,
- * and return 0 in r0 on success or -EFAULT on failure.
- */
-#define __put_user_N(bytes, STORE) \
-	STD_ENTRY(__put_user_##bytes); \
-1:	{ STORE r1, r0; move r0, zero }; \
-	jrp lr; \
-	STD_ENDPROC(__put_user_##bytes); \
-	.pushsection __ex_table,"a"; \
-	.word 1b, put_user_fault; \
-	.popsection
-
-__put_user_N(1, sb)
-__put_user_N(2, sh)
-__put_user_N(4, sw)
-
-/*
- * __put_user_8 takes a value in r0/r1 and a pointer in r2,
- * and returns 0 in r0 on success or -EFAULT on failure.
- */
-STD_ENTRY(__put_user_8)
-1:      { sw r2, r0; addi r2, r2, 4 }
-2:      { sw r2, r1; move r0, zero }
-	jrp lr
-	STD_ENDPROC(__put_user_8)
-	.pushsection __ex_table,"a"
-	.word 1b, put_user_fault
-	.word 2b, put_user_fault
-	.popsection
-
-
 /*
 * strnlen_user_asm takes the pointer in r0, and the length bound in r1.
 * It returns the length, including the terminating NUL, or zero on exception.

--- a/arch/tile/lib/usercopy_64.S
+++ b/arch/tile/lib/usercopy_64.S
@@ -19,55 +19,6 @@

 /* Access user memory, but use MMU to avoid propagating kernel exceptions. */

-	.pushsection .fixup,"ax"
-
-get_user_fault:
-	{ movei r1, -EFAULT; move r0, zero }
-	jrp lr
-	ENDPROC(get_user_fault)
-
-put_user_fault:
-	{ movei r0, -EFAULT; jrp lr }
-	ENDPROC(put_user_fault)
-
-	.popsection
-
-/*
- * __get_user_N functions take a pointer in r0, and return 0 in r1
- * on success, with the value in r0; or else -EFAULT in r1.
- */
-#define __get_user_N(bytes, LOAD) \
-	STD_ENTRY(__get_user_##bytes); \
-1:	{ LOAD r0, r0; move r1, zero }; \
-	jrp lr; \
-	STD_ENDPROC(__get_user_##bytes); \
-	.pushsection __ex_table,"a"; \
-	.quad 1b, get_user_fault; \
-	.popsection
-
-__get_user_N(1, ld1u)
-__get_user_N(2, ld2u)
-__get_user_N(4, ld4u)
-__get_user_N(8, ld)
-
-/*
- * __put_user_N functions take a value in r0 and a pointer in r1,
- * and return 0 in r0 on success or -EFAULT on failure.
- */
-#define __put_user_N(bytes, STORE) \
-	STD_ENTRY(__put_user_##bytes); \
-1:	{ STORE r1, r0; move r0, zero }; \
-	jrp lr; \
-	STD_ENDPROC(__put_user_##bytes); \
-	.pushsection __ex_table,"a"; \
-	.quad 1b, put_user_fault; \
-	.popsection
-
-__put_user_N(1, st1)
-__put_user_N(2, st2)
-__put_user_N(4, st4)
-__put_user_N(8, st)
-
 /*
 * strnlen_user_asm takes the pointer in r0, and the length bound in r1.
 * It returns the length, including the terminating NUL, or zero on exception.

--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -187,7 +187,7 @@ static pgd_t *get_current_pgd(void)
 	HV_Context ctx = hv_inquire_context();
 	unsigned long pgd_pfn = ctx.page_table >> PAGE_SHIFT;
 	struct page *pgd_page = pfn_to_page(pgd_pfn);
-	BUG_ON(PageHighMem(pgd_page));   /* oops, HIGHPTE? */
+	BUG_ON(PageHighMem(pgd_page));
 	return (pgd_t *) __va(ctx.page_table);
 }

@@ -273,11 +273,15 @@ static int handle_page_fault(struct pt_regs *regs,
 	int si_code;
 	int is_kernel_mode;
 	pgd_t *pgd;
+	unsigned int flags;

 	/* on TILE, protection faults are always writes */
 	if (!is_page_fault)
 		write = 1;

+	flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
+		 (write ? FAULT_FLAG_WRITE : 0));
+
 	is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);

 	tsk = validate_current();
@@ -382,6 +386,8 @@ static int handle_page_fault(struct pt_regs *regs,
 			vma = NULL;  /* happy compiler */
 			goto bad_area_nosemaphore;
 		}
+
+retry:
 		down_read(&mm->mmap_sem);
 	}

@@ -429,7 +435,11 @@ static int handle_page_fault(struct pt_regs *regs,
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	fault = handle_mm_fault(mm, vma, address, write);
+	fault = handle_mm_fault(mm, vma, address, flags);
+
+	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
+		return 0;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
@@ -437,10 +447,22 @@ static int handle_page_fault(struct pt_regs *regs,
 			goto do_sigbus;
 		BUG();
 	}
-	if (fault & VM_FAULT_MAJOR)
-		tsk->maj_flt++;
-	else
-		tsk->min_flt++;
+	if (flags & FAULT_FLAG_ALLOW_RETRY) {
+		if (fault & VM_FAULT_MAJOR)
+			tsk->maj_flt++;
+		else
+			tsk->min_flt++;
+		if (fault & VM_FAULT_RETRY) {
+			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+
+			 /*
+			  * No need to up_read(&mm->mmap_sem) as we would
+			  * have already released it in __lock_page_or_retry
+			  * in mm/filemap.c.
+			  */
+			goto retry;
+		}
+	}

 #if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
 	/*

--- a/arch/tile/mm/homecache.c
+++ b/arch/tile/mm/homecache.c
@@ -30,6 +30,7 @@
 #include <linux/cache.h>
 #include <linux/smp.h>
 #include <linux/module.h>
+#include <linux/hugetlb.h>

 #include <asm/page.h>
 #include <asm/sections.h>

--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
--- a/arch/tile/mm/migrate.h
+++ b/arch/tile/mm/migrate.h
@@ -24,6 +24,9 @@
 /*
 * This function is used as a helper when setting up the initial
 * page table (swapper_pg_dir).
+ *
+ * You must mask ALL interrupts prior to invoking this code, since
+ * you can't legally touch the stack during the cache flush.
 */
 extern int flush_and_install_context(HV_PhysAddr page_table, HV_PTE access,
 				     HV_ASID asid,
@@ -39,6 +42,9 @@ extern int flush_and_install_context(HV_PhysAddr page_table, HV_PTE access,
 *
 * Note that any non-NULL pointers must not point to the page that
 * is handled by the stack_pte itself.
+ *
+ * You must mask ALL interrupts prior to invoking this code, since
+ * you can't legally touch the stack during the cache flush.
 */
 extern int homecache_migrate_stack_and_flush(pte_t stack_pte, unsigned long va,
 				     size_t length, pte_t *stack_ptep,

--- a/arch/tile/mm/migrate_32.S
+++ b/arch/tile/mm/migrate_32.S
--- a/arch/tile/mm/migrate_64.S
+++ b/arch/tile/mm/migrate_64.S
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -158,9 +158,8 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 #endif

 #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern pmd_t pmdp_splitting_flush(struct vm_area_struct *vma,
-				  unsigned long address,
-				  pmd_t *pmdp);
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+				 unsigned long address, pmd_t *pmdp);
 #endif

 #ifndef __HAVE_ARCH_PTE_SAME

--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c