diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c
index c481673d249cb485dc1ceca3c02a073ebfc2aacc..ba43d85e8dded537cfeb4a23c510456e4f2c956f 100644
--- a/arch/sparc64/kernel/irq.c
+++ b/arch/sparc64/kernel/irq.c
@@ -915,12 +915,18 @@ static void __init sun4v_init_mondo_queues(void)
 		alloc_one_mondo(&tb->nonresum_mondo_pa, tb->nonresum_qmask);
 		alloc_one_kbuf(&tb->nonresum_kernel_buf_pa,
 			       tb->nonresum_qmask);
+	}
+}
+
+static void __init init_send_mondo_info(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct trap_per_cpu *tb = &trap_block[cpu];
 
 		init_cpu_send_mondo_info(tb);
 	}
-
-	/* Load up the boot cpu's entries.  */
-	sun4v_register_mondo_queues(hard_smp_processor_id());
 }
 
 static struct irqaction timer_irq_action = {
@@ -949,6 +955,13 @@ void __init init_IRQ(void)
 	if (tlb_type == hypervisor)
 		sun4v_init_mondo_queues();
 
+	init_send_mondo_info();
+
+	if (tlb_type == hypervisor) {
+		/* Load up the boot cpu's entries.  */
+		sun4v_register_mondo_queues(hard_smp_processor_id());
+	}
+
 	/* We need to clear any IRQ's pending in the soft interrupt
 	 * registers, a spurious one could be left around from the
 	 * PROM timer which we just disabled.
diff --git a/arch/sparc64/kernel/signal.c b/arch/sparc64/kernel/signal.c
index ca5a6ae3a6e2b37c77bcedb7bb18523cd95d615b..ec82d76dc6f2cc245ca54203dd38c35fee192ed2 100644
--- a/arch/sparc64/kernel/signal.c
+++ b/arch/sparc64/kernel/signal.c
@@ -23,7 +23,6 @@
 #include <linux/tty.h>
 #include <linux/binfmts.h>
 #include <linux/bitops.h>
-#include <linux/tracehook.h>
 
 #include <asm/uaccess.h>
 #include <asm/ptrace.h>
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 340842e51ce131d5c3797e89eb376335e92e3374..27b81775a4decd16c43f32df32c8347407d82bb4 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -459,27 +459,35 @@ static void spitfire_xcall_helper(u64 data0, u64 data1, u64 data2, u64 pstate, u
 	}
 }
 
-static inline void spitfire_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
+static void spitfire_xcall_deliver(struct trap_per_cpu *tb, int cnt)
 {
+	u64 *mondo, data0, data1, data2;
+	u16 *cpu_list;
 	u64 pstate;
 	int i;
 
 	__asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
-	for_each_cpu_mask(i, mask)
-		spitfire_xcall_helper(data0, data1, data2, pstate, i);
+	cpu_list = __va(tb->cpu_list_pa);
+	mondo = __va(tb->cpu_mondo_block_pa);
+	data0 = mondo[0];
+	data1 = mondo[1];
+	data2 = mondo[2];
+	for (i = 0; i < cnt; i++)
+		spitfire_xcall_helper(data0, data1, data2, pstate, cpu_list[i]);
 }
 
 /* Cheetah now allows to send the whole 64-bytes of data in the interrupt
  * packet, but we have no use for that.  However we do take advantage of
  * the new pipelining feature (ie. dispatch to multiple cpus simultaneously).
  */
-static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
+static void cheetah_xcall_deliver(struct trap_per_cpu *tb, int cnt)
 {
-	u64 pstate, ver, busy_mask;
 	int nack_busy_id, is_jbus, need_more;
+	u64 *mondo, pstate, ver, busy_mask;
+	u16 *cpu_list;
 
-	if (cpus_empty(mask))
-		return;
+	cpu_list = __va(tb->cpu_list_pa);
+	mondo = __va(tb->cpu_mondo_block_pa);
 
 	/* Unfortunately, someone at Sun had the brilliant idea to make the
 	 * busy/nack fields hard-coded by ITID number for this Ultra-III
@@ -502,7 +510,7 @@ static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mas
 			     "stxa	%2, [%5] %6\n\t"
 			     "membar	#Sync\n\t"
 			     : /* no outputs */
-			     : "r" (data0), "r" (data1), "r" (data2),
+			     : "r" (mondo[0]), "r" (mondo[1]), "r" (mondo[2]),
 			       "r" (0x40), "r" (0x50), "r" (0x60),
 			       "i" (ASI_INTR_W));
 
@@ -511,11 +519,16 @@ static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mas
 	{
 		int i;
 
-		for_each_cpu_mask(i, mask) {
-			u64 target = (i << 14) | 0x70;
+		for (i = 0; i < cnt; i++) {
+			u64 target, nr;
+
+			nr = cpu_list[i];
+			if (nr == 0xffff)
+				continue;
 
+			target = (nr << 14) | 0x70;
 			if (is_jbus) {
-				busy_mask |= (0x1UL << (i * 2));
+				busy_mask |= (0x1UL << (nr * 2));
 			} else {
 				target |= (nack_busy_id << 24);
 				busy_mask |= (0x1UL <<
@@ -549,11 +562,13 @@ static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mas
 				__asm__ __volatile__("wrpr %0, 0x0, %%pstate"
 						     : : "r" (pstate));
 				if (unlikely(need_more)) {
-					int i, cnt = 0;
-					for_each_cpu_mask(i, mask) {
-						cpu_clear(i, mask);
-						cnt++;
-						if (cnt == 32)
+					int i, this_cnt = 0;
+					for (i = 0; i < cnt; i++) {
+						if (cpu_list[i] == 0xffff)
+							continue;
+						cpu_list[i] = 0xffff;
+						this_cnt++;
+						if (this_cnt == 32)
 							break;
 					}
 					goto retry;
@@ -584,16 +599,20 @@ static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mas
 			/* Clear out the mask bits for cpus which did not
 			 * NACK us.
 			 */
-			for_each_cpu_mask(i, mask) {
-				u64 check_mask;
+			for (i = 0; i < cnt; i++) {
+				u64 check_mask, nr;
+
+				nr = cpu_list[i];
+				if (nr == 0xffff)
+					continue;
 
 				if (is_jbus)
-					check_mask = (0x2UL << (2*i));
+					check_mask = (0x2UL << (2*nr));
 				else
 					check_mask = (0x2UL <<
 						      this_busy_nack);
 				if ((dispatch_stat & check_mask) == 0)
-					cpu_clear(i, mask);
+					cpu_list[i] = 0xffff;
 				this_busy_nack += 2;
 				if (this_busy_nack == 64)
 					break;
@@ -605,47 +624,17 @@ static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mas
 }
 
 /* Multi-cpu list version.  */
-static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
+static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
 {
-	struct trap_per_cpu *tb;
+	int retries, this_cpu, prev_sent, i, saw_cpu_error;
+	unsigned long status;
 	u16 *cpu_list;
-	u64 *mondo;
-	cpumask_t error_mask;
-	unsigned long flags, status;
-	int cnt, retries, this_cpu, prev_sent, i;
-
-	if (cpus_empty(mask))
-		return;
-
-	/* We have to do this whole thing with interrupts fully disabled.
-	 * Otherwise if we send an xcall from interrupt context it will
-	 * corrupt both our mondo block and cpu list state.
-	 *
-	 * One consequence of this is that we cannot use timeout mechanisms
-	 * that depend upon interrupts being delivered locally.  So, for
-	 * example, we cannot sample jiffies and expect it to advance.
-	 *
-	 * Fortunately, udelay() uses %stick/%tick so we can use that.
-	 */
-	local_irq_save(flags);
 
 	this_cpu = smp_processor_id();
-	tb = &trap_block[this_cpu];
-
-	mondo = __va(tb->cpu_mondo_block_pa);
-	mondo[0] = data0;
-	mondo[1] = data1;
-	mondo[2] = data2;
-	wmb();
 
 	cpu_list = __va(tb->cpu_list_pa);
 
-	/* Setup the initial cpu list.  */
-	cnt = 0;
-	for_each_cpu_mask(i, mask)
-		cpu_list[cnt++] = i;
-
-	cpus_clear(error_mask);
+	saw_cpu_error = 0;
 	retries = 0;
 	prev_sent = 0;
 	do {
@@ -690,10 +679,9 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
 					continue;
 
 				err = sun4v_cpu_state(cpu);
-				if (err >= 0 &&
-				    err == HV_CPU_STATE_ERROR) {
+				if (err == HV_CPU_STATE_ERROR) {
+					saw_cpu_error = (cpu + 1);
 					cpu_list[i] = 0xffff;
-					cpu_set(cpu, error_mask);
 				}
 			}
 		} else if (unlikely(status != HV_EWOULDBLOCK))
@@ -717,32 +705,24 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
 		}
 	} while (1);
 
-	local_irq_restore(flags);
-
-	if (unlikely(!cpus_empty(error_mask)))
+	if (unlikely(saw_cpu_error))
 		goto fatal_mondo_cpu_error;
 
 	return;
 
 fatal_mondo_cpu_error:
 	printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus "
-	       "were in error state\n",
-	       this_cpu);
-	printk(KERN_CRIT "CPU[%d]: Error mask [ ", this_cpu);
-	for_each_cpu_mask(i, error_mask)
-		printk("%d ", i);
-	printk("]\n");
+	       "(including %d) were in error state\n",
+	       this_cpu, saw_cpu_error - 1);
 	return;
 
 fatal_mondo_timeout:
-	local_irq_restore(flags);
 	printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward "
 	       " progress after %d retries.\n",
 	       this_cpu, retries);
 	goto dump_cpu_list_and_out;
 
 fatal_mondo_error:
-	local_irq_restore(flags);
 	printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
 	       this_cpu, status);
 	printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
@@ -756,58 +736,93 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
 	printk("]\n");
 }
 
-/* Send cross call to all processors mentioned in MASK
- * except self.
+static void (*xcall_deliver_impl)(struct trap_per_cpu *, int);
+
+static void xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask)
+{
+	struct trap_per_cpu *tb;
+	int this_cpu, i, cnt;
+	unsigned long flags;
+	u16 *cpu_list;
+	u64 *mondo;
+
+	/* We have to do this whole thing with interrupts fully disabled.
+	 * Otherwise if we send an xcall from interrupt context it will
+	 * corrupt both our mondo block and cpu list state.
+	 *
+	 * One consequence of this is that we cannot use timeout mechanisms
+	 * that depend upon interrupts being delivered locally.  So, for
+	 * example, we cannot sample jiffies and expect it to advance.
+	 *
+	 * Fortunately, udelay() uses %stick/%tick so we can use that.
+	 */
+	local_irq_save(flags);
+
+	this_cpu = smp_processor_id();
+	tb = &trap_block[this_cpu];
+
+	mondo = __va(tb->cpu_mondo_block_pa);
+	mondo[0] = data0;
+	mondo[1] = data1;
+	mondo[2] = data2;
+	wmb();
+
+	cpu_list = __va(tb->cpu_list_pa);
+
+	/* Setup the initial cpu list.  */
+	cnt = 0;
+	for_each_cpu_mask_nr(i, *mask) {
+		if (i == this_cpu || !cpu_online(i))
+			continue;
+		cpu_list[cnt++] = i;
+	}
+
+	if (cnt)
+		xcall_deliver_impl(tb, cnt);
+
+	local_irq_restore(flags);
+}
+
+/* Send cross call to all processors mentioned in MASK_P
+ * except self.  Really, there are only two cases currently,
+ * "&cpu_online_map" and "&mm->cpu_vm_mask".
  */
-static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 data2, cpumask_t mask)
+static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 data2, const cpumask_t *mask)
 {
 	u64 data0 = (((u64)ctx)<<32 | (((u64)func) & 0xffffffff));
-	int this_cpu = get_cpu();
 
-	cpus_and(mask, mask, cpu_online_map);
-	cpu_clear(this_cpu, mask);
-
-	if (tlb_type == spitfire)
-		spitfire_xcall_deliver(data0, data1, data2, mask);
-	else if (tlb_type == cheetah || tlb_type == cheetah_plus)
-		cheetah_xcall_deliver(data0, data1, data2, mask);
-	else
-		hypervisor_xcall_deliver(data0, data1, data2, mask);
-	/* NOTE: Caller runs local copy on master. */
+	xcall_deliver(data0, data1, data2, mask);
+}
 
-	put_cpu();
+/* Send cross call to all processors except self. */
+static void smp_cross_call(unsigned long *func, u32 ctx, u64 data1, u64 data2)
+{
+	smp_cross_call_masked(func, ctx, data1, data2, &cpu_online_map);
 }
 
 extern unsigned long xcall_sync_tick;
 
 static void smp_start_sync_tick_client(int cpu)
 {
-	cpumask_t mask = cpumask_of_cpu(cpu);
-
-	smp_cross_call_masked(&xcall_sync_tick,
-			      0, 0, 0, mask);
+	xcall_deliver((u64) &xcall_sync_tick, 0, 0,
+		      &cpumask_of_cpu(cpu));
 }
 
 extern unsigned long xcall_call_function;
 
 void arch_send_call_function_ipi(cpumask_t mask)
 {
-	smp_cross_call_masked(&xcall_call_function, 0, 0, 0, mask);
+	xcall_deliver((u64) &xcall_call_function, 0, 0, &mask);
 }
 
 extern unsigned long xcall_call_function_single;
 
 void arch_send_call_function_single_ipi(int cpu)
 {
-	cpumask_t mask = cpumask_of_cpu(cpu);
-
-	smp_cross_call_masked(&xcall_call_function_single, 0, 0, 0, mask);
+	xcall_deliver((u64) &xcall_call_function_single, 0, 0,
+		      &cpumask_of_cpu(cpu));
 }
 
-/* Send cross call to all processors except self. */
-#define smp_cross_call(func, ctx, data1, data2) \
-	smp_cross_call_masked(func, ctx, data1, data2, cpu_online_map)
-
 void smp_call_function_client(int irq, struct pt_regs *regs)
 {
 	clear_softint(1 << irq);
@@ -877,7 +892,6 @@ static inline void __local_flush_dcache_page(struct page *page)
 
 void smp_flush_dcache_page_impl(struct page *page, int cpu)
 {
-	cpumask_t mask = cpumask_of_cpu(cpu);
 	int this_cpu;
 
 	if (tlb_type == hypervisor)
@@ -893,29 +907,24 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu)
 		__local_flush_dcache_page(page);
 	} else if (cpu_online(cpu)) {
 		void *pg_addr = page_address(page);
-		u64 data0;
+		u64 data0 = 0;
 
 		if (tlb_type == spitfire) {
-			data0 =
-				((u64)&xcall_flush_dcache_page_spitfire);
+			data0 = ((u64)&xcall_flush_dcache_page_spitfire);
 			if (page_mapping(page) != NULL)
 				data0 |= ((u64)1 << 32);
-			spitfire_xcall_deliver(data0,
-					       __pa(pg_addr),
-					       (u64) pg_addr,
-					       mask);
 		} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
 #ifdef DCACHE_ALIASING_POSSIBLE
-			data0 =
-				((u64)&xcall_flush_dcache_page_cheetah);
-			cheetah_xcall_deliver(data0,
-					      __pa(pg_addr),
-					      0, mask);
+			data0 =	((u64)&xcall_flush_dcache_page_cheetah);
 #endif
 		}
+		if (data0) {
+			xcall_deliver(data0, __pa(pg_addr),
+				      (u64) pg_addr, &cpumask_of_cpu(cpu));
 #ifdef CONFIG_DEBUG_DCFLUSH
-		atomic_inc(&dcpage_flushes_xcall);
+			atomic_inc(&dcpage_flushes_xcall);
 #endif
+		}
 	}
 
 	put_cpu();
@@ -923,66 +932,41 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu)
 
 void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 {
-	void *pg_addr = page_address(page);
-	cpumask_t mask = cpu_online_map;
-	u64 data0;
+	void *pg_addr;
 	int this_cpu;
+	u64 data0;
 
 	if (tlb_type == hypervisor)
 		return;
 
 	this_cpu = get_cpu();
 
-	cpu_clear(this_cpu, mask);
-
 #ifdef CONFIG_DEBUG_DCFLUSH
 	atomic_inc(&dcpage_flushes);
 #endif
-	if (cpus_empty(mask))
-		goto flush_self;
+	data0 = 0;
+	pg_addr = page_address(page);
 	if (tlb_type == spitfire) {
 		data0 = ((u64)&xcall_flush_dcache_page_spitfire);
 		if (page_mapping(page) != NULL)
 			data0 |= ((u64)1 << 32);
-		spitfire_xcall_deliver(data0,
-				       __pa(pg_addr),
-				       (u64) pg_addr,
-				       mask);
 	} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
 #ifdef DCACHE_ALIASING_POSSIBLE
 		data0 = ((u64)&xcall_flush_dcache_page_cheetah);
-		cheetah_xcall_deliver(data0,
-				      __pa(pg_addr),
-				      0, mask);
 #endif
 	}
+	if (data0) {
+		xcall_deliver(data0, __pa(pg_addr),
+			      (u64) pg_addr, &cpu_online_map);
 #ifdef CONFIG_DEBUG_DCFLUSH
-	atomic_inc(&dcpage_flushes_xcall);
+		atomic_inc(&dcpage_flushes_xcall);
 #endif
- flush_self:
+	}
 	__local_flush_dcache_page(page);
 
 	put_cpu();
 }
 
-static void __smp_receive_signal_mask(cpumask_t mask)
-{
-	smp_cross_call_masked(&xcall_receive_signal, 0, 0, 0, mask);
-}
-
-void smp_receive_signal(int cpu)
-{
-	cpumask_t mask = cpumask_of_cpu(cpu);
-
-	if (cpu_online(cpu))
-		__smp_receive_signal_mask(mask);
-}
-
-void smp_receive_signal_client(int irq, struct pt_regs *regs)
-{
-	clear_softint(1 << irq);
-}
-
 void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
 {
 	struct mm_struct *mm;
@@ -1083,7 +1067,7 @@ void smp_flush_tlb_mm(struct mm_struct *mm)
 
 	smp_cross_call_masked(&xcall_flush_tlb_mm,
 			      ctx, 0, 0,
-			      mm->cpu_vm_mask);
+			      &mm->cpu_vm_mask);
 
 local_flush_and_out:
 	__flush_tlb_mm(ctx, SECONDARY_CONTEXT);
@@ -1101,7 +1085,7 @@ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long
 	else
 		smp_cross_call_masked(&xcall_flush_tlb_pending,
 				      ctx, nr, (unsigned long) vaddrs,
-				      mm->cpu_vm_mask);
+				      &mm->cpu_vm_mask);
 
 	__flush_tlb_pending(ctx, nr, vaddrs);
 
@@ -1202,6 +1186,16 @@ void __devinit smp_prepare_boot_cpu(void)
 {
 }
 
+void __init smp_setup_processor_id(void)
+{
+	if (tlb_type == spitfire)
+		xcall_deliver_impl = spitfire_xcall_deliver;
+	else if (tlb_type == cheetah || tlb_type == cheetah_plus)
+		xcall_deliver_impl = cheetah_xcall_deliver;
+	else
+		xcall_deliver_impl = hypervisor_xcall_deliver;
+}
+
 void __devinit smp_fill_in_sib_core_maps(void)
 {
 	unsigned int i;
@@ -1370,7 +1364,13 @@ void __init smp_cpus_done(unsigned int max_cpus)
 
 void smp_send_reschedule(int cpu)
 {
-	smp_receive_signal(cpu);
+	xcall_deliver((u64) &xcall_receive_signal, 0, 0,
+		      &cpumask_of_cpu(cpu));
+}
+
+void smp_receive_signal_client(int irq, struct pt_regs *regs)
+{
+	clear_softint(1 << irq);
 }
 
 /* This is a nop because we capture all other cpus
diff --git a/arch/sparc64/mm/tsb.c b/arch/sparc64/mm/tsb.c
index 3547937b17a2f2d43c5d9f667dcfca7e8ce0c1eb..587f8efb2e05efb0681fec3a0c0ed0d59b87357b 100644
--- a/arch/sparc64/mm/tsb.c
+++ b/arch/sparc64/mm/tsb.c
@@ -1,9 +1,10 @@
 /* arch/sparc64/mm/tsb.c
  *
- * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
+ * Copyright (C) 2006, 2008 David S. Miller <davem@davemloft.net>
  */
 
 #include <linux/kernel.h>
+#include <linux/preempt.h>
 #include <asm/system.h>
 #include <asm/page.h>
 #include <asm/tlbflush.h>
@@ -415,7 +416,9 @@ void tsb_grow(struct mm_struct *mm, unsigned long tsb_index, unsigned long rss)
 		tsb_context_switch(mm);
 
 		/* Now force other processors to do the same.  */
+		preempt_disable();
 		smp_tsb_sync(mm);
+		preempt_enable();
 
 		/* Now it is safe to free the old tsb.  */
 		kmem_cache_free(tsb_caches[old_cache_index], old_tsb);