Merge branch 'release' of git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux-2.6

6d23c8bc · Linus Torvalds · 985834a1 · 780d09e8 · 6d23c8bc · 6d23c8bc
19 changed file
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -987,7 +987,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 				break;
 		}

-		if ((res = kcalloc(1, sizeof(struct resource), GFP_KERNEL)) == NULL) {
+		if ((res = kzalloc(sizeof(struct resource), GFP_KERNEL)) == NULL) {
 			printk(KERN_ERR "failed to alocate resource for iomem\n");
 			return;
 		}

--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -347,7 +347,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 		((struct fnptr *)kretprobe_trampoline)->ip;

 	spin_lock_irqsave(&kretprobe_lock, flags);
-        head = kretprobe_inst_table_head(current);
+	head = kretprobe_inst_table_head(current);

 	/*
 	 * It is possible to have multiple instances associated with a given
@@ -363,9 +363,9 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	 *       kretprobe_trampoline
 	 */
 	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
-                if (ri->task != current)
+		if (ri->task != current)
 			/* another task is sharing our hash bucket */
-                        continue;
+			continue;

 		if (ri->rp && ri->rp->handler)
 			ri->rp->handler(ri, regs);
@@ -394,7 +394,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	 * kprobe_handler() that we don't want the post_handler
 	 * to run (and have re-enabled preemption)
 	 */
-        return 1;
+	return 1;
 }

 /* Called with kretprobe_lock held */
@@ -739,12 +739,16 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,

 	switch(val) {
 	case DIE_BREAK:
-		if (pre_kprobes_handler(args))
-			ret = NOTIFY_STOP;
+		/* err is break number from ia64_bad_break() */
+		if (args->err == 0x80200 || args->err == 0x80300)
+			if (pre_kprobes_handler(args))
+				ret = NOTIFY_STOP;
 		break;
-	case DIE_SS:
-		if (post_kprobes_handler(args->regs))
-			ret = NOTIFY_STOP;
+	case DIE_FAULT:
+		/* err is vector number from ia64_fault() */
+		if (args->err == 36)
+			if (post_kprobes_handler(args->regs))
+				ret = NOTIFY_STOP;
 		break;
 	case DIE_PAGE_FAULT:
 		/* kprobe_running() needs smp_processor_id() */

--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -51,6 +51,9 @@
 *
 * 2005-08-12 Keith Owens <kaos@sgi.com>
 *	      Convert MCA/INIT handlers to use per event stacks and SAL/OS state.
+ *
+ * 2005-10-07 Keith Owens <kaos@sgi.com>
+ *	      Add notify_die() hooks.
 */
 #include <linux/config.h>
 #include <linux/types.h>
@@ -58,7 +61,6 @@
 #include <linux/sched.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
-#include <linux/kallsyms.h>
 #include <linux/smp_lock.h>
 #include <linux/bootmem.h>
 #include <linux/acpi.h>
@@ -69,6 +71,7 @@
 #include <linux/workqueue.h>

 #include <asm/delay.h>
+#include <asm/kdebug.h>
 #include <asm/machvec.h>
 #include <asm/meminit.h>
 #include <asm/page.h>
@@ -132,6 +135,14 @@ extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe);

 static int mca_init;

+
+static void inline
+ia64_mca_spin(const char *func)
+{
+	printk(KERN_EMERG "%s: spinning here, not returning to SAL\n", func);
+	while (1)
+		cpu_relax();
+}
 /*
 * IA64_MCA log support
 */
@@ -526,13 +537,16 @@ ia64_mca_wakeup_all(void)
 *  Outputs :   None
 */
 static irqreturn_t
-ia64_mca_rendez_int_handler(int rendez_irq, void *arg, struct pt_regs *ptregs)
+ia64_mca_rendez_int_handler(int rendez_irq, void *arg, struct pt_regs *regs)
 {
 	unsigned long flags;
 	int cpu = smp_processor_id();

 	/* Mask all interrupts */
 	local_irq_save(flags);
+	if (notify_die(DIE_MCA_RENDZVOUS_ENTER, "MCA", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);

 	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_DONE;
 	/* Register with the SAL monarch that the slave has
@@ -540,10 +554,18 @@ ia64_mca_rendez_int_handler(int rendez_irq, void *arg, struct pt_regs *ptregs)
 	 */
 	ia64_sal_mc_rendez();

+	if (notify_die(DIE_MCA_RENDZVOUS_PROCESS, "MCA", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
+
 	/* Wait for the monarch cpu to exit. */
 	while (monarch_cpu != -1)
 	       cpu_relax();	/* spin until monarch leaves */

+	if (notify_die(DIE_MCA_RENDZVOUS_LEAVE, "MCA", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
+
 	/* Enable all interrupts */
 	local_irq_restore(flags);
 	return IRQ_HANDLED;
@@ -933,6 +955,9 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
 	oops_in_progress = 1;	/* FIXME: make printk NMI/MCA/INIT safe */
 	previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA");
 	monarch_cpu = cpu;
+	if (notify_die(DIE_MCA_MONARCH_ENTER, "MCA", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
 	ia64_wait_for_slaves(cpu);

 	/* Wakeup all the processors which are spinning in the rendezvous loop.
@@ -942,6 +967,9 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
 	 * spinning in SAL does not work.
 	 */
 	ia64_mca_wakeup_all();
+	if (notify_die(DIE_MCA_MONARCH_PROCESS, "MCA", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);

 	/* Get the MCA error record and log it */
 	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA);
@@ -960,6 +988,9 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
 		ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
 		sos->os_status = IA64_MCA_CORRECTED;
 	}
+	if (notify_die(DIE_MCA_MONARCH_LEAVE, "MCA", regs, 0, 0, recover)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);

 	set_curr_task(cpu, previous_current);
 	monarch_cpu = -1;
@@ -1188,6 +1219,37 @@ ia64_mca_cpe_poll (unsigned long dummy)

 #endif /* CONFIG_ACPI */

+static int
+default_monarch_init_process(struct notifier_block *self, unsigned long val, void *data)
+{
+	int c;
+	struct task_struct *g, *t;
+	if (val != DIE_INIT_MONARCH_PROCESS)
+		return NOTIFY_DONE;
+	printk(KERN_ERR "Processes interrupted by INIT -");
+	for_each_online_cpu(c) {
+		struct ia64_sal_os_state *s;
+		t = __va(__per_cpu_mca[c] + IA64_MCA_CPU_INIT_STACK_OFFSET);
+		s = (struct ia64_sal_os_state *)((char *)t + MCA_SOS_OFFSET);
+		g = s->prev_task;
+		if (g) {
+			if (g->pid)
+				printk(" %d", g->pid);
+			else
+				printk(" %d (cpu %d task 0x%p)", g->pid, task_cpu(g), g);
+		}
+	}
+	printk("\n\n");
+	if (read_trylock(&tasklist_lock)) {
+		do_each_thread (g, t) {
+			printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm);
+			show_stack(t, NULL);
+		} while_each_thread (g, t);
+		read_unlock(&tasklist_lock);
+	}
+	return NOTIFY_DONE;
+}
+
 /*
 * C portion of the OS INIT handler
 *
@@ -1212,8 +1274,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 	static atomic_t slaves;
 	static atomic_t monarchs;
 	task_t *previous_current;
-	int cpu = smp_processor_id(), c;
-	struct task_struct *g, *t;
+	int cpu = smp_processor_id();

 	oops_in_progress = 1;	/* FIXME: make printk NMI/MCA/INIT safe */
 	console_loglevel = 15;	/* make sure printks make it to console */
@@ -1253,8 +1314,17 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 		ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_INIT;
 		while (monarch_cpu == -1)
 		       cpu_relax();	/* spin until monarch enters */
+		if (notify_die(DIE_INIT_SLAVE_ENTER, "INIT", regs, 0, 0, 0)
+				== NOTIFY_STOP)
+			ia64_mca_spin(__FUNCTION__);
+		if (notify_die(DIE_INIT_SLAVE_PROCESS, "INIT", regs, 0, 0, 0)
+				== NOTIFY_STOP)
+			ia64_mca_spin(__FUNCTION__);
 		while (monarch_cpu != -1)
 		       cpu_relax();	/* spin until monarch leaves */
+		if (notify_die(DIE_INIT_SLAVE_LEAVE, "INIT", regs, 0, 0, 0)
+				== NOTIFY_STOP)
+			ia64_mca_spin(__FUNCTION__);
 		printk("Slave on cpu %d returning to normal service.\n", cpu);
 		set_curr_task(cpu, previous_current);
 		ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
@@ -1263,6 +1333,9 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 	}

 	monarch_cpu = cpu;
+	if (notify_die(DIE_INIT_MONARCH_ENTER, "INIT", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);

 	/*
 	 * Wait for a bit.  On some machines (e.g., HP's zx2000 and zx6000, INIT can be
@@ -1273,27 +1346,16 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 	printk("Delaying for 5 seconds...\n");
 	udelay(5*1000000);
 	ia64_wait_for_slaves(cpu);
-	printk(KERN_ERR "Processes interrupted by INIT -");
-	for_each_online_cpu(c) {
-		struct ia64_sal_os_state *s;
-		t = __va(__per_cpu_mca[c] + IA64_MCA_CPU_INIT_STACK_OFFSET);
-		s = (struct ia64_sal_os_state *)((char *)t + MCA_SOS_OFFSET);
-		g = s->prev_task;
-		if (g) {
-			if (g->pid)
-				printk(" %d", g->pid);
-			else
-				printk(" %d (cpu %d task 0x%p)", g->pid, task_cpu(g), g);
-		}
-	}
-	printk("\n\n");
-	if (read_trylock(&tasklist_lock)) {
-		do_each_thread (g, t) {
-			printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm);
-			show_stack(t, NULL);
-		} while_each_thread (g, t);
-		read_unlock(&tasklist_lock);
-	}
+	/* If nobody intercepts DIE_INIT_MONARCH_PROCESS then we drop through
+	 * to default_monarch_init_process() above and just print all the
+	 * tasks.
+	 */
+	if (notify_die(DIE_INIT_MONARCH_PROCESS, "INIT", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
+	if (notify_die(DIE_INIT_MONARCH_LEAVE, "INIT", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
 	printk("\nINIT dump complete.  Monarch on cpu %d returning to normal service.\n", cpu);
 	atomic_dec(&monarchs);
 	set_curr_task(cpu, previous_current);
@@ -1462,6 +1524,10 @@ ia64_mca_init(void)
 	s64 rc;
 	struct ia64_sal_retval isrv;
 	u64 timeout = IA64_MCA_RENDEZ_TIMEOUT;	/* platform specific */
+	static struct notifier_block default_init_monarch_nb = {
+		.notifier_call = default_monarch_init_process,
+		.priority = 0/* we need to notified last */
+	};

 	IA64_MCA_DEBUG("%s: begin\n", __FUNCTION__);

@@ -1555,6 +1621,10 @@ ia64_mca_init(void)
 		       "(status %ld)\n", rc);
 		return;
 	}
+	if (register_die_notifier(&default_init_monarch_nb)) {
+		printk(KERN_ERR "Failed to register default monarch INIT process\n");
+		return;
+	}

 	IA64_MCA_DEBUG("%s: registered OS INIT handler with SAL\n", __FUNCTION__);


--- a/arch/ia64/kernel/mca_drv.c
+++ b/arch/ia64/kernel/mca_drv.c
@@ -547,9 +547,20 @@ recover_from_processor_error(int platform, slidx_table_t *slidx,
 		(pal_processor_state_info_t*)peidx_psp(peidx);

 	/*
-	 * We cannot recover errors with other than bus_check.
+	 * Processor recovery status must key off of the PAL recovery
+	 * status in the Processor State Parameter.
 	 */
-	if (psp->cc || psp->rc || psp->uc)
+
+	/*
+	 * The machine check is corrected.
+	 */
+	if (psp->cm == 1)
+		return 1;
+
+	/*
+	 * The error was not contained.  Software must be reset.
+	 */
+	if (psp->us || psp->ci == 0)
 		return 0;

 	/*
@@ -570,8 +581,6 @@ recover_from_processor_error(int platform, slidx_table_t *slidx,
 		return 0;
 	if (pbci->eb && pbci->bsi > 0)
 		return 0;
-	if (psp->ci == 0)
-		return 0;

 	/*
 	 * This is a local MCA and estimated as recoverble external bus error.

--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -4,6 +4,9 @@
 * Copyright (C) 1998-2003 Hewlett-Packard Co
 *	David Mosberger-Tang <davidm@hpl.hp.com>
 * 04/11/17 Ashok Raj	<ashok.raj@intel.com> Added CPU Hotplug Support
+ *
+ * 2005-10-07 Keith Owens <kaos@sgi.com>
+ *	      Add notify_die() hooks.
 */
 #define __KERNEL_SYSCALLS__	/* see <asm/unistd.h> */
 #include <linux/config.h>
@@ -34,6 +37,7 @@
 #include <asm/elf.h>
 #include <asm/ia32.h>
 #include <asm/irq.h>
+#include <asm/kdebug.h>
 #include <asm/pgalloc.h>
 #include <asm/processor.h>
 #include <asm/sal.h>
@@ -808,12 +812,14 @@ cpu_halt (void)
 void
 machine_restart (char *restart_cmd)
 {
+	(void) notify_die(DIE_MACHINE_RESTART, restart_cmd, NULL, 0, 0, 0);
 	(*efi.reset_system)(EFI_RESET_WARM, 0, 0, NULL);
 }

 void
 machine_halt (void)
 {
+	(void) notify_die(DIE_MACHINE_HALT, "", NULL, 0, 0, 0);
 	cpu_halt();
 }


--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -461,6 +461,7 @@ setup_arch (char **cmdline_p)
 #endif

 	cpu_init();	/* initialize the bootstrap CPU */
+	mmu_context_init();	/* initialize context_id bitmap */

 #ifdef CONFIG_ACPI
 	acpi_boot_init();

--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -387,15 +387,14 @@ setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set,
 	     struct sigscratch *scr)
 {
 	extern char __kernel_sigtramp[];
-	unsigned long tramp_addr, new_rbs = 0;
+	unsigned long tramp_addr, new_rbs = 0, new_sp;
 	struct sigframe __user *frame;
 	long err;

-	frame = (void __user *) scr->pt.r12;
+	new_sp = scr->pt.r12;
 	tramp_addr = (unsigned long) __kernel_sigtramp;
-	if ((ka->sa.sa_flags & SA_ONSTACK) && sas_ss_flags((unsigned long) frame) == 0) {
-		frame = (void __user *) ((current->sas_ss_sp + current->sas_ss_size)
-					 & ~(STACK_ALIGN - 1));
+	if ((ka->sa.sa_flags & SA_ONSTACK) && sas_ss_flags(new_sp) == 0) {
+		new_sp = current->sas_ss_sp + current->sas_ss_size;
 		/*
 		 * We need to check for the register stack being on the signal stack
 		 * separately, because it's switched separately (memory stack is switched
@@ -404,7 +403,7 @@ setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set,
 		if (!rbs_on_sig_stack(scr->pt.ar_bspstore))
 			new_rbs = (current->sas_ss_sp + sizeof(long) - 1) & ~(sizeof(long) - 1);
 	}
-	frame = (void __user *) frame - ((sizeof(*frame) + STACK_ALIGN - 1) & ~(STACK_ALIGN - 1));
+	frame = (void __user *) ((new_sp - sizeof(*frame)) & -STACK_ALIGN);

 	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
 		return force_sigsegv_info(sig, frame);

--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -30,17 +30,20 @@ fpswa_interface_t *fpswa_interface;
 EXPORT_SYMBOL(fpswa_interface);

 struct notifier_block *ia64die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);

-int register_die_notifier(struct notifier_block *nb)
+int
+register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&ia64die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+	return notifier_chain_register(&ia64die_chain, nb);
 }
+EXPORT_SYMBOL_GPL(register_die_notifier);
+
+int
+unregister_die_notifier(struct notifier_block *nb)
+{
+	return notifier_chain_unregister(&ia64die_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_die_notifier);

 void __init
 trap_init (void)
@@ -105,6 +108,7 @@ die (const char *str, struct pt_regs *regs, long err)
 	if (++die.lock_owner_depth < 3) {
 		printk("%s[%d]: %s %ld [%d]\n",
 			current->comm, current->pid, str, err, ++die_counter);
+		(void) notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
 		show_regs(regs);
  	} else
 		printk(KERN_ERR "Recursive die() failure, output suppressed\n");
@@ -155,9 +159,8 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 	switch (break_num) {
 	      case 0: /* unknown error (used by GCC for __builtin_abort()) */
 		if (notify_die(DIE_BREAK, "break 0", regs, break_num, TRAP_BRKPT, SIGTRAP)
-			       	== NOTIFY_STOP) {
+			       	== NOTIFY_STOP)
 			return;
-		}
 		die_if_kernel("bugcheck!", regs, break_num);
 		sig = SIGILL; code = ILL_ILLOPC;
 		break;
@@ -210,15 +213,6 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 		sig = SIGILL; code = __ILL_BNDMOD;
 		break;

-	      case 0x80200:
-	      case 0x80300:
-		if (notify_die(DIE_BREAK, "kprobe", regs, break_num, TRAP_BRKPT, SIGTRAP)
-			       	== NOTIFY_STOP) {
-			return;
-		}
-		sig = SIGTRAP; code = TRAP_BRKPT;
-		break;
-
 	      default:
 		if (break_num < 0x40000 || break_num > 0x100000)
 			die_if_kernel("Bad break", regs, break_num);
@@ -226,6 +220,9 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 		if (break_num < 0x80000) {
 			sig = SIGILL; code = __ILL_BREAK;
 		} else {
+			if (notify_die(DIE_BREAK, "bad break", regs, break_num, TRAP_BRKPT, SIGTRAP)
+					== NOTIFY_STOP)
+				return;
 			sig = SIGTRAP; code = TRAP_BRKPT;
 		}
 	}
@@ -578,12 +575,11 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 #endif
 			break;
 		      case 35: siginfo.si_code = TRAP_BRANCH; ifa = 0; break;
-		      case 36:
-			      if (notify_die(DIE_SS, "ss", &regs, vector,
-					     vector, SIGTRAP) == NOTIFY_STOP)
-				      return;
-			      siginfo.si_code = TRAP_TRACE; ifa = 0; break;
+		      case 36: siginfo.si_code = TRAP_TRACE; ifa = 0; break;
 		}
+		if (notify_die(DIE_FAULT, "ia64_fault", &regs, vector, siginfo.si_code, SIGTRAP)
+			       	== NOTIFY_STOP)
+			return;
 		siginfo.si_signo = SIGTRAP;
 		siginfo.si_errno = 0;
 		siginfo.si_addr  = (void __user *) ifa;

--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -350,14 +350,12 @@ static void __init initialize_pernode_data(void)
 *	for best.
 * @nid: node id
 * @pernodesize: size of this node's pernode data
- * @align: alignment to use for this node's pernode data
 */
-static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize,
-	unsigned long align)
+static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize)
 {
 	void *ptr = NULL;
 	u8 best = 0xff;
-	int bestnode = -1, node;
+	int bestnode = -1, node, anynode = 0;

 	for_each_online_node(node) {
 		if (node_isset(node, memory_less_mask))
@@ -366,13 +364,15 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize,
 			best = node_distance(nid, node);
 			bestnode = node;
 		}
+		anynode = node;
 	}

-	ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat,
-		pernodesize, align, __pa(MAX_DMA_ADDRESS));
+	if (bestnode == -1)
+		bestnode = anynode;
+
+	ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat, pernodesize,
+		PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS));

-	if (!ptr)
-		panic("NO memory for memory less node\n");
 	return ptr;
 }

@@ -413,8 +413,7 @@ static void __init memory_less_nodes(void)

 	for_each_node_mask(node, memory_less_mask) {
 		pernodesize = compute_pernodesize(node);
-		pernode = memory_less_node_alloc(node, pernodesize,
-			(node) ? (node * PERCPU_PAGE_SIZE) : (1024*1024));
+		pernode = memory_less_node_alloc(node, pernodesize);
 		fill_pernode(node, __pa(pernode), pernodesize);
 	}


--- a/arch/ia64/mm/tlb.c
+++ b/arch/ia64/mm/tlb.c
@@ -8,6 +8,8 @@
 *		Modified RID allocation for SMP
 *          Goutham Rao <goutham.rao@intel.com>
 *              IPI based ptc implementation and A-step IPI implementation.
+ * Rohit Seth <rohit.seth@intel.com>
+ * Ken Chen <kenneth.w.chen@intel.com>
 */
 #include <linux/config.h>
 #include <linux/module.h>
@@ -16,78 +18,75 @@
 #include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/bootmem.h>

 #include <asm/delay.h>
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
 #include <asm/pal.h>
 #include <asm/tlbflush.h>
+#include <asm/dma.h>

 static struct {
 	unsigned long mask;	/* mask of supported purge page-sizes */
-	unsigned long max_bits;	/* log2() of largest supported purge page-size */
+	unsigned long max_bits;	/* log2 of largest supported purge page-size */
 } purge;

 struct ia64_ctx ia64_ctx = {
 	.lock =		SPIN_LOCK_UNLOCKED,
 	.next =		1,
-	.limit =	(1 << 15) - 1,		/* start out with the safe (architected) limit */
 	.max_ctx =	~0U
 };

 DEFINE_PER_CPU(u8, ia64_need_tlb_flush);

+/*
+ * Initializes the ia64_ctx.bitmap array based on max_ctx+1.
+ * Called after cpu_init() has setup ia64_ctx.max_ctx based on
+ * maximum RID that is supported by boot CPU.
+ */
+void __init
+mmu_context_init (void)
+{
+	ia64_ctx.bitmap = alloc_bootmem((ia64_ctx.max_ctx+1)>>3);
+	ia64_ctx.flushmap = alloc_bootmem((ia64_ctx.max_ctx+1)>>3);
+}
+
 /*
 * Acquire the ia64_ctx.lock before calling this function!
 */
 void
 wrap_mmu_context (struct mm_struct *mm)
 {
-	unsigned long tsk_context, max_ctx = ia64_ctx.max_ctx;
-	struct task_struct *tsk;
-	int i;
+	int i, cpu;
+	unsigned long flush_bit;

-	if (ia64_ctx.next > max_ctx)
-		ia64_ctx.next = 300;	/* skip daemons */
-	ia64_ctx.limit = max_ctx + 1;
+	for (i=0; i <= ia64_ctx.max_ctx / BITS_PER_LONG; i++) {
+		flush_bit = xchg(&ia64_ctx.flushmap[i], 0);
+		ia64_ctx.bitmap[i] ^= flush_bit;
+	}
+ 
+	/* use offset at 300 to skip daemons */
+	ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap,
+				ia64_ctx.max_ctx, 300);
+	ia64_ctx.limit = find_next_bit(ia64_ctx.bitmap,
+				ia64_ctx.max_ctx, ia64_ctx.next);

 	/*
-	 * Scan all the task's mm->context and set proper safe range
+	 * can't call flush_tlb_all() here because of race condition
+	 * with O(1) scheduler [EF]
 	 */
-
-	read_lock(&tasklist_lock);
-  repeat:
-	for_each_process(tsk) {
-		if (!tsk->mm)
-			continue;
-		tsk_context = tsk->mm->context;
-		if (tsk_context == ia64_ctx.next) {
-			if (++ia64_ctx.next >= ia64_ctx.limit) {
-				/* empty range: reset the range limit and start over */
-				if (ia64_ctx.next > max_ctx)
-					ia64_ctx.next = 300;
-				ia64_ctx.limit = max_ctx + 1;
-				goto repeat;
-			}
-		}
-		if ((tsk_context > ia64_ctx.next) && (tsk_context < ia64_ctx.limit))
-			ia64_ctx.limit = tsk_context;
-	}
-	read_unlock(&tasklist_lock);
-	/* can't call flush_tlb_all() here because of race condition with O(1) scheduler [EF] */
-	{
-		int cpu = get_cpu(); /* prevent preemption/migration */
-		for_each_online_cpu(i) {
-			if (i != cpu)
-				per_cpu(ia64_need_tlb_flush, i) = 1;
-		}
-		put_cpu();
-	}
+	cpu = get_cpu(); /* prevent preemption/migration */
+	for_each_online_cpu(i)
+		if (i != cpu)
+			per_cpu(ia64_need_tlb_flush, i) = 1;
+	put_cpu();
 	local_flush_tlb_all();
 }

 void
-ia64_global_tlb_purge (struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long nbits)
+ia64_global_tlb_purge (struct mm_struct *mm, unsigned long start,
+		       unsigned long end, unsigned long nbits)
 {
 	static DEFINE_SPINLOCK(ptcg_lock);

@@ -135,7 +134,8 @@ local_flush_tlb_all (void)
 }

 void
-flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long end)
+flush_tlb_range (struct vm_area_struct *vma, unsigned long start,
+		 unsigned long end)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long size = end - start;
@@ -149,7 +149,8 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long
 #endif

 	nbits = ia64_fls(size + 0xfff);
-	while (unlikely (((1UL << nbits) & purge.mask) == 0) && (nbits < purge.max_bits))
+	while (unlikely (((1UL << nbits) & purge.mask) == 0) &&
+			(nbits < purge.max_bits))
 		++nbits;
 	if (nbits > purge.max_bits)
 		nbits = purge.max_bits;
@@ -191,5 +192,5 @@ ia64_tlb_init (void)
 	local_cpu_data->ptce_stride[0] = ptce_info.stride[0];
 	local_cpu_data->ptce_stride[1] = ptce_info.stride[1];

-	local_flush_tlb_all();		/* nuke left overs from bootstrapping... */
+	local_flush_tlb_all();	/* nuke left overs from bootstrapping... */
 }
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -95,7 +95,7 @@ pci_sal_write (unsigned int seg, unsigned int bus, unsigned int devfn,
 }

 static struct pci_raw_ops pci_sal_ops = {
-	.read = 	pci_sal_read,
+	.read =		pci_sal_read,
 	.write =	pci_sal_write
 };

@@ -137,35 +137,98 @@ alloc_pci_controller (int seg)
 	return controller;
 }

-static u64 __devinit
-add_io_space (struct acpi_resource_address64 *addr)
+struct pci_root_info {
+	struct pci_controller *controller;
+	char *name;
+};
+
+static unsigned int
+new_space (u64 phys_base, int sparse)
 {
-	u64 offset;
-	int sparse = 0;
+	u64 mmio_base;
 	int i;

-	if (addr->address_translation_offset == 0)
-		return IO_SPACE_BASE(0);	/* part of legacy IO space */
-
-	if (addr->attribute.io.translation_attribute == ACPI_SPARSE_TRANSLATION)
-		sparse = 1;
+	if (phys_base == 0)
+		return 0;	/* legacy I/O port space */

-	offset = (u64) ioremap(addr->address_translation_offset, 0);
+	mmio_base = (u64) ioremap(phys_base, 0);
 	for (i = 0; i < num_io_spaces; i++)
-		if (io_space[i].mmio_base == offset &&
+		if (io_space[i].mmio_base == mmio_base &&
 		    io_space[i].sparse == sparse)
-			return IO_SPACE_BASE(i);
+			return i;

 	if (num_io_spaces == MAX_IO_SPACES) {
-		printk("Too many IO port spaces\n");
+		printk(KERN_ERR "PCI: Too many IO port spaces "
+			"(MAX_IO_SPACES=%lu)\n", MAX_IO_SPACES);
 		return ~0;
 	}

 	i = num_io_spaces++;
-	io_space[i].mmio_base = offset;
+	io_space[i].mmio_base = mmio_base;
 	io_space[i].sparse = sparse;

-	return IO_SPACE_BASE(i);
+	return i;
+}
+
+static u64 __devinit
+add_io_space (struct pci_root_info *info, struct acpi_resource_address64 *addr)
+{
+	struct resource *resource;
+	char *name;
+	u64 base, min, max, base_port;
+	unsigned int sparse = 0, space_nr, len;
+
+	resource = kzalloc(sizeof(*resource), GFP_KERNEL);
+	if (!resource) {
+		printk(KERN_ERR "PCI: No memory for %s I/O port space\n",
+			info->name);
+		goto out;
+	}
+
+	len = strlen(info->name) + 32;
+	name = kzalloc(len, GFP_KERNEL);
+	if (!name) {
+		printk(KERN_ERR "PCI: No memory for %s I/O port space name\n",
+			info->name);
+		goto free_resource;
+	}
+
+	min = addr->min_address_range;
+	max = min + addr->address_length - 1;
+	if (addr->attribute.io.translation_attribute == ACPI_SPARSE_TRANSLATION)
+		sparse = 1;
+
+	space_nr = new_space(addr->address_translation_offset, sparse);
+	if (space_nr == ~0)
+		goto free_name;
+
+	base = __pa(io_space[space_nr].mmio_base);
+	base_port = IO_SPACE_BASE(space_nr);
+	snprintf(name, len, "%s I/O Ports %08lx-%08lx", info->name,
+		base_port + min, base_port + max);
+
+	/*
+	 * The SDM guarantees the legacy 0-64K space is sparse, but if the
+	 * mapping is done by the processor (not the bridge), ACPI may not
+	 * mark it as sparse.
+	 */
+	if (space_nr == 0)
+		sparse = 1;
+
+	resource->name  = name;
+	resource->flags = IORESOURCE_MEM;
+	resource->start = base + (sparse ? IO_SPACE_SPARSE_ENCODING(min) : min);
+	resource->end   = base + (sparse ? IO_SPACE_SPARSE_ENCODING(max) : max);
+	insert_resource(&iomem_resource, resource);
+
+	return base_port;
+
+free_name:
+	kfree(name);
+free_resource:
+	kfree(resource);
+out:
+	return ~0;
 }

 static acpi_status __devinit resource_to_window(struct acpi_resource *resource,
@@ -205,11 +268,6 @@ count_window (struct acpi_resource *resource, void *data)
 	return AE_OK;
 }

-struct pci_root_info {
-	struct pci_controller *controller;
-	char *name;
-};
-
 static __devinit acpi_status add_window(struct acpi_resource *res, void *data)
 {
 	struct pci_root_info *info = data;
@@ -231,7 +289,7 @@ static __devinit acpi_status add_window(struct acpi_resource *res, void *data)
 	} else if (addr.resource_type == ACPI_IO_RANGE) {
 		flags = IORESOURCE_IO;
 		root = &ioport_resource;
-		offset = add_io_space(&addr);
+		offset = add_io_space(info, &addr);
 		if (offset == ~0)
 			return AE_OK;
 	} else
@@ -241,7 +299,7 @@ static __devinit acpi_status add_window(struct acpi_resource *res, void *data)
 	window->resource.name = info->name;
 	window->resource.flags = flags;
 	window->resource.start = addr.min_address_range + offset;
-	window->resource.end = addr.max_address_range + offset;
+	window->resource.end = window->resource.start + addr.address_length - 1;
 	window->resource.child = NULL;
 	window->offset = offset;

@@ -739,7 +797,7 @@ int pci_vector_resources(int last, int nr_released)
 {
 	int count = nr_released;

- 	count += (IA64_LAST_DEVICE_VECTOR - last);
+	count += (IA64_LAST_DEVICE_VECTOR - last);

 	return count;
 }
--- a/arch/ia64/sn/kernel/io_init.c
+++ b/arch/ia64/sn/kernel/io_init.c
@@ -349,7 +349,7 @@ void sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus)
 		return;		/*bus # does not exist */
 	prom_bussoft_ptr = __va(prom_bussoft_ptr);

- 	controller = kcalloc(1,sizeof(struct pci_controller), GFP_KERNEL);
+ 	controller = kzalloc(sizeof(struct pci_controller), GFP_KERNEL);
 	controller->segment = segment;
 	if (!controller)
 		BUG();

--- a/arch/ia64/sn/kernel/xpc.h
+++ b/arch/ia64/sn/kernel/xpc.h
@@ -163,7 +163,7 @@ struct xpc_vars {
 	u8 version;
 	u64 heartbeat;
 	u64 heartbeating_to_mask;
-	u64 kdb_status;		/* 0 = machine running */
+	u64 heartbeat_offline;	/* if 0, heartbeat should be changing */
 	int act_nasid;
 	int act_phys_cpuid;
 	u64 vars_part_pa;

--- a/arch/ia64/sn/kernel/xpc_main.c
+++ b/arch/ia64/sn/kernel/xpc_main.c
@@ -57,6 +57,7 @@
 #include <linux/reboot.h>
 #include <asm/sn/intr.h>
 #include <asm/sn/sn_sal.h>
+#include <asm/kdebug.h>
 #include <asm/uaccess.h>
 #include "xpc.h"

@@ -188,6 +189,11 @@ static struct notifier_block xpc_reboot_notifier = {
 	.notifier_call = xpc_system_reboot,
 };

+static int xpc_system_die(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xpc_die_notifier = {
+	.notifier_call = xpc_system_die,
+};
+

 /*
 * Timer function to enforce the timelimit on the partition disengage request.
@@ -997,6 +1003,9 @@ xpc_do_exit(enum xpc_retval reason)
 	/* take ourselves off of the reboot_notifier_list */
 	(void) unregister_reboot_notifier(&xpc_reboot_notifier);

+	/* take ourselves off of the die_notifier list */
+	(void) unregister_die_notifier(&xpc_die_notifier);
+
 	/* close down protections for IPI operations */
 	xpc_restrict_IPI_ops();

@@ -1010,6 +1019,63 @@ xpc_do_exit(enum xpc_retval reason)
 }


+/*
+ * Called when the system is about to be either restarted or halted.
+ */
+static void
+xpc_die_disengage(void)
+{
+	struct xpc_partition *part;
+	partid_t partid;
+	unsigned long engaged;
+	long time, print_time, disengage_request_timeout;
+
+
+	/* keep xpc_hb_checker thread from doing anything (just in case) */
+	xpc_exiting = 1;
+
+	xpc_vars->heartbeating_to_mask = 0;  /* indicate we're deactivated */
+
+	for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
+		part = &xpc_partitions[partid];
+
+		if (!XPC_SUPPORTS_DISENGAGE_REQUEST(part->
+							remote_vars_version)) {
+
+			/* just in case it was left set by an earlier XPC */
+			xpc_clear_partition_engaged(1UL << partid);
+			continue;
+		}
+
+		if (xpc_partition_engaged(1UL << partid) ||
+					part->act_state != XPC_P_INACTIVE) {
+			xpc_request_partition_disengage(part);
+			xpc_mark_partition_disengaged(part);
+			xpc_IPI_send_disengage(part);
+		}
+	}
+
+	print_time = rtc_time();
+	disengage_request_timeout = print_time +
+		(xpc_disengage_request_timelimit * sn_rtc_cycles_per_second);
+
+	/* wait for all other partitions to disengage from us */
+
+	while ((engaged = xpc_partition_engaged(-1UL)) &&
+			(time = rtc_time()) < disengage_request_timeout) {
+
+		if (time >= print_time) {
+			dev_info(xpc_part, "waiting for remote partitions to "
+				"disengage, engaged=0x%lx\n", engaged);
+			print_time = time + (XPC_DISENGAGE_PRINTMSG_INTERVAL *
+						sn_rtc_cycles_per_second);
+		}
+	}
+	dev_info(xpc_part, "finished waiting for remote partitions to "
+				"disengage, engaged=0x%lx\n", engaged);
+}
+
+
 /*
 * This function is called when the system is being rebooted.
 */
@@ -1038,6 +1104,33 @@ xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused)
 }


+/*
+ * This function is called when the system is being rebooted.
+ */
+static int
+xpc_system_die(struct notifier_block *nb, unsigned long event, void *unused)
+{
+	switch (event) {
+	case DIE_MACHINE_RESTART:
+	case DIE_MACHINE_HALT:
+		xpc_die_disengage();
+		break;
+	case DIE_MCA_MONARCH_ENTER:
+	case DIE_INIT_MONARCH_ENTER:
+		xpc_vars->heartbeat++;
+		xpc_vars->heartbeat_offline = 1;
+		break;
+	case DIE_MCA_MONARCH_LEAVE:
+	case DIE_INIT_MONARCH_LEAVE:
+		xpc_vars->heartbeat++;
+		xpc_vars->heartbeat_offline = 0;
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+
 int __init
 xpc_init(void)
 {
@@ -1154,6 +1247,12 @@ xpc_init(void)
 		dev_warn(xpc_part, "can't register reboot notifier\n");
 	}

+	/* add ourselves to the die_notifier list (i.e., ia64die_chain) */
+	ret = register_die_notifier(&xpc_die_notifier);
+	if (ret != 0) {
+		dev_warn(xpc_part, "can't register die notifier\n");
+	}
+

 	/*
 	 * Set the beating to other partitions into motion.  This is
@@ -1179,6 +1278,9 @@ xpc_init(void)
 		/* take ourselves off of the reboot_notifier_list */
 		(void) unregister_reboot_notifier(&xpc_reboot_notifier);

+		/* take ourselves off of the die_notifier list */
+		(void) unregister_die_notifier(&xpc_die_notifier);
+
 		del_timer_sync(&xpc_hb_timer);
 		free_irq(SGI_XPC_ACTIVATE, NULL);
 		xpc_restrict_IPI_ops();

--- a/arch/ia64/sn/kernel/xpc_partition.c
+++ b/arch/ia64/sn/kernel/xpc_partition.c
@@ -436,13 +436,13 @@ xpc_check_remote_hb(void)
 		}

 		dev_dbg(xpc_part, "partid = %d, heartbeat = %ld, last_heartbeat"
-			" = %ld, kdb_status = %ld, HB_mask = 0x%lx\n", partid,
-			remote_vars->heartbeat, part->last_heartbeat,
-			remote_vars->kdb_status,
+			" = %ld, heartbeat_offline = %ld, HB_mask = 0x%lx\n",
+			partid, remote_vars->heartbeat, part->last_heartbeat,
+			remote_vars->heartbeat_offline,
 			remote_vars->heartbeating_to_mask);

 		if (((remote_vars->heartbeat == part->last_heartbeat) &&
-			(remote_vars->kdb_status == 0)) ||
+			(remote_vars->heartbeat_offline == 0)) ||
 			     !xpc_hb_allowed(sn_partition_id, remote_vars)) {

 			XPC_DEACTIVATE_PARTITION(part, xpcNoHeartbeat);

--- a/arch/ia64/sn/pci/tioce_provider.c
+++ b/arch/ia64/sn/pci/tioce_provider.c
@@ -218,7 +218,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port,
 	if (i > last)
 		return 0;

-	map = kcalloc(1, sizeof(struct tioce_dmamap), GFP_ATOMIC);
+	map = kzalloc(sizeof(struct tioce_dmamap), GFP_ATOMIC);
 	if (!map)
 		return 0;

@@ -555,7 +555,7 @@ tioce_kern_init(struct tioce_common *tioce_common)
 	struct tioce *tioce_mmr;
 	struct tioce_kernel *tioce_kern;

-	tioce_kern = kcalloc(1, sizeof(struct tioce_kernel), GFP_KERNEL);
+	tioce_kern = kzalloc(sizeof(struct tioce_kernel), GFP_KERNEL);
 	if (!tioce_kern) {
 		return NULL;
 	}
@@ -727,7 +727,7 @@ tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont
 	 * Allocate kernel bus soft and copy from prom.
 	 */

-	tioce_common = kcalloc(1, sizeof(struct tioce_common), GFP_KERNEL);
+	tioce_common = kzalloc(sizeof(struct tioce_common), GFP_KERNEL);
 	if (!tioce_common)
 		return NULL;


--- a/include/asm-ia64/kdebug.h
+++ b/include/asm-ia64/kdebug.h
@@ -22,6 +22,9 @@
 * 2005-Apr     Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy
 *              <anil.s.keshavamurthy@intel.com> adopted from
 *              include/asm-x86_64/kdebug.h
+ *
+ * 2005-Oct	Keith Owens <kaos@sgi.com>.  Expand notify_die to cover more
+ *		events.
 */
 #include <linux/notifier.h>

@@ -35,13 +38,36 @@ struct die_args {
 	int signr;
 };

-int register_die_notifier(struct notifier_block *nb);
+extern int register_die_notifier(struct notifier_block *);
+extern int unregister_die_notifier(struct notifier_block *);
 extern struct notifier_block *ia64die_chain;

 enum die_val {
 	DIE_BREAK = 1,
-	DIE_SS,
+	DIE_FAULT,
+	DIE_OOPS,
 	DIE_PAGE_FAULT,
+	DIE_MACHINE_HALT,
+	DIE_MACHINE_RESTART,
+	DIE_MCA_MONARCH_ENTER,
+	DIE_MCA_MONARCH_PROCESS,
+	DIE_MCA_MONARCH_LEAVE,
+	DIE_MCA_SLAVE_ENTER,
+	DIE_MCA_SLAVE_PROCESS,
+	DIE_MCA_SLAVE_LEAVE,
+	DIE_MCA_RENDZVOUS_ENTER,
+	DIE_MCA_RENDZVOUS_PROCESS,
+	DIE_MCA_RENDZVOUS_LEAVE,
+	DIE_INIT_MONARCH_ENTER,
+	DIE_INIT_MONARCH_PROCESS,
+	DIE_INIT_MONARCH_LEAVE,
+	DIE_INIT_SLAVE_ENTER,
+	DIE_INIT_SLAVE_PROCESS,
+	DIE_INIT_SLAVE_LEAVE,
+	DIE_KDEBUG_ENTER,
+	DIE_KDEBUG_LEAVE,
+	DIE_KDUMP_ENTER,
+	DIE_KDUMP_LEAVE,
 };

 static inline int notify_die(enum die_val val, char *str, struct pt_regs *regs,

--- a/include/asm-ia64/mmu_context.h
+++ b/include/asm-ia64/mmu_context.h
@@ -7,12 +7,13 @@
 */

 /*
- * Routines to manage the allocation of task context numbers.  Task context numbers are
- * used to reduce or eliminate the need to perform TLB flushes due to context switches.
- * Context numbers are implemented using ia-64 region ids.  Since the IA-64 TLB does not
- * consider the region number when performing a TLB lookup, we need to assign a unique
- * region id to each region in a process.  We use the least significant three bits in a
- * region id for this purpose.
+ * Routines to manage the allocation of task context numbers.  Task context
+ * numbers are used to reduce or eliminate the need to perform TLB flushes
+ * due to context switches.  Context numbers are implemented using ia-64
+ * region ids.  Since the IA-64 TLB does not consider the region number when
+ * performing a TLB lookup, we need to assign a unique region id to each
+ * region in a process.  We use the least significant three bits in aregion
+ * id for this purpose.
 */

 #define IA64_REGION_ID_KERNEL	0 /* the kernel's region id (tlb.c depends on this being 0) */
@@ -32,13 +33,17 @@
 struct ia64_ctx {
 	spinlock_t lock;
 	unsigned int next;	/* next context number to use */
-	unsigned int limit;	/* next >= limit => must call wrap_mmu_context() */
-	unsigned int max_ctx;	/* max. context value supported by all CPUs */
+	unsigned int limit;     /* available free range */
+	unsigned int max_ctx;   /* max. context value supported by all CPUs */
+				/* call wrap_mmu_context when next >= max */
+	unsigned long *bitmap;  /* bitmap size is max_ctx+1 */
+	unsigned long *flushmap;/* pending rid to be flushed */
 };

 extern struct ia64_ctx ia64_ctx;
 DECLARE_PER_CPU(u8, ia64_need_tlb_flush);

+extern void mmu_context_init (void);
 extern void wrap_mmu_context (struct mm_struct *mm);

 static inline void
@@ -47,10 +52,10 @@ enter_lazy_tlb (struct mm_struct *mm, struct task_struct *tsk)
 }

 /*
- * When the context counter wraps around all TLBs need to be flushed because an old
- * context number might have been reused. This is signalled by the ia64_need_tlb_flush
- * per-CPU variable, which is checked in the routine below. Called by activate_mm().
- * <efocht@ess.nec.de>
+ * When the context counter wraps around all TLBs need to be flushed because
+ * an old context number might have been reused. This is signalled by the
+ * ia64_need_tlb_flush per-CPU variable, which is checked in the routine
+ * below. Called by activate_mm(). <efocht@ess.nec.de>
 */
 static inline void
 delayed_tlb_flush (void)
@@ -60,11 +65,9 @@ delayed_tlb_flush (void)

 	if (unlikely(__ia64_per_cpu_var(ia64_need_tlb_flush))) {
 		spin_lock_irqsave(&ia64_ctx.lock, flags);
-		{
-			if (__ia64_per_cpu_var(ia64_need_tlb_flush)) {
-				local_flush_tlb_all();
-				__ia64_per_cpu_var(ia64_need_tlb_flush) = 0;
-			}
+		if (__ia64_per_cpu_var(ia64_need_tlb_flush)) {
+			local_flush_tlb_all();
+			__ia64_per_cpu_var(ia64_need_tlb_flush) = 0;
 		}
 		spin_unlock_irqrestore(&ia64_ctx.lock, flags);
 	}
@@ -76,20 +79,27 @@ get_mmu_context (struct mm_struct *mm)
 	unsigned long flags;
 	nv_mm_context_t context = mm->context;

-	if (unlikely(!context)) {
-		spin_lock_irqsave(&ia64_ctx.lock, flags);
-		{
-			/* re-check, now that we've got the lock: */
-			context = mm->context;
-			if (context == 0) {
-				cpus_clear(mm->cpu_vm_mask);
-				if (ia64_ctx.next >= ia64_ctx.limit)
-					wrap_mmu_context(mm);
-				mm->context = context = ia64_ctx.next++;
-			}
+	if (likely(context))
+		goto out;
+
+	spin_lock_irqsave(&ia64_ctx.lock, flags);
+	/* re-check, now that we've got the lock: */
+	context = mm->context;
+	if (context == 0) {
+		cpus_clear(mm->cpu_vm_mask);
+		if (ia64_ctx.next >= ia64_ctx.limit) {
+			ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap,
+					ia64_ctx.max_ctx, ia64_ctx.next);
+			ia64_ctx.limit = find_next_bit(ia64_ctx.bitmap,
+					ia64_ctx.max_ctx, ia64_ctx.next);
+			if (ia64_ctx.next >= ia64_ctx.max_ctx)
+				wrap_mmu_context(mm);
 		}
-		spin_unlock_irqrestore(&ia64_ctx.lock, flags);
+		mm->context = context = ia64_ctx.next++;
+		__set_bit(context, ia64_ctx.bitmap);
 	}
+	spin_unlock_irqrestore(&ia64_ctx.lock, flags);
+out:
 	/*
 	 * Ensure we're not starting to use "context" before any old
 	 * uses of it are gone from our TLB.
@@ -100,8 +110,8 @@ get_mmu_context (struct mm_struct *mm)
 }

 /*
- * Initialize context number to some sane value.  MM is guaranteed to be a brand-new
- * address-space, so no TLB flushing is needed, ever.
+ * Initialize context number to some sane value.  MM is guaranteed to be a
+ * brand-new address-space, so no TLB flushing is needed, ever.
 */
 static inline int
 init_new_context (struct task_struct *p, struct mm_struct *mm)
@@ -162,7 +172,10 @@ activate_context (struct mm_struct *mm)
 		if (!cpu_isset(smp_processor_id(), mm->cpu_vm_mask))
 			cpu_set(smp_processor_id(), mm->cpu_vm_mask);
 		reload_context(context);
-		/* in the unlikely event of a TLB-flush by another thread, redo the load: */
+		/*
+		 * in the unlikely event of a TLB-flush by another thread,
+		 * redo the load.
+		 */
 	} while (unlikely(context != mm->context));
 }

@@ -175,8 +188,8 @@ static inline void
 activate_mm (struct mm_struct *prev, struct mm_struct *next)
 {
 	/*
-	 * We may get interrupts here, but that's OK because interrupt handlers cannot
-	 * touch user-space.
+	 * We may get interrupts here, but that's OK because interrupt
+	 * handlers cannot touch user-space.
 	 */
 	ia64_set_kr(IA64_KR_PT_BASE, __pa(next->pgd));
 	activate_context(next);

--- a/include/asm-ia64/tlbflush.h
+++ b/include/asm-ia64/tlbflush.h
@@ -51,6 +51,7 @@ flush_tlb_mm (struct mm_struct *mm)
 	if (!mm)
 		return;

+	set_bit(mm->context, ia64_ctx.flushmap);
 	mm->context = 0;

 	if (atomic_read(&mm->mm_users) == 0)