diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index 6d458b35643c7b2379e0b9d77eb12e7dac36764f..2387a9b81be7afd7351dbfacfb6deafd525aa323 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c @@ -459,30 +459,35 @@ static void spitfire_xcall_helper(u64 data0, u64 data1, u64 data2, u64 pstate, u } } -static inline void spitfire_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask) +static void spitfire_xcall_deliver(struct trap_per_cpu *tb, int cnt) { + u64 *mondo, data0, data1, data2; + u16 *cpu_list; u64 pstate; int i; __asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate)); - for_each_cpu_mask_nr(i, *mask) - spitfire_xcall_helper(data0, data1, data2, pstate, i); + cpu_list = __va(tb->cpu_list_pa); + mondo = __va(tb->cpu_mondo_block_pa); + data0 = mondo[0]; + data1 = mondo[1]; + data2 = mondo[2]; + for (i = 0; i < cnt; i++) + spitfire_xcall_helper(data0, data1, data2, pstate, cpu_list[i]); } /* Cheetah now allows to send the whole 64-bytes of data in the interrupt * packet, but we have no use for that. However we do take advantage of * the new pipelining feature (ie. dispatch to multiple cpus simultaneously). */ -static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask_p) +static void cheetah_xcall_deliver(struct trap_per_cpu *tb, int cnt) { - u64 pstate, ver, busy_mask; int nack_busy_id, is_jbus, need_more; - cpumask_t mask; - - if (cpus_empty(*mask_p)) - return; + u64 *mondo, pstate, ver, busy_mask; + u16 *cpu_list; - mask = *mask_p; + cpu_list = __va(tb->cpu_list_pa); + mondo = __va(tb->cpu_mondo_block_pa); /* Unfortunately, someone at Sun had the brilliant idea to make the * busy/nack fields hard-coded by ITID number for this Ultra-III @@ -505,7 +510,7 @@ static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask "stxa %2, [%5] %6\n\t" "membar #Sync\n\t" : /* no outputs */ - : "r" (data0), "r" (data1), "r" (data2), + : "r" (mondo[0]), "r" (mondo[1]), "r" (mondo[2]), "r" (0x40), "r" (0x50), "r" (0x60), "i" (ASI_INTR_W)); @@ -514,11 +519,16 @@ static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask { int i; - for_each_cpu_mask_nr(i, mask) { - u64 target = (i << 14) | 0x70; + for (i = 0; i < cnt; i++) { + u64 target, nr; + + nr = cpu_list[i]; + if (nr == 0xffff) + continue; + target = (nr << 14) | 0x70; if (is_jbus) { - busy_mask |= (0x1UL << (i * 2)); + busy_mask |= (0x1UL << (nr * 2)); } else { target |= (nack_busy_id << 24); busy_mask |= (0x1UL << @@ -552,11 +562,13 @@ static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask __asm__ __volatile__("wrpr %0, 0x0, %%pstate" : : "r" (pstate)); if (unlikely(need_more)) { - int i, cnt = 0; - for_each_cpu_mask_nr(i, mask) { - cpu_clear(i, mask); - cnt++; - if (cnt == 32) + int i, this_cnt = 0; + for (i = 0; i < cnt; i++) { + if (cpu_list[i] == 0xffff) + continue; + cpu_list[i] = 0xffff; + this_cnt++; + if (this_cnt == 32) break; } goto retry; @@ -587,16 +599,20 @@ static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask /* Clear out the mask bits for cpus which did not * NACK us. */ - for_each_cpu_mask_nr(i, mask) { - u64 check_mask; + for (i = 0; i < cnt; i++) { + u64 check_mask, nr; + + nr = cpu_list[i]; + if (nr == 0xffff) + continue; if (is_jbus) - check_mask = (0x2UL << (2*i)); + check_mask = (0x2UL << (2*nr)); else check_mask = (0x2UL << this_busy_nack); if ((dispatch_stat & check_mask) == 0) - cpu_clear(i, mask); + cpu_list[i] = 0xffff; this_busy_nack += 2; if (this_busy_nack == 64) break; @@ -608,34 +624,17 @@ static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask } /* Multi-cpu list version. */ -static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask) +static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt) { - int cnt, retries, this_cpu, prev_sent, i; + int retries, this_cpu, prev_sent, i; unsigned long status; cpumask_t error_mask; - struct trap_per_cpu *tb; u16 *cpu_list; - u64 *mondo; - - if (cpus_empty(*mask)) - return; this_cpu = smp_processor_id(); - tb = &trap_block[this_cpu]; - - mondo = __va(tb->cpu_mondo_block_pa); - mondo[0] = data0; - mondo[1] = data1; - mondo[2] = data2; - wmb(); cpu_list = __va(tb->cpu_list_pa); - /* Setup the initial cpu list. */ - cnt = 0; - for_each_cpu_mask_nr(i, *mask) - cpu_list[cnt++] = i; - cpus_clear(error_mask); retries = 0; prev_sent = 0; @@ -743,11 +742,15 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpum printk("]\n"); } -static void (*xcall_deliver_impl)(u64, u64, u64, const cpumask_t *); +static void (*xcall_deliver_impl)(struct trap_per_cpu *, int); static void xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask) { + struct trap_per_cpu *tb; + int this_cpu, i, cnt; unsigned long flags; + u16 *cpu_list; + u64 *mondo; /* We have to do this whole thing with interrupts fully disabled. * Otherwise if we send an xcall from interrupt context it will @@ -760,7 +763,29 @@ static void xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask * Fortunately, udelay() uses %stick/%tick so we can use that. */ local_irq_save(flags); - xcall_deliver_impl(data0, data1, data2, mask); + + this_cpu = smp_processor_id(); + tb = &trap_block[this_cpu]; + + mondo = __va(tb->cpu_mondo_block_pa); + mondo[0] = data0; + mondo[1] = data1; + mondo[2] = data2; + wmb(); + + cpu_list = __va(tb->cpu_list_pa); + + /* Setup the initial cpu list. */ + cnt = 0; + for_each_cpu_mask_nr(i, *mask) { + if (i == this_cpu || !cpu_online(i)) + continue; + cpu_list[cnt++] = i; + } + + if (cnt) + xcall_deliver_impl(tb, cnt); + local_irq_restore(flags); }