tlb_uv.c 55.5 KB
Newer Older
1 2 3
/*
 *	SGI UltraViolet TLB flush routines.
 *
4
 *	(c) 2008-2012 Cliff Wickman <cpw@sgi.com>, SGI.
5 6 7 8
 *
 *	This code is released under the GNU General Public License version 2 or
 *	later.
 */
9
#include <linux/seq_file.h>
10
#include <linux/proc_fs.h>
11
#include <linux/debugfs.h>
12
#include <linux/kernel.h>
13
#include <linux/slab.h>
14
#include <linux/delay.h>
15 16

#include <asm/mmu_context.h>
T
Tejun Heo 已提交
17
#include <asm/uv/uv.h>
18
#include <asm/uv/uv_mmrs.h>
19
#include <asm/uv/uv_hub.h>
20
#include <asm/uv/uv_bau.h>
I
Ingo Molnar 已提交
21
#include <asm/apic.h>
22
#include <asm/idle.h>
23
#include <asm/tsc.h>
24
#include <asm/irq_vectors.h>
25
#include <asm/timer.h>
26

27 28 29 30 31 32 33 34 35 36 37
/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
static int timeout_base_ns[] = {
		20,
		160,
		1280,
		10240,
		81920,
		655360,
		5242880,
		167772160
};
C
Cliff Wickman 已提交
38

39
static int timeout_us;
40
static int nobau;
41
static int nobau_perm;
42
static cycles_t congested_cycles;
43

44
/* tunables: */
C
Cliff Wickman 已提交
45 46 47 48
static int max_concurr		= MAX_BAU_CONCURRENT;
static int max_concurr_const	= MAX_BAU_CONCURRENT;
static int plugged_delay	= PLUGGED_DELAY;
static int plugsb4reset		= PLUGSB4RESET;
49
static int giveup_limit		= GIVEUP_LIMIT;
C
Cliff Wickman 已提交
50 51 52 53 54
static int timeoutsb4reset	= TIMEOUTSB4RESET;
static int ipi_reset_limit	= IPI_RESET_LIMIT;
static int complete_threshold	= COMPLETE_THRESHOLD;
static int congested_respns_us	= CONGESTED_RESPONSE_US;
static int congested_reps	= CONGESTED_REPS;
55
static int disabled_period	= DISABLED_PERIOD;
C
Cliff Wickman 已提交
56 57 58 59 60 61 62 63 64 65

static struct tunables tunables[] = {
	{&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
	{&plugged_delay, PLUGGED_DELAY},
	{&plugsb4reset, PLUGSB4RESET},
	{&timeoutsb4reset, TIMEOUTSB4RESET},
	{&ipi_reset_limit, IPI_RESET_LIMIT},
	{&complete_threshold, COMPLETE_THRESHOLD},
	{&congested_respns_us, CONGESTED_RESPONSE_US},
	{&congested_reps, CONGESTED_REPS},
66 67
	{&disabled_period, DISABLED_PERIOD},
	{&giveup_limit, GIVEUP_LIMIT}
C
Cliff Wickman 已提交
68 69
};

70 71
static struct dentry *tunables_dir;
static struct dentry *tunables_file;
72

C
Cliff Wickman 已提交
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
/* these correspond to the statistics printed by ptc_seq_show() */
static char *stat_description[] = {
	"sent:     number of shootdown messages sent",
	"stime:    time spent sending messages",
	"numuvhubs: number of hubs targeted with shootdown",
	"numuvhubs16: number times 16 or more hubs targeted",
	"numuvhubs8: number times 8 or more hubs targeted",
	"numuvhubs4: number times 4 or more hubs targeted",
	"numuvhubs2: number times 2 or more hubs targeted",
	"numuvhubs1: number times 1 hub targeted",
	"numcpus:  number of cpus targeted with shootdown",
	"dto:      number of destination timeouts",
	"retries:  destination timeout retries sent",
	"rok:   :  destination timeouts successfully retried",
	"resetp:   ipi-style resource resets for plugs",
	"resett:   ipi-style resource resets for timeouts",
	"giveup:   fall-backs to ipi-style shootdowns",
	"sto:      number of source timeouts",
	"bz:       number of stay-busy's",
	"throt:    number times spun in throttle",
	"swack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE",
	"recv:     shootdown messages received",
	"rtime:    time spent processing messages",
	"all:      shootdown all-tlb messages",
	"one:      shootdown one-tlb messages",
	"mult:     interrupts that found multiple messages",
	"none:     interrupts that found no messages",
	"retry:    number of retry messages processed",
	"canc:     number messages canceled by retries",
	"nocan:    number retries that found nothing to cancel",
	"reset:    number of ipi-style reset requests processed",
	"rcan:     number messages canceled by reset requests",
	"disable:  number times use of the BAU was disabled",
	"enable:   number times use of the BAU was re-enabled"
};

static int __init
setup_nobau(char *arg)
111 112 113 114 115
{
	nobau = 1;
	return 0;
}
early_param("nobau", setup_nobau);
116

117
/* base pnode in this partition */
C
Cliff Wickman 已提交
118
static int uv_base_pnode __read_mostly;
119

120 121
static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
static DEFINE_PER_CPU(struct bau_control, bau_control);
122 123
static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);

124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
static void
set_bau_on(void)
{
	int cpu;
	struct bau_control *bcp;

	if (nobau_perm) {
		pr_info("BAU not initialized; cannot be turned on\n");
		return;
	}
	nobau = 0;
	for_each_present_cpu(cpu) {
		bcp = &per_cpu(bau_control, cpu);
		bcp->nobau = 0;
	}
	pr_info("BAU turned on\n");
	return;
}

static void
set_bau_off(void)
{
	int cpu;
	struct bau_control *bcp;

	nobau = 1;
	for_each_present_cpu(cpu) {
		bcp = &per_cpu(bau_control, cpu);
		bcp->nobau = 1;
	}
	pr_info("BAU turned off\n");
	return;
}

158
/*
159 160
 * Determine the first node on a uvhub. 'Nodes' are used for kernel
 * memory allocation.
161
 */
162
static int __init uvhub_to_first_node(int uvhub)
163 164 165 166 167
{
	int node, b;

	for_each_online_node(node) {
		b = uv_node_to_blade_id(node);
168
		if (uvhub == b)
169 170
			return node;
	}
171
	return -1;
172 173 174
}

/*
175
 * Determine the apicid of the first cpu on a uvhub.
176
 */
177
static int __init uvhub_to_first_apicid(int uvhub)
178 179 180 181
{
	int cpu;

	for_each_present_cpu(cpu)
182
		if (uvhub == uv_cpu_to_blade_id(cpu))
183 184 185 186
			return per_cpu(x86_cpu_to_apicid, cpu);
	return -1;
}

187 188 189 190 191 192 193 194
/*
 * Free a software acknowledge hardware resource by clearing its Pending
 * bit. This will return a reply to the sender.
 * If the message has timed out, a reply has already been sent by the
 * hardware but the resource has not been released. In that case our
 * clear of the Timeout bit (as well) will free the resource. No reply will
 * be sent (the hardware will only do one reply per message).
 */
C
Cliff Wickman 已提交
195 196
static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp,
						int do_acknowledge)
197
{
198
	unsigned long dw;
C
Cliff Wickman 已提交
199
	struct bau_pq_entry *msg;
200

201
	msg = mdp->msg;
C
Cliff Wickman 已提交
202
	if (!msg->canceled && do_acknowledge) {
C
Cliff Wickman 已提交
203 204
		dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
		write_mmr_sw_ack(dw);
205
	}
206
	msg->replied_to = 1;
C
Cliff Wickman 已提交
207
	msg->swack_vec = 0;
208 209 210
}

/*
211
 * Process the receipt of a RETRY message
212
 */
C
Cliff Wickman 已提交
213 214
static void bau_process_retry_msg(struct msg_desc *mdp,
					struct bau_control *bcp)
215
{
216 217 218 219
	int i;
	int cancel_count = 0;
	unsigned long msg_res;
	unsigned long mmr = 0;
C
Cliff Wickman 已提交
220 221 222
	struct bau_pq_entry *msg = mdp->msg;
	struct bau_pq_entry *msg2;
	struct ptc_stats *stat = bcp->statp;
223

224 225 226 227 228
	stat->d_retries++;
	/*
	 * cancel any message from msg+1 to the retry itself
	 */
	for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
C
Cliff Wickman 已提交
229 230
		if (msg2 > mdp->queue_last)
			msg2 = mdp->queue_first;
231 232 233
		if (msg2 == msg)
			break;

C
Cliff Wickman 已提交
234
		/* same conditions for cancellation as do_reset */
235
		if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
C
Cliff Wickman 已提交
236 237
		    (msg2->swack_vec) && ((msg2->swack_vec &
			msg->swack_vec) == 0) &&
238 239
		    (msg2->sending_cpu == msg->sending_cpu) &&
		    (msg2->msg_type != MSG_NOOP)) {
C
Cliff Wickman 已提交
240 241
			mmr = read_mmr_sw_ack();
			msg_res = msg2->swack_vec;
242 243 244 245 246 247
			/*
			 * This is a message retry; clear the resources held
			 * by the previous message only if they timed out.
			 * If it has not timed out we have an unexpected
			 * situation to report.
			 */
248
			if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
C
Cliff Wickman 已提交
249
				unsigned long mr;
250
				/*
C
Cliff Wickman 已提交
251 252
				 * Is the resource timed out?
				 * Make everyone ignore the cancelled message.
253 254 255 256
				 */
				msg2->canceled = 1;
				stat->d_canceled++;
				cancel_count++;
C
Cliff Wickman 已提交
257 258
				mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
				write_mmr_sw_ack(mr);
259
			}
260 261 262 263 264
		}
	}
	if (!cancel_count)
		stat->d_nocanceled++;
}
265

266 267 268 269
/*
 * Do all the things a cpu should do for a TLB shootdown message.
 * Other cpu's may come here at the same time for this message.
 */
C
Cliff Wickman 已提交
270 271
static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
						int do_acknowledge)
272 273
{
	short socket_ack_count = 0;
C
Cliff Wickman 已提交
274 275 276 277
	short *sp;
	struct atomic_short *asp;
	struct ptc_stats *stat = bcp->statp;
	struct bau_pq_entry *msg = mdp->msg;
278
	struct bau_control *smaster = bcp->socket_master;
279

280 281 282
	/*
	 * This must be a normal message, or retry of a normal message
	 */
283 284
	if (msg->address == TLB_FLUSH_ALL) {
		local_flush_tlb();
285
		stat->d_alltlb++;
286 287
	} else {
		__flush_tlb_one(msg->address);
288
		stat->d_onetlb++;
289
	}
290 291 292 293 294 295 296 297 298
	stat->d_requestee++;

	/*
	 * One cpu on each uvhub has the additional job on a RETRY
	 * of releasing the resource held by the message that is
	 * being retried.  That message is identified by sending
	 * cpu number.
	 */
	if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
C
Cliff Wickman 已提交
299
		bau_process_retry_msg(mdp, bcp);
300

301
	/*
C
Cliff Wickman 已提交
302
	 * This is a swack message, so we have to reply to it.
303 304 305 306
	 * Count each responding cpu on the socket. This avoids
	 * pinging the count's cache line back and forth between
	 * the sockets.
	 */
C
Cliff Wickman 已提交
307 308 309
	sp = &smaster->socket_acknowledge_count[mdp->msg_slot];
	asp = (struct atomic_short *)sp;
	socket_ack_count = atom_asr(1, asp);
310
	if (socket_ack_count == bcp->cpus_in_socket) {
C
Cliff Wickman 已提交
311
		int msg_ack_count;
312 313 314 315
		/*
		 * Both sockets dump their completed count total into
		 * the message's count.
		 */
316
		*sp = 0;
C
Cliff Wickman 已提交
317 318
		asp = (struct atomic_short *)&msg->acknowledge_count;
		msg_ack_count = atom_asr(socket_ack_count, asp);
319 320 321 322

		if (msg_ack_count == bcp->cpus_in_uvhub) {
			/*
			 * All cpus in uvhub saw it; reply
C
Cliff Wickman 已提交
323
			 * (unless we are in the UV2 workaround)
324
			 */
C
Cliff Wickman 已提交
325
			reply_to_message(mdp, bcp, do_acknowledge);
326 327
		}
	}
328

329
	return;
330 331 332
}

/*
C
cpw@sgi.com 已提交
333
 * Determine the first cpu on a pnode.
334
 */
C
cpw@sgi.com 已提交
335
static int pnode_to_first_cpu(int pnode, struct bau_control *smaster)
336 337
{
	int cpu;
C
cpw@sgi.com 已提交
338 339 340 341 342
	struct hub_and_pnode *hpp;

	for_each_present_cpu(cpu) {
		hpp = &smaster->thp[cpu];
		if (pnode == hpp->pnode)
343
			return cpu;
C
cpw@sgi.com 已提交
344
	}
345 346 347 348 349 350 351
	return -1;
}

/*
 * Last resort when we get a large number of destination timeouts is
 * to clear resources held by a given cpu.
 * Do this with IPI so that all messages in the BAU message queue
C
Cliff Wickman 已提交
352
 * can be identified by their nonzero swack_vec field.
353
 *
354 355
 * This is entered for a single cpu on the uvhub.
 * The sender want's this uvhub to free a specific message's
C
Cliff Wickman 已提交
356
 * swack resources.
357
 */
C
Cliff Wickman 已提交
358
static void do_reset(void *ptr)
359
{
360
	int i;
C
Cliff Wickman 已提交
361 362 363 364
	struct bau_control *bcp = &per_cpu(bau_control, smp_processor_id());
	struct reset_args *rap = (struct reset_args *)ptr;
	struct bau_pq_entry *msg;
	struct ptc_stats *stat = bcp->statp;
365

366 367 368
	stat->d_resets++;
	/*
	 * We're looking for the given sender, and
C
Cliff Wickman 已提交
369
	 * will free its swack resource.
370 371 372
	 * If all cpu's finally responded after the timeout, its
	 * message 'replied_to' was set.
	 */
C
Cliff Wickman 已提交
373 374 375 376
	for (msg = bcp->queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
		unsigned long msg_res;
		/* do_reset: same conditions for cancellation as
		   bau_process_retry_msg() */
377 378 379
		if ((msg->replied_to == 0) &&
		    (msg->canceled == 0) &&
		    (msg->sending_cpu == rap->sender) &&
C
Cliff Wickman 已提交
380
		    (msg->swack_vec) &&
381
		    (msg->msg_type != MSG_NOOP)) {
C
Cliff Wickman 已提交
382 383
			unsigned long mmr;
			unsigned long mr;
384 385 386 387 388 389 390
			/*
			 * make everyone else ignore this message
			 */
			msg->canceled = 1;
			/*
			 * only reset the resource if it is still pending
			 */
C
Cliff Wickman 已提交
391 392 393
			mmr = read_mmr_sw_ack();
			msg_res = msg->swack_vec;
			mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
394 395
			if (mmr & msg_res) {
				stat->d_rcanceled++;
C
Cliff Wickman 已提交
396
				write_mmr_sw_ack(mr);
397 398 399
			}
		}
	}
400
	return;
401 402 403
}

/*
404 405
 * Use IPI to get all target uvhubs to release resources held by
 * a given sending cpu number.
406
 */
C
cpw@sgi.com 已提交
407
static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
408
{
C
cpw@sgi.com 已提交
409 410
	int pnode;
	int apnode;
C
Cliff Wickman 已提交
411
	int maskbits;
C
cpw@sgi.com 已提交
412
	int sender = bcp->cpu;
413
	cpumask_t *mask = bcp->uvhub_master->cpumask;
C
cpw@sgi.com 已提交
414
	struct bau_control *smaster = bcp->socket_master;
415
	struct reset_args reset_args;
416

417
	reset_args.sender = sender;
418
	cpus_clear(*mask);
419
	/* find a single cpu for each uvhub in this distribution mask */
C
cpw@sgi.com 已提交
420
	maskbits = sizeof(struct pnmask) * BITSPERBYTE;
C
cpw@sgi.com 已提交
421 422
	/* each bit is a pnode relative to the partition base pnode */
	for (pnode = 0; pnode < maskbits; pnode++) {
C
Cliff Wickman 已提交
423
		int cpu;
C
cpw@sgi.com 已提交
424
		if (!bau_uvhub_isset(pnode, distribution))
425
			continue;
C
cpw@sgi.com 已提交
426 427
		apnode = pnode + bcp->partition_base_pnode;
		cpu = pnode_to_first_cpu(apnode, smaster);
428
		cpu_set(cpu, *mask);
429
	}
C
Cliff Wickman 已提交
430 431

	/* IPI all cpus; preemption is already disabled */
432
	smp_call_function_many(mask, do_reset, (void *)&reset_args, 1);
433 434 435
	return;
}

C
Cliff Wickman 已提交
436
static inline unsigned long cycles_2_us(unsigned long long cyc)
437 438 439
{
	unsigned long long ns;
	unsigned long us;
C
Cliff Wickman 已提交
440 441 442
	int cpu = smp_processor_id();

	ns =  (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR;
443 444
	us = ns / 1000;
	return us;
445 446
}

447
/*
448 449 450 451
 * wait for all cpus on this hub to finish their sends and go quiet
 * leaves uvhub_quiesce set so that no new broadcasts are started by
 * bau_flush_send_and_wait()
 */
C
Cliff Wickman 已提交
452
static inline void quiesce_local_uvhub(struct bau_control *hmaster)
453
{
C
Cliff Wickman 已提交
454
	atom_asr(1, (struct atomic_short *)&hmaster->uvhub_quiesce);
455 456 457 458 459
}

/*
 * mark this quiet-requestor as done
 */
C
Cliff Wickman 已提交
460
static inline void end_uvhub_quiesce(struct bau_control *hmaster)
461
{
C
Cliff Wickman 已提交
462 463 464 465 466 467 468 469 470 471 472
	atom_asr(-1, (struct atomic_short *)&hmaster->uvhub_quiesce);
}

static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift)
{
	unsigned long descriptor_status;

	descriptor_status = uv_read_local_mmr(mmr_offset);
	descriptor_status >>= right_shift;
	descriptor_status &= UV_ACT_STATUS_MASK;
	return descriptor_status;
473 474 475 476 477
}

/*
 * Wait for completion of a broadcast software ack message
 * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
478
 */
479
static int uv1_wait_completion(struct bau_desc *bau_desc,
C
Cliff Wickman 已提交
480 481
				unsigned long mmr_offset, int right_shift,
				struct bau_control *bcp, long try)
482 483
{
	unsigned long descriptor_status;
C
Cliff Wickman 已提交
484
	cycles_t ttm;
485
	struct ptc_stats *stat = bcp->statp;
486

C
Cliff Wickman 已提交
487
	descriptor_status = uv1_read_status(mmr_offset, right_shift);
488
	/* spin on the status MMR, waiting for it to go idle */
C
Cliff Wickman 已提交
489
	while ((descriptor_status != DS_IDLE)) {
490
		/*
491 492 493 494
		 * Our software ack messages may be blocked because
		 * there are no swack resources available.  As long
		 * as none of them has timed out hardware will NACK
		 * our message and its state will stay IDLE.
495
		 */
C
Cliff Wickman 已提交
496
		if (descriptor_status == DS_SOURCE_TIMEOUT) {
497 498
			stat->s_stimeout++;
			return FLUSH_GIVEUP;
C
Cliff Wickman 已提交
499
		} else if (descriptor_status == DS_DESTINATION_TIMEOUT) {
500
			stat->s_dtimeout++;
C
Cliff Wickman 已提交
501
			ttm = get_cycles();
502 503 504 505 506 507 508

			/*
			 * Our retries may be blocked by all destination
			 * swack resources being consumed, and a timeout
			 * pending.  In that case hardware returns the
			 * ERROR that looks like a destination timeout.
			 */
C
Cliff Wickman 已提交
509
			if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
510 511 512 513 514 515 516 517 518 519 520 521
				bcp->conseccompletes = 0;
				return FLUSH_RETRY_PLUGGED;
			}

			bcp->conseccompletes = 0;
			return FLUSH_RETRY_TIMEOUT;
		} else {
			/*
			 * descriptor_status is still BUSY
			 */
			cpu_relax();
		}
C
Cliff Wickman 已提交
522
		descriptor_status = uv1_read_status(mmr_offset, right_shift);
523 524 525 526 527
	}
	bcp->conseccompletes++;
	return FLUSH_COMPLETE;
}

C
Cliff Wickman 已提交
528
/*
529 530
 * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register.
 * But not currently used.
C
Cliff Wickman 已提交
531
 */
C
Cliff Wickman 已提交
532
static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
533 534
{
	unsigned long descriptor_status;
C
Cliff Wickman 已提交
535

536 537
	descriptor_status =
		((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1;
C
Cliff Wickman 已提交
538 539 540
	return descriptor_status;
}

C
Cliff Wickman 已提交
541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570
/*
 * Return whether the status of the descriptor that is normally used for this
 * cpu (the one indexed by its hub-relative cpu number) is busy.
 * The status of the original 32 descriptors is always reflected in the 64
 * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0.
 * The bit provided by the activation_status_2 register is irrelevant to
 * the status if it is only being tested for busy or not busy.
 */
int normal_busy(struct bau_control *bcp)
{
	int cpu = bcp->uvhub_cpu;
	int mmr_offset;
	int right_shift;

	mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
	right_shift = cpu * UV_ACT_STATUS_SIZE;
	return (((((read_lmmr(mmr_offset) >> right_shift) &
				UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY);
}

/*
 * Entered when a bau descriptor has gone into a permanent busy wait because
 * of a hardware bug.
 * Workaround the bug.
 */
int handle_uv2_busy(struct bau_control *bcp)
{
	struct ptc_stats *stat = bcp->statp;

	stat->s_uv2_wars++;
571 572
	bcp->busy = 1;
	return FLUSH_GIVEUP;
C
Cliff Wickman 已提交
573 574
}

C
Cliff Wickman 已提交
575 576 577 578 579 580
static int uv2_wait_completion(struct bau_desc *bau_desc,
				unsigned long mmr_offset, int right_shift,
				struct bau_control *bcp, long try)
{
	unsigned long descriptor_stat;
	cycles_t ttm;
581
	int desc = bcp->uvhub_cpu;
C
Cliff Wickman 已提交
582
	long busy_reps = 0;
583 584
	struct ptc_stats *stat = bcp->statp;

C
Cliff Wickman 已提交
585
	descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc);
C
Cliff Wickman 已提交
586

587
	/* spin on the status MMR, waiting for it to go idle */
C
Cliff Wickman 已提交
588
	while (descriptor_stat != UV2H_DESC_IDLE) {
589 590 591 592 593 594 595 596
		if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT)) {
			/*
			 * A h/w bug on the destination side may
			 * have prevented the message being marked
			 * pending, thus it doesn't get replied to
			 * and gets continually nacked until it times
			 * out with a SOURCE_TIMEOUT.
			 */
597 598
			stat->s_stimeout++;
			return FLUSH_GIVEUP;
C
Cliff Wickman 已提交
599
		} else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616
			ttm = get_cycles();

			/*
			 * Our retries may be blocked by all destination
			 * swack resources being consumed, and a timeout
			 * pending.  In that case hardware returns the
			 * ERROR that looks like a destination timeout.
			 * Without using the extended status we have to
			 * deduce from the short time that this was a
			 * strong nack.
			 */
			if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
				bcp->conseccompletes = 0;
				stat->s_plugged++;
				/* FLUSH_RETRY_PLUGGED causes hang on boot */
				return FLUSH_GIVEUP;
			}
617 618
			stat->s_dtimeout++;
			bcp->conseccompletes = 0;
619 620
			/* FLUSH_RETRY_TIMEOUT causes hang on boot */
			return FLUSH_GIVEUP;
621
		} else {
C
Cliff Wickman 已提交
622 623 624 625 626 627
			busy_reps++;
			if (busy_reps > 1000000) {
				/* not to hammer on the clock */
				busy_reps = 0;
				ttm = get_cycles();
				if ((ttm - bcp->send_message) >
628
						bcp->timeout_interval)
C
Cliff Wickman 已提交
629 630
					return handle_uv2_busy(bcp);
			}
631
			/*
C
Cliff Wickman 已提交
632
			 * descriptor_stat is still BUSY
633 634
			 */
			cpu_relax();
635
		}
C
Cliff Wickman 已提交
636 637
		descriptor_stat = uv2_read_status(mmr_offset, right_shift,
									desc);
638
	}
639
	bcp->conseccompletes++;
640 641 642
	return FLUSH_COMPLETE;
}

C
Cliff Wickman 已提交
643 644 645 646 647 648 649
/*
 * There are 2 status registers; each and array[32] of 2 bits. Set up for
 * which register to read and position in that register based on cpu in
 * current hub.
 */
static int wait_completion(struct bau_desc *bau_desc,
				struct bau_control *bcp, long try)
650
{
C
Cliff Wickman 已提交
651 652
	int right_shift;
	unsigned long mmr_offset;
653
	int desc = bcp->uvhub_cpu;
C
Cliff Wickman 已提交
654

C
Cliff Wickman 已提交
655
	if (desc < UV_CPUS_PER_AS) {
C
Cliff Wickman 已提交
656
		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
C
Cliff Wickman 已提交
657
		right_shift = desc * UV_ACT_STATUS_SIZE;
C
Cliff Wickman 已提交
658 659
	} else {
		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
C
Cliff Wickman 已提交
660
		right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
C
Cliff Wickman 已提交
661 662
	}

663
	if (bcp->uvhub_version == 1)
664
		return uv1_wait_completion(bau_desc, mmr_offset, right_shift,
C
Cliff Wickman 已提交
665
								bcp, try);
666 667
	else
		return uv2_wait_completion(bau_desc, mmr_offset, right_shift,
C
Cliff Wickman 已提交
668
								bcp, try);
669 670
}

C
Cliff Wickman 已提交
671
static inline cycles_t sec_2_cycles(unsigned long sec)
672 673 674 675 676 677 678 679 680 681
{
	unsigned long ns;
	cycles_t cyc;

	ns = sec * 1000000000;
	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
	return cyc;
}

/*
C
Cliff Wickman 已提交
682
 * Our retries are blocked by all destination sw ack resources being
683 684 685
 * in use, and a timeout is pending. In that case hardware immediately
 * returns the ERROR that looks like a destination timeout.
 */
C
Cliff Wickman 已提交
686 687
static void destination_plugged(struct bau_desc *bau_desc,
			struct bau_control *bcp,
688 689 690 691
			struct bau_control *hmaster, struct ptc_stats *stat)
{
	udelay(bcp->plugged_delay);
	bcp->plugged_tries++;
C
Cliff Wickman 已提交
692

693 694
	if (bcp->plugged_tries >= bcp->plugsb4reset) {
		bcp->plugged_tries = 0;
C
Cliff Wickman 已提交
695

696
		quiesce_local_uvhub(hmaster);
C
Cliff Wickman 已提交
697

698
		spin_lock(&hmaster->queue_lock);
C
cpw@sgi.com 已提交
699
		reset_with_ipi(&bau_desc->distribution, bcp);
700
		spin_unlock(&hmaster->queue_lock);
C
Cliff Wickman 已提交
701

702
		end_uvhub_quiesce(hmaster);
C
Cliff Wickman 已提交
703

704 705 706 707 708
		bcp->ipi_attempts++;
		stat->s_resets_plug++;
	}
}

C
Cliff Wickman 已提交
709 710 711
static void destination_timeout(struct bau_desc *bau_desc,
			struct bau_control *bcp, struct bau_control *hmaster,
			struct ptc_stats *stat)
712
{
C
Cliff Wickman 已提交
713
	hmaster->max_concurr = 1;
714 715 716
	bcp->timeout_tries++;
	if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
		bcp->timeout_tries = 0;
C
Cliff Wickman 已提交
717

718
		quiesce_local_uvhub(hmaster);
C
Cliff Wickman 已提交
719

720
		spin_lock(&hmaster->queue_lock);
C
cpw@sgi.com 已提交
721
		reset_with_ipi(&bau_desc->distribution, bcp);
722
		spin_unlock(&hmaster->queue_lock);
C
Cliff Wickman 已提交
723

724
		end_uvhub_quiesce(hmaster);
C
Cliff Wickman 已提交
725

726 727 728 729 730
		bcp->ipi_attempts++;
		stat->s_resets_timeout++;
	}
}

731
/*
732 733
 * Stop all cpus on a uvhub from using the BAU for a period of time.
 * This is reversed by check_enable.
734
 */
735
static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
736
{
737 738 739 740 741 742 743 744
	int tcpu;
	struct bau_control *tbcp;
	struct bau_control *hmaster;
	cycles_t tm1;

	hmaster = bcp->uvhub_master;
	spin_lock(&hmaster->disable_lock);
	if (!bcp->baudisabled) {
745
		stat->s_bau_disabled++;
746
		tm1 = get_cycles();
747 748
		for_each_present_cpu(tcpu) {
			tbcp = &per_cpu(bau_control, tcpu);
749 750 751 752 753
			if (tbcp->uvhub_master == hmaster) {
				tbcp->baudisabled = 1;
				tbcp->set_bau_on_time =
					tm1 + bcp->disabled_period;
			}
754 755
		}
	}
756
	spin_unlock(&hmaster->disable_lock);
757 758
}

C
Cliff Wickman 已提交
759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
static void count_max_concurr(int stat, struct bau_control *bcp,
				struct bau_control *hmaster)
{
	bcp->plugged_tries = 0;
	bcp->timeout_tries = 0;
	if (stat != FLUSH_COMPLETE)
		return;
	if (bcp->conseccompletes <= bcp->complete_threshold)
		return;
	if (hmaster->max_concurr >= hmaster->max_concurr_const)
		return;
	hmaster->max_concurr++;
}

static void record_send_stats(cycles_t time1, cycles_t time2,
		struct bau_control *bcp, struct ptc_stats *stat,
		int completion_status, int try)
{
	cycles_t elapsed;

	if (time2 > time1) {
		elapsed = time2 - time1;
		stat->s_time += elapsed;

		if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
			bcp->period_requests++;
			bcp->period_time += elapsed;
			if ((elapsed > congested_cycles) &&
787 788 789 790 791 792
			    (bcp->period_requests > bcp->cong_reps) &&
			    ((bcp->period_time / bcp->period_requests) >
							congested_cycles)) {
				stat->s_congested++;
				disable_for_period(bcp, stat);
			}
C
Cliff Wickman 已提交
793 794 795 796 797 798
		}
	} else
		stat->s_requestor--;

	if (completion_status == FLUSH_COMPLETE && try > 1)
		stat->s_retriesok++;
799
	else if (completion_status == FLUSH_GIVEUP) {
C
Cliff Wickman 已提交
800
		stat->s_giveup++;
801 802 803 804 805 806 807 808 809 810
		if (get_cycles() > bcp->period_end)
			bcp->period_giveups = 0;
		bcp->period_giveups++;
		if (bcp->period_giveups == 1)
			bcp->period_end = get_cycles() + bcp->disabled_period;
		if (bcp->period_giveups > bcp->giveup_limit) {
			disable_for_period(bcp, stat);
			stat->s_giveuplimit++;
		}
	}
C
Cliff Wickman 已提交
811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844
}

/*
 * Because of a uv1 hardware bug only a limited number of concurrent
 * requests can be made.
 */
static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
{
	spinlock_t *lock = &hmaster->uvhub_lock;
	atomic_t *v;

	v = &hmaster->active_descriptor_count;
	if (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)) {
		stat->s_throttles++;
		do {
			cpu_relax();
		} while (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr));
	}
}

/*
 * Handle the completion status of a message send.
 */
static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
			struct bau_control *bcp, struct bau_control *hmaster,
			struct ptc_stats *stat)
{
	if (completion_status == FLUSH_RETRY_PLUGGED)
		destination_plugged(bau_desc, bcp, hmaster, stat);
	else if (completion_status == FLUSH_RETRY_TIMEOUT)
		destination_timeout(bau_desc, bcp, hmaster, stat);
}

/*
845
 * Send a broadcast and wait for it to complete.
846
 *
847
 * The flush_mask contains the cpus the broadcast is to be sent to including
848
 * cpus that are on the local uvhub.
849
 *
850 851 852
 * Returns 0 if all flushing represented in the mask was done.
 * Returns 1 if it gives up entirely and the original cpu mask is to be
 * returned to the kernel.
853
 */
854 855
int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
	struct bau_desc *bau_desc)
856
{
857
	int seq_number = 0;
C
Cliff Wickman 已提交
858
	int completion_stat = 0;
859
	int uv1 = 0;
860
	long try = 0;
861
	unsigned long index;
862 863
	cycles_t time1;
	cycles_t time2;
864
	struct ptc_stats *stat = bcp->statp;
865
	struct bau_control *hmaster = bcp->uvhub_master;
866 867
	struct uv1_bau_msg_header *uv1_hdr = NULL;
	struct uv2_bau_msg_header *uv2_hdr = NULL;
868

869 870
	if (bcp->uvhub_version == 1) {
		uv1 = 1;
C
Cliff Wickman 已提交
871
		uv1_throttle(hmaster, stat);
872
	}
C
Cliff Wickman 已提交
873

874 875
	while (hmaster->uvhub_quiesce)
		cpu_relax();
876 877

	time1 = get_cycles();
878 879 880 881 882
	if (uv1)
		uv1_hdr = &bau_desc->header.uv1_hdr;
	else
		uv2_hdr = &bau_desc->header.uv2_hdr;

883
	do {
884
		if (try == 0) {
885 886 887 888
			if (uv1)
				uv1_hdr->msg_type = MSG_REGULAR;
			else
				uv2_hdr->msg_type = MSG_REGULAR;
889 890
			seq_number = bcp->message_number++;
		} else {
891 892 893 894
			if (uv1)
				uv1_hdr->msg_type = MSG_RETRY;
			else
				uv2_hdr->msg_type = MSG_RETRY;
895 896
			stat->s_retry_messages++;
		}
C
Cliff Wickman 已提交
897

898 899 900 901
		if (uv1)
			uv1_hdr->sequence = seq_number;
		else
			uv2_hdr->sequence = seq_number;
902
		index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
903
		bcp->send_message = get_cycles();
C
Cliff Wickman 已提交
904 905 906

		write_mmr_activation(index);

907
		try++;
C
Cliff Wickman 已提交
908 909 910
		completion_stat = wait_completion(bau_desc, bcp, try);

		handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
911

912
		if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
913
			bcp->ipi_attempts = 0;
914
			stat->s_overipilimit++;
C
Cliff Wickman 已提交
915
			completion_stat = FLUSH_GIVEUP;
916 917 918
			break;
		}
		cpu_relax();
C
Cliff Wickman 已提交
919 920 921
	} while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
		 (completion_stat == FLUSH_RETRY_TIMEOUT));

922
	time2 = get_cycles();
C
Cliff Wickman 已提交
923 924 925

	count_max_concurr(completion_stat, bcp, hmaster);

926 927
	while (hmaster->uvhub_quiesce)
		cpu_relax();
C
Cliff Wickman 已提交
928

929
	atomic_dec(&hmaster->active_descriptor_count);
C
Cliff Wickman 已提交
930 931 932 933

	record_send_stats(time1, time2, bcp, stat, completion_stat, try);

	if (completion_stat == FLUSH_GIVEUP)
C
Cliff Wickman 已提交
934
		/* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */
C
Cliff Wickman 已提交
935 936 937 938 939
		return 1;
	return 0;
}

/*
940 941 942
 * The BAU is disabled for this uvhub. When the disabled time period has
 * expired re-enable it.
 * Return 0 if it is re-enabled for all cpus on this uvhub.
C
Cliff Wickman 已提交
943 944 945 946 947
 */
static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
{
	int tcpu;
	struct bau_control *tbcp;
948
	struct bau_control *hmaster;
C
Cliff Wickman 已提交
949

950 951 952 953 954 955 956
	hmaster = bcp->uvhub_master;
	spin_lock(&hmaster->disable_lock);
	if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
		stat->s_bau_reenabled++;
		for_each_present_cpu(tcpu) {
			tbcp = &per_cpu(bau_control, tcpu);
			if (tbcp->uvhub_master == hmaster) {
C
Cliff Wickman 已提交
957 958 959
				tbcp->baudisabled = 0;
				tbcp->period_requests = 0;
				tbcp->period_time = 0;
960
				tbcp->period_giveups = 0;
961 962
			}
		}
963 964
		spin_unlock(&hmaster->disable_lock);
		return 0;
C
Cliff Wickman 已提交
965
	}
966
	spin_unlock(&hmaster->disable_lock);
C
Cliff Wickman 已提交
967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982
	return -1;
}

static void record_send_statistics(struct ptc_stats *stat, int locals, int hubs,
				int remotes, struct bau_desc *bau_desc)
{
	stat->s_requestor++;
	stat->s_ntargcpu += remotes + locals;
	stat->s_ntargremotes += remotes;
	stat->s_ntarglocals += locals;

	/* uvhub statistics */
	hubs = bau_uvhub_weight(&bau_desc->distribution);
	if (locals) {
		stat->s_ntarglocaluvhub++;
		stat->s_ntargremoteuvhub += (hubs - 1);
983
	} else
C
Cliff Wickman 已提交
984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026
		stat->s_ntargremoteuvhub += hubs;

	stat->s_ntarguvhub += hubs;

	if (hubs >= 16)
		stat->s_ntarguvhub16++;
	else if (hubs >= 8)
		stat->s_ntarguvhub8++;
	else if (hubs >= 4)
		stat->s_ntarguvhub4++;
	else if (hubs >= 2)
		stat->s_ntarguvhub2++;
	else
		stat->s_ntarguvhub1++;
}

/*
 * Translate a cpu mask to the uvhub distribution mask in the BAU
 * activation descriptor.
 */
static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
			struct bau_desc *bau_desc, int *localsp, int *remotesp)
{
	int cpu;
	int pnode;
	int cnt = 0;
	struct hub_and_pnode *hpp;

	for_each_cpu(cpu, flush_mask) {
		/*
		 * The distribution vector is a bit map of pnodes, relative
		 * to the partition base pnode (and the partition base nasid
		 * in the header).
		 * Translate cpu to pnode and hub using a local memory array.
		 */
		hpp = &bcp->socket_master->thp[cpu];
		pnode = hpp->pnode - bcp->partition_base_pnode;
		bau_uvhub_set(pnode, &bau_desc->distribution);
		cnt++;
		if (hpp->uvhub == bcp->uvhub)
			(*localsp)++;
		else
			(*remotesp)++;
1027
	}
C
Cliff Wickman 已提交
1028 1029
	if (!cnt)
		return 1;
1030
	return 0;
1031 1032
}

C
Cliff Wickman 已提交
1033 1034
/*
 * globally purge translation cache of a virtual address or all TLB's
T
Tejun Heo 已提交
1035
 * @cpumask: mask of all cpu's in which the address is to be removed
1036
 * @mm: mm_struct containing virtual address range
1037 1038
 * @start: start virtual address to be removed from TLB
 * @end: end virtual address to be remove from TLB
T
Tejun Heo 已提交
1039
 * @cpu: the current cpu
1040 1041 1042 1043 1044 1045
 *
 * This is the entry point for initiating any UV global TLB shootdown.
 *
 * Purges the translation caches of all specified processors of the given
 * virtual address, or purges all TLB's on specified processors.
 *
T
Tejun Heo 已提交
1046 1047
 * The caller has derived the cpumask from the mm_struct.  This function
 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
1048
 *
1049 1050
 * The cpumask is converted into a uvhubmask of the uvhubs containing
 * those cpus.
1051
 *
T
Tejun Heo 已提交
1052 1053 1054 1055 1056
 * Note that this function should be called with preemption disabled.
 *
 * Returns NULL if all remote flushing was done.
 * Returns pointer to cpumask if some remote flushing remains to be
 * done.  The returned pointer is valid till preemption is re-enabled.
1057
 */
T
Tejun Heo 已提交
1058
const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1059
				struct mm_struct *mm, unsigned long start,
1060
				unsigned long end, unsigned int cpu)
1061
{
1062
	int locals = 0;
1063 1064
	int remotes = 0;
	int hubs = 0;
1065
	struct bau_desc *bau_desc;
1066 1067 1068
	struct cpumask *flush_mask;
	struct ptc_stats *stat;
	struct bau_control *bcp;
1069 1070
	unsigned long descriptor_status;
	unsigned long status;
T
Tejun Heo 已提交
1071

1072
	bcp = &per_cpu(bau_control, cpu);
1073
	stat = bcp->statp;
1074 1075 1076 1077
	stat->s_enters++;

	if (bcp->nobau)
		return cpumask;
1078

1079 1080 1081 1082 1083 1084 1085 1086 1087 1088
	if (bcp->busy) {
		descriptor_status =
			read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_0);
		status = ((descriptor_status >> (bcp->uvhub_cpu *
			UV_ACT_STATUS_SIZE)) & UV_ACT_STATUS_MASK) << 1;
		if (status == UV2H_DESC_BUSY)
			return cpumask;
		bcp->busy = 0;
	}

1089 1090
	/* bau was disabled due to slow response */
	if (bcp->baudisabled) {
1091 1092
		if (check_enable(bcp, stat)) {
			stat->s_ipifordisabled++;
C
Cliff Wickman 已提交
1093
			return cpumask;
1094
		}
1095
	}
1096

1097 1098
	/*
	 * Each sending cpu has a per-cpu mask which it fills from the caller's
1099 1100
	 * cpu mask.  All cpus are converted to uvhubs and copied to the
	 * activation descriptor.
1101 1102
	 */
	flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
1103
	/* don't actually do a shootdown of the local cpu */
1104
	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
C
Cliff Wickman 已提交
1105

1106
	if (cpu_isset(cpu, *cpumask))
1107
		stat->s_ntargself++;
1108

1109
	bau_desc = bcp->descriptor_base;
1110
	bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu);
1111
	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
C
Cliff Wickman 已提交
1112
	if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
1113 1114
		return NULL;

C
Cliff Wickman 已提交
1115
	record_send_statistics(stat, locals, hubs, remotes, bau_desc);
1116

1117 1118 1119 1120
	if (!end || (end - start) <= PAGE_SIZE)
		bau_desc->payload.address = start;
	else
		bau_desc->payload.address = TLB_FLUSH_ALL;
T
Tejun Heo 已提交
1121
	bau_desc->payload.sending_cpu = cpu;
1122
	/*
1123 1124
	 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
	 * or 1 if it gave up and the original cpumask should be returned.
1125
	 */
1126
	if (!uv_flush_send_and_wait(flush_mask, bcp, bau_desc))
1127 1128 1129
		return NULL;
	else
		return cpumask;
1130 1131
}

C
Cliff Wickman 已提交
1132
/*
1133 1134
 * Search the message queue for any 'other' unprocessed message with the
 * same software acknowledge resource bit vector as the 'msg' message.
C
Cliff Wickman 已提交
1135 1136
 */
struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg,
1137
					   struct bau_control *bcp)
C
Cliff Wickman 已提交
1138 1139
{
	struct bau_pq_entry *msg_next = msg + 1;
1140
	unsigned char swack_vec = msg->swack_vec;
C
Cliff Wickman 已提交
1141 1142 1143

	if (msg_next > bcp->queue_last)
		msg_next = bcp->queue_first;
1144 1145 1146
	while (msg_next != msg) {
		if ((msg_next->canceled == 0) && (msg_next->replied_to == 0) &&
				(msg_next->swack_vec == swack_vec))
C
Cliff Wickman 已提交
1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176
			return msg_next;
		msg_next++;
		if (msg_next > bcp->queue_last)
			msg_next = bcp->queue_first;
	}
	return NULL;
}

/*
 * UV2 needs to work around a bug in which an arriving message has not
 * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register.
 * Such a message must be ignored.
 */
void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
{
	unsigned long mmr_image;
	unsigned char swack_vec;
	struct bau_pq_entry *msg = mdp->msg;
	struct bau_pq_entry *other_msg;

	mmr_image = read_mmr_sw_ack();
	swack_vec = msg->swack_vec;

	if ((swack_vec & mmr_image) == 0) {
		/*
		 * This message was assigned a swack resource, but no
		 * reserved acknowlegment is pending.
		 * The bug has prevented this message from setting the MMR.
		 */
		/*
1177 1178
		 * Some message has set the MMR 'pending' bit; it might have
		 * been another message.  Look for that message.
C
Cliff Wickman 已提交
1179
		 */
1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193
		other_msg = find_another_by_swack(msg, bcp);
		if (other_msg) {
			/*
			 * There is another. Process this one but do not
			 * ack it.
			 */
			bau_process_message(mdp, bcp, 0);
			/*
			 * Let the natural processing of that other message
			 * acknowledge it. Don't get the processing of sw_ack's
			 * out of order.
			 */
			return;
		}
C
Cliff Wickman 已提交
1194 1195 1196
	}

	/*
1197 1198
	 * Either the MMR shows this one pending a reply or there is no
	 * other message using this sw_ack, so it is safe to acknowledge it.
C
Cliff Wickman 已提交
1199 1200 1201 1202 1203 1204
	 */
	bau_process_message(mdp, bcp, 1);

	return;
}

1205 1206 1207 1208 1209 1210
/*
 * The BAU message interrupt comes here. (registered by set_intr_gate)
 * See entry_64.S
 *
 * We received a broadcast assist message.
 *
1211
 * Interrupts are disabled; this interrupt could represent
1212 1213
 * the receipt of several messages.
 *
1214 1215
 * All cores/threads on this hub get this interrupt.
 * The last one to see it does the software ack.
1216
 * (the resource will not be freed until noninterruptable cpus see this
1217
 *  interrupt; hardware may timeout the s/w ack and reply ERROR)
1218
 */
1219
void uv_bau_message_interrupt(struct pt_regs *regs)
1220 1221
{
	int count = 0;
1222
	cycles_t time_start;
C
Cliff Wickman 已提交
1223
	struct bau_pq_entry *msg;
1224 1225 1226 1227
	struct bau_control *bcp;
	struct ptc_stats *stat;
	struct msg_desc msgdesc;

1228
	ack_APIC_irq();
1229
	time_start = get_cycles();
C
Cliff Wickman 已提交
1230

1231
	bcp = &per_cpu(bau_control, smp_processor_id());
1232
	stat = bcp->statp;
C
Cliff Wickman 已提交
1233 1234 1235 1236

	msgdesc.queue_first = bcp->queue_first;
	msgdesc.queue_last = bcp->queue_last;

1237
	msg = bcp->bau_msg_head;
C
Cliff Wickman 已提交
1238
	while (msg->swack_vec) {
1239
		count++;
C
Cliff Wickman 已提交
1240 1241

		msgdesc.msg_slot = msg - msgdesc.queue_first;
1242
		msgdesc.msg = msg;
C
Cliff Wickman 已提交
1243 1244 1245 1246
		if (bcp->uvhub_version == 2)
			process_uv2_message(&msgdesc, bcp);
		else
			bau_process_message(&msgdesc, bcp, 1);
C
Cliff Wickman 已提交
1247

1248
		msg++;
C
Cliff Wickman 已提交
1249 1250
		if (msg > msgdesc.queue_last)
			msg = msgdesc.queue_first;
1251
		bcp->bau_msg_head = msg;
1252
	}
1253
	stat->d_time += (get_cycles() - time_start);
1254
	if (!count)
1255
		stat->d_nomsg++;
1256
	else if (count > 1)
1257
		stat->d_multmsg++;
1258 1259
}

C
Cliff Wickman 已提交
1260
/*
C
Cliff Wickman 已提交
1261
 * Each target uvhub (i.e. a uvhub that has cpu's) needs to have
C
Cliff Wickman 已提交
1262 1263 1264 1265
 * shootdown message timeouts enabled.  The timeout does not cause
 * an interrupt, but causes an error message to be returned to
 * the sender.
 */
C
Cliff Wickman 已提交
1266
static void __init enable_timeouts(void)
1267
{
1268 1269
	int uvhub;
	int nuvhubs;
1270
	int pnode;
C
Cliff Wickman 已提交
1271
	unsigned long mmr_image;
1272

1273
	nuvhubs = uv_num_possible_blades();
1274

1275 1276
	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
		if (!uv_blade_nr_possible_cpus(uvhub))
1277
			continue;
C
Cliff Wickman 已提交
1278

1279
		pnode = uv_blade_to_pnode(uvhub);
C
Cliff Wickman 已提交
1280
		mmr_image = read_mmr_misc_control(pnode);
C
Cliff Wickman 已提交
1281 1282 1283 1284 1285 1286
		/*
		 * Set the timeout period and then lock it in, in three
		 * steps; captures and locks in the period.
		 *
		 * To program the period, the SOFT_ACK_MODE must be off.
		 */
C
Cliff Wickman 已提交
1287 1288
		mmr_image &= ~(1L << SOFTACK_MSHIFT);
		write_mmr_misc_control(pnode, mmr_image);
C
Cliff Wickman 已提交
1289 1290 1291
		/*
		 * Set the 4-bit period.
		 */
C
Cliff Wickman 已提交
1292 1293 1294
		mmr_image &= ~((unsigned long)0xf << SOFTACK_PSHIFT);
		mmr_image |= (SOFTACK_TIMEOUT_PERIOD << SOFTACK_PSHIFT);
		write_mmr_misc_control(pnode, mmr_image);
C
Cliff Wickman 已提交
1295
		/*
1296
		 * UV1:
C
Cliff Wickman 已提交
1297 1298 1299 1300
		 * Subsequent reversals of the timebase bit (3) cause an
		 * immediate timeout of one or all INTD resources as
		 * indicated in bits 2:0 (7 causes all of them to timeout).
		 */
C
Cliff Wickman 已提交
1301
		mmr_image |= (1L << SOFTACK_MSHIFT);
1302
		if (is_uv2_hub()) {
1303 1304
			/* hw bug workaround; do not use extended status */
			mmr_image &= ~(1L << UV2_EXT_SHFT);
1305
		}
C
Cliff Wickman 已提交
1306
		write_mmr_misc_control(pnode, mmr_image);
1307 1308 1309
	}
}

C
Cliff Wickman 已提交
1310
static void *ptc_seq_start(struct seq_file *file, loff_t *offset)
1311 1312 1313 1314 1315 1316
{
	if (*offset < num_possible_cpus())
		return offset;
	return NULL;
}

C
Cliff Wickman 已提交
1317
static void *ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
1318 1319 1320 1321 1322 1323 1324
{
	(*offset)++;
	if (*offset < num_possible_cpus())
		return offset;
	return NULL;
}

C
Cliff Wickman 已提交
1325
static void ptc_seq_stop(struct seq_file *file, void *data)
1326 1327 1328
{
}

C
Cliff Wickman 已提交
1329
static inline unsigned long long usec_2_cycles(unsigned long microsec)
1330 1331 1332 1333
{
	unsigned long ns;
	unsigned long long cyc;

1334
	ns = microsec * 1000;
1335 1336 1337 1338
	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
	return cyc;
}

1339
/*
C
Cliff Wickman 已提交
1340
 * Display the statistics thru /proc/sgi_uv/ptc_statistics
1341
 * 'data' points to the cpu number
C
Cliff Wickman 已提交
1342
 * Note: see the descriptions in stat_description[].
1343
 */
C
Cliff Wickman 已提交
1344
static int ptc_seq_show(struct seq_file *file, void *data)
1345 1346
{
	struct ptc_stats *stat;
1347
	struct bau_control *bcp;
1348 1349 1350 1351 1352
	int cpu;

	cpu = *(loff_t *)data;
	if (!cpu) {
		seq_printf(file,
1353
		 "# cpu bauoff sent stime self locals remotes ncpus localhub ");
1354 1355
		seq_printf(file,
			"remotehub numuvhubs numuvhubs16 numuvhubs8 ");
1356
		seq_printf(file,
1357 1358 1359
			"numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries ");
		seq_printf(file,
			"rok resetp resett giveup sto bz throt disable ");
1360
		seq_printf(file,
1361
			"enable wars warshw warwaits enters ipidis plugged ");
1362
		seq_printf(file,
1363
			"ipiover glim cong swack recv rtime all one mult ");
1364
		seq_printf(file,
1365
			"none retry canc nocan reset rcan\n");
1366 1367
	}
	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
1368 1369
		bcp = &per_cpu(bau_control, cpu);
		stat = bcp->statp;
1370 1371
		/* source side statistics */
		seq_printf(file,
1372
			"cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
1373 1374
			   cpu, bcp->nobau, stat->s_requestor,
			   cycles_2_us(stat->s_time),
1375 1376 1377 1378
			   stat->s_ntargself, stat->s_ntarglocals,
			   stat->s_ntargremotes, stat->s_ntargcpu,
			   stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
			   stat->s_ntarguvhub, stat->s_ntarguvhub16);
1379
		seq_printf(file, "%ld %ld %ld %ld %ld %ld ",
1380 1381
			   stat->s_ntarguvhub8, stat->s_ntarguvhub4,
			   stat->s_ntarguvhub2, stat->s_ntarguvhub1,
1382
			   stat->s_dtimeout, stat->s_strongnacks);
1383
		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
1384 1385 1386
			   stat->s_retry_messages, stat->s_retriesok,
			   stat->s_resets_plug, stat->s_resets_timeout,
			   stat->s_giveup, stat->s_stimeout,
1387 1388 1389 1390 1391 1392 1393 1394
			   stat->s_busy, stat->s_throttles);
		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
			   stat->s_bau_disabled, stat->s_bau_reenabled,
			   stat->s_uv2_wars, stat->s_uv2_wars_hw,
			   stat->s_uv2_war_waits, stat->s_enters,
			   stat->s_ipifordisabled, stat->s_plugged,
			   stat->s_overipilimit, stat->s_giveuplimit,
			   stat->s_congested);
1395

1396 1397
		/* destination side statistics */
		seq_printf(file,
1398
			"%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
C
Cliff Wickman 已提交
1399
			   read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),
1400 1401 1402 1403 1404
			   stat->d_requestee, cycles_2_us(stat->d_time),
			   stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
			   stat->d_nomsg, stat->d_retries, stat->d_canceled,
			   stat->d_nocanceled, stat->d_resets,
			   stat->d_rcanceled);
1405 1406 1407 1408
	}
	return 0;
}

1409 1410 1411 1412
/*
 * Display the tunables thru debugfs
 */
static ssize_t tunables_read(struct file *file, char __user *userbuf,
C
Cliff Wickman 已提交
1413
				size_t count, loff_t *ppos)
1414
{
1415
	char *buf;
1416 1417
	int ret;

1418 1419 1420 1421
	buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d %d\n",
		"max_concur plugged_delay plugsb4reset timeoutsb4reset",
		"ipi_reset_limit complete_threshold congested_response_us",
		"congested_reps disabled_period giveup_limit",
C
Cliff Wickman 已提交
1422
		max_concurr, plugged_delay, plugsb4reset,
1423
		timeoutsb4reset, ipi_reset_limit, complete_threshold,
1424 1425
		congested_respns_us, congested_reps, disabled_period,
		giveup_limit);
1426

1427 1428 1429 1430 1431 1432
	if (!buf)
		return -ENOMEM;

	ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
	kfree(buf);
	return ret;
1433 1434
}

1435
/*
C
Cliff Wickman 已提交
1436 1437
 * handle a write to /proc/sgi_uv/ptc_statistics
 * -1: reset the statistics
1438 1439
 *  0: display meaning of the statistics
 */
C
Cliff Wickman 已提交
1440 1441
static ssize_t ptc_proc_write(struct file *file, const char __user *user,
				size_t count, loff_t *data)
1442
{
1443
	int cpu;
C
Cliff Wickman 已提交
1444 1445
	int i;
	int elements;
1446
	long input_arg;
1447
	char optstr[64];
1448
	struct ptc_stats *stat;
1449

1450
	if (count == 0 || count > sizeof(optstr))
1451
		return -EINVAL;
1452 1453 1454
	if (copy_from_user(optstr, user, count))
		return -EFAULT;
	optstr[count - 1] = '\0';
C
Cliff Wickman 已提交
1455

1456 1457 1458 1459 1460 1461 1462 1463
	if (!strcmp(optstr, "on")) {
		set_bau_on();
		return count;
	} else if (!strcmp(optstr, "off")) {
		set_bau_off();
		return count;
	}

1464
	if (strict_strtol(optstr, 10, &input_arg) < 0) {
1465 1466 1467 1468
		printk(KERN_DEBUG "%s is invalid\n", optstr);
		return -EINVAL;
	}

1469
	if (input_arg == 0) {
1470
		elements = ARRAY_SIZE(stat_description);
1471
		printk(KERN_DEBUG "# cpu:      cpu number\n");
1472
		printk(KERN_DEBUG "Sender statistics:\n");
C
Cliff Wickman 已提交
1473 1474
		for (i = 0; i < elements; i++)
			printk(KERN_DEBUG "%s\n", stat_description[i]);
1475 1476 1477 1478 1479
	} else if (input_arg == -1) {
		for_each_present_cpu(cpu) {
			stat = &per_cpu(ptcstats, cpu);
			memset(stat, 0, sizeof(struct ptc_stats));
		}
1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495
	}

	return count;
}

static int local_atoi(const char *name)
{
	int val = 0;

	for (;; name++) {
		switch (*name) {
		case '0' ... '9':
			val = 10*val+(*name-'0');
			break;
		default:
			return val;
1496
		}
1497
	}
1498 1499 1500
}

/*
C
Cliff Wickman 已提交
1501 1502
 * Parse the values written to /sys/kernel/debug/sgi_uv/bau_tunables.
 * Zero values reset them to defaults.
1503
 */
C
Cliff Wickman 已提交
1504 1505
static int parse_tunables_write(struct bau_control *bcp, char *instr,
				int count)
1506 1507 1508
{
	char *p;
	char *q;
C
Cliff Wickman 已提交
1509 1510
	int cnt = 0;
	int val;
1511
	int e = ARRAY_SIZE(tunables);
1512 1513 1514 1515 1516 1517 1518 1519 1520

	p = instr + strspn(instr, WHITESPACE);
	q = p;
	for (; *p; p = q + strspn(q, WHITESPACE)) {
		q = p + strcspn(p, WHITESPACE);
		cnt++;
		if (q == p)
			break;
	}
C
Cliff Wickman 已提交
1521 1522
	if (cnt != e) {
		printk(KERN_INFO "bau tunable error: should be %d values\n", e);
1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533
		return -EINVAL;
	}

	p = instr + strspn(instr, WHITESPACE);
	q = p;
	for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
		q = p + strcspn(p, WHITESPACE);
		val = local_atoi(p);
		switch (cnt) {
		case 0:
			if (val == 0) {
C
Cliff Wickman 已提交
1534 1535
				max_concurr = MAX_BAU_CONCURRENT;
				max_concurr_const = MAX_BAU_CONCURRENT;
1536 1537 1538 1539 1540 1541 1542 1543
				continue;
			}
			if (val < 1 || val > bcp->cpus_in_uvhub) {
				printk(KERN_DEBUG
				"Error: BAU max concurrent %d is invalid\n",
				val);
				return -EINVAL;
			}
C
Cliff Wickman 已提交
1544 1545
			max_concurr = val;
			max_concurr_const = val;
1546
			continue;
C
Cliff Wickman 已提交
1547
		default:
1548
			if (val == 0)
C
Cliff Wickman 已提交
1549
				*tunables[cnt].tunp = tunables[cnt].deflt;
1550
			else
C
Cliff Wickman 已提交
1551
				*tunables[cnt].tunp = val;
1552 1553 1554 1555 1556
			continue;
		}
		if (q == p)
			break;
	}
C
Cliff Wickman 已提交
1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577
	return 0;
}

/*
 * Handle a write to debugfs. (/sys/kernel/debug/sgi_uv/bau_tunables)
 */
static ssize_t tunables_write(struct file *file, const char __user *user,
				size_t count, loff_t *data)
{
	int cpu;
	int ret;
	char instr[100];
	struct bau_control *bcp;

	if (count == 0 || count > sizeof(instr)-1)
		return -EINVAL;
	if (copy_from_user(instr, user, count))
		return -EFAULT;

	instr[count] = '\0';

1578 1579
	cpu = get_cpu();
	bcp = &per_cpu(bau_control, cpu);
C
Cliff Wickman 已提交
1580
	ret = parse_tunables_write(bcp, instr, count);
1581
	put_cpu();
C
Cliff Wickman 已提交
1582 1583 1584
	if (ret)
		return ret;

1585 1586
	for_each_present_cpu(cpu) {
		bcp = &per_cpu(bau_control, cpu);
C
Cliff Wickman 已提交
1587 1588 1589 1590 1591 1592 1593 1594 1595
		bcp->max_concurr =		max_concurr;
		bcp->max_concurr_const =	max_concurr;
		bcp->plugged_delay =		plugged_delay;
		bcp->plugsb4reset =		plugsb4reset;
		bcp->timeoutsb4reset =		timeoutsb4reset;
		bcp->ipi_reset_limit =		ipi_reset_limit;
		bcp->complete_threshold =	complete_threshold;
		bcp->cong_response_us =		congested_respns_us;
		bcp->cong_reps =		congested_reps;
1596 1597
		bcp->disabled_period =		sec_2_cycles(disabled_period);
		bcp->giveup_limit =		giveup_limit;
1598
	}
1599 1600 1601 1602
	return count;
}

static const struct seq_operations uv_ptc_seq_ops = {
C
Cliff Wickman 已提交
1603 1604 1605 1606
	.start		= ptc_seq_start,
	.next		= ptc_seq_next,
	.stop		= ptc_seq_stop,
	.show		= ptc_seq_show
1607 1608
};

C
Cliff Wickman 已提交
1609
static int ptc_proc_open(struct inode *inode, struct file *file)
1610 1611 1612 1613
{
	return seq_open(file, &uv_ptc_seq_ops);
}

1614 1615 1616 1617 1618
static int tunables_open(struct inode *inode, struct file *file)
{
	return 0;
}

1619
static const struct file_operations proc_uv_ptc_operations = {
C
Cliff Wickman 已提交
1620
	.open		= ptc_proc_open,
1621
	.read		= seq_read,
C
Cliff Wickman 已提交
1622
	.write		= ptc_proc_write,
1623 1624
	.llseek		= seq_lseek,
	.release	= seq_release,
1625 1626
};

1627 1628 1629 1630
static const struct file_operations tunables_fops = {
	.open		= tunables_open,
	.read		= tunables_read,
	.write		= tunables_write,
1631
	.llseek		= default_llseek,
1632 1633
};

1634
static int __init uv_ptc_init(void)
1635
{
1636
	struct proc_dir_entry *proc_uv_ptc;
1637 1638 1639 1640

	if (!is_uv_system())
		return 0;

1641 1642
	proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
				  &proc_uv_ptc_operations);
1643 1644 1645 1646 1647
	if (!proc_uv_ptc) {
		printk(KERN_ERR "unable to create %s proc entry\n",
		       UV_PTC_BASENAME);
		return -EINVAL;
	}
1648 1649 1650 1651 1652 1653 1654 1655

	tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
	if (!tunables_dir) {
		printk(KERN_ERR "unable to create debugfs directory %s\n",
		       UV_BAU_TUNABLES_DIR);
		return -EINVAL;
	}
	tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
C
Cliff Wickman 已提交
1656
					tunables_dir, NULL, &tunables_fops);
1657 1658 1659 1660 1661
	if (!tunables_file) {
		printk(KERN_ERR "unable to create debugfs file %s\n",
		       UV_BAU_TUNABLES_FILE);
		return -EINVAL;
	}
1662 1663 1664 1665
	return 0;
}

/*
1666
 * Initialize the sending side's sending buffers.
1667
 */
C
Cliff Wickman 已提交
1668
static void activation_descriptor_init(int node, int pnode, int base_pnode)
1669 1670
{
	int i;
1671
	int cpu;
1672
	int uv1 = 0;
1673
	unsigned long gpa;
1674
	unsigned long m;
1675
	unsigned long n;
C
Cliff Wickman 已提交
1676
	size_t dsize;
1677 1678
	struct bau_desc *bau_desc;
	struct bau_desc *bd2;
1679 1680
	struct uv1_bau_msg_header *uv1_hdr;
	struct uv2_bau_msg_header *uv2_hdr;
1681
	struct bau_control *bcp;
1682

1683
	/*
C
Cliff Wickman 已提交
1684 1685
	 * each bau_desc is 64 bytes; there are 8 (ITEMS_PER_DESC)
	 * per cpu; and one per cpu on the uvhub (ADP_SZ)
1686
	 */
C
Cliff Wickman 已提交
1687 1688
	dsize = sizeof(struct bau_desc) * ADP_SZ * ITEMS_PER_DESC;
	bau_desc = kmalloc_node(dsize, GFP_KERNEL, node);
1689
	BUG_ON(!bau_desc);
1690

1691 1692 1693
	gpa = uv_gpa(bau_desc);
	n = uv_gpa_to_gnode(gpa);
	m = uv_gpa_to_offset(gpa);
1694 1695
	if (is_uv1_hub())
		uv1 = 1;
1696

1697
	/* the 14-bit pnode */
C
Cliff Wickman 已提交
1698
	write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m));
1699
	/*
C
Cliff Wickman 已提交
1700
	 * Initializing all 8 (ITEMS_PER_DESC) descriptors for each
1701
	 * cpu even though we only use the first one; one descriptor can
1702
	 * describe a broadcast to 256 uv hubs.
1703
	 */
C
Cliff Wickman 已提交
1704
	for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) {
1705
		memset(bd2, 0, sizeof(struct bau_desc));
1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725
		if (uv1) {
			uv1_hdr = &bd2->header.uv1_hdr;
			uv1_hdr->swack_flag =	1;
			/*
			 * The base_dest_nasid set in the message header
			 * is the nasid of the first uvhub in the partition.
			 * The bit map will indicate destination pnode numbers
			 * relative to that base. They may not be consecutive
			 * if nasid striding is being used.
			 */
			uv1_hdr->base_dest_nasid =
						UV_PNODE_TO_NASID(base_pnode);
			uv1_hdr->dest_subnodeid =	UV_LB_SUBNODEID;
			uv1_hdr->command =		UV_NET_ENDPOINT_INTD;
			uv1_hdr->int_both =		1;
			/*
			 * all others need to be set to zero:
			 *   fairness chaining multilevel count replied_to
			 */
		} else {
1726 1727 1728 1729
			/*
			 * BIOS uses legacy mode, but UV2 hardware always
			 * uses native mode for selective broadcasts.
			 */
1730 1731 1732 1733 1734 1735 1736
			uv2_hdr = &bd2->header.uv2_hdr;
			uv2_hdr->swack_flag =	1;
			uv2_hdr->base_dest_nasid =
						UV_PNODE_TO_NASID(base_pnode);
			uv2_hdr->dest_subnodeid =	UV_LB_SUBNODEID;
			uv2_hdr->command =		UV_NET_ENDPOINT_INTD;
		}
1737
	}
1738 1739 1740 1741 1742 1743
	for_each_present_cpu(cpu) {
		if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
			continue;
		bcp = &per_cpu(bau_control, cpu);
		bcp->descriptor_base = bau_desc;
	}
1744 1745 1746 1747
}

/*
 * initialize the destination side's receiving buffers
1748 1749 1750
 * entered for each uvhub in the partition
 * - node is first node (kernel memory notion) on the uvhub
 * - pnode is the uvhub's physical identifier
1751
 */
C
Cliff Wickman 已提交
1752
static void pq_init(int node, int pnode)
1753
{
1754
	int cpu;
C
Cliff Wickman 已提交
1755
	size_t plsize;
1756
	char *cp;
C
Cliff Wickman 已提交
1757 1758 1759 1760 1761 1762
	void *vp;
	unsigned long pn;
	unsigned long first;
	unsigned long pn_first;
	unsigned long last;
	struct bau_pq_entry *pqp;
1763
	struct bau_control *bcp;
1764

C
Cliff Wickman 已提交
1765 1766 1767
	plsize = (DEST_Q_SIZE + 1) * sizeof(struct bau_pq_entry);
	vp = kmalloc_node(plsize, GFP_KERNEL, node);
	pqp = (struct bau_pq_entry *)vp;
1768
	BUG_ON(!pqp);
1769

1770
	cp = (char *)pqp + 31;
C
Cliff Wickman 已提交
1771
	pqp = (struct bau_pq_entry *)(((unsigned long)cp >> 5) << 5);
1772 1773 1774 1775 1776 1777

	for_each_present_cpu(cpu) {
		if (pnode != uv_cpu_to_pnode(cpu))
			continue;
		/* for every cpu on this pnode: */
		bcp = &per_cpu(bau_control, cpu);
C
Cliff Wickman 已提交
1778 1779 1780
		bcp->queue_first	= pqp;
		bcp->bau_msg_head	= pqp;
		bcp->queue_last		= pqp + (DEST_Q_SIZE - 1);
1781
	}
1782
	/*
1783
	 * need the gnode of where the memory was really allocated
1784
	 */
1785
	pn = uv_gpa_to_gnode(uv_gpa(pqp));
C
Cliff Wickman 已提交
1786 1787 1788 1789 1790 1791
	first = uv_physnodeaddr(pqp);
	pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first;
	last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1));
	write_mmr_payload_first(pnode, pn_first);
	write_mmr_payload_tail(pnode, first);
	write_mmr_payload_last(pnode, last);
C
Cliff Wickman 已提交
1792
	write_gmmr_sw_ack(pnode, 0xffffUL);
C
Cliff Wickman 已提交
1793

1794
	/* in effect, all msg_type's are set to MSG_NOOP */
C
Cliff Wickman 已提交
1795
	memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);
1796
}
1797

1798
/*
1799
 * Initialization of each UV hub's structures
1800
 */
C
Cliff Wickman 已提交
1801
static void __init init_uvhub(int uvhub, int vector, int base_pnode)
1802
{
1803
	int node;
1804 1805
	int pnode;
	unsigned long apicid;
1806 1807 1808

	node = uvhub_to_first_node(uvhub);
	pnode = uv_blade_to_pnode(uvhub);
C
Cliff Wickman 已提交
1809 1810 1811 1812

	activation_descriptor_init(node, pnode, base_pnode);

	pq_init(node, pnode);
1813
	/*
1814 1815
	 * The below initialization can't be in firmware because the
	 * messaging IRQ will be determined by the OS.
1816
	 */
1817
	apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
C
Cliff Wickman 已提交
1818
	write_mmr_data_config(pnode, ((apicid << 32) | vector));
1819 1820
}

1821 1822 1823
/*
 * We will set BAU_MISC_CONTROL with a timeout period.
 * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
C
Cliff Wickman 已提交
1824
 * So the destination timeout period has to be calculated from them.
1825
 */
C
Cliff Wickman 已提交
1826
static int calculate_destination_timeout(void)
1827 1828 1829 1830 1831 1832 1833 1834 1835
{
	unsigned long mmr_image;
	int mult1;
	int mult2;
	int index;
	int base;
	int ret;
	unsigned long ts_ns;

1836
	if (is_uv1_hub()) {
C
Cliff Wickman 已提交
1837
		mult1 = SOFTACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
1838 1839 1840 1841
		mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
		index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
		mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
		mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
1842 1843
		ts_ns = timeout_base_ns[index];
		ts_ns *= (mult1 * mult2);
1844 1845
		ret = ts_ns / 1000;
	} else {
1846 1847
		/* 4 bits  0/1 for 10/80us base, 3 bits of multiplier */
		mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL);
1848
		mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT;
C
Cliff Wickman 已提交
1849
		if (mmr_image & (1L << UV2_ACK_UNITS_SHFT))
1850
			base = 80;
1851
		else
1852 1853
			base = 10;
		mult1 = mmr_image & UV2_ACK_MASK;
1854 1855
		ret = mult1 * base;
	}
1856 1857 1858
	return ret;
}

C
Cliff Wickman 已提交
1859 1860 1861 1862 1863 1864 1865 1866
static void __init init_per_cpu_tunables(void)
{
	int cpu;
	struct bau_control *bcp;

	for_each_present_cpu(cpu) {
		bcp = &per_cpu(bau_control, cpu);
		bcp->baudisabled		= 0;
1867 1868
		if (nobau)
			bcp->nobau		= 1;
C
Cliff Wickman 已提交
1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880
		bcp->statp			= &per_cpu(ptcstats, cpu);
		/* time interval to catch a hardware stay-busy bug */
		bcp->timeout_interval		= usec_2_cycles(2*timeout_us);
		bcp->max_concurr		= max_concurr;
		bcp->max_concurr_const		= max_concurr;
		bcp->plugged_delay		= plugged_delay;
		bcp->plugsb4reset		= plugsb4reset;
		bcp->timeoutsb4reset		= timeoutsb4reset;
		bcp->ipi_reset_limit		= ipi_reset_limit;
		bcp->complete_threshold		= complete_threshold;
		bcp->cong_response_us		= congested_respns_us;
		bcp->cong_reps			= congested_reps;
1881 1882
		bcp->disabled_period =		sec_2_cycles(disabled_period);
		bcp->giveup_limit =		giveup_limit;
1883 1884
		spin_lock_init(&bcp->queue_lock);
		spin_lock_init(&bcp->uvhub_lock);
1885
		spin_lock_init(&bcp->disable_lock);
C
Cliff Wickman 已提交
1886 1887 1888
	}
}

1889
/*
C
Cliff Wickman 已提交
1890
 * Scan all cpus to collect blade and socket summaries.
1891
 */
C
Cliff Wickman 已提交
1892 1893 1894
static int __init get_cpu_topology(int base_pnode,
					struct uvhub_desc *uvhub_descs,
					unsigned char *uvhub_mask)
1895 1896 1897 1898
{
	int cpu;
	int pnode;
	int uvhub;
C
Cliff Wickman 已提交
1899
	int socket;
1900 1901 1902 1903 1904 1905
	struct bau_control *bcp;
	struct uvhub_desc *bdp;
	struct socket_desc *sdp;

	for_each_present_cpu(cpu) {
		bcp = &per_cpu(bau_control, cpu);
C
Cliff Wickman 已提交
1906

1907
		memset(bcp, 0, sizeof(struct bau_control));
C
Cliff Wickman 已提交
1908

1909
		pnode = uv_cpu_hub_info(cpu)->pnode;
C
Cliff Wickman 已提交
1910
		if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) {
1911 1912
			printk(KERN_EMERG
				"cpu %d pnode %d-%d beyond %d; BAU disabled\n",
C
Cliff Wickman 已提交
1913
				cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE);
1914 1915
			return 1;
		}
C
Cliff Wickman 已提交
1916

1917
		bcp->osnode = cpu_to_node(cpu);
C
Cliff Wickman 已提交
1918 1919
		bcp->partition_base_pnode = base_pnode;

1920
		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
C
Cliff Wickman 已提交
1921
		*(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
1922
		bdp = &uvhub_descs[uvhub];
C
Cliff Wickman 已提交
1923

1924 1925 1926
		bdp->num_cpus++;
		bdp->uvhub = uvhub;
		bdp->pnode = pnode;
C
Cliff Wickman 已提交
1927

1928 1929
		/* kludge: 'assuming' one node per socket, and assuming that
		   disabling a socket just leaves a gap in node numbers */
1930
		socket = bcp->osnode & 1;
1931
		bdp->socket_mask |= (1 << socket);
1932 1933 1934
		sdp = &bdp->socket[socket];
		sdp->cpu_number[sdp->num_cpus] = cpu;
		sdp->num_cpus++;
1935
		if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
C
Cliff Wickman 已提交
1936 1937
			printk(KERN_EMERG "%d cpus per socket invalid\n",
				sdp->num_cpus);
1938 1939
			return 1;
		}
1940
	}
C
Cliff Wickman 已提交
1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959
	return 0;
}

/*
 * Each socket is to get a local array of pnodes/hubs.
 */
static void make_per_cpu_thp(struct bau_control *smaster)
{
	int cpu;
	size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus();

	smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode);
	memset(smaster->thp, 0, hpsz);
	for_each_present_cpu(cpu) {
		smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode;
		smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
	}
}

1960 1961 1962 1963 1964 1965 1966 1967 1968 1969
/*
 * Each uvhub is to get a local cpumask.
 */
static void make_per_hub_cpumask(struct bau_control *hmaster)
{
	int sz = sizeof(cpumask_t);

	hmaster->cpumask = kzalloc_node(sz, GFP_KERNEL, hmaster->osnode);
}

C
Cliff Wickman 已提交
1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995
/*
 * Initialize all the per_cpu information for the cpu's on a given socket,
 * given what has been gathered into the socket_desc struct.
 * And reports the chosen hub and socket masters back to the caller.
 */
static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
			struct bau_control **smasterp,
			struct bau_control **hmasterp)
{
	int i;
	int cpu;
	struct bau_control *bcp;

	for (i = 0; i < sdp->num_cpus; i++) {
		cpu = sdp->cpu_number[i];
		bcp = &per_cpu(bau_control, cpu);
		bcp->cpu = cpu;
		if (i == 0) {
			*smasterp = bcp;
			if (!(*hmasterp))
				*hmasterp = bcp;
		}
		bcp->cpus_in_uvhub = bdp->num_cpus;
		bcp->cpus_in_socket = sdp->num_cpus;
		bcp->socket_master = *smasterp;
		bcp->uvhub = bdp->uvhub;
1996 1997 1998 1999 2000 2001 2002 2003
		if (is_uv1_hub())
			bcp->uvhub_version = 1;
		else if (is_uv2_hub())
			bcp->uvhub_version = 2;
		else {
			printk(KERN_EMERG "uvhub version not 1 or 2\n");
			return 1;
		}
C
Cliff Wickman 已提交
2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
		bcp->uvhub_master = *hmasterp;
		bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
		if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
			printk(KERN_EMERG "%d cpus per uvhub invalid\n",
				bcp->uvhub_cpu);
			return 1;
		}
	}
	return 0;
}

/*
 * Summarize the blade and socket topology into the per_cpu structures.
 */
static int __init summarize_uvhub_sockets(int nuvhubs,
			struct uvhub_desc *uvhub_descs,
			unsigned char *uvhub_mask)
{
	int socket;
	int uvhub;
	unsigned short socket_mask;

C
Cliff Wickman 已提交
2026
	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
C
Cliff Wickman 已提交
2027 2028 2029 2030
		struct uvhub_desc *bdp;
		struct bau_control *smaster = NULL;
		struct bau_control *hmaster = NULL;

C
Cliff Wickman 已提交
2031 2032
		if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
			continue;
C
Cliff Wickman 已提交
2033

2034
		bdp = &uvhub_descs[uvhub];
2035 2036 2037
		socket_mask = bdp->socket_mask;
		socket = 0;
		while (socket_mask) {
C
Cliff Wickman 已提交
2038 2039 2040 2041
			struct socket_desc *sdp;
			if ((socket_mask & 1)) {
				sdp = &bdp->socket[socket];
				if (scan_sock(sdp, bdp, &smaster, &hmaster))
2042
					return 1;
2043
				make_per_cpu_thp(smaster);
2044 2045
			}
			socket++;
2046
			socket_mask = (socket_mask >> 1);
2047
		}
2048
		make_per_hub_cpumask(hmaster);
2049
	}
C
Cliff Wickman 已提交
2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069
	return 0;
}

/*
 * initialize the bau_control structure for each cpu
 */
static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
{
	unsigned char *uvhub_mask;
	void *vp;
	struct uvhub_desc *uvhub_descs;

	timeout_us = calculate_destination_timeout();

	vp = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
	uvhub_descs = (struct uvhub_desc *)vp;
	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
	uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);

	if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask))
2070
		goto fail;
C
Cliff Wickman 已提交
2071 2072

	if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask))
2073
		goto fail;
C
Cliff Wickman 已提交
2074

2075
	kfree(uvhub_descs);
C
Cliff Wickman 已提交
2076
	kfree(uvhub_mask);
C
Cliff Wickman 已提交
2077
	init_per_cpu_tunables();
2078
	return 0;
2079 2080 2081 2082 2083

fail:
	kfree(uvhub_descs);
	kfree(uvhub_mask);
	return 1;
2084 2085 2086 2087 2088 2089 2090
}

/*
 * Initialization of BAU-related structures
 */
static int __init uv_bau_init(void)
{
2091 2092 2093
	int uvhub;
	int pnode;
	int nuvhubs;
2094
	int cur_cpu;
C
Cliff Wickman 已提交
2095
	int cpus;
2096
	int vector;
C
Cliff Wickman 已提交
2097
	cpumask_var_t *mask;
2098 2099 2100

	if (!is_uv_system())
		return 0;
2101

C
Cliff Wickman 已提交
2102 2103 2104 2105
	for_each_possible_cpu(cur_cpu) {
		mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
		zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
	}
2106

2107
	nuvhubs = uv_num_possible_blades();
C
Cliff Wickman 已提交
2108
	congested_cycles = usec_2_cycles(congested_respns_us);
2109

C
Cliff Wickman 已提交
2110
	uv_base_pnode = 0x7fffffff;
2111
	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
C
Cliff Wickman 已提交
2112 2113 2114
		cpus = uv_blade_nr_possible_cpus(uvhub);
		if (cpus && (uv_blade_to_pnode(uvhub) < uv_base_pnode))
			uv_base_pnode = uv_blade_to_pnode(uvhub);
2115 2116
	}

2117 2118
	enable_timeouts();

C
Cliff Wickman 已提交
2119
	if (init_per_cpu(nuvhubs, uv_base_pnode)) {
2120 2121
		set_bau_off();
		nobau_perm = 1;
2122 2123
		return 0;
	}
2124 2125 2126 2127

	vector = UV_BAU_MESSAGE;
	for_each_possible_blade(uvhub)
		if (uv_blade_nr_possible_cpus(uvhub))
C
Cliff Wickman 已提交
2128
			init_uvhub(uvhub, vector, uv_base_pnode);
2129 2130 2131 2132

	alloc_intr_gate(vector, uv_bau_message_intr1);

	for_each_possible_blade(uvhub) {
2133
		if (uv_blade_nr_possible_cpus(uvhub)) {
C
Cliff Wickman 已提交
2134 2135
			unsigned long val;
			unsigned long mmr;
2136 2137
			pnode = uv_blade_to_pnode(uvhub);
			/* INIT the bau */
C
Cliff Wickman 已提交
2138 2139
			val = 1L << 63;
			write_gmmr_activation(pnode, val);
2140
			mmr = 1; /* should be 1 to broadcast to both sockets */
2141 2142
			if (!is_uv1_hub())
				write_mmr_data_broadcast(pnode, mmr);
2143
		}
2144
	}
2145

2146 2147
	return 0;
}
2148
core_initcall(uv_bau_init);
2149
fs_initcall(uv_ptc_init);