tlb_uv.c 49.7 KB
Newer Older
1 2 3
/*
 *	SGI UltraViolet TLB flush routines.
 *
C
Cliff Wickman 已提交
4
 *	(c) 2008-2011 Cliff Wickman <cpw@sgi.com>, SGI.
5 6 7 8
 *
 *	This code is released under the GNU General Public License version 2 or
 *	later.
 */
9
#include <linux/seq_file.h>
10
#include <linux/proc_fs.h>
11
#include <linux/debugfs.h>
12
#include <linux/kernel.h>
13
#include <linux/slab.h>
14
#include <linux/delay.h>
15 16

#include <asm/mmu_context.h>
T
Tejun Heo 已提交
17
#include <asm/uv/uv.h>
18
#include <asm/uv/uv_mmrs.h>
19
#include <asm/uv/uv_hub.h>
20
#include <asm/uv/uv_bau.h>
I
Ingo Molnar 已提交
21
#include <asm/apic.h>
22
#include <asm/idle.h>
23
#include <asm/tsc.h>
24
#include <asm/irq_vectors.h>
25
#include <asm/timer.h>
26

27 28 29 30 31 32 33 34 35 36 37
/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
static int timeout_base_ns[] = {
		20,
		160,
		1280,
		10240,
		81920,
		655360,
		5242880,
		167772160
};
C
Cliff Wickman 已提交
38

39
static int timeout_us;
40
static int nobau;
41 42 43
static int baudisabled;
static spinlock_t disable_lock;
static cycles_t congested_cycles;
44

45
/* tunables: */
C
Cliff Wickman 已提交
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
static int max_concurr		= MAX_BAU_CONCURRENT;
static int max_concurr_const	= MAX_BAU_CONCURRENT;
static int plugged_delay	= PLUGGED_DELAY;
static int plugsb4reset		= PLUGSB4RESET;
static int timeoutsb4reset	= TIMEOUTSB4RESET;
static int ipi_reset_limit	= IPI_RESET_LIMIT;
static int complete_threshold	= COMPLETE_THRESHOLD;
static int congested_respns_us	= CONGESTED_RESPONSE_US;
static int congested_reps	= CONGESTED_REPS;
static int congested_period	= CONGESTED_PERIOD;

static struct tunables tunables[] = {
	{&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
	{&plugged_delay, PLUGGED_DELAY},
	{&plugsb4reset, PLUGSB4RESET},
	{&timeoutsb4reset, TIMEOUTSB4RESET},
	{&ipi_reset_limit, IPI_RESET_LIMIT},
	{&complete_threshold, COMPLETE_THRESHOLD},
	{&congested_respns_us, CONGESTED_RESPONSE_US},
	{&congested_reps, CONGESTED_REPS},
	{&congested_period, CONGESTED_PERIOD}
};

69 70
static struct dentry *tunables_dir;
static struct dentry *tunables_file;
71

C
Cliff Wickman 已提交
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
/* these correspond to the statistics printed by ptc_seq_show() */
static char *stat_description[] = {
	"sent:     number of shootdown messages sent",
	"stime:    time spent sending messages",
	"numuvhubs: number of hubs targeted with shootdown",
	"numuvhubs16: number times 16 or more hubs targeted",
	"numuvhubs8: number times 8 or more hubs targeted",
	"numuvhubs4: number times 4 or more hubs targeted",
	"numuvhubs2: number times 2 or more hubs targeted",
	"numuvhubs1: number times 1 hub targeted",
	"numcpus:  number of cpus targeted with shootdown",
	"dto:      number of destination timeouts",
	"retries:  destination timeout retries sent",
	"rok:   :  destination timeouts successfully retried",
	"resetp:   ipi-style resource resets for plugs",
	"resett:   ipi-style resource resets for timeouts",
	"giveup:   fall-backs to ipi-style shootdowns",
	"sto:      number of source timeouts",
	"bz:       number of stay-busy's",
	"throt:    number times spun in throttle",
	"swack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE",
	"recv:     shootdown messages received",
	"rtime:    time spent processing messages",
	"all:      shootdown all-tlb messages",
	"one:      shootdown one-tlb messages",
	"mult:     interrupts that found multiple messages",
	"none:     interrupts that found no messages",
	"retry:    number of retry messages processed",
	"canc:     number messages canceled by retries",
	"nocan:    number retries that found nothing to cancel",
	"reset:    number of ipi-style reset requests processed",
	"rcan:     number messages canceled by reset requests",
	"disable:  number times use of the BAU was disabled",
	"enable:   number times use of the BAU was re-enabled"
};

static int __init
setup_nobau(char *arg)
110 111 112 113 114
{
	nobau = 1;
	return 0;
}
early_param("nobau", setup_nobau);
115

116
/* base pnode in this partition */
C
Cliff Wickman 已提交
117
static int uv_base_pnode __read_mostly;
118

119 120
static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
static DEFINE_PER_CPU(struct bau_control, bau_control);
121 122
static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);

123
/*
124 125
 * Determine the first node on a uvhub. 'Nodes' are used for kernel
 * memory allocation.
126
 */
127
static int __init uvhub_to_first_node(int uvhub)
128 129 130 131 132
{
	int node, b;

	for_each_online_node(node) {
		b = uv_node_to_blade_id(node);
133
		if (uvhub == b)
134 135
			return node;
	}
136
	return -1;
137 138 139
}

/*
140
 * Determine the apicid of the first cpu on a uvhub.
141
 */
142
static int __init uvhub_to_first_apicid(int uvhub)
143 144 145 146
{
	int cpu;

	for_each_present_cpu(cpu)
147
		if (uvhub == uv_cpu_to_blade_id(cpu))
148 149 150 151
			return per_cpu(x86_cpu_to_apicid, cpu);
	return -1;
}

152 153 154 155 156 157 158 159
/*
 * Free a software acknowledge hardware resource by clearing its Pending
 * bit. This will return a reply to the sender.
 * If the message has timed out, a reply has already been sent by the
 * hardware but the resource has not been released. In that case our
 * clear of the Timeout bit (as well) will free the resource. No reply will
 * be sent (the hardware will only do one reply per message).
 */
C
Cliff Wickman 已提交
160
static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp)
161
{
162
	unsigned long dw;
C
Cliff Wickman 已提交
163
	struct bau_pq_entry *msg;
164

165 166
	msg = mdp->msg;
	if (!msg->canceled) {
C
Cliff Wickman 已提交
167 168
		dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
		write_mmr_sw_ack(dw);
169
	}
170
	msg->replied_to = 1;
C
Cliff Wickman 已提交
171
	msg->swack_vec = 0;
172 173 174
}

/*
175
 * Process the receipt of a RETRY message
176
 */
C
Cliff Wickman 已提交
177 178
static void bau_process_retry_msg(struct msg_desc *mdp,
					struct bau_control *bcp)
179
{
180 181 182 183
	int i;
	int cancel_count = 0;
	unsigned long msg_res;
	unsigned long mmr = 0;
C
Cliff Wickman 已提交
184 185 186
	struct bau_pq_entry *msg = mdp->msg;
	struct bau_pq_entry *msg2;
	struct ptc_stats *stat = bcp->statp;
187

188 189 190 191 192
	stat->d_retries++;
	/*
	 * cancel any message from msg+1 to the retry itself
	 */
	for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
C
Cliff Wickman 已提交
193 194
		if (msg2 > mdp->queue_last)
			msg2 = mdp->queue_first;
195 196 197
		if (msg2 == msg)
			break;

C
Cliff Wickman 已提交
198
		/* same conditions for cancellation as do_reset */
199
		if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
C
Cliff Wickman 已提交
200 201
		    (msg2->swack_vec) && ((msg2->swack_vec &
			msg->swack_vec) == 0) &&
202 203
		    (msg2->sending_cpu == msg->sending_cpu) &&
		    (msg2->msg_type != MSG_NOOP)) {
C
Cliff Wickman 已提交
204 205
			mmr = read_mmr_sw_ack();
			msg_res = msg2->swack_vec;
206 207 208 209 210 211
			/*
			 * This is a message retry; clear the resources held
			 * by the previous message only if they timed out.
			 * If it has not timed out we have an unexpected
			 * situation to report.
			 */
212
			if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
C
Cliff Wickman 已提交
213
				unsigned long mr;
214 215 216 217 218 219 220
				/*
				 * is the resource timed out?
				 * make everyone ignore the cancelled message.
				 */
				msg2->canceled = 1;
				stat->d_canceled++;
				cancel_count++;
C
Cliff Wickman 已提交
221 222
				mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
				write_mmr_sw_ack(mr);
223
			}
224 225 226 227 228
		}
	}
	if (!cancel_count)
		stat->d_nocanceled++;
}
229

230 231 232 233
/*
 * Do all the things a cpu should do for a TLB shootdown message.
 * Other cpu's may come here at the same time for this message.
 */
C
Cliff Wickman 已提交
234 235
static void bau_process_message(struct msg_desc *mdp,
					struct bau_control *bcp)
236 237
{
	short socket_ack_count = 0;
C
Cliff Wickman 已提交
238 239 240 241
	short *sp;
	struct atomic_short *asp;
	struct ptc_stats *stat = bcp->statp;
	struct bau_pq_entry *msg = mdp->msg;
242
	struct bau_control *smaster = bcp->socket_master;
243

244 245 246
	/*
	 * This must be a normal message, or retry of a normal message
	 */
247 248
	if (msg->address == TLB_FLUSH_ALL) {
		local_flush_tlb();
249
		stat->d_alltlb++;
250 251
	} else {
		__flush_tlb_one(msg->address);
252
		stat->d_onetlb++;
253
	}
254 255 256 257 258 259 260 261 262
	stat->d_requestee++;

	/*
	 * One cpu on each uvhub has the additional job on a RETRY
	 * of releasing the resource held by the message that is
	 * being retried.  That message is identified by sending
	 * cpu number.
	 */
	if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
C
Cliff Wickman 已提交
263
		bau_process_retry_msg(mdp, bcp);
264

265
	/*
C
Cliff Wickman 已提交
266
	 * This is a swack message, so we have to reply to it.
267 268 269 270
	 * Count each responding cpu on the socket. This avoids
	 * pinging the count's cache line back and forth between
	 * the sockets.
	 */
C
Cliff Wickman 已提交
271 272 273
	sp = &smaster->socket_acknowledge_count[mdp->msg_slot];
	asp = (struct atomic_short *)sp;
	socket_ack_count = atom_asr(1, asp);
274
	if (socket_ack_count == bcp->cpus_in_socket) {
C
Cliff Wickman 已提交
275
		int msg_ack_count;
276 277 278 279 280
		/*
		 * Both sockets dump their completed count total into
		 * the message's count.
		 */
		smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
C
Cliff Wickman 已提交
281 282
		asp = (struct atomic_short *)&msg->acknowledge_count;
		msg_ack_count = atom_asr(socket_ack_count, asp);
283 284 285 286 287

		if (msg_ack_count == bcp->cpus_in_uvhub) {
			/*
			 * All cpus in uvhub saw it; reply
			 */
C
Cliff Wickman 已提交
288
			reply_to_message(mdp, bcp);
289 290
		}
	}
291

292
	return;
293 294 295
}

/*
C
cpw@sgi.com 已提交
296
 * Determine the first cpu on a pnode.
297
 */
C
cpw@sgi.com 已提交
298
static int pnode_to_first_cpu(int pnode, struct bau_control *smaster)
299 300
{
	int cpu;
C
cpw@sgi.com 已提交
301 302 303 304 305
	struct hub_and_pnode *hpp;

	for_each_present_cpu(cpu) {
		hpp = &smaster->thp[cpu];
		if (pnode == hpp->pnode)
306
			return cpu;
C
cpw@sgi.com 已提交
307
	}
308 309 310 311 312 313 314
	return -1;
}

/*
 * Last resort when we get a large number of destination timeouts is
 * to clear resources held by a given cpu.
 * Do this with IPI so that all messages in the BAU message queue
C
Cliff Wickman 已提交
315
 * can be identified by their nonzero swack_vec field.
316
 *
317 318
 * This is entered for a single cpu on the uvhub.
 * The sender want's this uvhub to free a specific message's
C
Cliff Wickman 已提交
319
 * swack resources.
320
 */
C
Cliff Wickman 已提交
321
static void do_reset(void *ptr)
322
{
323
	int i;
C
Cliff Wickman 已提交
324 325 326 327
	struct bau_control *bcp = &per_cpu(bau_control, smp_processor_id());
	struct reset_args *rap = (struct reset_args *)ptr;
	struct bau_pq_entry *msg;
	struct ptc_stats *stat = bcp->statp;
328

329 330 331
	stat->d_resets++;
	/*
	 * We're looking for the given sender, and
C
Cliff Wickman 已提交
332
	 * will free its swack resource.
333 334 335
	 * If all cpu's finally responded after the timeout, its
	 * message 'replied_to' was set.
	 */
C
Cliff Wickman 已提交
336 337 338 339
	for (msg = bcp->queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
		unsigned long msg_res;
		/* do_reset: same conditions for cancellation as
		   bau_process_retry_msg() */
340 341 342
		if ((msg->replied_to == 0) &&
		    (msg->canceled == 0) &&
		    (msg->sending_cpu == rap->sender) &&
C
Cliff Wickman 已提交
343
		    (msg->swack_vec) &&
344
		    (msg->msg_type != MSG_NOOP)) {
C
Cliff Wickman 已提交
345 346
			unsigned long mmr;
			unsigned long mr;
347 348 349 350 351 352 353
			/*
			 * make everyone else ignore this message
			 */
			msg->canceled = 1;
			/*
			 * only reset the resource if it is still pending
			 */
C
Cliff Wickman 已提交
354 355 356
			mmr = read_mmr_sw_ack();
			msg_res = msg->swack_vec;
			mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
357 358
			if (mmr & msg_res) {
				stat->d_rcanceled++;
C
Cliff Wickman 已提交
359
				write_mmr_sw_ack(mr);
360 361 362
			}
		}
	}
363
	return;
364 365 366
}

/*
367 368
 * Use IPI to get all target uvhubs to release resources held by
 * a given sending cpu number.
369
 */
C
cpw@sgi.com 已提交
370
static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
371
{
C
cpw@sgi.com 已提交
372 373
	int pnode;
	int apnode;
C
Cliff Wickman 已提交
374
	int maskbits;
C
cpw@sgi.com 已提交
375
	int sender = bcp->cpu;
376
	cpumask_t *mask = bcp->uvhub_master->cpumask;
C
cpw@sgi.com 已提交
377
	struct bau_control *smaster = bcp->socket_master;
378
	struct reset_args reset_args;
379

380
	reset_args.sender = sender;
381
	cpus_clear(*mask);
382
	/* find a single cpu for each uvhub in this distribution mask */
C
cpw@sgi.com 已提交
383
	maskbits = sizeof(struct pnmask) * BITSPERBYTE;
C
cpw@sgi.com 已提交
384 385
	/* each bit is a pnode relative to the partition base pnode */
	for (pnode = 0; pnode < maskbits; pnode++) {
C
Cliff Wickman 已提交
386
		int cpu;
C
cpw@sgi.com 已提交
387
		if (!bau_uvhub_isset(pnode, distribution))
388
			continue;
C
cpw@sgi.com 已提交
389 390
		apnode = pnode + bcp->partition_base_pnode;
		cpu = pnode_to_first_cpu(apnode, smaster);
391
		cpu_set(cpu, *mask);
392
	}
C
Cliff Wickman 已提交
393 394

	/* IPI all cpus; preemption is already disabled */
395
	smp_call_function_many(mask, do_reset, (void *)&reset_args, 1);
396 397 398
	return;
}

C
Cliff Wickman 已提交
399
static inline unsigned long cycles_2_us(unsigned long long cyc)
400 401 402
{
	unsigned long long ns;
	unsigned long us;
C
Cliff Wickman 已提交
403 404 405
	int cpu = smp_processor_id();

	ns =  (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR;
406 407
	us = ns / 1000;
	return us;
408 409
}

410
/*
411 412 413 414
 * wait for all cpus on this hub to finish their sends and go quiet
 * leaves uvhub_quiesce set so that no new broadcasts are started by
 * bau_flush_send_and_wait()
 */
C
Cliff Wickman 已提交
415
static inline void quiesce_local_uvhub(struct bau_control *hmaster)
416
{
C
Cliff Wickman 已提交
417
	atom_asr(1, (struct atomic_short *)&hmaster->uvhub_quiesce);
418 419 420 421 422
}

/*
 * mark this quiet-requestor as done
 */
C
Cliff Wickman 已提交
423
static inline void end_uvhub_quiesce(struct bau_control *hmaster)
424
{
C
Cliff Wickman 已提交
425 426 427 428 429 430 431 432 433 434 435
	atom_asr(-1, (struct atomic_short *)&hmaster->uvhub_quiesce);
}

static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift)
{
	unsigned long descriptor_status;

	descriptor_status = uv_read_local_mmr(mmr_offset);
	descriptor_status >>= right_shift;
	descriptor_status &= UV_ACT_STATUS_MASK;
	return descriptor_status;
436 437 438 439 440
}

/*
 * Wait for completion of a broadcast software ack message
 * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
441
 */
442
static int uv1_wait_completion(struct bau_desc *bau_desc,
C
Cliff Wickman 已提交
443 444
				unsigned long mmr_offset, int right_shift,
				struct bau_control *bcp, long try)
445 446
{
	unsigned long descriptor_status;
C
Cliff Wickman 已提交
447
	cycles_t ttm;
448
	struct ptc_stats *stat = bcp->statp;
449

C
Cliff Wickman 已提交
450
	descriptor_status = uv1_read_status(mmr_offset, right_shift);
451
	/* spin on the status MMR, waiting for it to go idle */
C
Cliff Wickman 已提交
452
	while ((descriptor_status != DS_IDLE)) {
453
		/*
454 455 456 457
		 * Our software ack messages may be blocked because
		 * there are no swack resources available.  As long
		 * as none of them has timed out hardware will NACK
		 * our message and its state will stay IDLE.
458
		 */
C
Cliff Wickman 已提交
459
		if (descriptor_status == DS_SOURCE_TIMEOUT) {
460 461
			stat->s_stimeout++;
			return FLUSH_GIVEUP;
C
Cliff Wickman 已提交
462
		} else if (descriptor_status == DS_DESTINATION_TIMEOUT) {
463
			stat->s_dtimeout++;
C
Cliff Wickman 已提交
464
			ttm = get_cycles();
465 466 467 468 469 470 471

			/*
			 * Our retries may be blocked by all destination
			 * swack resources being consumed, and a timeout
			 * pending.  In that case hardware returns the
			 * ERROR that looks like a destination timeout.
			 */
C
Cliff Wickman 已提交
472
			if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
473 474 475 476 477 478 479 480 481 482 483 484
				bcp->conseccompletes = 0;
				return FLUSH_RETRY_PLUGGED;
			}

			bcp->conseccompletes = 0;
			return FLUSH_RETRY_TIMEOUT;
		} else {
			/*
			 * descriptor_status is still BUSY
			 */
			cpu_relax();
		}
C
Cliff Wickman 已提交
485
		descriptor_status = uv1_read_status(mmr_offset, right_shift);
486 487 488 489 490
	}
	bcp->conseccompletes++;
	return FLUSH_COMPLETE;
}

C
Cliff Wickman 已提交
491 492 493 494
/*
 * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register.
 */
static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu)
495 496 497
{
	unsigned long descriptor_status;
	unsigned long descriptor_status2;
C
Cliff Wickman 已提交
498 499 500 501 502 503 504 505 506 507 508 509 510 511

	descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK);
	descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL;
	descriptor_status = (descriptor_status << 1) | descriptor_status2;
	return descriptor_status;
}

static int uv2_wait_completion(struct bau_desc *bau_desc,
				unsigned long mmr_offset, int right_shift,
				struct bau_control *bcp, long try)
{
	unsigned long descriptor_stat;
	cycles_t ttm;
	int cpu = bcp->uvhub_cpu;
512 513
	struct ptc_stats *stat = bcp->statp;

C
Cliff Wickman 已提交
514 515
	descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);

516
	/* spin on the status MMR, waiting for it to go idle */
C
Cliff Wickman 已提交
517
	while (descriptor_stat != UV2H_DESC_IDLE) {
518 519 520 521 522 523
		/*
		 * Our software ack messages may be blocked because
		 * there are no swack resources available.  As long
		 * as none of them has timed out hardware will NACK
		 * our message and its state will stay IDLE.
		 */
C
Cliff Wickman 已提交
524 525 526
		if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) ||
		    (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) ||
		    (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) {
527 528
			stat->s_stimeout++;
			return FLUSH_GIVEUP;
C
Cliff Wickman 已提交
529
		} else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
530
			stat->s_dtimeout++;
C
Cliff Wickman 已提交
531
			ttm = get_cycles();
532 533 534 535 536 537
			/*
			 * Our retries may be blocked by all destination
			 * swack resources being consumed, and a timeout
			 * pending.  In that case hardware returns the
			 * ERROR that looks like a destination timeout.
			 */
C
Cliff Wickman 已提交
538
			if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
539 540 541 542 543 544 545
				bcp->conseccompletes = 0;
				return FLUSH_RETRY_PLUGGED;
			}
			bcp->conseccompletes = 0;
			return FLUSH_RETRY_TIMEOUT;
		} else {
			/*
C
Cliff Wickman 已提交
546
			 * descriptor_stat is still BUSY
547 548
			 */
			cpu_relax();
549
		}
C
Cliff Wickman 已提交
550
		descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
551
	}
552
	bcp->conseccompletes++;
553 554 555
	return FLUSH_COMPLETE;
}

C
Cliff Wickman 已提交
556 557 558 559 560 561 562
/*
 * There are 2 status registers; each and array[32] of 2 bits. Set up for
 * which register to read and position in that register based on cpu in
 * current hub.
 */
static int wait_completion(struct bau_desc *bau_desc,
				struct bau_control *bcp, long try)
563
{
C
Cliff Wickman 已提交
564 565 566 567 568 569 570 571 572 573 574 575
	int right_shift;
	unsigned long mmr_offset;
	int cpu = bcp->uvhub_cpu;

	if (cpu < UV_CPUS_PER_AS) {
		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
		right_shift = cpu * UV_ACT_STATUS_SIZE;
	} else {
		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
		right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
	}

576
	if (bcp->uvhub_version == 1)
577
		return uv1_wait_completion(bau_desc, mmr_offset, right_shift,
C
Cliff Wickman 已提交
578
								bcp, try);
579 580
	else
		return uv2_wait_completion(bau_desc, mmr_offset, right_shift,
C
Cliff Wickman 已提交
581
								bcp, try);
582 583
}

C
Cliff Wickman 已提交
584
static inline cycles_t sec_2_cycles(unsigned long sec)
585 586 587 588 589 590 591 592 593 594
{
	unsigned long ns;
	cycles_t cyc;

	ns = sec * 1000000000;
	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
	return cyc;
}

/*
C
Cliff Wickman 已提交
595
 * Our retries are blocked by all destination sw ack resources being
596 597 598
 * in use, and a timeout is pending. In that case hardware immediately
 * returns the ERROR that looks like a destination timeout.
 */
C
Cliff Wickman 已提交
599 600
static void destination_plugged(struct bau_desc *bau_desc,
			struct bau_control *bcp,
601 602 603 604
			struct bau_control *hmaster, struct ptc_stats *stat)
{
	udelay(bcp->plugged_delay);
	bcp->plugged_tries++;
C
Cliff Wickman 已提交
605

606 607
	if (bcp->plugged_tries >= bcp->plugsb4reset) {
		bcp->plugged_tries = 0;
C
Cliff Wickman 已提交
608

609
		quiesce_local_uvhub(hmaster);
C
Cliff Wickman 已提交
610

611
		spin_lock(&hmaster->queue_lock);
C
cpw@sgi.com 已提交
612
		reset_with_ipi(&bau_desc->distribution, bcp);
613
		spin_unlock(&hmaster->queue_lock);
C
Cliff Wickman 已提交
614

615
		end_uvhub_quiesce(hmaster);
C
Cliff Wickman 已提交
616

617 618 619 620 621
		bcp->ipi_attempts++;
		stat->s_resets_plug++;
	}
}

C
Cliff Wickman 已提交
622 623 624
static void destination_timeout(struct bau_desc *bau_desc,
			struct bau_control *bcp, struct bau_control *hmaster,
			struct ptc_stats *stat)
625
{
C
Cliff Wickman 已提交
626
	hmaster->max_concurr = 1;
627 628 629
	bcp->timeout_tries++;
	if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
		bcp->timeout_tries = 0;
C
Cliff Wickman 已提交
630

631
		quiesce_local_uvhub(hmaster);
C
Cliff Wickman 已提交
632

633
		spin_lock(&hmaster->queue_lock);
C
cpw@sgi.com 已提交
634
		reset_with_ipi(&bau_desc->distribution, bcp);
635
		spin_unlock(&hmaster->queue_lock);
C
Cliff Wickman 已提交
636

637
		end_uvhub_quiesce(hmaster);
C
Cliff Wickman 已提交
638

639 640 641 642 643
		bcp->ipi_attempts++;
		stat->s_resets_timeout++;
	}
}

644 645 646 647
/*
 * Completions are taking a very long time due to a congested numalink
 * network.
 */
C
Cliff Wickman 已提交
648 649
static void disable_for_congestion(struct bau_control *bcp,
					struct ptc_stats *stat)
650 651 652
{
	/* let only one cpu do this disabling */
	spin_lock(&disable_lock);
C
Cliff Wickman 已提交
653

654 655
	if (!baudisabled && bcp->period_requests &&
	    ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
C
Cliff Wickman 已提交
656 657
		int tcpu;
		struct bau_control *tbcp;
658 659 660 661
		/* it becomes this cpu's job to turn on the use of the
		   BAU again */
		baudisabled = 1;
		bcp->set_bau_off = 1;
C
Cliff Wickman 已提交
662 663
		bcp->set_bau_on_time = get_cycles();
		bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period);
664 665 666
		stat->s_bau_disabled++;
		for_each_present_cpu(tcpu) {
			tbcp = &per_cpu(bau_control, tcpu);
C
Cliff Wickman 已提交
667
			tbcp->baudisabled = 1;
668 669
		}
	}
C
Cliff Wickman 已提交
670

671 672 673
	spin_unlock(&disable_lock);
}

C
Cliff Wickman 已提交
674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745
static void count_max_concurr(int stat, struct bau_control *bcp,
				struct bau_control *hmaster)
{
	bcp->plugged_tries = 0;
	bcp->timeout_tries = 0;
	if (stat != FLUSH_COMPLETE)
		return;
	if (bcp->conseccompletes <= bcp->complete_threshold)
		return;
	if (hmaster->max_concurr >= hmaster->max_concurr_const)
		return;
	hmaster->max_concurr++;
}

static void record_send_stats(cycles_t time1, cycles_t time2,
		struct bau_control *bcp, struct ptc_stats *stat,
		int completion_status, int try)
{
	cycles_t elapsed;

	if (time2 > time1) {
		elapsed = time2 - time1;
		stat->s_time += elapsed;

		if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
			bcp->period_requests++;
			bcp->period_time += elapsed;
			if ((elapsed > congested_cycles) &&
			    (bcp->period_requests > bcp->cong_reps))
				disable_for_congestion(bcp, stat);
		}
	} else
		stat->s_requestor--;

	if (completion_status == FLUSH_COMPLETE && try > 1)
		stat->s_retriesok++;
	else if (completion_status == FLUSH_GIVEUP)
		stat->s_giveup++;
}

/*
 * Because of a uv1 hardware bug only a limited number of concurrent
 * requests can be made.
 */
static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
{
	spinlock_t *lock = &hmaster->uvhub_lock;
	atomic_t *v;

	v = &hmaster->active_descriptor_count;
	if (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)) {
		stat->s_throttles++;
		do {
			cpu_relax();
		} while (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr));
	}
}

/*
 * Handle the completion status of a message send.
 */
static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
			struct bau_control *bcp, struct bau_control *hmaster,
			struct ptc_stats *stat)
{
	if (completion_status == FLUSH_RETRY_PLUGGED)
		destination_plugged(bau_desc, bcp, hmaster, stat);
	else if (completion_status == FLUSH_RETRY_TIMEOUT)
		destination_timeout(bau_desc, bcp, hmaster, stat);
}

/*
746
 * Send a broadcast and wait for it to complete.
747
 *
748
 * The flush_mask contains the cpus the broadcast is to be sent to including
749
 * cpus that are on the local uvhub.
750
 *
751 752 753
 * Returns 0 if all flushing represented in the mask was done.
 * Returns 1 if it gives up entirely and the original cpu mask is to be
 * returned to the kernel.
754
 */
755
int uv_flush_send_and_wait(struct bau_desc *bau_desc,
C
Cliff Wickman 已提交
756
			struct cpumask *flush_mask, struct bau_control *bcp)
757
{
758
	int seq_number = 0;
C
Cliff Wickman 已提交
759
	int completion_stat = 0;
760
	int uv1 = 0;
761
	long try = 0;
762
	unsigned long index;
763 764
	cycles_t time1;
	cycles_t time2;
765
	struct ptc_stats *stat = bcp->statp;
766
	struct bau_control *hmaster = bcp->uvhub_master;
767 768
	struct uv1_bau_msg_header *uv1_hdr = NULL;
	struct uv2_bau_msg_header *uv2_hdr = NULL;
769

770 771
	if (bcp->uvhub_version == 1) {
		uv1 = 1;
C
Cliff Wickman 已提交
772
		uv1_throttle(hmaster, stat);
773 774 775
		uv1_hdr = &bau_desc->header.uv1_hdr;
	} else
		uv2_hdr = &bau_desc->header.uv2_hdr;
C
Cliff Wickman 已提交
776

777 778
	while (hmaster->uvhub_quiesce)
		cpu_relax();
779 780 781

	time1 = get_cycles();
	do {
782
		if (try == 0) {
783 784 785 786
			if (uv1)
				uv1_hdr->msg_type = MSG_REGULAR;
			else
				uv2_hdr->msg_type = MSG_REGULAR;
787 788
			seq_number = bcp->message_number++;
		} else {
789 790 791 792
			if (uv1)
				uv1_hdr->msg_type = MSG_RETRY;
			else
				uv2_hdr->msg_type = MSG_RETRY;
793 794
			stat->s_retry_messages++;
		}
C
Cliff Wickman 已提交
795

796 797 798 799
		if (uv1)
			uv1_hdr->sequence = seq_number;
		else
			uv2_hdr->sequence = seq_number;
C
Cliff Wickman 已提交
800
		index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
801
		bcp->send_message = get_cycles();
C
Cliff Wickman 已提交
802 803 804

		write_mmr_activation(index);

805
		try++;
C
Cliff Wickman 已提交
806 807 808
		completion_stat = wait_completion(bau_desc, bcp, try);

		handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
809

810
		if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
811
			bcp->ipi_attempts = 0;
C
Cliff Wickman 已提交
812
			completion_stat = FLUSH_GIVEUP;
813 814 815
			break;
		}
		cpu_relax();
C
Cliff Wickman 已提交
816 817 818
	} while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
		 (completion_stat == FLUSH_RETRY_TIMEOUT));

819
	time2 = get_cycles();
C
Cliff Wickman 已提交
820 821 822

	count_max_concurr(completion_stat, bcp, hmaster);

823 824
	while (hmaster->uvhub_quiesce)
		cpu_relax();
C
Cliff Wickman 已提交
825

826
	atomic_dec(&hmaster->active_descriptor_count);
C
Cliff Wickman 已提交
827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853

	record_send_stats(time1, time2, bcp, stat, completion_stat, try);

	if (completion_stat == FLUSH_GIVEUP)
		return 1;
	return 0;
}

/*
 * The BAU is disabled. When the disabled time period has expired, the cpu
 * that disabled it must re-enable it.
 * Return 0 if it is re-enabled for all cpus.
 */
static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
{
	int tcpu;
	struct bau_control *tbcp;

	if (bcp->set_bau_off) {
		if (get_cycles() >= bcp->set_bau_on_time) {
			stat->s_bau_reenabled++;
			baudisabled = 0;
			for_each_present_cpu(tcpu) {
				tbcp = &per_cpu(bau_control, tcpu);
				tbcp->baudisabled = 0;
				tbcp->period_requests = 0;
				tbcp->period_time = 0;
854
			}
C
Cliff Wickman 已提交
855
			return 0;
856
		}
C
Cliff Wickman 已提交
857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873
	}
	return -1;
}

static void record_send_statistics(struct ptc_stats *stat, int locals, int hubs,
				int remotes, struct bau_desc *bau_desc)
{
	stat->s_requestor++;
	stat->s_ntargcpu += remotes + locals;
	stat->s_ntargremotes += remotes;
	stat->s_ntarglocals += locals;

	/* uvhub statistics */
	hubs = bau_uvhub_weight(&bau_desc->distribution);
	if (locals) {
		stat->s_ntarglocaluvhub++;
		stat->s_ntargremoteuvhub += (hubs - 1);
874
	} else
C
Cliff Wickman 已提交
875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917
		stat->s_ntargremoteuvhub += hubs;

	stat->s_ntarguvhub += hubs;

	if (hubs >= 16)
		stat->s_ntarguvhub16++;
	else if (hubs >= 8)
		stat->s_ntarguvhub8++;
	else if (hubs >= 4)
		stat->s_ntarguvhub4++;
	else if (hubs >= 2)
		stat->s_ntarguvhub2++;
	else
		stat->s_ntarguvhub1++;
}

/*
 * Translate a cpu mask to the uvhub distribution mask in the BAU
 * activation descriptor.
 */
static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
			struct bau_desc *bau_desc, int *localsp, int *remotesp)
{
	int cpu;
	int pnode;
	int cnt = 0;
	struct hub_and_pnode *hpp;

	for_each_cpu(cpu, flush_mask) {
		/*
		 * The distribution vector is a bit map of pnodes, relative
		 * to the partition base pnode (and the partition base nasid
		 * in the header).
		 * Translate cpu to pnode and hub using a local memory array.
		 */
		hpp = &bcp->socket_master->thp[cpu];
		pnode = hpp->pnode - bcp->partition_base_pnode;
		bau_uvhub_set(pnode, &bau_desc->distribution);
		cnt++;
		if (hpp->uvhub == bcp->uvhub)
			(*localsp)++;
		else
			(*remotesp)++;
918
	}
C
Cliff Wickman 已提交
919 920
	if (!cnt)
		return 1;
921
	return 0;
922 923
}

C
Cliff Wickman 已提交
924 925
/*
 * globally purge translation cache of a virtual address or all TLB's
T
Tejun Heo 已提交
926
 * @cpumask: mask of all cpu's in which the address is to be removed
927 928
 * @mm: mm_struct containing virtual address range
 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
T
Tejun Heo 已提交
929
 * @cpu: the current cpu
930 931 932 933 934 935
 *
 * This is the entry point for initiating any UV global TLB shootdown.
 *
 * Purges the translation caches of all specified processors of the given
 * virtual address, or purges all TLB's on specified processors.
 *
T
Tejun Heo 已提交
936 937
 * The caller has derived the cpumask from the mm_struct.  This function
 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
938
 *
939 940
 * The cpumask is converted into a uvhubmask of the uvhubs containing
 * those cpus.
941
 *
T
Tejun Heo 已提交
942 943 944 945 946
 * Note that this function should be called with preemption disabled.
 *
 * Returns NULL if all remote flushing was done.
 * Returns pointer to cpumask if some remote flushing remains to be
 * done.  The returned pointer is valid till preemption is re-enabled.
947
 */
T
Tejun Heo 已提交
948
const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
C
Cliff Wickman 已提交
949 950
				struct mm_struct *mm, unsigned long va,
				unsigned int cpu)
951
{
952
	int locals = 0;
953 954
	int remotes = 0;
	int hubs = 0;
955
	struct bau_desc *bau_desc;
956 957 958
	struct cpumask *flush_mask;
	struct ptc_stats *stat;
	struct bau_control *bcp;
T
Tejun Heo 已提交
959

960
	/* kernel was booted 'nobau' */
961 962
	if (nobau)
		return cpumask;
T
Tejun Heo 已提交
963

964
	bcp = &per_cpu(bau_control, cpu);
965
	stat = bcp->statp;
966 967 968

	/* bau was disabled due to slow response */
	if (bcp->baudisabled) {
C
Cliff Wickman 已提交
969 970
		if (check_enable(bcp, stat))
			return cpumask;
971
	}
972

973 974
	/*
	 * Each sending cpu has a per-cpu mask which it fills from the caller's
975 976
	 * cpu mask.  All cpus are converted to uvhubs and copied to the
	 * activation descriptor.
977 978
	 */
	flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
979
	/* don't actually do a shootdown of the local cpu */
980
	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
C
Cliff Wickman 已提交
981

982
	if (cpu_isset(cpu, *cpumask))
983
		stat->s_ntargself++;
984

985
	bau_desc = bcp->descriptor_base;
986
	bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu);
987
	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
C
Cliff Wickman 已提交
988
	if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
989 990
		return NULL;

C
Cliff Wickman 已提交
991
	record_send_statistics(stat, locals, hubs, remotes, bau_desc);
992 993

	bau_desc->payload.address = va;
T
Tejun Heo 已提交
994
	bau_desc->payload.sending_cpu = cpu;
995
	/*
996 997
	 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
	 * or 1 if it gave up and the original cpumask should be returned.
998
	 */
999 1000 1001 1002
	if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
		return NULL;
	else
		return cpumask;
1003 1004 1005 1006 1007 1008 1009 1010
}

/*
 * The BAU message interrupt comes here. (registered by set_intr_gate)
 * See entry_64.S
 *
 * We received a broadcast assist message.
 *
1011
 * Interrupts are disabled; this interrupt could represent
1012 1013
 * the receipt of several messages.
 *
1014 1015
 * All cores/threads on this hub get this interrupt.
 * The last one to see it does the software ack.
1016
 * (the resource will not be freed until noninterruptable cpus see this
1017
 *  interrupt; hardware may timeout the s/w ack and reply ERROR)
1018
 */
1019
void uv_bau_message_interrupt(struct pt_regs *regs)
1020 1021
{
	int count = 0;
1022
	cycles_t time_start;
C
Cliff Wickman 已提交
1023
	struct bau_pq_entry *msg;
1024 1025 1026 1027 1028
	struct bau_control *bcp;
	struct ptc_stats *stat;
	struct msg_desc msgdesc;

	time_start = get_cycles();
C
Cliff Wickman 已提交
1029

1030
	bcp = &per_cpu(bau_control, smp_processor_id());
1031
	stat = bcp->statp;
C
Cliff Wickman 已提交
1032 1033 1034 1035

	msgdesc.queue_first = bcp->queue_first;
	msgdesc.queue_last = bcp->queue_last;

1036
	msg = bcp->bau_msg_head;
C
Cliff Wickman 已提交
1037
	while (msg->swack_vec) {
1038
		count++;
C
Cliff Wickman 已提交
1039 1040 1041

		msgdesc.msg_slot = msg - msgdesc.queue_first;
		msgdesc.swack_slot = ffs(msg->swack_vec) - 1;
1042
		msgdesc.msg = msg;
C
Cliff Wickman 已提交
1043 1044
		bau_process_message(&msgdesc, bcp);

1045
		msg++;
C
Cliff Wickman 已提交
1046 1047
		if (msg > msgdesc.queue_last)
			msg = msgdesc.queue_first;
1048
		bcp->bau_msg_head = msg;
1049
	}
1050
	stat->d_time += (get_cycles() - time_start);
1051
	if (!count)
1052
		stat->d_nomsg++;
1053
	else if (count > 1)
1054
		stat->d_multmsg++;
C
Cliff Wickman 已提交
1055

1056
	ack_APIC_irq();
1057 1058
}

C
Cliff Wickman 已提交
1059
/*
C
Cliff Wickman 已提交
1060
 * Each target uvhub (i.e. a uvhub that has cpu's) needs to have
C
Cliff Wickman 已提交
1061 1062 1063 1064
 * shootdown message timeouts enabled.  The timeout does not cause
 * an interrupt, but causes an error message to be returned to
 * the sender.
 */
C
Cliff Wickman 已提交
1065
static void __init enable_timeouts(void)
1066
{
1067 1068
	int uvhub;
	int nuvhubs;
1069
	int pnode;
C
Cliff Wickman 已提交
1070
	unsigned long mmr_image;
1071

1072
	nuvhubs = uv_num_possible_blades();
1073

1074 1075
	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
		if (!uv_blade_nr_possible_cpus(uvhub))
1076
			continue;
C
Cliff Wickman 已提交
1077

1078
		pnode = uv_blade_to_pnode(uvhub);
C
Cliff Wickman 已提交
1079
		mmr_image = read_mmr_misc_control(pnode);
C
Cliff Wickman 已提交
1080 1081 1082 1083 1084 1085
		/*
		 * Set the timeout period and then lock it in, in three
		 * steps; captures and locks in the period.
		 *
		 * To program the period, the SOFT_ACK_MODE must be off.
		 */
C
Cliff Wickman 已提交
1086 1087
		mmr_image &= ~(1L << SOFTACK_MSHIFT);
		write_mmr_misc_control(pnode, mmr_image);
C
Cliff Wickman 已提交
1088 1089 1090
		/*
		 * Set the 4-bit period.
		 */
C
Cliff Wickman 已提交
1091 1092 1093
		mmr_image &= ~((unsigned long)0xf << SOFTACK_PSHIFT);
		mmr_image |= (SOFTACK_TIMEOUT_PERIOD << SOFTACK_PSHIFT);
		write_mmr_misc_control(pnode, mmr_image);
C
Cliff Wickman 已提交
1094
		/*
1095
		 * UV1:
C
Cliff Wickman 已提交
1096 1097 1098 1099
		 * Subsequent reversals of the timebase bit (3) cause an
		 * immediate timeout of one or all INTD resources as
		 * indicated in bits 2:0 (7 causes all of them to timeout).
		 */
C
Cliff Wickman 已提交
1100
		mmr_image |= (1L << SOFTACK_MSHIFT);
1101
		if (is_uv2_hub()) {
1102
			mmr_image &= ~(1L << UV2_LEG_SHFT);
C
Cliff Wickman 已提交
1103
			mmr_image |= (1L << UV2_EXT_SHFT);
1104
		}
C
Cliff Wickman 已提交
1105
		write_mmr_misc_control(pnode, mmr_image);
1106 1107 1108
	}
}

C
Cliff Wickman 已提交
1109
static void *ptc_seq_start(struct seq_file *file, loff_t *offset)
1110 1111 1112 1113 1114 1115
{
	if (*offset < num_possible_cpus())
		return offset;
	return NULL;
}

C
Cliff Wickman 已提交
1116
static void *ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
1117 1118 1119 1120 1121 1122 1123
{
	(*offset)++;
	if (*offset < num_possible_cpus())
		return offset;
	return NULL;
}

C
Cliff Wickman 已提交
1124
static void ptc_seq_stop(struct seq_file *file, void *data)
1125 1126 1127
{
}

C
Cliff Wickman 已提交
1128
static inline unsigned long long usec_2_cycles(unsigned long microsec)
1129 1130 1131 1132
{
	unsigned long ns;
	unsigned long long cyc;

1133
	ns = microsec * 1000;
1134 1135 1136 1137
	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
	return cyc;
}

1138
/*
C
Cliff Wickman 已提交
1139
 * Display the statistics thru /proc/sgi_uv/ptc_statistics
1140
 * 'data' points to the cpu number
C
Cliff Wickman 已提交
1141
 * Note: see the descriptions in stat_description[].
1142
 */
C
Cliff Wickman 已提交
1143
static int ptc_seq_show(struct seq_file *file, void *data)
1144 1145 1146 1147 1148 1149 1150
{
	struct ptc_stats *stat;
	int cpu;

	cpu = *(loff_t *)data;
	if (!cpu) {
		seq_printf(file,
1151 1152 1153
			"# cpu sent stime self locals remotes ncpus localhub ");
		seq_printf(file,
			"remotehub numuvhubs numuvhubs16 numuvhubs8 ");
1154
		seq_printf(file,
C
Cliff Wickman 已提交
1155
			"numuvhubs4 numuvhubs2 numuvhubs1 dto retries rok ");
1156
		seq_printf(file,
C
Cliff Wickman 已提交
1157
			"resetp resett giveup sto bz throt swack recv rtime ");
1158
		seq_printf(file,
C
Cliff Wickman 已提交
1159
			"all one mult none retry canc nocan reset rcan ");
1160 1161
		seq_printf(file,
			"disable enable\n");
1162 1163 1164
	}
	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
		stat = &per_cpu(ptcstats, cpu);
1165 1166 1167 1168
		/* source side statistics */
		seq_printf(file,
			"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
			   cpu, stat->s_requestor, cycles_2_us(stat->s_time),
1169 1170 1171 1172 1173
			   stat->s_ntargself, stat->s_ntarglocals,
			   stat->s_ntargremotes, stat->s_ntargcpu,
			   stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
			   stat->s_ntarguvhub, stat->s_ntarguvhub16);
		seq_printf(file, "%ld %ld %ld %ld %ld ",
1174 1175
			   stat->s_ntarguvhub8, stat->s_ntarguvhub4,
			   stat->s_ntarguvhub2, stat->s_ntarguvhub1,
1176
			   stat->s_dtimeout);
1177 1178 1179 1180 1181
		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
			   stat->s_retry_messages, stat->s_retriesok,
			   stat->s_resets_plug, stat->s_resets_timeout,
			   stat->s_giveup, stat->s_stimeout,
			   stat->s_busy, stat->s_throttles);
1182

1183 1184
		/* destination side statistics */
		seq_printf(file,
1185
			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
C
Cliff Wickman 已提交
1186
			   read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),
1187 1188 1189 1190 1191
			   stat->d_requestee, cycles_2_us(stat->d_time),
			   stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
			   stat->d_nomsg, stat->d_retries, stat->d_canceled,
			   stat->d_nocanceled, stat->d_resets,
			   stat->d_rcanceled);
1192 1193
		seq_printf(file, "%ld %ld\n",
			stat->s_bau_disabled, stat->s_bau_reenabled);
1194 1195 1196 1197
	}
	return 0;
}

1198 1199 1200 1201
/*
 * Display the tunables thru debugfs
 */
static ssize_t tunables_read(struct file *file, char __user *userbuf,
C
Cliff Wickman 已提交
1202
				size_t count, loff_t *ppos)
1203
{
1204
	char *buf;
1205 1206
	int ret;

1207
	buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
C
Cliff Wickman 已提交
1208
		"max_concur plugged_delay plugsb4reset",
1209 1210
		"timeoutsb4reset ipi_reset_limit complete_threshold",
		"congested_response_us congested_reps congested_period",
C
Cliff Wickman 已提交
1211
		max_concurr, plugged_delay, plugsb4reset,
1212
		timeoutsb4reset, ipi_reset_limit, complete_threshold,
C
Cliff Wickman 已提交
1213
		congested_respns_us, congested_reps, congested_period);
1214

1215 1216 1217 1218 1219 1220
	if (!buf)
		return -ENOMEM;

	ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
	kfree(buf);
	return ret;
1221 1222
}

1223
/*
C
Cliff Wickman 已提交
1224 1225
 * handle a write to /proc/sgi_uv/ptc_statistics
 * -1: reset the statistics
1226 1227
 *  0: display meaning of the statistics
 */
C
Cliff Wickman 已提交
1228 1229
static ssize_t ptc_proc_write(struct file *file, const char __user *user,
				size_t count, loff_t *data)
1230
{
1231
	int cpu;
C
Cliff Wickman 已提交
1232 1233
	int i;
	int elements;
1234
	long input_arg;
1235
	char optstr[64];
1236
	struct ptc_stats *stat;
1237

1238
	if (count == 0 || count > sizeof(optstr))
1239
		return -EINVAL;
1240 1241 1242
	if (copy_from_user(optstr, user, count))
		return -EFAULT;
	optstr[count - 1] = '\0';
C
Cliff Wickman 已提交
1243

1244
	if (strict_strtol(optstr, 10, &input_arg) < 0) {
1245 1246 1247 1248
		printk(KERN_DEBUG "%s is invalid\n", optstr);
		return -EINVAL;
	}

1249
	if (input_arg == 0) {
C
Cliff Wickman 已提交
1250
		elements = sizeof(stat_description)/sizeof(*stat_description);
1251
		printk(KERN_DEBUG "# cpu:      cpu number\n");
1252
		printk(KERN_DEBUG "Sender statistics:\n");
C
Cliff Wickman 已提交
1253 1254
		for (i = 0; i < elements; i++)
			printk(KERN_DEBUG "%s\n", stat_description[i]);
1255 1256 1257 1258 1259
	} else if (input_arg == -1) {
		for_each_present_cpu(cpu) {
			stat = &per_cpu(ptcstats, cpu);
			memset(stat, 0, sizeof(struct ptc_stats));
		}
1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275
	}

	return count;
}

static int local_atoi(const char *name)
{
	int val = 0;

	for (;; name++) {
		switch (*name) {
		case '0' ... '9':
			val = 10*val+(*name-'0');
			break;
		default:
			return val;
1276
		}
1277
	}
1278 1279 1280
}

/*
C
Cliff Wickman 已提交
1281 1282
 * Parse the values written to /sys/kernel/debug/sgi_uv/bau_tunables.
 * Zero values reset them to defaults.
1283
 */
C
Cliff Wickman 已提交
1284 1285
static int parse_tunables_write(struct bau_control *bcp, char *instr,
				int count)
1286 1287 1288
{
	char *p;
	char *q;
C
Cliff Wickman 已提交
1289 1290 1291
	int cnt = 0;
	int val;
	int e = sizeof(tunables) / sizeof(*tunables);
1292 1293 1294 1295 1296 1297 1298 1299 1300

	p = instr + strspn(instr, WHITESPACE);
	q = p;
	for (; *p; p = q + strspn(q, WHITESPACE)) {
		q = p + strcspn(p, WHITESPACE);
		cnt++;
		if (q == p)
			break;
	}
C
Cliff Wickman 已提交
1301 1302
	if (cnt != e) {
		printk(KERN_INFO "bau tunable error: should be %d values\n", e);
1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313
		return -EINVAL;
	}

	p = instr + strspn(instr, WHITESPACE);
	q = p;
	for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
		q = p + strcspn(p, WHITESPACE);
		val = local_atoi(p);
		switch (cnt) {
		case 0:
			if (val == 0) {
C
Cliff Wickman 已提交
1314 1315
				max_concurr = MAX_BAU_CONCURRENT;
				max_concurr_const = MAX_BAU_CONCURRENT;
1316 1317 1318 1319 1320 1321 1322 1323
				continue;
			}
			if (val < 1 || val > bcp->cpus_in_uvhub) {
				printk(KERN_DEBUG
				"Error: BAU max concurrent %d is invalid\n",
				val);
				return -EINVAL;
			}
C
Cliff Wickman 已提交
1324 1325
			max_concurr = val;
			max_concurr_const = val;
1326
			continue;
C
Cliff Wickman 已提交
1327
		default:
1328
			if (val == 0)
C
Cliff Wickman 已提交
1329
				*tunables[cnt].tunp = tunables[cnt].deflt;
1330
			else
C
Cliff Wickman 已提交
1331
				*tunables[cnt].tunp = val;
1332 1333 1334 1335 1336
			continue;
		}
		if (q == p)
			break;
	}
C
Cliff Wickman 已提交
1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357
	return 0;
}

/*
 * Handle a write to debugfs. (/sys/kernel/debug/sgi_uv/bau_tunables)
 */
static ssize_t tunables_write(struct file *file, const char __user *user,
				size_t count, loff_t *data)
{
	int cpu;
	int ret;
	char instr[100];
	struct bau_control *bcp;

	if (count == 0 || count > sizeof(instr)-1)
		return -EINVAL;
	if (copy_from_user(instr, user, count))
		return -EFAULT;

	instr[count] = '\0';

1358 1359
	cpu = get_cpu();
	bcp = &per_cpu(bau_control, cpu);
C
Cliff Wickman 已提交
1360
	ret = parse_tunables_write(bcp, instr, count);
1361
	put_cpu();
C
Cliff Wickman 已提交
1362 1363 1364
	if (ret)
		return ret;

1365 1366
	for_each_present_cpu(cpu) {
		bcp = &per_cpu(bau_control, cpu);
C
Cliff Wickman 已提交
1367 1368 1369 1370 1371 1372 1373 1374 1375 1376
		bcp->max_concurr =		max_concurr;
		bcp->max_concurr_const =	max_concurr;
		bcp->plugged_delay =		plugged_delay;
		bcp->plugsb4reset =		plugsb4reset;
		bcp->timeoutsb4reset =		timeoutsb4reset;
		bcp->ipi_reset_limit =		ipi_reset_limit;
		bcp->complete_threshold =	complete_threshold;
		bcp->cong_response_us =		congested_respns_us;
		bcp->cong_reps =		congested_reps;
		bcp->cong_period =		congested_period;
1377
	}
1378 1379 1380 1381
	return count;
}

static const struct seq_operations uv_ptc_seq_ops = {
C
Cliff Wickman 已提交
1382 1383 1384 1385
	.start		= ptc_seq_start,
	.next		= ptc_seq_next,
	.stop		= ptc_seq_stop,
	.show		= ptc_seq_show
1386 1387
};

C
Cliff Wickman 已提交
1388
static int ptc_proc_open(struct inode *inode, struct file *file)
1389 1390 1391 1392
{
	return seq_open(file, &uv_ptc_seq_ops);
}

1393 1394 1395 1396 1397
static int tunables_open(struct inode *inode, struct file *file)
{
	return 0;
}

1398
static const struct file_operations proc_uv_ptc_operations = {
C
Cliff Wickman 已提交
1399
	.open		= ptc_proc_open,
1400
	.read		= seq_read,
C
Cliff Wickman 已提交
1401
	.write		= ptc_proc_write,
1402 1403
	.llseek		= seq_lseek,
	.release	= seq_release,
1404 1405
};

1406 1407 1408 1409
static const struct file_operations tunables_fops = {
	.open		= tunables_open,
	.read		= tunables_read,
	.write		= tunables_write,
1410
	.llseek		= default_llseek,
1411 1412
};

1413
static int __init uv_ptc_init(void)
1414
{
1415
	struct proc_dir_entry *proc_uv_ptc;
1416 1417 1418 1419

	if (!is_uv_system())
		return 0;

1420 1421
	proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
				  &proc_uv_ptc_operations);
1422 1423 1424 1425 1426
	if (!proc_uv_ptc) {
		printk(KERN_ERR "unable to create %s proc entry\n",
		       UV_PTC_BASENAME);
		return -EINVAL;
	}
1427 1428 1429 1430 1431 1432 1433 1434

	tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
	if (!tunables_dir) {
		printk(KERN_ERR "unable to create debugfs directory %s\n",
		       UV_BAU_TUNABLES_DIR);
		return -EINVAL;
	}
	tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
C
Cliff Wickman 已提交
1435
					tunables_dir, NULL, &tunables_fops);
1436 1437 1438 1439 1440
	if (!tunables_file) {
		printk(KERN_ERR "unable to create debugfs file %s\n",
		       UV_BAU_TUNABLES_FILE);
		return -EINVAL;
	}
1441 1442 1443 1444
	return 0;
}

/*
1445
 * Initialize the sending side's sending buffers.
1446
 */
C
Cliff Wickman 已提交
1447
static void activation_descriptor_init(int node, int pnode, int base_pnode)
1448 1449
{
	int i;
1450
	int cpu;
1451
	int uv1 = 0;
1452
	unsigned long gpa;
1453
	unsigned long m;
1454
	unsigned long n;
C
Cliff Wickman 已提交
1455
	size_t dsize;
1456 1457
	struct bau_desc *bau_desc;
	struct bau_desc *bd2;
1458 1459
	struct uv1_bau_msg_header *uv1_hdr;
	struct uv2_bau_msg_header *uv2_hdr;
1460
	struct bau_control *bcp;
1461

1462
	/*
C
Cliff Wickman 已提交
1463 1464
	 * each bau_desc is 64 bytes; there are 8 (ITEMS_PER_DESC)
	 * per cpu; and one per cpu on the uvhub (ADP_SZ)
1465
	 */
C
Cliff Wickman 已提交
1466 1467
	dsize = sizeof(struct bau_desc) * ADP_SZ * ITEMS_PER_DESC;
	bau_desc = kmalloc_node(dsize, GFP_KERNEL, node);
1468
	BUG_ON(!bau_desc);
1469

1470 1471 1472
	gpa = uv_gpa(bau_desc);
	n = uv_gpa_to_gnode(gpa);
	m = uv_gpa_to_offset(gpa);
1473 1474
	if (is_uv1_hub())
		uv1 = 1;
1475

1476
	/* the 14-bit pnode */
C
Cliff Wickman 已提交
1477
	write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m));
1478
	/*
C
Cliff Wickman 已提交
1479
	 * Initializing all 8 (ITEMS_PER_DESC) descriptors for each
1480
	 * cpu even though we only use the first one; one descriptor can
1481
	 * describe a broadcast to 256 uv hubs.
1482
	 */
C
Cliff Wickman 已提交
1483
	for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) {
1484
		memset(bd2, 0, sizeof(struct bau_desc));
1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511
		if (uv1) {
			uv1_hdr = &bd2->header.uv1_hdr;
			uv1_hdr->swack_flag =	1;
			/*
			 * The base_dest_nasid set in the message header
			 * is the nasid of the first uvhub in the partition.
			 * The bit map will indicate destination pnode numbers
			 * relative to that base. They may not be consecutive
			 * if nasid striding is being used.
			 */
			uv1_hdr->base_dest_nasid =
						UV_PNODE_TO_NASID(base_pnode);
			uv1_hdr->dest_subnodeid =	UV_LB_SUBNODEID;
			uv1_hdr->command =		UV_NET_ENDPOINT_INTD;
			uv1_hdr->int_both =		1;
			/*
			 * all others need to be set to zero:
			 *   fairness chaining multilevel count replied_to
			 */
		} else {
			uv2_hdr = &bd2->header.uv2_hdr;
			uv2_hdr->swack_flag =	1;
			uv2_hdr->base_dest_nasid =
						UV_PNODE_TO_NASID(base_pnode);
			uv2_hdr->dest_subnodeid =	UV_LB_SUBNODEID;
			uv2_hdr->command =		UV_NET_ENDPOINT_INTD;
		}
1512
	}
1513 1514 1515 1516 1517 1518
	for_each_present_cpu(cpu) {
		if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
			continue;
		bcp = &per_cpu(bau_control, cpu);
		bcp->descriptor_base = bau_desc;
	}
1519 1520 1521 1522
}

/*
 * initialize the destination side's receiving buffers
1523 1524 1525
 * entered for each uvhub in the partition
 * - node is first node (kernel memory notion) on the uvhub
 * - pnode is the uvhub's physical identifier
1526
 */
C
Cliff Wickman 已提交
1527
static void pq_init(int node, int pnode)
1528
{
1529
	int cpu;
C
Cliff Wickman 已提交
1530
	size_t plsize;
1531
	char *cp;
C
Cliff Wickman 已提交
1532 1533 1534 1535 1536 1537
	void *vp;
	unsigned long pn;
	unsigned long first;
	unsigned long pn_first;
	unsigned long last;
	struct bau_pq_entry *pqp;
1538
	struct bau_control *bcp;
1539

C
Cliff Wickman 已提交
1540 1541 1542
	plsize = (DEST_Q_SIZE + 1) * sizeof(struct bau_pq_entry);
	vp = kmalloc_node(plsize, GFP_KERNEL, node);
	pqp = (struct bau_pq_entry *)vp;
1543
	BUG_ON(!pqp);
1544

1545
	cp = (char *)pqp + 31;
C
Cliff Wickman 已提交
1546
	pqp = (struct bau_pq_entry *)(((unsigned long)cp >> 5) << 5);
1547 1548 1549 1550 1551 1552

	for_each_present_cpu(cpu) {
		if (pnode != uv_cpu_to_pnode(cpu))
			continue;
		/* for every cpu on this pnode: */
		bcp = &per_cpu(bau_control, cpu);
C
Cliff Wickman 已提交
1553 1554 1555
		bcp->queue_first	= pqp;
		bcp->bau_msg_head	= pqp;
		bcp->queue_last		= pqp + (DEST_Q_SIZE - 1);
1556
	}
1557
	/*
1558
	 * need the gnode of where the memory was really allocated
1559
	 */
1560
	pn = uv_gpa_to_gnode(uv_gpa(pqp));
C
Cliff Wickman 已提交
1561 1562 1563 1564 1565 1566 1567
	first = uv_physnodeaddr(pqp);
	pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first;
	last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1));
	write_mmr_payload_first(pnode, pn_first);
	write_mmr_payload_tail(pnode, first);
	write_mmr_payload_last(pnode, last);

1568
	/* in effect, all msg_type's are set to MSG_NOOP */
C
Cliff Wickman 已提交
1569
	memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);
1570
}
1571

1572
/*
1573
 * Initialization of each UV hub's structures
1574
 */
C
Cliff Wickman 已提交
1575
static void __init init_uvhub(int uvhub, int vector, int base_pnode)
1576
{
1577
	int node;
1578 1579
	int pnode;
	unsigned long apicid;
1580 1581 1582

	node = uvhub_to_first_node(uvhub);
	pnode = uv_blade_to_pnode(uvhub);
C
Cliff Wickman 已提交
1583 1584 1585 1586

	activation_descriptor_init(node, pnode, base_pnode);

	pq_init(node, pnode);
1587
	/*
1588 1589
	 * The below initialization can't be in firmware because the
	 * messaging IRQ will be determined by the OS.
1590
	 */
1591
	apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
C
Cliff Wickman 已提交
1592
	write_mmr_data_config(pnode, ((apicid << 32) | vector));
1593 1594
}

1595 1596 1597
/*
 * We will set BAU_MISC_CONTROL with a timeout period.
 * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
C
Cliff Wickman 已提交
1598
 * So the destination timeout period has to be calculated from them.
1599
 */
C
Cliff Wickman 已提交
1600
static int calculate_destination_timeout(void)
1601 1602 1603 1604 1605 1606 1607 1608 1609
{
	unsigned long mmr_image;
	int mult1;
	int mult2;
	int index;
	int base;
	int ret;
	unsigned long ts_ns;

1610
	if (is_uv1_hub()) {
C
Cliff Wickman 已提交
1611
		mult1 = SOFTACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622
		mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
		index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
		mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
		mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
		base = timeout_base_ns[index];
		ts_ns = base * mult1 * mult2;
		ret = ts_ns / 1000;
	} else {
		/* 4 bits  0/1 for 10/80us, 3 bits of multiplier */
		mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
		mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT;
C
Cliff Wickman 已提交
1623
		if (mmr_image & (1L << UV2_ACK_UNITS_SHFT))
1624 1625 1626 1627 1628 1629
			mult1 = 80;
		else
			mult1 = 10;
		base = mmr_image & UV2_ACK_MASK;
		ret = mult1 * base;
	}
1630 1631 1632
	return ret;
}

C
Cliff Wickman 已提交
1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656
static void __init init_per_cpu_tunables(void)
{
	int cpu;
	struct bau_control *bcp;

	for_each_present_cpu(cpu) {
		bcp = &per_cpu(bau_control, cpu);
		bcp->baudisabled		= 0;
		bcp->statp			= &per_cpu(ptcstats, cpu);
		/* time interval to catch a hardware stay-busy bug */
		bcp->timeout_interval		= usec_2_cycles(2*timeout_us);
		bcp->max_concurr		= max_concurr;
		bcp->max_concurr_const		= max_concurr;
		bcp->plugged_delay		= plugged_delay;
		bcp->plugsb4reset		= plugsb4reset;
		bcp->timeoutsb4reset		= timeoutsb4reset;
		bcp->ipi_reset_limit		= ipi_reset_limit;
		bcp->complete_threshold		= complete_threshold;
		bcp->cong_response_us		= congested_respns_us;
		bcp->cong_reps			= congested_reps;
		bcp->cong_period		= congested_period;
	}
}

1657
/*
C
Cliff Wickman 已提交
1658
 * Scan all cpus to collect blade and socket summaries.
1659
 */
C
Cliff Wickman 已提交
1660 1661 1662
static int __init get_cpu_topology(int base_pnode,
					struct uvhub_desc *uvhub_descs,
					unsigned char *uvhub_mask)
1663 1664 1665 1666
{
	int cpu;
	int pnode;
	int uvhub;
C
Cliff Wickman 已提交
1667
	int socket;
1668 1669 1670 1671 1672 1673
	struct bau_control *bcp;
	struct uvhub_desc *bdp;
	struct socket_desc *sdp;

	for_each_present_cpu(cpu) {
		bcp = &per_cpu(bau_control, cpu);
C
Cliff Wickman 已提交
1674

1675
		memset(bcp, 0, sizeof(struct bau_control));
C
Cliff Wickman 已提交
1676

1677
		pnode = uv_cpu_hub_info(cpu)->pnode;
C
Cliff Wickman 已提交
1678
		if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) {
1679 1680
			printk(KERN_EMERG
				"cpu %d pnode %d-%d beyond %d; BAU disabled\n",
C
Cliff Wickman 已提交
1681
				cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE);
1682 1683
			return 1;
		}
C
Cliff Wickman 已提交
1684

1685
		bcp->osnode = cpu_to_node(cpu);
C
Cliff Wickman 已提交
1686 1687
		bcp->partition_base_pnode = base_pnode;

1688
		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
C
Cliff Wickman 已提交
1689
		*(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
1690
		bdp = &uvhub_descs[uvhub];
C
Cliff Wickman 已提交
1691

1692 1693 1694
		bdp->num_cpus++;
		bdp->uvhub = uvhub;
		bdp->pnode = pnode;
C
Cliff Wickman 已提交
1695

1696 1697
		/* kludge: 'assuming' one node per socket, and assuming that
		   disabling a socket just leaves a gap in node numbers */
1698
		socket = bcp->osnode & 1;
1699
		bdp->socket_mask |= (1 << socket);
1700 1701 1702
		sdp = &bdp->socket[socket];
		sdp->cpu_number[sdp->num_cpus] = cpu;
		sdp->num_cpus++;
1703
		if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
C
Cliff Wickman 已提交
1704 1705
			printk(KERN_EMERG "%d cpus per socket invalid\n",
				sdp->num_cpus);
1706 1707
			return 1;
		}
1708
	}
C
Cliff Wickman 已提交
1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727
	return 0;
}

/*
 * Each socket is to get a local array of pnodes/hubs.
 */
static void make_per_cpu_thp(struct bau_control *smaster)
{
	int cpu;
	size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus();

	smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode);
	memset(smaster->thp, 0, hpsz);
	for_each_present_cpu(cpu) {
		smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode;
		smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
	}
}

1728 1729 1730 1731 1732 1733 1734 1735 1736 1737
/*
 * Each uvhub is to get a local cpumask.
 */
static void make_per_hub_cpumask(struct bau_control *hmaster)
{
	int sz = sizeof(cpumask_t);

	hmaster->cpumask = kzalloc_node(sz, GFP_KERNEL, hmaster->osnode);
}

C
Cliff Wickman 已提交
1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763
/*
 * Initialize all the per_cpu information for the cpu's on a given socket,
 * given what has been gathered into the socket_desc struct.
 * And reports the chosen hub and socket masters back to the caller.
 */
static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
			struct bau_control **smasterp,
			struct bau_control **hmasterp)
{
	int i;
	int cpu;
	struct bau_control *bcp;

	for (i = 0; i < sdp->num_cpus; i++) {
		cpu = sdp->cpu_number[i];
		bcp = &per_cpu(bau_control, cpu);
		bcp->cpu = cpu;
		if (i == 0) {
			*smasterp = bcp;
			if (!(*hmasterp))
				*hmasterp = bcp;
		}
		bcp->cpus_in_uvhub = bdp->num_cpus;
		bcp->cpus_in_socket = sdp->num_cpus;
		bcp->socket_master = *smasterp;
		bcp->uvhub = bdp->uvhub;
1764 1765 1766 1767 1768 1769 1770 1771
		if (is_uv1_hub())
			bcp->uvhub_version = 1;
		else if (is_uv2_hub())
			bcp->uvhub_version = 2;
		else {
			printk(KERN_EMERG "uvhub version not 1 or 2\n");
			return 1;
		}
C
Cliff Wickman 已提交
1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793
		bcp->uvhub_master = *hmasterp;
		bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
		if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
			printk(KERN_EMERG "%d cpus per uvhub invalid\n",
				bcp->uvhub_cpu);
			return 1;
		}
	}
	return 0;
}

/*
 * Summarize the blade and socket topology into the per_cpu structures.
 */
static int __init summarize_uvhub_sockets(int nuvhubs,
			struct uvhub_desc *uvhub_descs,
			unsigned char *uvhub_mask)
{
	int socket;
	int uvhub;
	unsigned short socket_mask;

C
Cliff Wickman 已提交
1794
	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
C
Cliff Wickman 已提交
1795 1796 1797 1798
		struct uvhub_desc *bdp;
		struct bau_control *smaster = NULL;
		struct bau_control *hmaster = NULL;

C
Cliff Wickman 已提交
1799 1800
		if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
			continue;
C
Cliff Wickman 已提交
1801

1802
		bdp = &uvhub_descs[uvhub];
1803 1804 1805
		socket_mask = bdp->socket_mask;
		socket = 0;
		while (socket_mask) {
C
Cliff Wickman 已提交
1806 1807 1808 1809
			struct socket_desc *sdp;
			if ((socket_mask & 1)) {
				sdp = &bdp->socket[socket];
				if (scan_sock(sdp, bdp, &smaster, &hmaster))
1810
					return 1;
1811
				make_per_cpu_thp(smaster);
1812 1813
			}
			socket++;
1814
			socket_mask = (socket_mask >> 1);
1815
		}
1816
		make_per_hub_cpumask(hmaster);
1817
	}
C
Cliff Wickman 已提交
1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837
	return 0;
}

/*
 * initialize the bau_control structure for each cpu
 */
static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
{
	unsigned char *uvhub_mask;
	void *vp;
	struct uvhub_desc *uvhub_descs;

	timeout_us = calculate_destination_timeout();

	vp = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
	uvhub_descs = (struct uvhub_desc *)vp;
	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
	uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);

	if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask))
1838
		goto fail;
C
Cliff Wickman 已提交
1839 1840

	if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask))
1841
		goto fail;
C
Cliff Wickman 已提交
1842

1843
	kfree(uvhub_descs);
C
Cliff Wickman 已提交
1844
	kfree(uvhub_mask);
C
Cliff Wickman 已提交
1845
	init_per_cpu_tunables();
1846
	return 0;
1847 1848 1849 1850 1851

fail:
	kfree(uvhub_descs);
	kfree(uvhub_mask);
	return 1;
1852 1853 1854 1855 1856 1857 1858
}

/*
 * Initialization of BAU-related structures
 */
static int __init uv_bau_init(void)
{
1859 1860 1861
	int uvhub;
	int pnode;
	int nuvhubs;
1862
	int cur_cpu;
C
Cliff Wickman 已提交
1863
	int cpus;
1864
	int vector;
C
Cliff Wickman 已提交
1865
	cpumask_var_t *mask;
1866 1867 1868

	if (!is_uv_system())
		return 0;
1869

1870 1871 1872
	if (nobau)
		return 0;

C
Cliff Wickman 已提交
1873 1874 1875 1876
	for_each_possible_cpu(cur_cpu) {
		mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
		zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
	}
1877

1878
	nuvhubs = uv_num_possible_blades();
1879
	spin_lock_init(&disable_lock);
C
Cliff Wickman 已提交
1880
	congested_cycles = usec_2_cycles(congested_respns_us);
1881

C
Cliff Wickman 已提交
1882
	uv_base_pnode = 0x7fffffff;
1883
	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
C
Cliff Wickman 已提交
1884 1885 1886
		cpus = uv_blade_nr_possible_cpus(uvhub);
		if (cpus && (uv_blade_to_pnode(uvhub) < uv_base_pnode))
			uv_base_pnode = uv_blade_to_pnode(uvhub);
1887 1888
	}

C
Cliff Wickman 已提交
1889
	if (init_per_cpu(nuvhubs, uv_base_pnode)) {
1890 1891 1892
		nobau = 1;
		return 0;
	}
1893 1894 1895 1896

	vector = UV_BAU_MESSAGE;
	for_each_possible_blade(uvhub)
		if (uv_blade_nr_possible_cpus(uvhub))
C
Cliff Wickman 已提交
1897
			init_uvhub(uvhub, vector, uv_base_pnode);
1898

C
Cliff Wickman 已提交
1899
	enable_timeouts();
1900 1901 1902
	alloc_intr_gate(vector, uv_bau_message_intr1);

	for_each_possible_blade(uvhub) {
1903
		if (uv_blade_nr_possible_cpus(uvhub)) {
C
Cliff Wickman 已提交
1904 1905
			unsigned long val;
			unsigned long mmr;
1906 1907
			pnode = uv_blade_to_pnode(uvhub);
			/* INIT the bau */
C
Cliff Wickman 已提交
1908 1909
			val = 1L << 63;
			write_gmmr_activation(pnode, val);
1910
			mmr = 1; /* should be 1 to broadcast to both sockets */
1911 1912
			if (!is_uv1_hub())
				write_mmr_data_broadcast(pnode, mmr);
1913
		}
1914
	}
1915

1916 1917
	return 0;
}
1918
core_initcall(uv_bau_init);
1919
fs_initcall(uv_ptc_init);