tlb_uv.c 57.9 KB
Newer Older
1 2 3
/*
 *	SGI UltraViolet TLB flush routines.
 *
4
 *	(c) 2008-2014 Cliff Wickman <cpw@sgi.com>, SGI.
5 6 7 8
 *
 *	This code is released under the GNU General Public License version 2 or
 *	later.
 */
9
#include <linux/seq_file.h>
10
#include <linux/proc_fs.h>
11
#include <linux/debugfs.h>
12
#include <linux/kernel.h>
13
#include <linux/slab.h>
14
#include <linux/delay.h>
15 16

#include <asm/mmu_context.h>
T
Tejun Heo 已提交
17
#include <asm/uv/uv.h>
18
#include <asm/uv/uv_mmrs.h>
19
#include <asm/uv/uv_hub.h>
20
#include <asm/uv/uv_bau.h>
I
Ingo Molnar 已提交
21
#include <asm/apic.h>
22
#include <asm/tsc.h>
23
#include <asm/irq_vectors.h>
24
#include <asm/timer.h>
25

26 27 28 29 30 31 32 33 34 35 36 37
static struct bau_operations ops;

static struct bau_operations uv123_bau_ops = {
	.bau_gpa_to_offset       = uv_gpa_to_offset,
	.read_l_sw_ack           = read_mmr_sw_ack,
	.read_g_sw_ack           = read_gmmr_sw_ack,
	.write_l_sw_ack          = write_mmr_sw_ack,
	.write_g_sw_ack          = write_gmmr_sw_ack,
	.write_payload_first     = write_mmr_payload_first,
	.write_payload_last      = write_mmr_payload_last,
};

38 39 40 41 42 43 44 45 46 47 48
static struct bau_operations uv4_bau_ops = {
	.bau_gpa_to_offset       = uv_gpa_to_soc_phys_ram,
	.read_l_sw_ack           = read_mmr_proc_sw_ack,
	.read_g_sw_ack           = read_gmmr_proc_sw_ack,
	.write_l_sw_ack          = write_mmr_proc_sw_ack,
	.write_g_sw_ack          = write_gmmr_proc_sw_ack,
	.write_payload_first     = write_mmr_proc_payload_first,
	.write_payload_last      = write_mmr_proc_payload_last,
};


49 50 51 52 53 54 55 56 57 58 59
/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
static int timeout_base_ns[] = {
		20,
		160,
		1280,
		10240,
		81920,
		655360,
		5242880,
		167772160
};
C
Cliff Wickman 已提交
60

61
static int timeout_us;
62
static bool nobau = true;
63
static int nobau_perm;
64
static cycles_t congested_cycles;
65

66
/* tunables: */
C
Cliff Wickman 已提交
67 68 69 70
static int max_concurr		= MAX_BAU_CONCURRENT;
static int max_concurr_const	= MAX_BAU_CONCURRENT;
static int plugged_delay	= PLUGGED_DELAY;
static int plugsb4reset		= PLUGSB4RESET;
71
static int giveup_limit		= GIVEUP_LIMIT;
C
Cliff Wickman 已提交
72 73 74 75 76
static int timeoutsb4reset	= TIMEOUTSB4RESET;
static int ipi_reset_limit	= IPI_RESET_LIMIT;
static int complete_threshold	= COMPLETE_THRESHOLD;
static int congested_respns_us	= CONGESTED_RESPONSE_US;
static int congested_reps	= CONGESTED_REPS;
77
static int disabled_period	= DISABLED_PERIOD;
C
Cliff Wickman 已提交
78 79

static struct tunables tunables[] = {
80 81 82 83 84 85 86 87 88 89
	{&max_concurr,           MAX_BAU_CONCURRENT}, /* must be [0] */
	{&plugged_delay,         PLUGGED_DELAY},
	{&plugsb4reset,          PLUGSB4RESET},
	{&timeoutsb4reset,       TIMEOUTSB4RESET},
	{&ipi_reset_limit,       IPI_RESET_LIMIT},
	{&complete_threshold,    COMPLETE_THRESHOLD},
	{&congested_respns_us,   CONGESTED_RESPONSE_US},
	{&congested_reps,        CONGESTED_REPS},
	{&disabled_period,       DISABLED_PERIOD},
	{&giveup_limit,          GIVEUP_LIMIT}
C
Cliff Wickman 已提交
90 91
};

92 93
static struct dentry *tunables_dir;
static struct dentry *tunables_file;
94

C
Cliff Wickman 已提交
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
/* these correspond to the statistics printed by ptc_seq_show() */
static char *stat_description[] = {
	"sent:     number of shootdown messages sent",
	"stime:    time spent sending messages",
	"numuvhubs: number of hubs targeted with shootdown",
	"numuvhubs16: number times 16 or more hubs targeted",
	"numuvhubs8: number times 8 or more hubs targeted",
	"numuvhubs4: number times 4 or more hubs targeted",
	"numuvhubs2: number times 2 or more hubs targeted",
	"numuvhubs1: number times 1 hub targeted",
	"numcpus:  number of cpus targeted with shootdown",
	"dto:      number of destination timeouts",
	"retries:  destination timeout retries sent",
	"rok:   :  destination timeouts successfully retried",
	"resetp:   ipi-style resource resets for plugs",
	"resett:   ipi-style resource resets for timeouts",
	"giveup:   fall-backs to ipi-style shootdowns",
	"sto:      number of source timeouts",
	"bz:       number of stay-busy's",
	"throt:    number times spun in throttle",
	"swack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE",
	"recv:     shootdown messages received",
	"rtime:    time spent processing messages",
	"all:      shootdown all-tlb messages",
	"one:      shootdown one-tlb messages",
	"mult:     interrupts that found multiple messages",
	"none:     interrupts that found no messages",
	"retry:    number of retry messages processed",
	"canc:     number messages canceled by retries",
	"nocan:    number retries that found nothing to cancel",
	"reset:    number of ipi-style reset requests processed",
	"rcan:     number messages canceled by reset requests",
	"disable:  number times use of the BAU was disabled",
	"enable:   number times use of the BAU was re-enabled"
};

131
static int __init setup_bau(char *arg)
132
{
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
	int result;

	if (!arg)
		return -EINVAL;

	result = strtobool(arg, &nobau);
	if (result)
		return result;

	/* we need to flip the logic here, so that bau=y sets nobau to false */
	nobau = !nobau;

	if (!nobau)
		pr_info("UV BAU Enabled\n");
	else
		pr_info("UV BAU Disabled\n");

150 151
	return 0;
}
152
early_param("bau", setup_bau);
153

154
/* base pnode in this partition */
C
Cliff Wickman 已提交
155
static int uv_base_pnode __read_mostly;
156

157 158
static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
static DEFINE_PER_CPU(struct bau_control, bau_control);
159 160
static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);

161 162 163 164 165 166 167 168 169 170
static void
set_bau_on(void)
{
	int cpu;
	struct bau_control *bcp;

	if (nobau_perm) {
		pr_info("BAU not initialized; cannot be turned on\n");
		return;
	}
171
	nobau = false;
172 173
	for_each_present_cpu(cpu) {
		bcp = &per_cpu(bau_control, cpu);
174
		bcp->nobau = false;
175 176 177 178 179 180 181 182 183 184 185
	}
	pr_info("BAU turned on\n");
	return;
}

static void
set_bau_off(void)
{
	int cpu;
	struct bau_control *bcp;

186
	nobau = true;
187 188
	for_each_present_cpu(cpu) {
		bcp = &per_cpu(bau_control, cpu);
189
		bcp->nobau = true;
190 191 192 193 194
	}
	pr_info("BAU turned off\n");
	return;
}

195
/*
196 197
 * Determine the first node on a uvhub. 'Nodes' are used for kernel
 * memory allocation.
198
 */
199
static int __init uvhub_to_first_node(int uvhub)
200 201 202 203 204
{
	int node, b;

	for_each_online_node(node) {
		b = uv_node_to_blade_id(node);
205
		if (uvhub == b)
206 207
			return node;
	}
208
	return -1;
209 210 211
}

/*
212
 * Determine the apicid of the first cpu on a uvhub.
213
 */
214
static int __init uvhub_to_first_apicid(int uvhub)
215 216 217 218
{
	int cpu;

	for_each_present_cpu(cpu)
219
		if (uvhub == uv_cpu_to_blade_id(cpu))
220 221 222 223
			return per_cpu(x86_cpu_to_apicid, cpu);
	return -1;
}

224 225 226 227 228 229 230 231
/*
 * Free a software acknowledge hardware resource by clearing its Pending
 * bit. This will return a reply to the sender.
 * If the message has timed out, a reply has already been sent by the
 * hardware but the resource has not been released. In that case our
 * clear of the Timeout bit (as well) will free the resource. No reply will
 * be sent (the hardware will only do one reply per message).
 */
C
Cliff Wickman 已提交
232 233
static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp,
						int do_acknowledge)
234
{
235
	unsigned long dw;
C
Cliff Wickman 已提交
236
	struct bau_pq_entry *msg;
237

238
	msg = mdp->msg;
C
Cliff Wickman 已提交
239
	if (!msg->canceled && do_acknowledge) {
C
Cliff Wickman 已提交
240
		dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
241
		ops.write_l_sw_ack(dw);
242
	}
243
	msg->replied_to = 1;
C
Cliff Wickman 已提交
244
	msg->swack_vec = 0;
245 246 247
}

/*
248
 * Process the receipt of a RETRY message
249
 */
C
Cliff Wickman 已提交
250 251
static void bau_process_retry_msg(struct msg_desc *mdp,
					struct bau_control *bcp)
252
{
253 254 255 256
	int i;
	int cancel_count = 0;
	unsigned long msg_res;
	unsigned long mmr = 0;
C
Cliff Wickman 已提交
257 258 259
	struct bau_pq_entry *msg = mdp->msg;
	struct bau_pq_entry *msg2;
	struct ptc_stats *stat = bcp->statp;
260

261 262 263 264 265
	stat->d_retries++;
	/*
	 * cancel any message from msg+1 to the retry itself
	 */
	for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
C
Cliff Wickman 已提交
266 267
		if (msg2 > mdp->queue_last)
			msg2 = mdp->queue_first;
268 269 270
		if (msg2 == msg)
			break;

C
Cliff Wickman 已提交
271
		/* same conditions for cancellation as do_reset */
272
		if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
C
Cliff Wickman 已提交
273 274
		    (msg2->swack_vec) && ((msg2->swack_vec &
			msg->swack_vec) == 0) &&
275 276
		    (msg2->sending_cpu == msg->sending_cpu) &&
		    (msg2->msg_type != MSG_NOOP)) {
277
			mmr = ops.read_l_sw_ack();
C
Cliff Wickman 已提交
278
			msg_res = msg2->swack_vec;
279 280 281 282 283 284
			/*
			 * This is a message retry; clear the resources held
			 * by the previous message only if they timed out.
			 * If it has not timed out we have an unexpected
			 * situation to report.
			 */
285
			if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
C
Cliff Wickman 已提交
286
				unsigned long mr;
287
				/*
C
Cliff Wickman 已提交
288 289
				 * Is the resource timed out?
				 * Make everyone ignore the cancelled message.
290 291 292 293
				 */
				msg2->canceled = 1;
				stat->d_canceled++;
				cancel_count++;
C
Cliff Wickman 已提交
294
				mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
295
				ops.write_l_sw_ack(mr);
296
			}
297 298 299 300 301
		}
	}
	if (!cancel_count)
		stat->d_nocanceled++;
}
302

303 304 305 306
/*
 * Do all the things a cpu should do for a TLB shootdown message.
 * Other cpu's may come here at the same time for this message.
 */
C
Cliff Wickman 已提交
307 308
static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
						int do_acknowledge)
309 310
{
	short socket_ack_count = 0;
C
Cliff Wickman 已提交
311 312 313 314
	short *sp;
	struct atomic_short *asp;
	struct ptc_stats *stat = bcp->statp;
	struct bau_pq_entry *msg = mdp->msg;
315
	struct bau_control *smaster = bcp->socket_master;
316

317 318 319
	/*
	 * This must be a normal message, or retry of a normal message
	 */
320 321
	if (msg->address == TLB_FLUSH_ALL) {
		local_flush_tlb();
322
		stat->d_alltlb++;
323 324
	} else {
		__flush_tlb_one(msg->address);
325
		stat->d_onetlb++;
326
	}
327 328 329 330 331 332 333 334 335
	stat->d_requestee++;

	/*
	 * One cpu on each uvhub has the additional job on a RETRY
	 * of releasing the resource held by the message that is
	 * being retried.  That message is identified by sending
	 * cpu number.
	 */
	if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
C
Cliff Wickman 已提交
336
		bau_process_retry_msg(mdp, bcp);
337

338
	/*
C
Cliff Wickman 已提交
339
	 * This is a swack message, so we have to reply to it.
340 341 342 343
	 * Count each responding cpu on the socket. This avoids
	 * pinging the count's cache line back and forth between
	 * the sockets.
	 */
C
Cliff Wickman 已提交
344 345 346
	sp = &smaster->socket_acknowledge_count[mdp->msg_slot];
	asp = (struct atomic_short *)sp;
	socket_ack_count = atom_asr(1, asp);
347
	if (socket_ack_count == bcp->cpus_in_socket) {
C
Cliff Wickman 已提交
348
		int msg_ack_count;
349 350 351 352
		/*
		 * Both sockets dump their completed count total into
		 * the message's count.
		 */
353
		*sp = 0;
C
Cliff Wickman 已提交
354 355
		asp = (struct atomic_short *)&msg->acknowledge_count;
		msg_ack_count = atom_asr(socket_ack_count, asp);
356 357 358 359

		if (msg_ack_count == bcp->cpus_in_uvhub) {
			/*
			 * All cpus in uvhub saw it; reply
C
Cliff Wickman 已提交
360
			 * (unless we are in the UV2 workaround)
361
			 */
C
Cliff Wickman 已提交
362
			reply_to_message(mdp, bcp, do_acknowledge);
363 364
		}
	}
365

366
	return;
367 368 369
}

/*
C
cpw@sgi.com 已提交
370
 * Determine the first cpu on a pnode.
371
 */
C
cpw@sgi.com 已提交
372
static int pnode_to_first_cpu(int pnode, struct bau_control *smaster)
373 374
{
	int cpu;
C
cpw@sgi.com 已提交
375 376 377 378 379
	struct hub_and_pnode *hpp;

	for_each_present_cpu(cpu) {
		hpp = &smaster->thp[cpu];
		if (pnode == hpp->pnode)
380
			return cpu;
C
cpw@sgi.com 已提交
381
	}
382 383 384 385 386 387 388
	return -1;
}

/*
 * Last resort when we get a large number of destination timeouts is
 * to clear resources held by a given cpu.
 * Do this with IPI so that all messages in the BAU message queue
C
Cliff Wickman 已提交
389
 * can be identified by their nonzero swack_vec field.
390
 *
391 392
 * This is entered for a single cpu on the uvhub.
 * The sender want's this uvhub to free a specific message's
C
Cliff Wickman 已提交
393
 * swack resources.
394
 */
C
Cliff Wickman 已提交
395
static void do_reset(void *ptr)
396
{
397
	int i;
C
Cliff Wickman 已提交
398 399 400 401
	struct bau_control *bcp = &per_cpu(bau_control, smp_processor_id());
	struct reset_args *rap = (struct reset_args *)ptr;
	struct bau_pq_entry *msg;
	struct ptc_stats *stat = bcp->statp;
402

403 404 405
	stat->d_resets++;
	/*
	 * We're looking for the given sender, and
C
Cliff Wickman 已提交
406
	 * will free its swack resource.
407 408 409
	 * If all cpu's finally responded after the timeout, its
	 * message 'replied_to' was set.
	 */
C
Cliff Wickman 已提交
410 411 412 413
	for (msg = bcp->queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
		unsigned long msg_res;
		/* do_reset: same conditions for cancellation as
		   bau_process_retry_msg() */
414 415 416
		if ((msg->replied_to == 0) &&
		    (msg->canceled == 0) &&
		    (msg->sending_cpu == rap->sender) &&
C
Cliff Wickman 已提交
417
		    (msg->swack_vec) &&
418
		    (msg->msg_type != MSG_NOOP)) {
C
Cliff Wickman 已提交
419 420
			unsigned long mmr;
			unsigned long mr;
421 422 423 424 425 426 427
			/*
			 * make everyone else ignore this message
			 */
			msg->canceled = 1;
			/*
			 * only reset the resource if it is still pending
			 */
428
			mmr = ops.read_l_sw_ack();
C
Cliff Wickman 已提交
429 430
			msg_res = msg->swack_vec;
			mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
431 432
			if (mmr & msg_res) {
				stat->d_rcanceled++;
433
				ops.write_l_sw_ack(mr);
434 435 436
			}
		}
	}
437
	return;
438 439 440
}

/*
441 442
 * Use IPI to get all target uvhubs to release resources held by
 * a given sending cpu number.
443
 */
C
cpw@sgi.com 已提交
444
static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
445
{
C
cpw@sgi.com 已提交
446 447
	int pnode;
	int apnode;
C
Cliff Wickman 已提交
448
	int maskbits;
C
cpw@sgi.com 已提交
449
	int sender = bcp->cpu;
450
	cpumask_t *mask = bcp->uvhub_master->cpumask;
C
cpw@sgi.com 已提交
451
	struct bau_control *smaster = bcp->socket_master;
452
	struct reset_args reset_args;
453

454
	reset_args.sender = sender;
455
	cpumask_clear(mask);
456
	/* find a single cpu for each uvhub in this distribution mask */
C
cpw@sgi.com 已提交
457
	maskbits = sizeof(struct pnmask) * BITSPERBYTE;
C
cpw@sgi.com 已提交
458 459
	/* each bit is a pnode relative to the partition base pnode */
	for (pnode = 0; pnode < maskbits; pnode++) {
C
Cliff Wickman 已提交
460
		int cpu;
C
cpw@sgi.com 已提交
461
		if (!bau_uvhub_isset(pnode, distribution))
462
			continue;
C
cpw@sgi.com 已提交
463 464
		apnode = pnode + bcp->partition_base_pnode;
		cpu = pnode_to_first_cpu(apnode, smaster);
465
		cpumask_set_cpu(cpu, mask);
466
	}
C
Cliff Wickman 已提交
467 468

	/* IPI all cpus; preemption is already disabled */
469
	smp_call_function_many(mask, do_reset, (void *)&reset_args, 1);
470 471 472
	return;
}

473 474 475 476 477 478
/*
 * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative
 * number, not an absolute. It converts a duration in cycles to a duration in
 * ns.
 */
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
479
{
480
	struct cyc2ns_data *data = cyc2ns_read_begin();
481
	unsigned long long ns;
C
Cliff Wickman 已提交
482

483 484 485 486 487 488 489 490
	ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);

	cyc2ns_read_end(data);
	return ns;
}

/*
 * The reverse of the above; converts a duration in ns to a duration in cycles.
491
 */
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515
static inline unsigned long long ns_2_cycles(unsigned long long ns)
{
	struct cyc2ns_data *data = cyc2ns_read_begin();
	unsigned long long cyc;

	cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul;

	cyc2ns_read_end(data);
	return cyc;
}

static inline unsigned long cycles_2_us(unsigned long long cyc)
{
	return cycles_2_ns(cyc) / NSEC_PER_USEC;
}

static inline cycles_t sec_2_cycles(unsigned long sec)
{
	return ns_2_cycles(sec * NSEC_PER_SEC);
}

static inline unsigned long long usec_2_cycles(unsigned long usec)
{
	return ns_2_cycles(usec * NSEC_PER_USEC);
516 517
}

518
/*
519 520 521 522
 * wait for all cpus on this hub to finish their sends and go quiet
 * leaves uvhub_quiesce set so that no new broadcasts are started by
 * bau_flush_send_and_wait()
 */
C
Cliff Wickman 已提交
523
static inline void quiesce_local_uvhub(struct bau_control *hmaster)
524
{
C
Cliff Wickman 已提交
525
	atom_asr(1, (struct atomic_short *)&hmaster->uvhub_quiesce);
526 527 528 529 530
}

/*
 * mark this quiet-requestor as done
 */
C
Cliff Wickman 已提交
531
static inline void end_uvhub_quiesce(struct bau_control *hmaster)
532
{
C
Cliff Wickman 已提交
533 534 535 536 537 538 539 540 541 542 543
	atom_asr(-1, (struct atomic_short *)&hmaster->uvhub_quiesce);
}

static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift)
{
	unsigned long descriptor_status;

	descriptor_status = uv_read_local_mmr(mmr_offset);
	descriptor_status >>= right_shift;
	descriptor_status &= UV_ACT_STATUS_MASK;
	return descriptor_status;
544 545 546 547 548
}

/*
 * Wait for completion of a broadcast software ack message
 * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
549
 */
550
static int uv1_wait_completion(struct bau_desc *bau_desc,
C
Cliff Wickman 已提交
551 552
				unsigned long mmr_offset, int right_shift,
				struct bau_control *bcp, long try)
553 554
{
	unsigned long descriptor_status;
C
Cliff Wickman 已提交
555
	cycles_t ttm;
556
	struct ptc_stats *stat = bcp->statp;
557

C
Cliff Wickman 已提交
558
	descriptor_status = uv1_read_status(mmr_offset, right_shift);
559
	/* spin on the status MMR, waiting for it to go idle */
C
Cliff Wickman 已提交
560
	while ((descriptor_status != DS_IDLE)) {
561
		/*
562 563 564 565
		 * Our software ack messages may be blocked because
		 * there are no swack resources available.  As long
		 * as none of them has timed out hardware will NACK
		 * our message and its state will stay IDLE.
566
		 */
C
Cliff Wickman 已提交
567
		if (descriptor_status == DS_SOURCE_TIMEOUT) {
568 569
			stat->s_stimeout++;
			return FLUSH_GIVEUP;
C
Cliff Wickman 已提交
570
		} else if (descriptor_status == DS_DESTINATION_TIMEOUT) {
571
			stat->s_dtimeout++;
C
Cliff Wickman 已提交
572
			ttm = get_cycles();
573 574 575 576 577 578 579

			/*
			 * Our retries may be blocked by all destination
			 * swack resources being consumed, and a timeout
			 * pending.  In that case hardware returns the
			 * ERROR that looks like a destination timeout.
			 */
C
Cliff Wickman 已提交
580
			if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
581 582 583 584 585 586 587 588 589 590 591 592
				bcp->conseccompletes = 0;
				return FLUSH_RETRY_PLUGGED;
			}

			bcp->conseccompletes = 0;
			return FLUSH_RETRY_TIMEOUT;
		} else {
			/*
			 * descriptor_status is still BUSY
			 */
			cpu_relax();
		}
C
Cliff Wickman 已提交
593
		descriptor_status = uv1_read_status(mmr_offset, right_shift);
594 595 596 597 598
	}
	bcp->conseccompletes++;
	return FLUSH_COMPLETE;
}

C
Cliff Wickman 已提交
599
/*
600 601
 * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register.
 * But not currently used.
C
Cliff Wickman 已提交
602
 */
603
static unsigned long uv2_3_read_status(unsigned long offset, int rshft, int desc)
604
{
605
	return ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1;
C
Cliff Wickman 已提交
606 607
}

C
Cliff Wickman 已提交
608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637
/*
 * Return whether the status of the descriptor that is normally used for this
 * cpu (the one indexed by its hub-relative cpu number) is busy.
 * The status of the original 32 descriptors is always reflected in the 64
 * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0.
 * The bit provided by the activation_status_2 register is irrelevant to
 * the status if it is only being tested for busy or not busy.
 */
int normal_busy(struct bau_control *bcp)
{
	int cpu = bcp->uvhub_cpu;
	int mmr_offset;
	int right_shift;

	mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
	right_shift = cpu * UV_ACT_STATUS_SIZE;
	return (((((read_lmmr(mmr_offset) >> right_shift) &
				UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY);
}

/*
 * Entered when a bau descriptor has gone into a permanent busy wait because
 * of a hardware bug.
 * Workaround the bug.
 */
int handle_uv2_busy(struct bau_control *bcp)
{
	struct ptc_stats *stat = bcp->statp;

	stat->s_uv2_wars++;
638 639
	bcp->busy = 1;
	return FLUSH_GIVEUP;
C
Cliff Wickman 已提交
640 641
}

642
static int uv2_3_wait_completion(struct bau_desc *bau_desc,
C
Cliff Wickman 已提交
643 644 645 646 647
				unsigned long mmr_offset, int right_shift,
				struct bau_control *bcp, long try)
{
	unsigned long descriptor_stat;
	cycles_t ttm;
648
	int desc = bcp->uvhub_cpu;
C
Cliff Wickman 已提交
649
	long busy_reps = 0;
650 651
	struct ptc_stats *stat = bcp->statp;

652
	descriptor_stat = uv2_3_read_status(mmr_offset, right_shift, desc);
C
Cliff Wickman 已提交
653

654
	/* spin on the status MMR, waiting for it to go idle */
C
Cliff Wickman 已提交
655
	while (descriptor_stat != UV2H_DESC_IDLE) {
656 657 658 659 660 661 662 663
		if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT)) {
			/*
			 * A h/w bug on the destination side may
			 * have prevented the message being marked
			 * pending, thus it doesn't get replied to
			 * and gets continually nacked until it times
			 * out with a SOURCE_TIMEOUT.
			 */
664 665
			stat->s_stimeout++;
			return FLUSH_GIVEUP;
C
Cliff Wickman 已提交
666
		} else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683
			ttm = get_cycles();

			/*
			 * Our retries may be blocked by all destination
			 * swack resources being consumed, and a timeout
			 * pending.  In that case hardware returns the
			 * ERROR that looks like a destination timeout.
			 * Without using the extended status we have to
			 * deduce from the short time that this was a
			 * strong nack.
			 */
			if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
				bcp->conseccompletes = 0;
				stat->s_plugged++;
				/* FLUSH_RETRY_PLUGGED causes hang on boot */
				return FLUSH_GIVEUP;
			}
684 685
			stat->s_dtimeout++;
			bcp->conseccompletes = 0;
686 687
			/* FLUSH_RETRY_TIMEOUT causes hang on boot */
			return FLUSH_GIVEUP;
688
		} else {
C
Cliff Wickman 已提交
689 690 691 692 693
			busy_reps++;
			if (busy_reps > 1000000) {
				/* not to hammer on the clock */
				busy_reps = 0;
				ttm = get_cycles();
694
				if ((ttm - bcp->send_message) > bcp->timeout_interval)
C
Cliff Wickman 已提交
695 696
					return handle_uv2_busy(bcp);
			}
697
			/*
C
Cliff Wickman 已提交
698
			 * descriptor_stat is still BUSY
699 700
			 */
			cpu_relax();
701
		}
702
		descriptor_stat = uv2_3_read_status(mmr_offset, right_shift, desc);
703
	}
704
	bcp->conseccompletes++;
705 706 707
	return FLUSH_COMPLETE;
}

C
Cliff Wickman 已提交
708 709 710 711 712
/*
 * There are 2 status registers; each and array[32] of 2 bits. Set up for
 * which register to read and position in that register based on cpu in
 * current hub.
 */
713
static int wait_completion(struct bau_desc *bau_desc, struct bau_control *bcp, long try)
714
{
C
Cliff Wickman 已提交
715 716
	int right_shift;
	unsigned long mmr_offset;
717
	int desc = bcp->uvhub_cpu;
C
Cliff Wickman 已提交
718

C
Cliff Wickman 已提交
719
	if (desc < UV_CPUS_PER_AS) {
C
Cliff Wickman 已提交
720
		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
C
Cliff Wickman 已提交
721
		right_shift = desc * UV_ACT_STATUS_SIZE;
C
Cliff Wickman 已提交
722 723
	} else {
		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
C
Cliff Wickman 已提交
724
		right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
C
Cliff Wickman 已提交
725 726
	}

727
	if (bcp->uvhub_version == 1)
728
		return uv1_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try);
729
	else
730
		return uv2_3_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try);
731 732
}

733
/*
C
Cliff Wickman 已提交
734
 * Our retries are blocked by all destination sw ack resources being
735 736 737
 * in use, and a timeout is pending. In that case hardware immediately
 * returns the ERROR that looks like a destination timeout.
 */
C
Cliff Wickman 已提交
738 739
static void destination_plugged(struct bau_desc *bau_desc,
			struct bau_control *bcp,
740 741 742 743
			struct bau_control *hmaster, struct ptc_stats *stat)
{
	udelay(bcp->plugged_delay);
	bcp->plugged_tries++;
C
Cliff Wickman 已提交
744

745 746
	if (bcp->plugged_tries >= bcp->plugsb4reset) {
		bcp->plugged_tries = 0;
C
Cliff Wickman 已提交
747

748
		quiesce_local_uvhub(hmaster);
C
Cliff Wickman 已提交
749

750
		spin_lock(&hmaster->queue_lock);
C
cpw@sgi.com 已提交
751
		reset_with_ipi(&bau_desc->distribution, bcp);
752
		spin_unlock(&hmaster->queue_lock);
C
Cliff Wickman 已提交
753

754
		end_uvhub_quiesce(hmaster);
C
Cliff Wickman 已提交
755

756 757 758 759 760
		bcp->ipi_attempts++;
		stat->s_resets_plug++;
	}
}

C
Cliff Wickman 已提交
761 762 763
static void destination_timeout(struct bau_desc *bau_desc,
			struct bau_control *bcp, struct bau_control *hmaster,
			struct ptc_stats *stat)
764
{
C
Cliff Wickman 已提交
765
	hmaster->max_concurr = 1;
766 767 768
	bcp->timeout_tries++;
	if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
		bcp->timeout_tries = 0;
C
Cliff Wickman 已提交
769

770
		quiesce_local_uvhub(hmaster);
C
Cliff Wickman 已提交
771

772
		spin_lock(&hmaster->queue_lock);
C
cpw@sgi.com 已提交
773
		reset_with_ipi(&bau_desc->distribution, bcp);
774
		spin_unlock(&hmaster->queue_lock);
C
Cliff Wickman 已提交
775

776
		end_uvhub_quiesce(hmaster);
C
Cliff Wickman 已提交
777

778 779 780 781 782
		bcp->ipi_attempts++;
		stat->s_resets_timeout++;
	}
}

783
/*
784 785
 * Stop all cpus on a uvhub from using the BAU for a period of time.
 * This is reversed by check_enable.
786
 */
787
static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
788
{
789 790 791 792 793 794 795 796
	int tcpu;
	struct bau_control *tbcp;
	struct bau_control *hmaster;
	cycles_t tm1;

	hmaster = bcp->uvhub_master;
	spin_lock(&hmaster->disable_lock);
	if (!bcp->baudisabled) {
797
		stat->s_bau_disabled++;
798
		tm1 = get_cycles();
799 800
		for_each_present_cpu(tcpu) {
			tbcp = &per_cpu(bau_control, tcpu);
801 802 803 804 805
			if (tbcp->uvhub_master == hmaster) {
				tbcp->baudisabled = 1;
				tbcp->set_bau_on_time =
					tm1 + bcp->disabled_period;
			}
806 807
		}
	}
808
	spin_unlock(&hmaster->disable_lock);
809 810
}

C
Cliff Wickman 已提交
811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838
static void count_max_concurr(int stat, struct bau_control *bcp,
				struct bau_control *hmaster)
{
	bcp->plugged_tries = 0;
	bcp->timeout_tries = 0;
	if (stat != FLUSH_COMPLETE)
		return;
	if (bcp->conseccompletes <= bcp->complete_threshold)
		return;
	if (hmaster->max_concurr >= hmaster->max_concurr_const)
		return;
	hmaster->max_concurr++;
}

static void record_send_stats(cycles_t time1, cycles_t time2,
		struct bau_control *bcp, struct ptc_stats *stat,
		int completion_status, int try)
{
	cycles_t elapsed;

	if (time2 > time1) {
		elapsed = time2 - time1;
		stat->s_time += elapsed;

		if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
			bcp->period_requests++;
			bcp->period_time += elapsed;
			if ((elapsed > congested_cycles) &&
839 840 841 842 843 844
			    (bcp->period_requests > bcp->cong_reps) &&
			    ((bcp->period_time / bcp->period_requests) >
							congested_cycles)) {
				stat->s_congested++;
				disable_for_period(bcp, stat);
			}
C
Cliff Wickman 已提交
845 846 847 848 849 850
		}
	} else
		stat->s_requestor--;

	if (completion_status == FLUSH_COMPLETE && try > 1)
		stat->s_retriesok++;
851
	else if (completion_status == FLUSH_GIVEUP) {
C
Cliff Wickman 已提交
852
		stat->s_giveup++;
853 854 855 856 857 858 859 860 861 862
		if (get_cycles() > bcp->period_end)
			bcp->period_giveups = 0;
		bcp->period_giveups++;
		if (bcp->period_giveups == 1)
			bcp->period_end = get_cycles() + bcp->disabled_period;
		if (bcp->period_giveups > bcp->giveup_limit) {
			disable_for_period(bcp, stat);
			stat->s_giveuplimit++;
		}
	}
C
Cliff Wickman 已提交
863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896
}

/*
 * Because of a uv1 hardware bug only a limited number of concurrent
 * requests can be made.
 */
static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
{
	spinlock_t *lock = &hmaster->uvhub_lock;
	atomic_t *v;

	v = &hmaster->active_descriptor_count;
	if (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)) {
		stat->s_throttles++;
		do {
			cpu_relax();
		} while (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr));
	}
}

/*
 * Handle the completion status of a message send.
 */
static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
			struct bau_control *bcp, struct bau_control *hmaster,
			struct ptc_stats *stat)
{
	if (completion_status == FLUSH_RETRY_PLUGGED)
		destination_plugged(bau_desc, bcp, hmaster, stat);
	else if (completion_status == FLUSH_RETRY_TIMEOUT)
		destination_timeout(bau_desc, bcp, hmaster, stat);
}

/*
897
 * Send a broadcast and wait for it to complete.
898
 *
899
 * The flush_mask contains the cpus the broadcast is to be sent to including
900
 * cpus that are on the local uvhub.
901
 *
902 903 904
 * Returns 0 if all flushing represented in the mask was done.
 * Returns 1 if it gives up entirely and the original cpu mask is to be
 * returned to the kernel.
905
 */
906 907
int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
	struct bau_desc *bau_desc)
908
{
909
	int seq_number = 0;
C
Cliff Wickman 已提交
910
	int completion_stat = 0;
911
	int uv1 = 0;
912
	long try = 0;
913
	unsigned long index;
914 915
	cycles_t time1;
	cycles_t time2;
916
	struct ptc_stats *stat = bcp->statp;
917
	struct bau_control *hmaster = bcp->uvhub_master;
918
	struct uv1_bau_msg_header *uv1_hdr = NULL;
919
	struct uv2_3_bau_msg_header *uv2_3_hdr = NULL;
920

921 922
	if (bcp->uvhub_version == 1) {
		uv1 = 1;
C
Cliff Wickman 已提交
923
		uv1_throttle(hmaster, stat);
924
	}
C
Cliff Wickman 已提交
925

926 927
	while (hmaster->uvhub_quiesce)
		cpu_relax();
928 929

	time1 = get_cycles();
930 931 932
	if (uv1)
		uv1_hdr = &bau_desc->header.uv1_hdr;
	else
933 934
		/* uv2 and uv3 */
		uv2_3_hdr = &bau_desc->header.uv2_3_hdr;
935

936
	do {
937
		if (try == 0) {
938 939 940
			if (uv1)
				uv1_hdr->msg_type = MSG_REGULAR;
			else
941
				uv2_3_hdr->msg_type = MSG_REGULAR;
942 943
			seq_number = bcp->message_number++;
		} else {
944 945 946
			if (uv1)
				uv1_hdr->msg_type = MSG_RETRY;
			else
947
				uv2_3_hdr->msg_type = MSG_RETRY;
948 949
			stat->s_retry_messages++;
		}
C
Cliff Wickman 已提交
950

951 952 953
		if (uv1)
			uv1_hdr->sequence = seq_number;
		else
954
			uv2_3_hdr->sequence = seq_number;
955
		index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
956
		bcp->send_message = get_cycles();
C
Cliff Wickman 已提交
957 958 959

		write_mmr_activation(index);

960
		try++;
C
Cliff Wickman 已提交
961 962 963
		completion_stat = wait_completion(bau_desc, bcp, try);

		handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
964

965
		if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
966
			bcp->ipi_attempts = 0;
967
			stat->s_overipilimit++;
C
Cliff Wickman 已提交
968
			completion_stat = FLUSH_GIVEUP;
969 970 971
			break;
		}
		cpu_relax();
C
Cliff Wickman 已提交
972 973 974
	} while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
		 (completion_stat == FLUSH_RETRY_TIMEOUT));

975
	time2 = get_cycles();
C
Cliff Wickman 已提交
976 977 978

	count_max_concurr(completion_stat, bcp, hmaster);

979 980
	while (hmaster->uvhub_quiesce)
		cpu_relax();
C
Cliff Wickman 已提交
981

982
	atomic_dec(&hmaster->active_descriptor_count);
C
Cliff Wickman 已提交
983 984 985 986

	record_send_stats(time1, time2, bcp, stat, completion_stat, try);

	if (completion_stat == FLUSH_GIVEUP)
C
Cliff Wickman 已提交
987
		/* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */
C
Cliff Wickman 已提交
988 989 990 991 992
		return 1;
	return 0;
}

/*
993 994 995
 * The BAU is disabled for this uvhub. When the disabled time period has
 * expired re-enable it.
 * Return 0 if it is re-enabled for all cpus on this uvhub.
C
Cliff Wickman 已提交
996 997 998 999 1000
 */
static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
{
	int tcpu;
	struct bau_control *tbcp;
1001
	struct bau_control *hmaster;
C
Cliff Wickman 已提交
1002

1003 1004 1005 1006 1007 1008 1009
	hmaster = bcp->uvhub_master;
	spin_lock(&hmaster->disable_lock);
	if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
		stat->s_bau_reenabled++;
		for_each_present_cpu(tcpu) {
			tbcp = &per_cpu(bau_control, tcpu);
			if (tbcp->uvhub_master == hmaster) {
C
Cliff Wickman 已提交
1010 1011 1012
				tbcp->baudisabled = 0;
				tbcp->period_requests = 0;
				tbcp->period_time = 0;
1013
				tbcp->period_giveups = 0;
1014 1015
			}
		}
1016 1017
		spin_unlock(&hmaster->disable_lock);
		return 0;
C
Cliff Wickman 已提交
1018
	}
1019
	spin_unlock(&hmaster->disable_lock);
C
Cliff Wickman 已提交
1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035
	return -1;
}

static void record_send_statistics(struct ptc_stats *stat, int locals, int hubs,
				int remotes, struct bau_desc *bau_desc)
{
	stat->s_requestor++;
	stat->s_ntargcpu += remotes + locals;
	stat->s_ntargremotes += remotes;
	stat->s_ntarglocals += locals;

	/* uvhub statistics */
	hubs = bau_uvhub_weight(&bau_desc->distribution);
	if (locals) {
		stat->s_ntarglocaluvhub++;
		stat->s_ntargremoteuvhub += (hubs - 1);
1036
	} else
C
Cliff Wickman 已提交
1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079
		stat->s_ntargremoteuvhub += hubs;

	stat->s_ntarguvhub += hubs;

	if (hubs >= 16)
		stat->s_ntarguvhub16++;
	else if (hubs >= 8)
		stat->s_ntarguvhub8++;
	else if (hubs >= 4)
		stat->s_ntarguvhub4++;
	else if (hubs >= 2)
		stat->s_ntarguvhub2++;
	else
		stat->s_ntarguvhub1++;
}

/*
 * Translate a cpu mask to the uvhub distribution mask in the BAU
 * activation descriptor.
 */
static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
			struct bau_desc *bau_desc, int *localsp, int *remotesp)
{
	int cpu;
	int pnode;
	int cnt = 0;
	struct hub_and_pnode *hpp;

	for_each_cpu(cpu, flush_mask) {
		/*
		 * The distribution vector is a bit map of pnodes, relative
		 * to the partition base pnode (and the partition base nasid
		 * in the header).
		 * Translate cpu to pnode and hub using a local memory array.
		 */
		hpp = &bcp->socket_master->thp[cpu];
		pnode = hpp->pnode - bcp->partition_base_pnode;
		bau_uvhub_set(pnode, &bau_desc->distribution);
		cnt++;
		if (hpp->uvhub == bcp->uvhub)
			(*localsp)++;
		else
			(*remotesp)++;
1080
	}
C
Cliff Wickman 已提交
1081 1082
	if (!cnt)
		return 1;
1083
	return 0;
1084 1085
}

C
Cliff Wickman 已提交
1086 1087
/*
 * globally purge translation cache of a virtual address or all TLB's
T
Tejun Heo 已提交
1088
 * @cpumask: mask of all cpu's in which the address is to be removed
1089
 * @mm: mm_struct containing virtual address range
1090 1091
 * @start: start virtual address to be removed from TLB
 * @end: end virtual address to be remove from TLB
T
Tejun Heo 已提交
1092
 * @cpu: the current cpu
1093 1094 1095 1096 1097 1098
 *
 * This is the entry point for initiating any UV global TLB shootdown.
 *
 * Purges the translation caches of all specified processors of the given
 * virtual address, or purges all TLB's on specified processors.
 *
T
Tejun Heo 已提交
1099 1100
 * The caller has derived the cpumask from the mm_struct.  This function
 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
1101
 *
1102 1103
 * The cpumask is converted into a uvhubmask of the uvhubs containing
 * those cpus.
1104
 *
T
Tejun Heo 已提交
1105 1106 1107 1108 1109
 * Note that this function should be called with preemption disabled.
 *
 * Returns NULL if all remote flushing was done.
 * Returns pointer to cpumask if some remote flushing remains to be
 * done.  The returned pointer is valid till preemption is re-enabled.
1110
 */
T
Tejun Heo 已提交
1111
const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1112 1113 1114 1115
						struct mm_struct *mm,
						unsigned long start,
						unsigned long end,
						unsigned int cpu)
1116
{
1117
	int locals = 0;
1118 1119
	int remotes = 0;
	int hubs = 0;
1120
	struct bau_desc *bau_desc;
1121 1122 1123
	struct cpumask *flush_mask;
	struct ptc_stats *stat;
	struct bau_control *bcp;
1124 1125
	unsigned long descriptor_status;
	unsigned long status;
T
Tejun Heo 已提交
1126

1127
	bcp = &per_cpu(bau_control, cpu);
1128 1129 1130

	if (bcp->nobau)
		return cpumask;
1131

1132 1133 1134
	stat = bcp->statp;
	stat->s_enters++;

1135 1136 1137 1138 1139 1140 1141 1142 1143 1144
	if (bcp->busy) {
		descriptor_status =
			read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_0);
		status = ((descriptor_status >> (bcp->uvhub_cpu *
			UV_ACT_STATUS_SIZE)) & UV_ACT_STATUS_MASK) << 1;
		if (status == UV2H_DESC_BUSY)
			return cpumask;
		bcp->busy = 0;
	}

1145 1146
	/* bau was disabled due to slow response */
	if (bcp->baudisabled) {
1147 1148
		if (check_enable(bcp, stat)) {
			stat->s_ipifordisabled++;
C
Cliff Wickman 已提交
1149
			return cpumask;
1150
		}
1151
	}
1152

1153 1154
	/*
	 * Each sending cpu has a per-cpu mask which it fills from the caller's
1155 1156
	 * cpu mask.  All cpus are converted to uvhubs and copied to the
	 * activation descriptor.
1157 1158
	 */
	flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
1159
	/* don't actually do a shootdown of the local cpu */
1160
	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
C
Cliff Wickman 已提交
1161

1162
	if (cpumask_test_cpu(cpu, cpumask))
1163
		stat->s_ntargself++;
1164

1165
	bau_desc = bcp->descriptor_base;
1166
	bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu);
1167
	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
C
Cliff Wickman 已提交
1168
	if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
1169 1170
		return NULL;

C
Cliff Wickman 已提交
1171
	record_send_statistics(stat, locals, hubs, remotes, bau_desc);
1172

1173 1174 1175 1176
	if (!end || (end - start) <= PAGE_SIZE)
		bau_desc->payload.address = start;
	else
		bau_desc->payload.address = TLB_FLUSH_ALL;
T
Tejun Heo 已提交
1177
	bau_desc->payload.sending_cpu = cpu;
1178
	/*
1179 1180
	 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
	 * or 1 if it gave up and the original cpumask should be returned.
1181
	 */
1182
	if (!uv_flush_send_and_wait(flush_mask, bcp, bau_desc))
1183 1184 1185
		return NULL;
	else
		return cpumask;
1186 1187
}

C
Cliff Wickman 已提交
1188
/*
1189 1190
 * Search the message queue for any 'other' unprocessed message with the
 * same software acknowledge resource bit vector as the 'msg' message.
C
Cliff Wickman 已提交
1191 1192
 */
struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg,
1193
					   struct bau_control *bcp)
C
Cliff Wickman 已提交
1194 1195
{
	struct bau_pq_entry *msg_next = msg + 1;
1196
	unsigned char swack_vec = msg->swack_vec;
C
Cliff Wickman 已提交
1197 1198 1199

	if (msg_next > bcp->queue_last)
		msg_next = bcp->queue_first;
1200 1201 1202
	while (msg_next != msg) {
		if ((msg_next->canceled == 0) && (msg_next->replied_to == 0) &&
				(msg_next->swack_vec == swack_vec))
C
Cliff Wickman 已提交
1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222
			return msg_next;
		msg_next++;
		if (msg_next > bcp->queue_last)
			msg_next = bcp->queue_first;
	}
	return NULL;
}

/*
 * UV2 needs to work around a bug in which an arriving message has not
 * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register.
 * Such a message must be ignored.
 */
void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
{
	unsigned long mmr_image;
	unsigned char swack_vec;
	struct bau_pq_entry *msg = mdp->msg;
	struct bau_pq_entry *other_msg;

1223
	mmr_image = ops.read_l_sw_ack();
C
Cliff Wickman 已提交
1224 1225 1226 1227 1228 1229 1230 1231 1232
	swack_vec = msg->swack_vec;

	if ((swack_vec & mmr_image) == 0) {
		/*
		 * This message was assigned a swack resource, but no
		 * reserved acknowlegment is pending.
		 * The bug has prevented this message from setting the MMR.
		 */
		/*
1233 1234
		 * Some message has set the MMR 'pending' bit; it might have
		 * been another message.  Look for that message.
C
Cliff Wickman 已提交
1235
		 */
1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249
		other_msg = find_another_by_swack(msg, bcp);
		if (other_msg) {
			/*
			 * There is another. Process this one but do not
			 * ack it.
			 */
			bau_process_message(mdp, bcp, 0);
			/*
			 * Let the natural processing of that other message
			 * acknowledge it. Don't get the processing of sw_ack's
			 * out of order.
			 */
			return;
		}
C
Cliff Wickman 已提交
1250 1251 1252
	}

	/*
1253 1254
	 * Either the MMR shows this one pending a reply or there is no
	 * other message using this sw_ack, so it is safe to acknowledge it.
C
Cliff Wickman 已提交
1255 1256 1257 1258 1259 1260
	 */
	bau_process_message(mdp, bcp, 1);

	return;
}

1261 1262 1263 1264 1265 1266
/*
 * The BAU message interrupt comes here. (registered by set_intr_gate)
 * See entry_64.S
 *
 * We received a broadcast assist message.
 *
1267
 * Interrupts are disabled; this interrupt could represent
1268 1269
 * the receipt of several messages.
 *
1270 1271
 * All cores/threads on this hub get this interrupt.
 * The last one to see it does the software ack.
1272
 * (the resource will not be freed until noninterruptable cpus see this
1273
 *  interrupt; hardware may timeout the s/w ack and reply ERROR)
1274
 */
1275
void uv_bau_message_interrupt(struct pt_regs *regs)
1276 1277
{
	int count = 0;
1278
	cycles_t time_start;
C
Cliff Wickman 已提交
1279
	struct bau_pq_entry *msg;
1280 1281 1282 1283
	struct bau_control *bcp;
	struct ptc_stats *stat;
	struct msg_desc msgdesc;

1284
	ack_APIC_irq();
1285
	time_start = get_cycles();
C
Cliff Wickman 已提交
1286

1287
	bcp = &per_cpu(bau_control, smp_processor_id());
1288
	stat = bcp->statp;
C
Cliff Wickman 已提交
1289 1290 1291 1292

	msgdesc.queue_first = bcp->queue_first;
	msgdesc.queue_last = bcp->queue_last;

1293
	msg = bcp->bau_msg_head;
C
Cliff Wickman 已提交
1294
	while (msg->swack_vec) {
1295
		count++;
C
Cliff Wickman 已提交
1296 1297

		msgdesc.msg_slot = msg - msgdesc.queue_first;
1298
		msgdesc.msg = msg;
C
Cliff Wickman 已提交
1299 1300 1301
		if (bcp->uvhub_version == 2)
			process_uv2_message(&msgdesc, bcp);
		else
1302
			/* no error workaround for uv1 or uv3 */
C
Cliff Wickman 已提交
1303
			bau_process_message(&msgdesc, bcp, 1);
C
Cliff Wickman 已提交
1304

1305
		msg++;
C
Cliff Wickman 已提交
1306 1307
		if (msg > msgdesc.queue_last)
			msg = msgdesc.queue_first;
1308
		bcp->bau_msg_head = msg;
1309
	}
1310
	stat->d_time += (get_cycles() - time_start);
1311
	if (!count)
1312
		stat->d_nomsg++;
1313
	else if (count > 1)
1314
		stat->d_multmsg++;
1315 1316
}

C
Cliff Wickman 已提交
1317
/*
C
Cliff Wickman 已提交
1318
 * Each target uvhub (i.e. a uvhub that has cpu's) needs to have
C
Cliff Wickman 已提交
1319 1320 1321 1322
 * shootdown message timeouts enabled.  The timeout does not cause
 * an interrupt, but causes an error message to be returned to
 * the sender.
 */
C
Cliff Wickman 已提交
1323
static void __init enable_timeouts(void)
1324
{
1325 1326
	int uvhub;
	int nuvhubs;
1327
	int pnode;
C
Cliff Wickman 已提交
1328
	unsigned long mmr_image;
1329

1330
	nuvhubs = uv_num_possible_blades();
1331

1332 1333
	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
		if (!uv_blade_nr_possible_cpus(uvhub))
1334
			continue;
C
Cliff Wickman 已提交
1335

1336
		pnode = uv_blade_to_pnode(uvhub);
C
Cliff Wickman 已提交
1337
		mmr_image = read_mmr_misc_control(pnode);
C
Cliff Wickman 已提交
1338 1339 1340 1341 1342 1343
		/*
		 * Set the timeout period and then lock it in, in three
		 * steps; captures and locks in the period.
		 *
		 * To program the period, the SOFT_ACK_MODE must be off.
		 */
C
Cliff Wickman 已提交
1344 1345
		mmr_image &= ~(1L << SOFTACK_MSHIFT);
		write_mmr_misc_control(pnode, mmr_image);
C
Cliff Wickman 已提交
1346 1347 1348
		/*
		 * Set the 4-bit period.
		 */
C
Cliff Wickman 已提交
1349 1350 1351
		mmr_image &= ~((unsigned long)0xf << SOFTACK_PSHIFT);
		mmr_image |= (SOFTACK_TIMEOUT_PERIOD << SOFTACK_PSHIFT);
		write_mmr_misc_control(pnode, mmr_image);
C
Cliff Wickman 已提交
1352
		/*
1353
		 * UV1:
C
Cliff Wickman 已提交
1354 1355 1356 1357
		 * Subsequent reversals of the timebase bit (3) cause an
		 * immediate timeout of one or all INTD resources as
		 * indicated in bits 2:0 (7 causes all of them to timeout).
		 */
C
Cliff Wickman 已提交
1358
		mmr_image |= (1L << SOFTACK_MSHIFT);
1359
		if (is_uv2_hub()) {
1360
			/* do not touch the legacy mode bit */
1361 1362
			/* hw bug workaround; do not use extended status */
			mmr_image &= ~(1L << UV2_EXT_SHFT);
1363 1364 1365
		} else if (is_uv3_hub()) {
			mmr_image &= ~(1L << PREFETCH_HINT_SHFT);
			mmr_image |= (1L << SB_STATUS_SHFT);
1366
		}
C
Cliff Wickman 已提交
1367
		write_mmr_misc_control(pnode, mmr_image);
1368 1369 1370
	}
}

C
Cliff Wickman 已提交
1371
static void *ptc_seq_start(struct seq_file *file, loff_t *offset)
1372 1373 1374 1375 1376 1377
{
	if (*offset < num_possible_cpus())
		return offset;
	return NULL;
}

C
Cliff Wickman 已提交
1378
static void *ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
1379 1380 1381 1382 1383 1384 1385
{
	(*offset)++;
	if (*offset < num_possible_cpus())
		return offset;
	return NULL;
}

C
Cliff Wickman 已提交
1386
static void ptc_seq_stop(struct seq_file *file, void *data)
1387 1388 1389 1390
{
}

/*
C
Cliff Wickman 已提交
1391
 * Display the statistics thru /proc/sgi_uv/ptc_statistics
1392
 * 'data' points to the cpu number
C
Cliff Wickman 已提交
1393
 * Note: see the descriptions in stat_description[].
1394
 */
C
Cliff Wickman 已提交
1395
static int ptc_seq_show(struct seq_file *file, void *data)
1396 1397
{
	struct ptc_stats *stat;
1398
	struct bau_control *bcp;
1399 1400 1401 1402
	int cpu;

	cpu = *(loff_t *)data;
	if (!cpu) {
1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414
		seq_puts(file,
			 "# cpu bauoff sent stime self locals remotes ncpus localhub ");
		seq_puts(file, "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
		seq_puts(file,
			 "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries ");
		seq_puts(file,
			 "rok resetp resett giveup sto bz throt disable ");
		seq_puts(file,
			 "enable wars warshw warwaits enters ipidis plugged ");
		seq_puts(file,
			 "ipiover glim cong swack recv rtime all one mult ");
		seq_puts(file, "none retry canc nocan reset rcan\n");
1415 1416
	}
	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
1417
		bcp = &per_cpu(bau_control, cpu);
1418 1419 1420 1421
		if (bcp->nobau) {
			seq_printf(file, "cpu %d bau disabled\n", cpu);
			return 0;
		}
1422
		stat = bcp->statp;
1423 1424
		/* source side statistics */
		seq_printf(file,
1425
			"cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
1426 1427
			   cpu, bcp->nobau, stat->s_requestor,
			   cycles_2_us(stat->s_time),
1428 1429 1430 1431
			   stat->s_ntargself, stat->s_ntarglocals,
			   stat->s_ntargremotes, stat->s_ntargcpu,
			   stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
			   stat->s_ntarguvhub, stat->s_ntarguvhub16);
1432
		seq_printf(file, "%ld %ld %ld %ld %ld %ld ",
1433 1434
			   stat->s_ntarguvhub8, stat->s_ntarguvhub4,
			   stat->s_ntarguvhub2, stat->s_ntarguvhub1,
1435
			   stat->s_dtimeout, stat->s_strongnacks);
1436
		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
1437 1438 1439
			   stat->s_retry_messages, stat->s_retriesok,
			   stat->s_resets_plug, stat->s_resets_timeout,
			   stat->s_giveup, stat->s_stimeout,
1440 1441 1442 1443 1444 1445 1446 1447
			   stat->s_busy, stat->s_throttles);
		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
			   stat->s_bau_disabled, stat->s_bau_reenabled,
			   stat->s_uv2_wars, stat->s_uv2_wars_hw,
			   stat->s_uv2_war_waits, stat->s_enters,
			   stat->s_ipifordisabled, stat->s_plugged,
			   stat->s_overipilimit, stat->s_giveuplimit,
			   stat->s_congested);
1448

1449 1450
		/* destination side statistics */
		seq_printf(file,
1451
			"%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
1452
			   ops.read_g_sw_ack(uv_cpu_to_pnode(cpu)),
1453 1454 1455 1456 1457
			   stat->d_requestee, cycles_2_us(stat->d_time),
			   stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
			   stat->d_nomsg, stat->d_retries, stat->d_canceled,
			   stat->d_nocanceled, stat->d_resets,
			   stat->d_rcanceled);
1458 1459 1460 1461
	}
	return 0;
}

1462 1463 1464 1465
/*
 * Display the tunables thru debugfs
 */
static ssize_t tunables_read(struct file *file, char __user *userbuf,
C
Cliff Wickman 已提交
1466
				size_t count, loff_t *ppos)
1467
{
1468
	char *buf;
1469 1470
	int ret;

1471 1472 1473 1474
	buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d %d\n",
		"max_concur plugged_delay plugsb4reset timeoutsb4reset",
		"ipi_reset_limit complete_threshold congested_response_us",
		"congested_reps disabled_period giveup_limit",
C
Cliff Wickman 已提交
1475
		max_concurr, plugged_delay, plugsb4reset,
1476
		timeoutsb4reset, ipi_reset_limit, complete_threshold,
1477 1478
		congested_respns_us, congested_reps, disabled_period,
		giveup_limit);
1479

1480 1481 1482 1483 1484 1485
	if (!buf)
		return -ENOMEM;

	ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
	kfree(buf);
	return ret;
1486 1487
}

1488
/*
C
Cliff Wickman 已提交
1489 1490
 * handle a write to /proc/sgi_uv/ptc_statistics
 * -1: reset the statistics
1491 1492
 *  0: display meaning of the statistics
 */
C
Cliff Wickman 已提交
1493 1494
static ssize_t ptc_proc_write(struct file *file, const char __user *user,
				size_t count, loff_t *data)
1495
{
1496
	int cpu;
C
Cliff Wickman 已提交
1497 1498
	int i;
	int elements;
1499
	long input_arg;
1500
	char optstr[64];
1501
	struct ptc_stats *stat;
1502

1503
	if (count == 0 || count > sizeof(optstr))
1504
		return -EINVAL;
1505 1506 1507
	if (copy_from_user(optstr, user, count))
		return -EFAULT;
	optstr[count - 1] = '\0';
C
Cliff Wickman 已提交
1508

1509 1510 1511 1512 1513 1514 1515 1516
	if (!strcmp(optstr, "on")) {
		set_bau_on();
		return count;
	} else if (!strcmp(optstr, "off")) {
		set_bau_off();
		return count;
	}

1517
	if (kstrtol(optstr, 10, &input_arg) < 0) {
1518
		pr_debug("%s is invalid\n", optstr);
1519 1520 1521
		return -EINVAL;
	}

1522
	if (input_arg == 0) {
1523
		elements = ARRAY_SIZE(stat_description);
1524 1525
		pr_debug("# cpu:      cpu number\n");
		pr_debug("Sender statistics:\n");
C
Cliff Wickman 已提交
1526
		for (i = 0; i < elements; i++)
1527
			pr_debug("%s\n", stat_description[i]);
1528 1529 1530 1531 1532
	} else if (input_arg == -1) {
		for_each_present_cpu(cpu) {
			stat = &per_cpu(ptcstats, cpu);
			memset(stat, 0, sizeof(struct ptc_stats));
		}
1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548
	}

	return count;
}

static int local_atoi(const char *name)
{
	int val = 0;

	for (;; name++) {
		switch (*name) {
		case '0' ... '9':
			val = 10*val+(*name-'0');
			break;
		default:
			return val;
1549
		}
1550
	}
1551 1552 1553
}

/*
C
Cliff Wickman 已提交
1554 1555
 * Parse the values written to /sys/kernel/debug/sgi_uv/bau_tunables.
 * Zero values reset them to defaults.
1556
 */
C
Cliff Wickman 已提交
1557 1558
static int parse_tunables_write(struct bau_control *bcp, char *instr,
				int count)
1559 1560 1561
{
	char *p;
	char *q;
C
Cliff Wickman 已提交
1562 1563
	int cnt = 0;
	int val;
1564
	int e = ARRAY_SIZE(tunables);
1565 1566 1567 1568 1569 1570 1571 1572 1573

	p = instr + strspn(instr, WHITESPACE);
	q = p;
	for (; *p; p = q + strspn(q, WHITESPACE)) {
		q = p + strcspn(p, WHITESPACE);
		cnt++;
		if (q == p)
			break;
	}
C
Cliff Wickman 已提交
1574
	if (cnt != e) {
1575
		pr_info("bau tunable error: should be %d values\n", e);
1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586
		return -EINVAL;
	}

	p = instr + strspn(instr, WHITESPACE);
	q = p;
	for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
		q = p + strcspn(p, WHITESPACE);
		val = local_atoi(p);
		switch (cnt) {
		case 0:
			if (val == 0) {
C
Cliff Wickman 已提交
1587 1588
				max_concurr = MAX_BAU_CONCURRENT;
				max_concurr_const = MAX_BAU_CONCURRENT;
1589 1590 1591
				continue;
			}
			if (val < 1 || val > bcp->cpus_in_uvhub) {
1592
				pr_debug(
1593 1594 1595 1596
				"Error: BAU max concurrent %d is invalid\n",
				val);
				return -EINVAL;
			}
C
Cliff Wickman 已提交
1597 1598
			max_concurr = val;
			max_concurr_const = val;
1599
			continue;
C
Cliff Wickman 已提交
1600
		default:
1601
			if (val == 0)
C
Cliff Wickman 已提交
1602
				*tunables[cnt].tunp = tunables[cnt].deflt;
1603
			else
C
Cliff Wickman 已提交
1604
				*tunables[cnt].tunp = val;
1605 1606 1607 1608 1609
			continue;
		}
		if (q == p)
			break;
	}
C
Cliff Wickman 已提交
1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630
	return 0;
}

/*
 * Handle a write to debugfs. (/sys/kernel/debug/sgi_uv/bau_tunables)
 */
static ssize_t tunables_write(struct file *file, const char __user *user,
				size_t count, loff_t *data)
{
	int cpu;
	int ret;
	char instr[100];
	struct bau_control *bcp;

	if (count == 0 || count > sizeof(instr)-1)
		return -EINVAL;
	if (copy_from_user(instr, user, count))
		return -EFAULT;

	instr[count] = '\0';

1631 1632
	cpu = get_cpu();
	bcp = &per_cpu(bau_control, cpu);
C
Cliff Wickman 已提交
1633
	ret = parse_tunables_write(bcp, instr, count);
1634
	put_cpu();
C
Cliff Wickman 已提交
1635 1636 1637
	if (ret)
		return ret;

1638 1639
	for_each_present_cpu(cpu) {
		bcp = &per_cpu(bau_control, cpu);
1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650
		bcp->max_concurr         = max_concurr;
		bcp->max_concurr_const   = max_concurr;
		bcp->plugged_delay       = plugged_delay;
		bcp->plugsb4reset        = plugsb4reset;
		bcp->timeoutsb4reset     = timeoutsb4reset;
		bcp->ipi_reset_limit     = ipi_reset_limit;
		bcp->complete_threshold  = complete_threshold;
		bcp->cong_response_us    = congested_respns_us;
		bcp->cong_reps           = congested_reps;
		bcp->disabled_period     = sec_2_cycles(disabled_period);
		bcp->giveup_limit        = giveup_limit;
1651
	}
1652 1653 1654 1655
	return count;
}

static const struct seq_operations uv_ptc_seq_ops = {
C
Cliff Wickman 已提交
1656 1657 1658 1659
	.start		= ptc_seq_start,
	.next		= ptc_seq_next,
	.stop		= ptc_seq_stop,
	.show		= ptc_seq_show
1660 1661
};

C
Cliff Wickman 已提交
1662
static int ptc_proc_open(struct inode *inode, struct file *file)
1663 1664 1665 1666
{
	return seq_open(file, &uv_ptc_seq_ops);
}

1667 1668 1669 1670 1671
static int tunables_open(struct inode *inode, struct file *file)
{
	return 0;
}

1672
static const struct file_operations proc_uv_ptc_operations = {
C
Cliff Wickman 已提交
1673
	.open		= ptc_proc_open,
1674
	.read		= seq_read,
C
Cliff Wickman 已提交
1675
	.write		= ptc_proc_write,
1676 1677
	.llseek		= seq_lseek,
	.release	= seq_release,
1678 1679
};

1680 1681 1682 1683
static const struct file_operations tunables_fops = {
	.open		= tunables_open,
	.read		= tunables_read,
	.write		= tunables_write,
1684
	.llseek		= default_llseek,
1685 1686
};

1687
static int __init uv_ptc_init(void)
1688
{
1689
	struct proc_dir_entry *proc_uv_ptc;
1690 1691 1692 1693

	if (!is_uv_system())
		return 0;

1694 1695
	proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
				  &proc_uv_ptc_operations);
1696
	if (!proc_uv_ptc) {
1697
		pr_err("unable to create %s proc entry\n",
1698 1699 1700
		       UV_PTC_BASENAME);
		return -EINVAL;
	}
1701 1702 1703

	tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
	if (!tunables_dir) {
1704
		pr_err("unable to create debugfs directory %s\n",
1705 1706 1707 1708
		       UV_BAU_TUNABLES_DIR);
		return -EINVAL;
	}
	tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
C
Cliff Wickman 已提交
1709
					tunables_dir, NULL, &tunables_fops);
1710
	if (!tunables_file) {
1711
		pr_err("unable to create debugfs file %s\n",
1712 1713 1714
		       UV_BAU_TUNABLES_FILE);
		return -EINVAL;
	}
1715 1716 1717 1718
	return 0;
}

/*
1719
 * Initialize the sending side's sending buffers.
1720
 */
C
Cliff Wickman 已提交
1721
static void activation_descriptor_init(int node, int pnode, int base_pnode)
1722 1723
{
	int i;
1724
	int cpu;
1725
	int uv1 = 0;
1726
	unsigned long gpa;
1727
	unsigned long m;
1728
	unsigned long n;
C
Cliff Wickman 已提交
1729
	size_t dsize;
1730 1731
	struct bau_desc *bau_desc;
	struct bau_desc *bd2;
1732
	struct uv1_bau_msg_header *uv1_hdr;
1733
	struct uv2_3_bau_msg_header *uv2_3_hdr;
1734
	struct bau_control *bcp;
1735

1736
	/*
C
Cliff Wickman 已提交
1737 1738
	 * each bau_desc is 64 bytes; there are 8 (ITEMS_PER_DESC)
	 * per cpu; and one per cpu on the uvhub (ADP_SZ)
1739
	 */
C
Cliff Wickman 已提交
1740 1741
	dsize = sizeof(struct bau_desc) * ADP_SZ * ITEMS_PER_DESC;
	bau_desc = kmalloc_node(dsize, GFP_KERNEL, node);
1742
	BUG_ON(!bau_desc);
1743

1744 1745
	gpa = uv_gpa(bau_desc);
	n = uv_gpa_to_gnode(gpa);
1746
	m = ops.bau_gpa_to_offset(gpa);
1747 1748
	if (is_uv1_hub())
		uv1 = 1;
1749

1750
	/* the 14-bit pnode */
C
Cliff Wickman 已提交
1751
	write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m));
1752
	/*
C
Cliff Wickman 已提交
1753
	 * Initializing all 8 (ITEMS_PER_DESC) descriptors for each
1754
	 * cpu even though we only use the first one; one descriptor can
1755
	 * describe a broadcast to 256 uv hubs.
1756
	 */
C
Cliff Wickman 已提交
1757
	for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) {
1758
		memset(bd2, 0, sizeof(struct bau_desc));
1759 1760
		if (uv1) {
			uv1_hdr = &bd2->header.uv1_hdr;
1761
			uv1_hdr->swack_flag = 1;
1762 1763 1764 1765 1766 1767 1768 1769
			/*
			 * The base_dest_nasid set in the message header
			 * is the nasid of the first uvhub in the partition.
			 * The bit map will indicate destination pnode numbers
			 * relative to that base. They may not be consecutive
			 * if nasid striding is being used.
			 */
			uv1_hdr->base_dest_nasid =
1770 1771 1772 1773
			                          UV_PNODE_TO_NASID(base_pnode);
			uv1_hdr->dest_subnodeid  = UV_LB_SUBNODEID;
			uv1_hdr->command         = UV_NET_ENDPOINT_INTD;
			uv1_hdr->int_both        = 1;
1774 1775 1776 1777 1778
			/*
			 * all others need to be set to zero:
			 *   fairness chaining multilevel count replied_to
			 */
		} else {
1779
			/*
1780
			 * BIOS uses legacy mode, but uv2 and uv3 hardware always
1781 1782
			 * uses native mode for selective broadcasts.
			 */
1783
			uv2_3_hdr = &bd2->header.uv2_3_hdr;
1784
			uv2_3_hdr->swack_flag      = 1;
1785
			uv2_3_hdr->base_dest_nasid =
1786 1787 1788
			                          UV_PNODE_TO_NASID(base_pnode);
			uv2_3_hdr->dest_subnodeid  = UV_LB_SUBNODEID;
			uv2_3_hdr->command         = UV_NET_ENDPOINT_INTD;
1789
		}
1790
	}
1791 1792 1793 1794 1795 1796
	for_each_present_cpu(cpu) {
		if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
			continue;
		bcp = &per_cpu(bau_control, cpu);
		bcp->descriptor_base = bau_desc;
	}
1797 1798 1799 1800
}

/*
 * initialize the destination side's receiving buffers
1801 1802 1803
 * entered for each uvhub in the partition
 * - node is first node (kernel memory notion) on the uvhub
 * - pnode is the uvhub's physical identifier
1804
 */
C
Cliff Wickman 已提交
1805
static void pq_init(int node, int pnode)
1806
{
1807
	int cpu;
C
Cliff Wickman 已提交
1808
	size_t plsize;
1809
	char *cp;
C
Cliff Wickman 已提交
1810
	void *vp;
1811
	unsigned long gnode, first, last, tail;
C
Cliff Wickman 已提交
1812
	struct bau_pq_entry *pqp;
1813
	struct bau_control *bcp;
1814

C
Cliff Wickman 已提交
1815 1816 1817
	plsize = (DEST_Q_SIZE + 1) * sizeof(struct bau_pq_entry);
	vp = kmalloc_node(plsize, GFP_KERNEL, node);
	pqp = (struct bau_pq_entry *)vp;
1818
	BUG_ON(!pqp);
1819

1820
	cp = (char *)pqp + 31;
C
Cliff Wickman 已提交
1821
	pqp = (struct bau_pq_entry *)(((unsigned long)cp >> 5) << 5);
1822 1823 1824 1825 1826 1827

	for_each_present_cpu(cpu) {
		if (pnode != uv_cpu_to_pnode(cpu))
			continue;
		/* for every cpu on this pnode: */
		bcp = &per_cpu(bau_control, cpu);
C
Cliff Wickman 已提交
1828 1829 1830
		bcp->queue_first	= pqp;
		bcp->bau_msg_head	= pqp;
		bcp->queue_last		= pqp + (DEST_Q_SIZE - 1);
1831
	}
1832

1833 1834
	first = ops.bau_gpa_to_offset(uv_gpa(pqp));
	last = ops.bau_gpa_to_offset(uv_gpa(pqp + (DEST_Q_SIZE - 1)));
1835

1836
	/*
1837 1838
	 * Pre UV4, the gnode is required to locate the payload queue
	 * and the payload queue tail must be maintained by the kernel.
1839
	 */
1840 1841 1842 1843 1844 1845 1846 1847
	bcp = &per_cpu(bau_control, smp_processor_id());
	if (bcp->uvhub_version <= 3) {
		tail = first;
		gnode = uv_gpa_to_gnode(uv_gpa(pqp));
		first = (gnode << UV_PAYLOADQ_GNODE_SHIFT) | tail;
		write_mmr_payload_tail(pnode, tail);
	}

1848 1849
	ops.write_payload_first(pnode, first);
	ops.write_payload_last(pnode, last);
C
Cliff Wickman 已提交
1850

1851
	/* in effect, all msg_type's are set to MSG_NOOP */
C
Cliff Wickman 已提交
1852
	memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);
1853
}
1854

1855
/*
1856
 * Initialization of each UV hub's structures
1857
 */
C
Cliff Wickman 已提交
1858
static void __init init_uvhub(int uvhub, int vector, int base_pnode)
1859
{
1860
	int node;
1861 1862
	int pnode;
	unsigned long apicid;
1863 1864 1865

	node = uvhub_to_first_node(uvhub);
	pnode = uv_blade_to_pnode(uvhub);
C
Cliff Wickman 已提交
1866 1867 1868 1869

	activation_descriptor_init(node, pnode, base_pnode);

	pq_init(node, pnode);
1870
	/*
1871 1872
	 * The below initialization can't be in firmware because the
	 * messaging IRQ will be determined by the OS.
1873
	 */
1874
	apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
C
Cliff Wickman 已提交
1875
	write_mmr_data_config(pnode, ((apicid << 32) | vector));
1876 1877
}

1878 1879 1880
/*
 * We will set BAU_MISC_CONTROL with a timeout period.
 * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
C
Cliff Wickman 已提交
1881
 * So the destination timeout period has to be calculated from them.
1882
 */
C
Cliff Wickman 已提交
1883
static int calculate_destination_timeout(void)
1884 1885 1886 1887 1888 1889 1890 1891 1892
{
	unsigned long mmr_image;
	int mult1;
	int mult2;
	int index;
	int base;
	int ret;
	unsigned long ts_ns;

1893
	if (is_uv1_hub()) {
C
Cliff Wickman 已提交
1894
		mult1 = SOFTACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
1895 1896 1897 1898
		mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
		index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
		mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
		mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
1899 1900
		ts_ns = timeout_base_ns[index];
		ts_ns *= (mult1 * mult2);
1901 1902
		ret = ts_ns / 1000;
	} else {
1903
		/* same destination timeout for uv2 and uv3 */
1904 1905
		/* 4 bits  0/1 for 10/80us base, 3 bits of multiplier */
		mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL);
1906
		mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT;
C
Cliff Wickman 已提交
1907
		if (mmr_image & (1L << UV2_ACK_UNITS_SHFT))
1908
			base = 80;
1909
		else
1910 1911
			base = 10;
		mult1 = mmr_image & UV2_ACK_MASK;
1912 1913
		ret = mult1 * base;
	}
1914 1915 1916
	return ret;
}

C
Cliff Wickman 已提交
1917 1918 1919 1920 1921 1922 1923 1924
static void __init init_per_cpu_tunables(void)
{
	int cpu;
	struct bau_control *bcp;

	for_each_present_cpu(cpu) {
		bcp = &per_cpu(bau_control, cpu);
		bcp->baudisabled		= 0;
1925
		if (nobau)
1926
			bcp->nobau		= true;
C
Cliff Wickman 已提交
1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938
		bcp->statp			= &per_cpu(ptcstats, cpu);
		/* time interval to catch a hardware stay-busy bug */
		bcp->timeout_interval		= usec_2_cycles(2*timeout_us);
		bcp->max_concurr		= max_concurr;
		bcp->max_concurr_const		= max_concurr;
		bcp->plugged_delay		= plugged_delay;
		bcp->plugsb4reset		= plugsb4reset;
		bcp->timeoutsb4reset		= timeoutsb4reset;
		bcp->ipi_reset_limit		= ipi_reset_limit;
		bcp->complete_threshold		= complete_threshold;
		bcp->cong_response_us		= congested_respns_us;
		bcp->cong_reps			= congested_reps;
1939 1940
		bcp->disabled_period		= sec_2_cycles(disabled_period);
		bcp->giveup_limit		= giveup_limit;
1941 1942
		spin_lock_init(&bcp->queue_lock);
		spin_lock_init(&bcp->uvhub_lock);
1943
		spin_lock_init(&bcp->disable_lock);
C
Cliff Wickman 已提交
1944 1945 1946
	}
}

1947
/*
C
Cliff Wickman 已提交
1948
 * Scan all cpus to collect blade and socket summaries.
1949
 */
C
Cliff Wickman 已提交
1950 1951 1952
static int __init get_cpu_topology(int base_pnode,
					struct uvhub_desc *uvhub_descs,
					unsigned char *uvhub_mask)
1953 1954 1955 1956
{
	int cpu;
	int pnode;
	int uvhub;
C
Cliff Wickman 已提交
1957
	int socket;
1958 1959 1960 1961 1962 1963
	struct bau_control *bcp;
	struct uvhub_desc *bdp;
	struct socket_desc *sdp;

	for_each_present_cpu(cpu) {
		bcp = &per_cpu(bau_control, cpu);
C
Cliff Wickman 已提交
1964

1965
		memset(bcp, 0, sizeof(struct bau_control));
C
Cliff Wickman 已提交
1966

1967
		pnode = uv_cpu_hub_info(cpu)->pnode;
C
Cliff Wickman 已提交
1968
		if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) {
1969
			pr_emerg(
1970
				"cpu %d pnode %d-%d beyond %d; BAU disabled\n",
C
Cliff Wickman 已提交
1971
				cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE);
1972 1973
			return 1;
		}
C
Cliff Wickman 已提交
1974

1975
		bcp->osnode = cpu_to_node(cpu);
C
Cliff Wickman 已提交
1976 1977
		bcp->partition_base_pnode = base_pnode;

1978
		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
C
Cliff Wickman 已提交
1979
		*(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
1980
		bdp = &uvhub_descs[uvhub];
C
Cliff Wickman 已提交
1981

1982 1983 1984
		bdp->num_cpus++;
		bdp->uvhub = uvhub;
		bdp->pnode = pnode;
C
Cliff Wickman 已提交
1985

1986 1987
		/* kludge: 'assuming' one node per socket, and assuming that
		   disabling a socket just leaves a gap in node numbers */
1988
		socket = bcp->osnode & 1;
1989
		bdp->socket_mask |= (1 << socket);
1990 1991 1992
		sdp = &bdp->socket[socket];
		sdp->cpu_number[sdp->num_cpus] = cpu;
		sdp->num_cpus++;
1993
		if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
1994
			pr_emerg("%d cpus per socket invalid\n",
C
Cliff Wickman 已提交
1995
				sdp->num_cpus);
1996 1997
			return 1;
		}
1998
	}
C
Cliff Wickman 已提交
1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
	return 0;
}

/*
 * Each socket is to get a local array of pnodes/hubs.
 */
static void make_per_cpu_thp(struct bau_control *smaster)
{
	int cpu;
	size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus();

	smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode);
	memset(smaster->thp, 0, hpsz);
	for_each_present_cpu(cpu) {
		smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode;
		smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
	}
}

2018 2019 2020 2021 2022 2023 2024 2025 2026 2027
/*
 * Each uvhub is to get a local cpumask.
 */
static void make_per_hub_cpumask(struct bau_control *hmaster)
{
	int sz = sizeof(cpumask_t);

	hmaster->cpumask = kzalloc_node(sz, GFP_KERNEL, hmaster->osnode);
}

C
Cliff Wickman 已提交
2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053
/*
 * Initialize all the per_cpu information for the cpu's on a given socket,
 * given what has been gathered into the socket_desc struct.
 * And reports the chosen hub and socket masters back to the caller.
 */
static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
			struct bau_control **smasterp,
			struct bau_control **hmasterp)
{
	int i;
	int cpu;
	struct bau_control *bcp;

	for (i = 0; i < sdp->num_cpus; i++) {
		cpu = sdp->cpu_number[i];
		bcp = &per_cpu(bau_control, cpu);
		bcp->cpu = cpu;
		if (i == 0) {
			*smasterp = bcp;
			if (!(*hmasterp))
				*hmasterp = bcp;
		}
		bcp->cpus_in_uvhub = bdp->num_cpus;
		bcp->cpus_in_socket = sdp->num_cpus;
		bcp->socket_master = *smasterp;
		bcp->uvhub = bdp->uvhub;
2054 2055 2056 2057
		if (is_uv1_hub())
			bcp->uvhub_version = 1;
		else if (is_uv2_hub())
			bcp->uvhub_version = 2;
2058 2059
		else if (is_uv3_hub())
			bcp->uvhub_version = 3;
2060 2061
		else if (is_uv4_hub())
			bcp->uvhub_version = 4;
2062
		else {
2063
			pr_emerg("uvhub version not 1, 2, 3, or 4\n");
2064 2065
			return 1;
		}
C
Cliff Wickman 已提交
2066
		bcp->uvhub_master = *hmasterp;
2067 2068
		bcp->uvhub_cpu = uv_cpu_blade_processor_id(cpu);

C
Cliff Wickman 已提交
2069
		if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
2070
			pr_emerg("%d cpus per uvhub invalid\n",
C
Cliff Wickman 已提交
2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088
				bcp->uvhub_cpu);
			return 1;
		}
	}
	return 0;
}

/*
 * Summarize the blade and socket topology into the per_cpu structures.
 */
static int __init summarize_uvhub_sockets(int nuvhubs,
			struct uvhub_desc *uvhub_descs,
			unsigned char *uvhub_mask)
{
	int socket;
	int uvhub;
	unsigned short socket_mask;

C
Cliff Wickman 已提交
2089
	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
C
Cliff Wickman 已提交
2090 2091 2092 2093
		struct uvhub_desc *bdp;
		struct bau_control *smaster = NULL;
		struct bau_control *hmaster = NULL;

C
Cliff Wickman 已提交
2094 2095
		if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
			continue;
C
Cliff Wickman 已提交
2096

2097
		bdp = &uvhub_descs[uvhub];
2098 2099 2100
		socket_mask = bdp->socket_mask;
		socket = 0;
		while (socket_mask) {
C
Cliff Wickman 已提交
2101 2102 2103 2104
			struct socket_desc *sdp;
			if ((socket_mask & 1)) {
				sdp = &bdp->socket[socket];
				if (scan_sock(sdp, bdp, &smaster, &hmaster))
2105
					return 1;
2106
				make_per_cpu_thp(smaster);
2107 2108
			}
			socket++;
2109
			socket_mask = (socket_mask >> 1);
2110
		}
2111
		make_per_hub_cpumask(hmaster);
2112
	}
C
Cliff Wickman 已提交
2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124
	return 0;
}

/*
 * initialize the bau_control structure for each cpu
 */
static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
{
	unsigned char *uvhub_mask;
	void *vp;
	struct uvhub_desc *uvhub_descs;

2125 2126
	if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub())
		timeout_us = calculate_destination_timeout();
C
Cliff Wickman 已提交
2127 2128 2129 2130 2131 2132 2133

	vp = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
	uvhub_descs = (struct uvhub_desc *)vp;
	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
	uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);

	if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask))
2134
		goto fail;
C
Cliff Wickman 已提交
2135 2136

	if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask))
2137
		goto fail;
C
Cliff Wickman 已提交
2138

2139
	kfree(uvhub_descs);
C
Cliff Wickman 已提交
2140
	kfree(uvhub_mask);
C
Cliff Wickman 已提交
2141
	init_per_cpu_tunables();
2142
	return 0;
2143 2144 2145 2146 2147

fail:
	kfree(uvhub_descs);
	kfree(uvhub_mask);
	return 1;
2148 2149 2150 2151 2152 2153 2154
}

/*
 * Initialization of BAU-related structures
 */
static int __init uv_bau_init(void)
{
2155 2156 2157
	int uvhub;
	int pnode;
	int nuvhubs;
2158
	int cur_cpu;
C
Cliff Wickman 已提交
2159
	int cpus;
2160
	int vector;
C
Cliff Wickman 已提交
2161
	cpumask_var_t *mask;
2162 2163 2164

	if (!is_uv_system())
		return 0;
2165

2166 2167 2168
	if (is_uv4_hub())
		ops = uv4_bau_ops;
	else if (is_uv3_hub())
2169 2170 2171 2172 2173 2174
		ops = uv123_bau_ops;
	else if (is_uv2_hub())
		ops = uv123_bau_ops;
	else if (is_uv1_hub())
		ops = uv123_bau_ops;

C
Cliff Wickman 已提交
2175 2176 2177 2178
	for_each_possible_cpu(cur_cpu) {
		mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
		zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
	}
2179

2180
	nuvhubs = uv_num_possible_blades();
C
Cliff Wickman 已提交
2181
	congested_cycles = usec_2_cycles(congested_respns_us);
2182

C
Cliff Wickman 已提交
2183
	uv_base_pnode = 0x7fffffff;
2184
	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
C
Cliff Wickman 已提交
2185 2186 2187
		cpus = uv_blade_nr_possible_cpus(uvhub);
		if (cpus && (uv_blade_to_pnode(uvhub) < uv_base_pnode))
			uv_base_pnode = uv_blade_to_pnode(uvhub);
2188 2189
	}

2190 2191 2192
	/* software timeouts are not supported on UV4 */
	if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub())
		enable_timeouts();
2193

C
Cliff Wickman 已提交
2194
	if (init_per_cpu(nuvhubs, uv_base_pnode)) {
2195 2196
		set_bau_off();
		nobau_perm = 1;
2197 2198
		return 0;
	}
2199 2200

	vector = UV_BAU_MESSAGE;
2201
	for_each_possible_blade(uvhub) {
2202
		if (uv_blade_nr_possible_cpus(uvhub))
C
Cliff Wickman 已提交
2203
			init_uvhub(uvhub, vector, uv_base_pnode);
2204
	}
2205 2206 2207 2208

	alloc_intr_gate(vector, uv_bau_message_intr1);

	for_each_possible_blade(uvhub) {
2209
		if (uv_blade_nr_possible_cpus(uvhub)) {
C
Cliff Wickman 已提交
2210 2211
			unsigned long val;
			unsigned long mmr;
2212 2213
			pnode = uv_blade_to_pnode(uvhub);
			/* INIT the bau */
C
Cliff Wickman 已提交
2214 2215
			val = 1L << 63;
			write_gmmr_activation(pnode, val);
2216
			mmr = 1; /* should be 1 to broadcast to both sockets */
2217 2218
			if (!is_uv1_hub())
				write_mmr_data_broadcast(pnode, mmr);
2219
		}
2220
	}
2221

2222 2223
	return 0;
}
2224
core_initcall(uv_bau_init);
2225
fs_initcall(uv_ptc_init);