xpc_partition.c 31.1 KB
Newer Older
1 2 3 4 5
/*
 * This file is subject to the terms and conditions of the GNU General Public
 * License.  See the file "COPYING" in the main directory of this archive
 * for more details.
 *
6
 * Copyright (c) 2004-2008 Silicon Graphics, Inc.  All Rights Reserved.
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
 */

/*
 * Cross Partition Communication (XPC) partition support.
 *
 *	This is the part of XPC that detects the presence/absence of
 *	other partitions. It provides a heartbeat and monitors the
 *	heartbeats of other partitions.
 *
 */

#include <linux/kernel.h>
#include <linux/sysctl.h>
#include <linux/cache.h>
#include <linux/mmzone.h>
#include <linux/nodemask.h>
J
Jes Sorensen 已提交
23
#include <asm/uncached.h>
24 25 26 27 28
#include <asm/sn/bte.h>
#include <asm/sn/intr.h>
#include <asm/sn/sn_sal.h>
#include <asm/sn/nodepda.h>
#include <asm/sn/addrs.h>
29
#include "xpc.h"
30 31 32 33 34 35 36 37 38 39 40 41

/* XPC is exiting flag */
int xpc_exiting;

/* SH_IPI_ACCESS shub register value on startup */
static u64 xpc_sh1_IPI_access;
static u64 xpc_sh2_IPI_access0;
static u64 xpc_sh2_IPI_access1;
static u64 xpc_sh2_IPI_access2;
static u64 xpc_sh2_IPI_access3;

/* original protection values for each node */
42
u64 xpc_prot_vec[MAX_NUMNODES];
43

44
/* this partition's reserved page pointers */
45
struct xpc_rsvd_page *xpc_rsvd_page;
46 47
static u64 *xpc_part_nasids;
static u64 *xpc_mach_nasids;
48 49 50
struct xpc_vars *xpc_vars;
struct xpc_vars_part *xpc_vars_part;

51 52 53
static int xp_nasid_mask_bytes;	/* actual size in bytes of nasid mask */
static int xp_nasid_mask_words;	/* actual size in words of nasid mask */

54 55 56 57 58 59 60 61 62
/*
 * For performance reasons, each entry of xpc_partitions[] is cacheline
 * aligned. And xpc_partitions[] is padded with an additional entry at the
 * end so that the last legitimate entry doesn't share its cacheline with
 * another variable.
 */
struct xpc_partition xpc_partitions[XP_MAX_PARTITIONS + 1];

/*
63 64 65
 * Generic buffer used to store a local copy of portions of a remote
 * partition's reserved page (either its header and part_nasids mask,
 * or its vars).
66
 */
67 68
char *xpc_remote_copy_buffer;
void *xpc_remote_copy_buffer_base;
69

70 71 72
/*
 * Guarantee that the kmalloc'd memory is cacheline aligned.
 */
73
void *
74 75 76 77
xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
{
	/* see if kmalloc will give us cachline aligned memory by default */
	*base = kmalloc(size, flags);
78
	if (*base == NULL)
79
		return NULL;
80 81

	if ((u64)*base == L1_CACHE_ALIGN((u64)*base))
82
		return *base;
83

84 85 86 87
	kfree(*base);

	/* nope, we'll have to do it ourselves */
	*base = kmalloc(size + L1_CACHE_BYTES, flags);
88
	if (*base == NULL)
89
		return NULL;
90

91
	return (void *)L1_CACHE_ALIGN((u64)*base);
92 93
}

94 95 96 97 98
/*
 * Given a nasid, get the physical address of the  partition's reserved page
 * for that nasid. This function returns 0 on any error.
 */
static u64
99
xpc_get_rsvd_page_pa(int nasid)
100 101 102 103 104 105
{
	bte_result_t bte_res;
	s64 status;
	u64 cookie = 0;
	u64 rp_pa = nasid;	/* seed with nasid */
	u64 len = 0;
106 107 108
	u64 buf = buf;
	u64 buf_len = 0;
	void *buf_base = NULL;
109 110 111 112

	while (1) {

		status = sn_partition_reserved_page_pa(buf, &cookie, &rp_pa,
113
						       &len);
114 115 116 117 118

		dev_dbg(xpc_part, "SAL returned with status=%li, cookie="
			"0x%016lx, address=0x%016lx, len=0x%016lx\n",
			status, cookie, rp_pa, len);

119
		if (status != SALRET_MORE_PASSES)
120 121
			break;

122
		if (L1_CACHE_ALIGN(len) > buf_len) {
123
			kfree(buf_base);
124
			buf_len = L1_CACHE_ALIGN(len);
125 126 127
			buf = (u64)xpc_kmalloc_cacheline_aligned(buf_len,
								 GFP_KERNEL,
								 &buf_base);
128 129 130 131 132 133
			if (buf_base == NULL) {
				dev_err(xpc_part, "unable to kmalloc "
					"len=0x%016lx\n", buf_len);
				status = SALRET_ERROR;
				break;
			}
134 135
		}

136
		bte_res = xp_bte_copy(rp_pa, buf, buf_len,
137
				      (BTE_NOTIFY | BTE_WACQUIRE), NULL);
138 139 140 141 142 143 144
		if (bte_res != BTE_SUCCESS) {
			dev_dbg(xpc_part, "xp_bte_copy failed %i\n", bte_res);
			status = SALRET_ERROR;
			break;
		}
	}

145
	kfree(buf_base);
146

147
	if (status != SALRET_OK)
148
		rp_pa = 0;
149

150 151 152 153 154 155 156 157 158 159 160 161 162 163
	dev_dbg(xpc_part, "reserved page at phys address 0x%016lx\n", rp_pa);
	return rp_pa;
}

/*
 * Fill the partition reserved page with the information needed by
 * other partitions to discover we are alive and establish initial
 * communications.
 */
struct xpc_rsvd_page *
xpc_rsvd_page_init(void)
{
	struct xpc_rsvd_page *rp;
	AMO_t *amos_page;
164
	u64 rp_pa, nasid_array = 0;
165 166 167 168
	int i, ret;

	/* get the local reserved page's address */

169 170 171
	preempt_disable();
	rp_pa = xpc_get_rsvd_page_pa(cpuid_to_nasid(smp_processor_id()));
	preempt_enable();
172 173 174 175
	if (rp_pa == 0) {
		dev_err(xpc_part, "SAL failed to locate the reserved page\n");
		return NULL;
	}
176
	rp = (struct xpc_rsvd_page *)__va(rp_pa);
177 178 179 180 181 182 183 184 185

	if (rp->partid != sn_partition_id) {
		dev_err(xpc_part, "the reserved page's partid of %d should be "
			"%d\n", rp->partid, sn_partition_id);
		return NULL;
	}

	rp->version = XPC_RP_VERSION;

186 187 188 189 190 191 192 193 194 195 196 197 198
	/* establish the actual sizes of the nasid masks */
	if (rp->SAL_version == 1) {
		/* SAL_version 1 didn't set the nasids_size field */
		rp->nasids_size = 128;
	}
	xp_nasid_mask_bytes = rp->nasids_size;
	xp_nasid_mask_words = xp_nasid_mask_bytes / 8;

	/* setup the pointers to the various items in the reserved page */
	xpc_part_nasids = XPC_RP_PART_NASIDS(rp);
	xpc_mach_nasids = XPC_RP_MACH_NASIDS(rp);
	xpc_vars = XPC_RP_VARS(rp);
	xpc_vars_part = XPC_RP_VARS_PART(rp);
199 200 201 202 203 204 205 206 207 208 209 210 211

	/*
	 * Before clearing xpc_vars, see if a page of AMOs had been previously
	 * allocated. If not we'll need to allocate one and set permissions
	 * so that cross-partition AMOs are allowed.
	 *
	 * The allocated AMO page needs MCA reporting to remain disabled after
	 * XPC has unloaded.  To make this work, we keep a copy of the pointer
	 * to this page (i.e., amos_page) in the struct xpc_vars structure,
	 * which is pointed to by the reserved page, and re-use that saved copy
	 * on subsequent loads of XPC. This AMO page is never freed, and its
	 * memory protections are never restricted.
	 */
212 213
	amos_page = xpc_vars->amos_page;
	if (amos_page == NULL) {
214
		amos_page = (AMO_t *)TO_AMO(uncached_alloc_page(0, 1));
215 216 217 218 219 220 221 222 223 224
		if (amos_page == NULL) {
			dev_err(xpc_part, "can't allocate page of AMOs\n");
			return NULL;
		}

		/*
		 * Open up AMO-R/W to cpu.  This is done for Shub 1.1 systems
		 * when xpc_allow_IPI_ops() is called via xpc_hb_init().
		 */
		if (!enable_shub_wars_1_1()) {
225 226 227 228
			ret = sn_change_memprotect(ia64_tpa((u64)amos_page),
						   PAGE_SIZE,
						   SN_MEMPROT_ACCESS_CLASS_1,
						   &nasid_array);
229 230 231
			if (ret != 0) {
				dev_err(xpc_part, "can't change memory "
					"protections\n");
J
Jes Sorensen 已提交
232
				uncached_free_page(__IA64_UNCACHED_OFFSET |
233
						   TO_PHYS((u64)amos_page), 1);
234 235 236
				return NULL;
			}
		}
237
	} else if (!IS_AMO_ADDRESS((u64)amos_page)) {
238 239 240 241 242 243
		/*
		 * EFI's XPBOOT can also set amos_page in the reserved page,
		 * but it happens to leave it as an uncached physical address
		 * and we need it to be an uncached virtual, so we'll have to
		 * convert it.
		 */
244
		if (!IS_AMO_PHYS_ADDRESS((u64)amos_page)) {
245
			dev_err(xpc_part, "previously used amos_page address "
246
				"is bad = 0x%p\n", (void *)amos_page);
247 248
			return NULL;
		}
249
		amos_page = (AMO_t *)TO_AMO((u64)amos_page);
250 251
	}

252
	/* clear xpc_vars */
253 254 255 256 257
	memset(xpc_vars, 0, sizeof(struct xpc_vars));

	xpc_vars->version = XPC_V_VERSION;
	xpc_vars->act_nasid = cpuid_to_nasid(0);
	xpc_vars->act_phys_cpuid = cpu_physical_id(0);
258
	xpc_vars->vars_part_pa = __pa(xpc_vars_part);
259 260
	xpc_vars->amos_page_pa = ia64_tpa((u64)amos_page);
	xpc_vars->amos_page = amos_page;	/* save for next load of XPC */
261

262
	/* clear xpc_vars_part */
263 264
	memset((u64 *)xpc_vars_part, 0, sizeof(struct xpc_vars_part) *
	       XP_MAX_PARTITIONS);
265

266
	/* initialize the activate IRQ related AMO variables */
267
	for (i = 0; i < xp_nasid_mask_words; i++)
268
		(void)xpc_IPI_init(XPC_ACTIVATE_IRQ_AMOS + i);
269 270

	/* initialize the engaged remote partitions related AMO variables */
271 272
	(void)xpc_IPI_init(XPC_ENGAGED_PARTITIONS_AMO);
	(void)xpc_IPI_init(XPC_DISENGAGE_REQUEST_AMO);
273

274
	/* timestamp of when reserved page was setup by XPC */
275
	rp->stamp = CURRENT_TIME;
276 277 278 279 280

	/*
	 * This signifies to the remote partition that our reserved
	 * page is initialized.
	 */
281
	rp->vars_pa = __pa(xpc_vars);
282 283 284 285 286 287 288 289 290 291 292 293 294 295

	return rp;
}

/*
 * Change protections to allow IPI operations (and AMO operations on
 * Shub 1.1 systems).
 */
void
xpc_allow_IPI_ops(void)
{
	int node;
	int nasid;

296
	/* >>> Change SH_IPI_ACCESS code to use SAL call once it is available */
297 298 299

	if (is_shub2()) {
		xpc_sh2_IPI_access0 =
300
		    (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS0));
301
		xpc_sh2_IPI_access1 =
302
		    (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS1));
303
		xpc_sh2_IPI_access2 =
304
		    (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS2));
305
		xpc_sh2_IPI_access3 =
306
		    (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS3));
307 308 309

		for_each_online_node(node) {
			nasid = cnodeid_to_nasid(node);
310 311 312 313 314 315 316 317
			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0),
			      -1UL);
			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1),
			      -1UL);
			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2),
			      -1UL);
			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3),
			      -1UL);
318 319 320 321
		}

	} else {
		xpc_sh1_IPI_access =
322
		    (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH1_IPI_ACCESS));
323 324 325

		for_each_online_node(node) {
			nasid = cnodeid_to_nasid(node);
326 327
			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS),
			      -1UL);
328 329 330 331 332 333 334

			/*
			 * Since the BIST collides with memory operations on
			 * SHUB 1.1 sn_change_memprotect() cannot be used.
			 */
			if (enable_shub_wars_1_1()) {
				/* open up everything */
335 336 337
				xpc_prot_vec[node] = (u64)HUB_L((u64 *)
								GLOBAL_MMR_ADDR
								(nasid,
338
						  SH1_MD_DQLP_MMR_DIR_PRIVEC0));
339 340
				HUB_S((u64 *)
				      GLOBAL_MMR_ADDR(nasid,
341
						   SH1_MD_DQLP_MMR_DIR_PRIVEC0),
342 343 344
				      -1UL);
				HUB_S((u64 *)
				      GLOBAL_MMR_ADDR(nasid,
345
						   SH1_MD_DQRP_MMR_DIR_PRIVEC0),
346
				      -1UL);
347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
			}
		}
	}
}

/*
 * Restrict protections to disallow IPI operations (and AMO operations on
 * Shub 1.1 systems).
 */
void
xpc_restrict_IPI_ops(void)
{
	int node;
	int nasid;

362
	/* >>> Change SH_IPI_ACCESS code to use SAL call once it is available */
363 364 365 366 367

	if (is_shub2()) {

		for_each_online_node(node) {
			nasid = cnodeid_to_nasid(node);
368 369 370 371 372 373 374 375
			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0),
			      xpc_sh2_IPI_access0);
			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1),
			      xpc_sh2_IPI_access1);
			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2),
			      xpc_sh2_IPI_access2);
			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3),
			      xpc_sh2_IPI_access3);
376 377 378 379 380 381
		}

	} else {

		for_each_online_node(node) {
			nasid = cnodeid_to_nasid(node);
382 383
			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS),
			      xpc_sh1_IPI_access);
384 385

			if (enable_shub_wars_1_1()) {
386
				HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid,
387
						   SH1_MD_DQLP_MMR_DIR_PRIVEC0),
388 389
				      xpc_prot_vec[node]);
				HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid,
390
						   SH1_MD_DQRP_MMR_DIR_PRIVEC0),
391
				      xpc_prot_vec[node]);
392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408
			}
		}
	}
}

/*
 * At periodic intervals, scan through all active partitions and ensure
 * their heartbeat is still active.  If not, the partition is deactivated.
 */
void
xpc_check_remote_hb(void)
{
	struct xpc_vars *remote_vars;
	struct xpc_partition *part;
	partid_t partid;
	bte_result_t bres;

409
	remote_vars = (struct xpc_vars *)xpc_remote_copy_buffer;
410 411

	for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
412

413
		if (xpc_exiting)
414 415
			break;

416
		if (partid == sn_partition_id)
417 418 419 420 421
			continue;

		part = &xpc_partitions[partid];

		if (part->act_state == XPC_P_INACTIVE ||
422
		    part->act_state == XPC_P_DEACTIVATING) {
423 424 425 426 427
			continue;
		}

		/* pull the remote_hb cache line */
		bres = xp_bte_copy(part->remote_vars_pa,
428 429 430
				   (u64)remote_vars,
				   XPC_RP_VARS_SIZE,
				   (BTE_NOTIFY | BTE_WACQUIRE), NULL);
431 432
		if (bres != BTE_SUCCESS) {
			XPC_DEACTIVATE_PARTITION(part,
433
						 xpc_map_bte_errors(bres));
434 435 436 437
			continue;
		}

		dev_dbg(xpc_part, "partid = %d, heartbeat = %ld, last_heartbeat"
438 439 440
			" = %ld, heartbeat_offline = %ld, HB_mask = 0x%lx\n",
			partid, remote_vars->heartbeat, part->last_heartbeat,
			remote_vars->heartbeat_offline,
441 442 443
			remote_vars->heartbeating_to_mask);

		if (((remote_vars->heartbeat == part->last_heartbeat) &&
444 445
		     (remote_vars->heartbeat_offline == 0)) ||
		    !xpc_hb_allowed(sn_partition_id, remote_vars)) {
446

447
			XPC_DEACTIVATE_PARTITION(part, xpNoHeartbeat);
448 449 450 451 452 453 454 455
			continue;
		}

		part->last_heartbeat = remote_vars->heartbeat;
	}
}

/*
456
 * Get a copy of a portion of the remote partition's rsvd page.
457 458
 *
 * remote_rp points to a buffer that is cacheline aligned for BTE copies and
459 460
 * is large enough to contain a copy of their reserved page header and
 * part_nasids mask.
461
 */
462
static enum xp_retval
463
xpc_get_remote_rp(int nasid, u64 *discovered_nasids,
464
		  struct xpc_rsvd_page *remote_rp, u64 *remote_rp_pa)
465 466 467 468 469
{
	int bres, i;

	/* get the reserved page's physical address */

470
	*remote_rp_pa = xpc_get_rsvd_page_pa(nasid);
471
	if (*remote_rp_pa == 0)
472
		return xpNoRsvdPageAddr;
473

474
	/* pull over the reserved page header and part_nasids mask */
475 476 477
	bres = xp_bte_copy(*remote_rp_pa, (u64)remote_rp,
			   XPC_RP_HEADER_SIZE + xp_nasid_mask_bytes,
			   (BTE_NOTIFY | BTE_WACQUIRE), NULL);
478
	if (bres != BTE_SUCCESS)
479 480 481
		return xpc_map_bte_errors(bres);

	if (discovered_nasids != NULL) {
482 483
		u64 *remote_part_nasids = XPC_RP_PART_NASIDS(remote_rp);

484
		for (i = 0; i < xp_nasid_mask_words; i++)
485
			discovered_nasids[i] |= remote_part_nasids[i];
486 487 488 489 490
	}

	/* check that the partid is for another partition */

	if (remote_rp->partid < 1 ||
491
	    remote_rp->partid > (XP_MAX_PARTITIONS - 1)) {
492
		return xpInvalidPartid;
493 494
	}

495
	if (remote_rp->partid == sn_partition_id)
496
		return xpLocalPartid;
497 498

	if (XPC_VERSION_MAJOR(remote_rp->version) !=
499
	    XPC_VERSION_MAJOR(XPC_RP_VERSION)) {
500
		return xpBadVersion;
501 502
	}

503
	return xpSuccess;
504 505 506
}

/*
507
 * Get a copy of the remote partition's XPC variables from the reserved page.
508 509
 *
 * remote_vars points to a buffer that is cacheline aligned for BTE copies and
510
 * assumed to be of size XPC_RP_VARS_SIZE.
511
 */
512
static enum xp_retval
513 514 515 516
xpc_get_remote_vars(u64 remote_vars_pa, struct xpc_vars *remote_vars)
{
	int bres;

517
	if (remote_vars_pa == 0)
518
		return xpVarsNotSet;
519 520

	/* pull over the cross partition variables */
521 522
	bres = xp_bte_copy(remote_vars_pa, (u64)remote_vars, XPC_RP_VARS_SIZE,
			   (BTE_NOTIFY | BTE_WACQUIRE), NULL);
523
	if (bres != BTE_SUCCESS)
524 525 526
		return xpc_map_bte_errors(bres);

	if (XPC_VERSION_MAJOR(remote_vars->version) !=
527
	    XPC_VERSION_MAJOR(XPC_V_VERSION)) {
528
		return xpBadVersion;
529 530
	}

531
	return xpSuccess;
532 533 534
}

/*
535 536 537 538
 * Update the remote partition's info.
 */
static void
xpc_update_partition_info(struct xpc_partition *part, u8 remote_rp_version,
539 540
			  struct timespec *remote_rp_stamp, u64 remote_rp_pa,
			  u64 remote_vars_pa, struct xpc_vars *remote_vars)
541 542
{
	part->remote_rp_version = remote_rp_version;
T
Tony Luck 已提交
543
	dev_dbg(xpc_part, "  remote_rp_version = 0x%016x\n",
544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581
		part->remote_rp_version);

	part->remote_rp_stamp = *remote_rp_stamp;
	dev_dbg(xpc_part, "  remote_rp_stamp (tv_sec = 0x%lx tv_nsec = 0x%lx\n",
		part->remote_rp_stamp.tv_sec, part->remote_rp_stamp.tv_nsec);

	part->remote_rp_pa = remote_rp_pa;
	dev_dbg(xpc_part, "  remote_rp_pa = 0x%016lx\n", part->remote_rp_pa);

	part->remote_vars_pa = remote_vars_pa;
	dev_dbg(xpc_part, "  remote_vars_pa = 0x%016lx\n",
		part->remote_vars_pa);

	part->last_heartbeat = remote_vars->heartbeat;
	dev_dbg(xpc_part, "  last_heartbeat = 0x%016lx\n",
		part->last_heartbeat);

	part->remote_vars_part_pa = remote_vars->vars_part_pa;
	dev_dbg(xpc_part, "  remote_vars_part_pa = 0x%016lx\n",
		part->remote_vars_part_pa);

	part->remote_act_nasid = remote_vars->act_nasid;
	dev_dbg(xpc_part, "  remote_act_nasid = 0x%x\n",
		part->remote_act_nasid);

	part->remote_act_phys_cpuid = remote_vars->act_phys_cpuid;
	dev_dbg(xpc_part, "  remote_act_phys_cpuid = 0x%x\n",
		part->remote_act_phys_cpuid);

	part->remote_amos_page_pa = remote_vars->amos_page_pa;
	dev_dbg(xpc_part, "  remote_amos_page_pa = 0x%lx\n",
		part->remote_amos_page_pa);

	part->remote_vars_version = remote_vars->version;
	dev_dbg(xpc_part, "  remote_vars_version = 0x%x\n",
		part->remote_vars_version);
}

582
/*
583
 * Prior code has determined the nasid which generated an IPI.  Inspect
584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600
 * that nasid to determine if its partition needs to be activated or
 * deactivated.
 *
 * A partition is consider "awaiting activation" if our partition
 * flags indicate it is not active and it has a heartbeat.  A
 * partition is considered "awaiting deactivation" if our partition
 * flags indicate it is active but it has no heartbeat or it is not
 * sending its heartbeat to us.
 *
 * To determine the heartbeat, the remote nasid must have a properly
 * initialized reserved page.
 */
static void
xpc_identify_act_IRQ_req(int nasid)
{
	struct xpc_rsvd_page *remote_rp;
	struct xpc_vars *remote_vars;
601
	u64 remote_rp_pa;
602
	u64 remote_vars_pa;
603 604 605 606
	int remote_rp_version;
	int reactivate = 0;
	int stamp_diff;
	struct timespec remote_rp_stamp = { 0, 0 };
607 608
	partid_t partid;
	struct xpc_partition *part;
609
	enum xp_retval ret;
610 611 612

	/* pull over the reserved page structure */

613
	remote_rp = (struct xpc_rsvd_page *)xpc_remote_copy_buffer;
614

615
	ret = xpc_get_remote_rp(nasid, NULL, remote_rp, &remote_rp_pa);
616
	if (ret != xpSuccess) {
617
		dev_warn(xpc_part, "unable to get reserved page from nasid %d, "
618
			 "which sent interrupt, reason=%d\n", nasid, ret);
619 620 621 622
		return;
	}

	remote_vars_pa = remote_rp->vars_pa;
623
	remote_rp_version = remote_rp->version;
624
	if (XPC_SUPPORTS_RP_STAMP(remote_rp_version))
625
		remote_rp_stamp = remote_rp->stamp;
626

627 628 629 630 631
	partid = remote_rp->partid;
	part = &xpc_partitions[partid];

	/* pull over the cross partition variables */

632
	remote_vars = (struct xpc_vars *)xpc_remote_copy_buffer;
633 634

	ret = xpc_get_remote_vars(remote_vars_pa, remote_vars);
635
	if (ret != xpSuccess) {
636 637

		dev_warn(xpc_part, "unable to get XPC variables from nasid %d, "
638
			 "which sent interrupt, reason=%d\n", nasid, ret);
639 640 641 642 643 644 645 646

		XPC_DEACTIVATE_PARTITION(part, ret);
		return;
	}

	part->act_IRQ_rcvd++;

	dev_dbg(xpc_part, "partid for nasid %d is %d; IRQs = %d; HB = "
647
		"%ld:0x%lx\n", (int)nasid, (int)partid, part->act_IRQ_rcvd,
648 649
		remote_vars->heartbeat, remote_vars->heartbeating_to_mask);

650 651
	if (xpc_partition_disengaged(part) &&
	    part->act_state == XPC_P_INACTIVE) {
652

653
		xpc_update_partition_info(part, remote_rp_version,
654 655
					  &remote_rp_stamp, remote_rp_pa,
					  remote_vars_pa, remote_vars);
656

657 658 659 660 661 662 663 664 665 666 667 668
		if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) {
			if (xpc_partition_disengage_requested(1UL << partid)) {
				/*
				 * Other side is waiting on us to disengage,
				 * even though we already have.
				 */
				return;
			}
		} else {
			/* other side doesn't support disengage requests */
			xpc_clear_partition_disengage_request(1UL << partid);
		}
669

670 671 672
		xpc_activate_partition(part);
		return;
	}
673

674 675 676 677 678
	DBUG_ON(part->remote_rp_version == 0);
	DBUG_ON(part->remote_vars_version == 0);

	if (!XPC_SUPPORTS_RP_STAMP(part->remote_rp_version)) {
		DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(part->
679
						       remote_vars_version));
680 681 682

		if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) {
			DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->
683
							       version));
684 685
			/* see if the other side rebooted */
			if (part->remote_amos_page_pa ==
686 687
			    remote_vars->amos_page_pa &&
			    xpc_hb_allowed(sn_partition_id, remote_vars)) {
688 689 690 691
				/* doesn't look that way, so ignore the IPI */
				return;
			}
		}
692

693 694 695 696
		/*
		 * Other side rebooted and previous XPC didn't support the
		 * disengage request, so we don't need to do anything special.
		 */
697

698
		xpc_update_partition_info(part, remote_rp_version,
699 700
					  &remote_rp_stamp, remote_rp_pa,
					  remote_vars_pa, remote_vars);
701
		part->reactivate_nasid = nasid;
702
		XPC_DEACTIVATE_PARTITION(part, xpReactivating);
703 704
		return;
	}
705

706
	DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version));
707

708 709
	if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) {
		DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version));
710

711 712 713 714
		/*
		 * Other side rebooted and previous XPC did support the
		 * disengage request, but the new one doesn't.
		 */
715

716 717
		xpc_clear_partition_engaged(1UL << partid);
		xpc_clear_partition_disengage_request(1UL << partid);
718

719
		xpc_update_partition_info(part, remote_rp_version,
720 721
					  &remote_rp_stamp, remote_rp_pa,
					  remote_vars_pa, remote_vars);
722 723 724 725
		reactivate = 1;

	} else {
		DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version));
726

727
		stamp_diff = xpc_compare_stamps(&part->remote_rp_stamp,
728
						&remote_rp_stamp);
729 730 731 732 733 734 735 736 737 738
		if (stamp_diff != 0) {
			DBUG_ON(stamp_diff >= 0);

			/*
			 * Other side rebooted and the previous XPC did support
			 * the disengage request, as does the new one.
			 */

			DBUG_ON(xpc_partition_engaged(1UL << partid));
			DBUG_ON(xpc_partition_disengage_requested(1UL <<
739
								  partid));
740 741

			xpc_update_partition_info(part, remote_rp_version,
742 743 744
						  &remote_rp_stamp,
						  remote_rp_pa, remote_vars_pa,
						  remote_vars);
745 746 747 748
			reactivate = 1;
		}
	}

749
	if (part->disengage_request_timeout > 0 &&
750
	    !xpc_partition_disengaged(part)) {
751 752 753 754 755
		/* still waiting on other side to disengage from us */
		return;
	}

	if (reactivate) {
756
		part->reactivate_nasid = nasid;
757
		XPC_DEACTIVATE_PARTITION(part, xpReactivating);
758 759

	} else if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version) &&
760
		   xpc_partition_disengage_requested(1UL << partid)) {
761
		XPC_DEACTIVATE_PARTITION(part, xpOtherGoingDown);
762 763 764 765 766 767 768 769 770 771 772 773 774 775 776
	}
}

/*
 * Loop through the activation AMO variables and process any bits
 * which are set.  Each bit indicates a nasid sending a partition
 * activation or deactivation request.
 *
 * Return #of IRQs detected.
 */
int
xpc_identify_act_IRQ_sender(void)
{
	int word, bit;
	u64 nasid_mask;
777
	u64 nasid;		/* remote nasid */
778 779 780
	int n_IRQs_detected = 0;
	AMO_t *act_amos;

781
	act_amos = xpc_vars->amos_page + XPC_ACTIVATE_IRQ_AMOS;
782 783

	/* scan through act AMO variable looking for non-zero entries */
784
	for (word = 0; word < xp_nasid_mask_words; word++) {
785

786
		if (xpc_exiting)
787
			break;
788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803

		nasid_mask = xpc_IPI_receive(&act_amos[word]);
		if (nasid_mask == 0) {
			/* no IRQs from nasids in this variable */
			continue;
		}

		dev_dbg(xpc_part, "AMO[%d] gave back 0x%lx\n", word,
			nasid_mask);

		/*
		 * If this nasid has been added to the machine since
		 * our partition was reset, this will retain the
		 * remote nasid in our reserved pages machine mask.
		 * This is used in the event of module reload.
		 */
804
		xpc_mach_nasids[word] |= nasid_mask;
805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820

		/* locate the nasid(s) which sent interrupts */

		for (bit = 0; bit < (8 * sizeof(u64)); bit++) {
			if (nasid_mask & (1UL << bit)) {
				n_IRQs_detected++;
				nasid = XPC_NASID_FROM_W_B(word, bit);
				dev_dbg(xpc_part, "interrupt from nasid %ld\n",
					nasid);
				xpc_identify_act_IRQ_req(nasid);
			}
		}
	}
	return n_IRQs_detected;
}

821 822 823 824 825 826 827 828 829 830 831 832 833
/*
 * See if the other side has responded to a partition disengage request
 * from us.
 */
int
xpc_partition_disengaged(struct xpc_partition *part)
{
	partid_t partid = XPC_PARTID(part);
	int disengaged;

	disengaged = (xpc_partition_engaged(1UL << partid) == 0);
	if (part->disengage_request_timeout) {
		if (!disengaged) {
834 835
			if (time_before(jiffies,
			    part->disengage_request_timeout)) {
836 837 838 839 840 841 842 843 844
				/* timelimit hasn't been reached yet */
				return 0;
			}

			/*
			 * Other side hasn't responded to our disengage
			 * request in a timely fashion, so assume it's dead.
			 */

845
			dev_info(xpc_part, "disengage from remote partition %d "
846
				 "timed out\n", partid);
847
			xpc_disengage_request_timedout = 1;
848 849 850 851 852 853 854 855
			xpc_clear_partition_engaged(1UL << partid);
			disengaged = 1;
		}
		part->disengage_request_timeout = 0;

		/* cancel the timer function, provided it's not us */
		if (!in_interrupt()) {
			del_singleshot_timer_sync(&part->
856
						  disengage_request_timer);
857 858 859
		}

		DBUG_ON(part->act_state != XPC_P_DEACTIVATING &&
860
			part->act_state != XPC_P_INACTIVE);
861
		if (part->act_state != XPC_P_INACTIVE)
862 863
			xpc_wakeup_channel_mgr(part);

864
		if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version))
865 866 867 868 869
			xpc_cancel_partition_disengage_request(part);
	}
	return disengaged;
}

870 871 872
/*
 * Mark specified partition as active.
 */
873
enum xp_retval
874 875 876
xpc_mark_partition_active(struct xpc_partition *part)
{
	unsigned long irq_flags;
877
	enum xp_retval ret;
878 879 880 881 882 883

	dev_dbg(xpc_part, "setting partition %d to ACTIVE\n", XPC_PARTID(part));

	spin_lock_irqsave(&part->act_lock, irq_flags);
	if (part->act_state == XPC_P_ACTIVATING) {
		part->act_state = XPC_P_ACTIVE;
884
		ret = xpSuccess;
885
	} else {
886
		DBUG_ON(part->reason == xpSuccess);
887 888 889 890 891 892 893 894 895 896 897 898
		ret = part->reason;
	}
	spin_unlock_irqrestore(&part->act_lock, irq_flags);

	return ret;
}

/*
 * Notify XPC that the partition is down.
 */
void
xpc_deactivate_partition(const int line, struct xpc_partition *part,
899
			 enum xp_retval reason)
900 901 902 903 904 905 906 907
{
	unsigned long irq_flags;

	spin_lock_irqsave(&part->act_lock, irq_flags);

	if (part->act_state == XPC_P_INACTIVE) {
		XPC_SET_REASON(part, reason, line);
		spin_unlock_irqrestore(&part->act_lock, irq_flags);
908
		if (reason == xpReactivating) {
909 910 911 912 913 914
			/* we interrupt ourselves to reactivate partition */
			xpc_IPI_send_reactivate(part);
		}
		return;
	}
	if (part->act_state == XPC_P_DEACTIVATING) {
915 916
		if ((part->reason == xpUnloading && reason != xpUnloading) ||
		    reason == xpReactivating) {
917 918 919 920 921 922 923 924 925 926 927
			XPC_SET_REASON(part, reason, line);
		}
		spin_unlock_irqrestore(&part->act_lock, irq_flags);
		return;
	}

	part->act_state = XPC_P_DEACTIVATING;
	XPC_SET_REASON(part, reason, line);

	spin_unlock_irqrestore(&part->act_lock, irq_flags);

928 929 930
	if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) {
		xpc_request_partition_disengage(part);
		xpc_IPI_send_disengage(part);
931

932 933
		/* set a timelimit on the disengage request */
		part->disengage_request_timeout = jiffies +
934
		    (xpc_disengage_request_timelimit * HZ);
935
		part->disengage_request_timer.expires =
936
		    part->disengage_request_timeout;
937 938
		add_timer(&part->disengage_request_timer);
	}
939

940 941
	dev_dbg(xpc_part, "bringing partition %d down, reason = %d\n",
		XPC_PARTID(part), reason);
942

943
	xpc_partition_going_down(part, reason);
944 945 946
}

/*
947
 * Mark specified partition as inactive.
948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977
 */
void
xpc_mark_partition_inactive(struct xpc_partition *part)
{
	unsigned long irq_flags;

	dev_dbg(xpc_part, "setting partition %d to INACTIVE\n",
		XPC_PARTID(part));

	spin_lock_irqsave(&part->act_lock, irq_flags);
	part->act_state = XPC_P_INACTIVE;
	spin_unlock_irqrestore(&part->act_lock, irq_flags);
	part->remote_rp_pa = 0;
}

/*
 * SAL has provided a partition and machine mask.  The partition mask
 * contains a bit for each even nasid in our partition.  The machine
 * mask contains a bit for each even nasid in the entire machine.
 *
 * Using those two bit arrays, we can determine which nasids are
 * known in the machine.  Each should also have a reserved page
 * initialized if they are available for partitioning.
 */
void
xpc_discovery(void)
{
	void *remote_rp_base;
	struct xpc_rsvd_page *remote_rp;
	struct xpc_vars *remote_vars;
978
	u64 remote_rp_pa;
979 980
	u64 remote_vars_pa;
	int region;
981
	int region_size;
982 983 984 985 986 987
	int max_regions;
	int nasid;
	struct xpc_rsvd_page *rp;
	partid_t partid;
	struct xpc_partition *part;
	u64 *discovered_nasids;
988
	enum xp_retval ret;
989

990
	remote_rp = xpc_kmalloc_cacheline_aligned(XPC_RP_HEADER_SIZE +
991 992
						  xp_nasid_mask_bytes,
						  GFP_KERNEL, &remote_rp_base);
993
	if (remote_rp == NULL)
994
		return;
995

996
	remote_vars = (struct xpc_vars *)remote_rp;
997

998
	discovered_nasids = kzalloc(sizeof(u64) * xp_nasid_mask_words,
999
				    GFP_KERNEL);
1000 1001 1002 1003 1004
	if (discovered_nasids == NULL) {
		kfree(remote_rp_base);
		return;
	}

1005
	rp = (struct xpc_rsvd_page *)xpc_rsvd_page;
1006 1007 1008 1009 1010 1011

	/*
	 * The term 'region' in this context refers to the minimum number of
	 * nodes that can comprise an access protection grouping. The access
	 * protection is in regards to memory, IOI and IPI.
	 */
1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024
	max_regions = 64;
	region_size = sn_region_size;

	switch (region_size) {
	case 128:
		max_regions *= 2;
	case 64:
		max_regions *= 2;
	case 32:
		max_regions *= 2;
		region_size = 16;
		DBUG_ON(!is_shub2());
	}
1025 1026 1027

	for (region = 0; region < max_regions; region++) {

1028
		if (xpc_exiting)
1029 1030 1031 1032
			break;

		dev_dbg(xpc_part, "searching region %d\n", region);

1033
		for (nasid = (region * region_size * 2);
1034
		     nasid < ((region + 1) * region_size * 2); nasid += 2) {
1035

1036
			if (xpc_exiting)
1037 1038 1039 1040
				break;

			dev_dbg(xpc_part, "checking nasid %d\n", nasid);

1041
			if (XPC_NASID_IN_ARRAY(nasid, xpc_part_nasids)) {
1042 1043 1044 1045 1046 1047
				dev_dbg(xpc_part, "PROM indicates Nasid %d is "
					"part of the local partition; skipping "
					"region\n", nasid);
				break;
			}

1048
			if (!(XPC_NASID_IN_ARRAY(nasid, xpc_mach_nasids))) {
1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064
				dev_dbg(xpc_part, "PROM indicates Nasid %d was "
					"not on Numa-Link network at reset\n",
					nasid);
				continue;
			}

			if (XPC_NASID_IN_ARRAY(nasid, discovered_nasids)) {
				dev_dbg(xpc_part, "Nasid %d is part of a "
					"partition which was previously "
					"discovered\n", nasid);
				continue;
			}

			/* pull over the reserved page structure */

			ret = xpc_get_remote_rp(nasid, discovered_nasids,
1065
						remote_rp, &remote_rp_pa);
1066
			if (ret != xpSuccess) {
1067 1068 1069 1070
				dev_dbg(xpc_part, "unable to get reserved page "
					"from nasid %d, reason=%d\n", nasid,
					ret);

1071
				if (ret == xpLocalPartid)
1072
					break;
1073

1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084
				continue;
			}

			remote_vars_pa = remote_rp->vars_pa;

			partid = remote_rp->partid;
			part = &xpc_partitions[partid];

			/* pull over the cross partition variables */

			ret = xpc_get_remote_vars(remote_vars_pa, remote_vars);
1085
			if (ret != xpSuccess) {
1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111
				dev_dbg(xpc_part, "unable to get XPC variables "
					"from nasid %d, reason=%d\n", nasid,
					ret);

				XPC_DEACTIVATE_PARTITION(part, ret);
				continue;
			}

			if (part->act_state != XPC_P_INACTIVE) {
				dev_dbg(xpc_part, "partition %d on nasid %d is "
					"already activating\n", partid, nasid);
				break;
			}

			/*
			 * Register the remote partition's AMOs with SAL so it
			 * can handle and cleanup errors within that address
			 * range should the remote partition go down. We don't
			 * unregister this range because it is difficult to
			 * tell when outstanding writes to the remote partition
			 * are finished and thus when it is thus safe to
			 * unregister. This should not result in wasted space
			 * in the SAL xp_addr_region table because we should
			 * get the same page for remote_act_amos_pa after
			 * module reloads and system reboots.
			 */
1112 1113 1114 1115
			if (sn_register_xp_addr_region
			    (remote_vars->amos_page_pa, PAGE_SIZE, 1) < 0) {
				dev_dbg(xpc_part,
					"partition %d failed to "
1116 1117 1118
					"register xp_addr region 0x%016lx\n",
					partid, remote_vars->amos_page_pa);

1119
				XPC_SET_REASON(part, xpPhysAddrRegFailed,
1120
					       __LINE__);
1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134
				break;
			}

			/*
			 * The remote nasid is valid and available.
			 * Send an interrupt to that nasid to notify
			 * it that we are ready to begin activation.
			 */
			dev_dbg(xpc_part, "sending an interrupt to AMO 0x%lx, "
				"nasid %d, phys_cpuid 0x%x\n",
				remote_vars->amos_page_pa,
				remote_vars->act_nasid,
				remote_vars->act_phys_cpuid);

1135
			if (XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->
1136
							   version)) {
1137
				part->remote_amos_page_pa =
1138
				    remote_vars->amos_page_pa;
1139 1140 1141
				xpc_mark_partition_disengaged(part);
				xpc_cancel_partition_disengage_request(part);
			}
1142 1143 1144 1145 1146 1147 1148 1149 1150 1151
			xpc_IPI_send_activate(remote_vars);
		}
	}

	kfree(discovered_nasids);
	kfree(remote_rp_base);
}

/*
 * Given a partid, get the nasids owned by that partition from the
1152
 * remote partition's reserved page.
1153
 */
1154
enum xp_retval
1155 1156 1157 1158 1159 1160 1161
xpc_initiate_partid_to_nasids(partid_t partid, void *nasid_mask)
{
	struct xpc_partition *part;
	u64 part_nasid_pa;
	int bte_res;

	part = &xpc_partitions[partid];
1162
	if (part->remote_rp_pa == 0)
1163
		return xpPartitionDown;
1164

1165 1166
	memset(nasid_mask, 0, XP_NASID_MASK_BYTES);

1167
	part_nasid_pa = (u64)XPC_RP_PART_NASIDS(part->remote_rp_pa);
1168

1169 1170 1171
	bte_res = xp_bte_copy(part_nasid_pa, (u64)nasid_mask,
			      xp_nasid_mask_bytes, (BTE_NOTIFY | BTE_WACQUIRE),
			      NULL);
1172 1173 1174

	return xpc_map_bte_errors(bte_res);
}