grufault.c 23.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
/*
 * SN Platform GRU Driver
 *
 *              FAULT HANDLER FOR GRU DETECTED TLB MISSES
 *
 * This file contains code that handles TLB misses within the GRU.
 * These misses are reported either via interrupts or user polling of
 * the user CB.
 *
 *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 */

#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/device.h>
#include <linux/io.h>
#include <linux/uaccess.h>
35
#include <linux/security.h>
36
#include <linux/prefetch.h>
37 38 39 40 41 42 43
#include <asm/pgtable.h>
#include "gru.h"
#include "grutables.h"
#include "grulib.h"
#include "gru_instructions.h"
#include <asm/uv/uv_hub.h>

44 45 46 47 48 49
/* Return codes for vtop functions */
#define VTOP_SUCCESS               0
#define VTOP_INVALID               -1
#define VTOP_RETRY                 -2


50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
/*
 * Test if a physical address is a valid GRU GSEG address
 */
static inline int is_gru_paddr(unsigned long paddr)
{
	return paddr >= gru_start_paddr && paddr < gru_end_paddr;
}

/*
 * Find the vma of a GRU segment. Caller must hold mmap_sem.
 */
struct vm_area_struct *gru_find_vma(unsigned long vaddr)
{
	struct vm_area_struct *vma;

	vma = find_vma(current->mm, vaddr);
	if (vma && vma->vm_start <= vaddr && vma->vm_ops == &gru_vm_ops)
		return vma;
	return NULL;
}

/*
 * Find and lock the gts that contains the specified user vaddr.
 *
 * Returns:
 * 	- *gts with the mmap_sem locked for read and the GTS locked.
 *	- NULL if vaddr invalid OR is not a valid GSEG vaddr.
 */

static struct gru_thread_state *gru_find_lock_gts(unsigned long vaddr)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
	struct gru_thread_state *gts = NULL;

	down_read(&mm->mmap_sem);
	vma = gru_find_vma(vaddr);
	if (vma)
		gts = gru_find_thread_state(vma, TSID(vaddr, vma));
	if (gts)
		mutex_lock(&gts->ts_ctxlock);
	else
		up_read(&mm->mmap_sem);
	return gts;
}

static struct gru_thread_state *gru_alloc_locked_gts(unsigned long vaddr)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
J
Jack Steiner 已提交
100
	struct gru_thread_state *gts = ERR_PTR(-EINVAL);
101 102 103

	down_write(&mm->mmap_sem);
	vma = gru_find_vma(vaddr);
J
Jack Steiner 已提交
104 105 106 107 108 109 110 111 112
	if (!vma)
		goto err;

	gts = gru_alloc_thread_state(vma, TSID(vaddr, vma));
	if (IS_ERR(gts))
		goto err;
	mutex_lock(&gts->ts_ctxlock);
	downgrade_write(&mm->mmap_sem);
	return gts;
113

J
Jack Steiner 已提交
114 115
err:
	up_write(&mm->mmap_sem);
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
	return gts;
}

/*
 * Unlock a GTS that was previously locked with gru_find_lock_gts().
 */
static void gru_unlock_gts(struct gru_thread_state *gts)
{
	mutex_unlock(&gts->ts_ctxlock);
	up_read(&current->mm->mmap_sem);
}

/*
 * Set a CB.istatus to active using a user virtual address. This must be done
 * just prior to a TFH RESTART. The new cb.istatus is an in-cache status ONLY.
 * If the line is evicted, the status may be lost. The in-cache update
 * is necessary to prevent the user from seeing a stale cb.istatus that will
 * change as soon as the TFH restart is complete. Races may cause an
 * occasional failure to clear the cb.istatus, but that is ok.
 */
136
static void gru_cb_set_istatus_active(struct gru_instruction_bits *cbk)
137
{
138 139
	if (cbk) {
		cbk->istatus = CBS_ACTIVE;
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
	}
}

/*
 * Read & clear a TFM
 *
 * The GRU has an array of fault maps. A map is private to a cpu
 * Only one cpu will be accessing a cpu's fault map.
 *
 * This function scans the cpu-private fault map & clears all bits that
 * are set. The function returns a bitmap that indicates the bits that
 * were cleared. Note that sense the maps may be updated asynchronously by
 * the GRU, atomic operations must be used to clear bits.
 */
static void get_clear_fault_map(struct gru_state *gru,
155 156
				struct gru_tlb_fault_map *imap,
				struct gru_tlb_fault_map *dmap)
157 158 159 160 161 162 163 164 165 166
{
	unsigned long i, k;
	struct gru_tlb_fault_map *tfm;

	tfm = get_tfm_for_cpu(gru, gru_cpu_fault_map_id());
	prefetchw(tfm);		/* Helps on hardware, required for emulator */
	for (i = 0; i < BITS_TO_LONGS(GRU_NUM_CBE); i++) {
		k = tfm->fault_bits[i];
		if (k)
			k = xchg(&tfm->fault_bits[i], 0UL);
167 168 169 170 171
		imap->fault_bits[i] = k;
		k = tfm->done_bits[i];
		if (k)
			k = xchg(&tfm->done_bits[i], 0UL);
		dmap->fault_bits[i] = k;
172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
	}

	/*
	 * Not functionally required but helps performance. (Required
	 * on emulator)
	 */
	gru_flush_cache(tfm);
}

/*
 * Atomic (interrupt context) & non-atomic (user context) functions to
 * convert a vaddr into a physical address. The size of the page
 * is returned in pageshift.
 * 	returns:
 * 		  0 - successful
 * 		< 0 - error code
 * 		  1 - (atomic only) try again in non-atomic context
 */
static int non_atomic_pte_lookup(struct vm_area_struct *vma,
				 unsigned long vaddr, int write,
				 unsigned long *paddr, int *pageshift)
{
	struct page *page;

J
Jack Steiner 已提交
196 197 198
#ifdef CONFIG_HUGETLB_PAGE
	*pageshift = is_vm_hugetlb_page(vma) ? HPAGE_SHIFT : PAGE_SHIFT;
#else
199
	*pageshift = PAGE_SHIFT;
J
Jack Steiner 已提交
200
#endif
201
	if (get_user_pages(vaddr, 1, write ? FOLL_WRITE : 0, &page, NULL) <= 0)
202 203 204 205 206 207 208 209 210 211 212 213
		return -EFAULT;
	*paddr = page_to_phys(page);
	put_page(page);
	return 0;
}

/*
 * atomic_pte_lookup
 *
 * Convert a user virtual address to a physical address
 * Only supports Intel large pages (2MB only) on x86_64.
 *	ZZZ - hugepage support is incomplete
J
Jack Steiner 已提交
214 215 216
 *
 * NOTE: mmap_sem is already held on entry to this function. This
 * guarantees existence of the page tables.
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
 */
static int atomic_pte_lookup(struct vm_area_struct *vma, unsigned long vaddr,
	int write, unsigned long *paddr, int *pageshift)
{
	pgd_t *pgdp;
	pmd_t *pmdp;
	pud_t *pudp;
	pte_t pte;

	pgdp = pgd_offset(vma->vm_mm, vaddr);
	if (unlikely(pgd_none(*pgdp)))
		goto err;

	pudp = pud_offset(pgdp, vaddr);
	if (unlikely(pud_none(*pudp)))
		goto err;

	pmdp = pmd_offset(pudp, vaddr);
	if (unlikely(pmd_none(*pmdp)))
		goto err;
#ifdef CONFIG_X86_64
	if (unlikely(pmd_large(*pmdp)))
		pte = *(pte_t *) pmdp;
	else
#endif
		pte = *pte_offset_kernel(pmdp, vaddr);

	if (unlikely(!pte_present(pte) ||
		     (write && (!pte_write(pte) || !pte_dirty(pte)))))
		return 1;

	*paddr = pte_pfn(pte) << PAGE_SHIFT;
249
#ifdef CONFIG_HUGETLB_PAGE
250
	*pageshift = is_vm_hugetlb_page(vma) ? HPAGE_SHIFT : PAGE_SHIFT;
251 252 253
#else
	*pageshift = PAGE_SHIFT;
#endif
254 255 256 257 258 259
	return 0;

err:
	return 1;
}

260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
static int gru_vtop(struct gru_thread_state *gts, unsigned long vaddr,
		    int write, int atomic, unsigned long *gpa, int *pageshift)
{
	struct mm_struct *mm = gts->ts_mm;
	struct vm_area_struct *vma;
	unsigned long paddr;
	int ret, ps;

	vma = find_vma(mm, vaddr);
	if (!vma)
		goto inval;

	/*
	 * Atomic lookup is faster & usually works even if called in non-atomic
	 * context.
	 */
	rmb();	/* Must/check ms_range_active before loading PTEs */
	ret = atomic_pte_lookup(vma, vaddr, write, &paddr, &ps);
	if (ret) {
		if (atomic)
			goto upm;
		if (non_atomic_pte_lookup(vma, vaddr, write, &paddr, &ps))
			goto inval;
	}
	if (is_gru_paddr(paddr))
		goto inval;
	paddr = paddr & ~((1UL << ps) - 1);
	*gpa = uv_soc_phys_ram_to_gpa(paddr);
	*pageshift = ps;
289
	return VTOP_SUCCESS;
290 291

inval:
292
	return VTOP_INVALID;
293
upm:
294
	return VTOP_RETRY;
295 296 297
}


298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
/*
 * Flush a CBE from cache. The CBE is clean in the cache. Dirty the
 * CBE cacheline so that the line will be written back to home agent.
 * Otherwise the line may be silently dropped. This has no impact
 * except on performance.
 */
static void gru_flush_cache_cbe(struct gru_control_block_extended *cbe)
{
	if (unlikely(cbe)) {
		cbe->cbrexecstatus = 0;         /* make CL dirty */
		gru_flush_cache(cbe);
	}
}

/*
 * Preload the TLB with entries that may be required. Currently, preloading
 * is implemented only for BCOPY. Preload  <tlb_preload_count> pages OR to
 * the end of the bcopy tranfer, whichever is smaller.
 */
static void gru_preload_tlb(struct gru_state *gru,
			struct gru_thread_state *gts, int atomic,
			unsigned long fault_vaddr, int asid, int write,
			unsigned char tlb_preload_count,
			struct gru_tlb_fault_handle *tfh,
			struct gru_control_block_extended *cbe)
{
	unsigned long vaddr = 0, gpa;
	int ret, pageshift;

	if (cbe->opccpy != OP_BCOPY)
		return;

	if (fault_vaddr == cbe->cbe_baddr0)
		vaddr = fault_vaddr + GRU_CACHE_LINE_BYTES * cbe->cbe_src_cl - 1;
	else if (fault_vaddr == cbe->cbe_baddr1)
		vaddr = fault_vaddr + (1 << cbe->xtypecpy) * cbe->cbe_nelemcur - 1;

	fault_vaddr &= PAGE_MASK;
	vaddr &= PAGE_MASK;
	vaddr = min(vaddr, fault_vaddr + tlb_preload_count * PAGE_SIZE);

	while (vaddr > fault_vaddr) {
		ret = gru_vtop(gts, vaddr, write, atomic, &gpa, &pageshift);
		if (ret || tfh_write_only(tfh, gpa, GAA_RAM, vaddr, asid, write,
					  GRU_PAGESIZE(pageshift)))
			return;
		gru_dbg(grudev,
			"%s: gid %d, gts 0x%p, tfh 0x%p, vaddr 0x%lx, asid 0x%x, rw %d, ps %d, gpa 0x%lx\n",
			atomic ? "atomic" : "non-atomic", gru->gs_gid, gts, tfh,
			vaddr, asid, write, pageshift, gpa);
		vaddr -= PAGE_SIZE;
		STAT(tlb_preload_page);
	}
}

353 354 355 356 357 358 359 360 361 362
/*
 * Drop a TLB entry into the GRU. The fault is described by info in an TFH.
 *	Input:
 *		cb    Address of user CBR. Null if not running in user context
 * 	Return:
 * 		  0 = dropin, exception, or switch to UPM successful
 * 		  1 = range invalidate active
 * 		< 0 = error code
 *
 */
363 364
static int gru_try_dropin(struct gru_state *gru,
			  struct gru_thread_state *gts,
365
			  struct gru_tlb_fault_handle *tfh,
366
			  struct gru_instruction_bits *cbk)
367
{
368 369
	struct gru_control_block_extended *cbe = NULL;
	unsigned char tlb_preload_count = gts->ts_tlb_preload_count;
J
Jack Steiner 已提交
370
	int pageshift = 0, asid, write, ret, atomic = !cbk, indexway;
371
	unsigned long gpa = 0, vaddr = 0;
372 373 374 375 376 377 378 379

	/*
	 * NOTE: The GRU contains magic hardware that eliminates races between
	 * TLB invalidates and TLB dropins. If an invalidate occurs
	 * in the window between reading the TFH and the subsequent TLB dropin,
	 * the dropin is ignored. This eliminates the need for additional locks.
	 */

380 381 382 383 384 385 386 387
	/*
	 * Prefetch the CBE if doing TLB preloading
	 */
	if (unlikely(tlb_preload_count)) {
		cbe = gru_tfh_to_cbe(tfh);
		prefetchw(cbe);
	}

388 389 390 391 392
	/*
	 * Error if TFH state is IDLE or FMM mode & the user issuing a UPM call.
	 * Might be a hardware race OR a stupid user. Ignore FMM because FMM
	 * is a transient state.
	 */
393 394
	if (tfh->status != TFHSTATUS_EXCEPTION) {
		gru_flush_cache(tfh);
395
		sync_core();
396 397 398 399
		if (tfh->status != TFHSTATUS_EXCEPTION)
			goto failnoexception;
		STAT(tfh_stale_on_fault);
	}
400 401
	if (tfh->state == TFHSTATE_IDLE)
		goto failidle;
402
	if (tfh->state == TFHSTATE_MISS_FMM && cbk)
403 404 405 406 407
		goto failfmm;

	write = (tfh->cause & TFHCAUSE_TLB_MOD) != 0;
	vaddr = tfh->missvaddr;
	asid = tfh->missasid;
J
Jack Steiner 已提交
408
	indexway = tfh->indexway;
409 410 411 412 413 414 415 416 417 418 419 420
	if (asid == 0)
		goto failnoasid;

	rmb();	/* TFH must be cache resident before reading ms_range_active */

	/*
	 * TFH is cache resident - at least briefly. Fail the dropin
	 * if a range invalidate is active.
	 */
	if (atomic_read(&gts->ts_gms->ms_range_active))
		goto failactive;

421
	ret = gru_vtop(gts, vaddr, write, atomic, &gpa, &pageshift);
422
	if (ret == VTOP_INVALID)
423
		goto failinval;
424
	if (ret == VTOP_RETRY)
425
		goto failupm;
426

427 428
	if (!(gts->ts_sizeavail & GRU_SIZEAVAIL(pageshift))) {
		gts->ts_sizeavail |= GRU_SIZEAVAIL(pageshift);
429
		if (atomic || !gru_update_cch(gts)) {
430 431 432 433
			gts->ts_force_cch_reload = 1;
			goto failupm;
		}
	}
434 435

	if (unlikely(cbe) && pageshift == PAGE_SHIFT) {
436
		gru_preload_tlb(gru, gts, atomic, vaddr, asid, write, tlb_preload_count, tfh, cbe);
437 438 439
		gru_flush_cache_cbe(cbe);
	}

440
	gru_cb_set_istatus_active(cbk);
441
	gts->ustats.tlbdropin++;
442 443 444
	tfh_write_restart(tfh, gpa, GAA_RAM, vaddr, asid, write,
			  GRU_PAGESIZE(pageshift));
	gru_dbg(grudev,
J
Jack Steiner 已提交
445 446
		"%s: gid %d, gts 0x%p, tfh 0x%p, vaddr 0x%lx, asid 0x%x, indexway 0x%x,"
		" rw %d, ps %d, gpa 0x%lx\n",
447
		atomic ? "atomic" : "non-atomic", gru->gs_gid, gts, tfh, vaddr, asid,
J
Jack Steiner 已提交
448 449
		indexway, write, pageshift, gpa);
	STAT(tlb_dropin);
450 451 452 453 454 455
	return 0;

failnoasid:
	/* No asid (delayed unload). */
	STAT(tlb_dropin_fail_no_asid);
	gru_dbg(grudev, "FAILED no_asid tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
456
	if (!cbk)
457 458 459
		tfh_user_polling_mode(tfh);
	else
		gru_flush_cache(tfh);
460
	gru_flush_cache_cbe(cbe);
461 462 463 464 465
	return -EAGAIN;

failupm:
	/* Atomic failure switch CBR to UPM */
	tfh_user_polling_mode(tfh);
466
	gru_flush_cache_cbe(cbe);
467 468 469 470 471 472
	STAT(tlb_dropin_fail_upm);
	gru_dbg(grudev, "FAILED upm tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
	return 1;

failfmm:
	/* FMM state on UPM call */
J
Jack Steiner 已提交
473
	gru_flush_cache(tfh);
474
	gru_flush_cache_cbe(cbe);
475 476 477 478
	STAT(tlb_dropin_fail_fmm);
	gru_dbg(grudev, "FAILED fmm tfh: 0x%p, state %d\n", tfh, tfh->state);
	return 0;

479 480 481
failnoexception:
	/* TFH status did not show exception pending */
	gru_flush_cache(tfh);
482
	gru_flush_cache_cbe(cbe);
483 484
	if (cbk)
		gru_flush_cache(cbk);
485
	STAT(tlb_dropin_fail_no_exception);
486 487
	gru_dbg(grudev, "FAILED non-exception tfh: 0x%p, status %d, state %d\n",
		tfh, tfh->status, tfh->state);
488 489
	return 0;

490
failidle:
491
	/* TFH state was idle  - no miss pending */
492
	gru_flush_cache(tfh);
493
	gru_flush_cache_cbe(cbe);
494 495
	if (cbk)
		gru_flush_cache(cbk);
496 497 498 499 500 501 502
	STAT(tlb_dropin_fail_idle);
	gru_dbg(grudev, "FAILED idle tfh: 0x%p, state %d\n", tfh, tfh->state);
	return 0;

failinval:
	/* All errors (atomic & non-atomic) switch CBR to EXCEPTION state */
	tfh_exception(tfh);
503
	gru_flush_cache_cbe(cbe);
504 505 506 507 508 509
	STAT(tlb_dropin_fail_invalid);
	gru_dbg(grudev, "FAILED inval tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
	return -EFAULT;

failactive:
	/* Range invalidate active. Switch to UPM iff atomic */
510
	if (!cbk)
511 512 513
		tfh_user_polling_mode(tfh);
	else
		gru_flush_cache(tfh);
514
	gru_flush_cache_cbe(cbe);
515 516 517 518 519 520 521 522 523 524 525 526
	STAT(tlb_dropin_fail_range_active);
	gru_dbg(grudev, "FAILED range active: tfh 0x%p, vaddr 0x%lx\n",
		tfh, vaddr);
	return 1;
}

/*
 * Process an external interrupt from the GRU. This interrupt is
 * caused by a TLB miss.
 * Note that this is the interrupt handler that is registered with linux
 * interrupt handlers.
 */
J
Jack Steiner 已提交
527
static irqreturn_t gru_intr(int chiplet, int blade)
528 529
{
	struct gru_state *gru;
530
	struct gru_tlb_fault_map imap, dmap;
531 532
	struct gru_thread_state *gts;
	struct gru_tlb_fault_handle *tfh = NULL;
533
	struct completion *cmp;
534 535 536 537
	int cbrnum, ctxnum;

	STAT(intr);

J
Jack Steiner 已提交
538
	gru = &gru_base[blade]->bs_grus[chiplet];
539
	if (!gru) {
J
Jack Steiner 已提交
540 541
		dev_err(grudev, "GRU: invalid interrupt: cpu %d, chiplet %d\n",
			raw_smp_processor_id(), chiplet);
542 543
		return IRQ_NONE;
	}
544
	get_clear_fault_map(gru, &imap, &dmap);
J
Jack Steiner 已提交
545 546 547 548 549
	gru_dbg(grudev,
		"cpu %d, chiplet %d, gid %d, imap %016lx %016lx, dmap %016lx %016lx\n",
		smp_processor_id(), chiplet, gru->gs_gid,
		imap.fault_bits[0], imap.fault_bits[1],
		dmap.fault_bits[0], dmap.fault_bits[1]);
550 551

	for_each_cbr_in_tfm(cbrnum, dmap.fault_bits) {
J
Jack Steiner 已提交
552
		STAT(intr_cbr);
553 554 555
		cmp = gru->gs_blade->bs_async_wq;
		if (cmp)
			complete(cmp);
556
		gru_dbg(grudev, "gid %d, cbr_done %d, done %d\n",
557
			gru->gs_gid, cbrnum, cmp ? cmp->done : -1);
558
	}
559

560
	for_each_cbr_in_tfm(cbrnum, imap.fault_bits) {
J
Jack Steiner 已提交
561
		STAT(intr_tfh);
562 563 564 565 566 567 568 569 570 571 572 573
		tfh = get_tfh_by_index(gru, cbrnum);
		prefetchw(tfh);	/* Helps on hdw, required for emulator */

		/*
		 * When hardware sets a bit in the faultmap, it implicitly
		 * locks the GRU context so that it cannot be unloaded.
		 * The gts cannot change until a TFH start/writestart command
		 * is issued.
		 */
		ctxnum = tfh->ctxnum;
		gts = gru->gs_gts[ctxnum];

574 575 576 577 578 579
		/* Spurious interrupts can cause this. Ignore. */
		if (!gts) {
			STAT(intr_spurious);
			continue;
		}

580 581 582 583
		/*
		 * This is running in interrupt context. Trylock the mmap_sem.
		 * If it fails, retry the fault in user context.
		 */
584
		gts->ustats.fmm_tlbmiss++;
585 586
		if (!gts->ts_force_cch_reload &&
					down_read_trylock(&gts->ts_mm->mmap_sem)) {
587
			gru_try_dropin(gru, gts, tfh, NULL);
588 589 590
			up_read(&gts->ts_mm->mmap_sem);
		} else {
			tfh_user_polling_mode(tfh);
591
			STAT(intr_mm_lock_failed);
592 593 594 595 596
		}
	}
	return IRQ_HANDLED;
}

J
Jack Steiner 已提交
597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619
irqreturn_t gru0_intr(int irq, void *dev_id)
{
	return gru_intr(0, uv_numa_blade_id());
}

irqreturn_t gru1_intr(int irq, void *dev_id)
{
	return gru_intr(1, uv_numa_blade_id());
}

irqreturn_t gru_intr_mblade(int irq, void *dev_id)
{
	int blade;

	for_each_possible_blade(blade) {
		if (uv_blade_nr_possible_cpus(blade))
			continue;
		 gru_intr(0, blade);
		 gru_intr(1, blade);
	}
	return IRQ_HANDLED;
}

620 621 622

static int gru_user_dropin(struct gru_thread_state *gts,
			   struct gru_tlb_fault_handle *tfh,
623
			   void *cb)
624 625 626 627
{
	struct gru_mm_struct *gms = gts->ts_gms;
	int ret;

628
	gts->ustats.upm_tlbmiss++;
629 630 631 632
	while (1) {
		wait_event(gms->ms_wait_queue,
			   atomic_read(&gms->ms_range_active) == 0);
		prefetchw(tfh);	/* Helps on hdw, required for emulator */
633
		ret = gru_try_dropin(gts->ts_gru, gts, tfh, cb);
634 635 636 637 638 639 640 641 642 643 644 645 646 647 648
		if (ret <= 0)
			return ret;
		STAT(call_os_wait_queue);
	}
}

/*
 * This interface is called as a result of a user detecting a "call OS" bit
 * in a user CB. Normally means that a TLB fault has occurred.
 * 	cb - user virtual address of the CB
 */
int gru_handle_user_call_os(unsigned long cb)
{
	struct gru_tlb_fault_handle *tfh;
	struct gru_thread_state *gts;
649
	void *cbk;
650 651 652 653 654 655 656 657 658 659 660 661
	int ucbnum, cbrnum, ret = -EINVAL;

	STAT(call_os);

	/* sanity check the cb pointer */
	ucbnum = get_cb_number((void *)cb);
	if ((cb & (GRU_HANDLE_STRIDE - 1)) || ucbnum >= GRU_NUM_CB)
		return -EINVAL;

	gts = gru_find_lock_gts(cb);
	if (!gts)
		return -EINVAL;
J
Jack Steiner 已提交
662
	gru_dbg(grudev, "address 0x%lx, gid %d, gts 0x%p\n", cb, gts->ts_gru ? gts->ts_gru->gs_gid : -1, gts);
663

J
Jack Steiner 已提交
664
	if (ucbnum >= gts->ts_cbr_au_count * GRU_CBR_AU_SIZE)
665 666
		goto exit;

667
	gru_check_context_placement(gts);
J
Jack Steiner 已提交
668

669 670 671 672 673
	/*
	 * CCH may contain stale data if ts_force_cch_reload is set.
	 */
	if (gts->ts_gru && gts->ts_force_cch_reload) {
		gts->ts_force_cch_reload = 0;
674
		gru_update_cch(gts);
675 676
	}

677 678
	ret = -EAGAIN;
	cbrnum = thread_cbr_number(gts, ucbnum);
679
	if (gts->ts_gru) {
680
		tfh = get_tfh_by_index(gts->ts_gru, cbrnum);
681 682 683
		cbk = get_gseg_base_address_cb(gts->ts_gru->gs_gru_base_vaddr,
				gts->ts_ctxnum, ucbnum);
		ret = gru_user_dropin(gts, tfh, cbk);
684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708
	}
exit:
	gru_unlock_gts(gts);
	return ret;
}

/*
 * Fetch the exception detail information for a CB that terminated with
 * an exception.
 */
int gru_get_exception_detail(unsigned long arg)
{
	struct control_block_extended_exc_detail excdet;
	struct gru_control_block_extended *cbe;
	struct gru_thread_state *gts;
	int ucbnum, cbrnum, ret;

	STAT(user_exception);
	if (copy_from_user(&excdet, (void __user *)arg, sizeof(excdet)))
		return -EFAULT;

	gts = gru_find_lock_gts(excdet.cb);
	if (!gts)
		return -EINVAL;

J
Jack Steiner 已提交
709
	gru_dbg(grudev, "address 0x%lx, gid %d, gts 0x%p\n", excdet.cb, gts->ts_gru ? gts->ts_gru->gs_gid : -1, gts);
J
Jack Steiner 已提交
710 711 712 713
	ucbnum = get_cb_number((void *)excdet.cb);
	if (ucbnum >= gts->ts_cbr_au_count * GRU_CBR_AU_SIZE) {
		ret = -EINVAL;
	} else if (gts->ts_gru) {
714 715
		cbrnum = thread_cbr_number(gts, ucbnum);
		cbe = get_cbe_by_index(gts->ts_gru, cbrnum);
716
		gru_flush_cache(cbe);	/* CBE not coherent */
717
		sync_core();		/* make sure we are have current data */
718 719 720 721 722
		excdet.opc = cbe->opccpy;
		excdet.exopc = cbe->exopccpy;
		excdet.ecause = cbe->ecause;
		excdet.exceptdet0 = cbe->idef1upd;
		excdet.exceptdet1 = cbe->idef3upd;
723 724
		excdet.cbrstate = cbe->cbrstate;
		excdet.cbrexecstatus = cbe->cbrexecstatus;
725
		gru_flush_cache_cbe(cbe);
726 727 728 729 730 731
		ret = 0;
	} else {
		ret = -EAGAIN;
	}
	gru_unlock_gts(gts);

732 733 734 735 736
	gru_dbg(grudev,
		"cb 0x%lx, op %d, exopc %d, cbrstate %d, cbrexecstatus 0x%x, ecause 0x%x, "
		"exdet0 0x%lx, exdet1 0x%x\n",
		excdet.cb, excdet.opc, excdet.exopc, excdet.cbrstate, excdet.cbrexecstatus,
		excdet.ecause, excdet.exceptdet0, excdet.exceptdet1);
737 738 739 740 741 742 743 744
	if (!ret && copy_to_user((void __user *)arg, &excdet, sizeof(excdet)))
		ret = -EFAULT;
	return ret;
}

/*
 * User request to unload a context. Content is saved for possible reload.
 */
745 746 747 748
static int gru_unload_all_contexts(void)
{
	struct gru_thread_state *gts;
	struct gru_state *gru;
749
	int gid, ctxnum;
750 751 752

	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
753
	foreach_gid(gid) {
754 755 756 757 758 759 760
		gru = GID_TO_GRU(gid);
		spin_lock(&gru->gs_lock);
		for (ctxnum = 0; ctxnum < GRU_NUM_CCH; ctxnum++) {
			gts = gru->gs_gts[ctxnum];
			if (gts && mutex_trylock(&gts->ts_ctxlock)) {
				spin_unlock(&gru->gs_lock);
				gru_unload_context(gts, 1);
761
				mutex_unlock(&gts->ts_ctxlock);
762 763 764 765 766 767 768 769
				spin_lock(&gru->gs_lock);
			}
		}
		spin_unlock(&gru->gs_lock);
	}
	return 0;
}

770 771 772 773 774 775 776 777 778 779 780
int gru_user_unload_context(unsigned long arg)
{
	struct gru_thread_state *gts;
	struct gru_unload_context_req req;

	STAT(user_unload_context);
	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
		return -EFAULT;

	gru_dbg(grudev, "gseg 0x%lx\n", req.gseg);

781 782 783
	if (!req.gseg)
		return gru_unload_all_contexts();

784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802
	gts = gru_find_lock_gts(req.gseg);
	if (!gts)
		return -EINVAL;

	if (gts->ts_gru)
		gru_unload_context(gts, 1);
	gru_unlock_gts(gts);

	return 0;
}

/*
 * User request to flush a range of virtual addresses from the GRU TLB
 * (Mainly for testing).
 */
int gru_user_flush_tlb(unsigned long arg)
{
	struct gru_thread_state *gts;
	struct gru_flush_tlb_req req;
803
	struct gru_mm_struct *gms;
804 805 806 807 808 809 810 811 812 813 814 815

	STAT(user_flush_tlb);
	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
		return -EFAULT;

	gru_dbg(grudev, "gseg 0x%lx, vaddr 0x%lx, len 0x%lx\n", req.gseg,
		req.vaddr, req.len);

	gts = gru_find_lock_gts(req.gseg);
	if (!gts)
		return -EINVAL;

816
	gms = gts->ts_gms;
817
	gru_unlock_gts(gts);
818
	gru_flush_tlb_range(gms, req.vaddr, req.len);
819 820 821 822

	return 0;
}

823 824 825 826 827 828 829 830 831 832 833
/*
 * Fetch GSEG statisticss
 */
long gru_get_gseg_statistics(unsigned long arg)
{
	struct gru_thread_state *gts;
	struct gru_get_gseg_statistics_req req;

	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
		return -EFAULT;

834 835 836 837 838
	/*
	 * The library creates arrays of contexts for threaded programs.
	 * If no gts exists in the array, the context has never been used & all
	 * statistics are implicitly 0.
	 */
839 840 841 842 843 844 845 846 847 848 849 850 851 852
	gts = gru_find_lock_gts(req.gseg);
	if (gts) {
		memcpy(&req.stats, &gts->ustats, sizeof(gts->ustats));
		gru_unlock_gts(gts);
	} else {
		memset(&req.stats, 0, sizeof(gts->ustats));
	}

	if (copy_to_user((void __user *)arg, &req, sizeof(req)))
		return -EFAULT;

	return 0;
}

853 854 855 856
/*
 * Register the current task as the user of the GSEG slice.
 * Needed for TLB fault interrupt targeting.
 */
857
int gru_set_context_option(unsigned long arg)
858 859
{
	struct gru_thread_state *gts;
860 861
	struct gru_set_context_option_req req;
	int ret = 0;
862

863 864 865 866 867
	STAT(set_context_option);
	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
		return -EFAULT;
	gru_dbg(grudev, "op %d, gseg 0x%lx, value1 0x%lx\n", req.op, req.gseg, req.val1);

868 869 870 871 872 873
	gts = gru_find_lock_gts(req.gseg);
	if (!gts) {
		gts = gru_alloc_locked_gts(req.gseg);
		if (IS_ERR(gts))
			return PTR_ERR(gts);
	}
874

875
	switch (req.op) {
876 877
	case sco_blade_chiplet:
		/* Select blade/chiplet for GRU context */
878 879 880
		if (req.val0 < -1 || req.val0 >= GRU_CHIPLETS_PER_HUB ||
		    req.val1 < -1 || req.val1 >= GRU_MAX_BLADES ||
		    (req.val1 >= 0 && !gru_base[req.val1])) {
881 882 883 884
			ret = -EINVAL;
		} else {
			gts->ts_user_blade_id = req.val1;
			gts->ts_user_chiplet_id = req.val0;
885
			gru_check_context_placement(gts);
886 887
		}
		break;
888 889 890 891
	case sco_gseg_owner:
 		/* Register the current task as the GSEG owner */
		gts->ts_tgid_owner = current->tgid;
		break;
892 893 894 895
	case sco_cch_req_slice:
 		/* Set the CCH slice option */
		gts->ts_cch_req_slice = req.val1 & 3;
		break;
896 897 898
	default:
		ret = -EINVAL;
	}
899 900
	gru_unlock_gts(gts);

901
	return ret;
902
}