p2m.c 26.8 KB
Newer Older
1 2 3 4 5
/*
 * Xen leaves the responsibility for maintaining p2m mappings to the
 * guests themselves, but it must also access and update the p2m array
 * during suspend/resume when all the pages are reallocated.
 *
6 7 8
 * The logical flat p2m table is mapped to a linear kernel memory area.
 * For accesses by Xen a three-level tree linked via mfns only is set up to
 * allow the address space to be sparse.
9
 *
10 11 12 13 14 15 16
 *               Xen
 *                |
 *          p2m_top_mfn
 *              /   \
 * p2m_mid_mfn p2m_mid_mfn
 *         /           /
 *  p2m p2m p2m ...
17 18 19
 *
 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
 *
20 21
 * The p2m_top_mfn level is limited to 1 page, so the maximum representable
 * pseudo-physical address space is:
22 23 24 25
 *  P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
 *
 * P2M_PER_PAGE depends on the architecture, as a mfn is always
 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
26
 * 512 and 1024 entries respectively.
27 28 29 30 31 32 33
 *
 * In short, these structures contain the Machine Frame Number (MFN) of the PFN.
 *
 * However not all entries are filled with MFNs. Specifically for all other
 * leaf entries, or for the top  root, or middle one, for which there is a void
 * entry, we assume it is  "missing". So (for example)
 *  pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.
34 35 36
 * We have a dedicated page p2m_missing with all entries being
 * INVALID_P2M_ENTRY. This page may be referenced multiple times in the p2m
 * list/tree in case there are multiple areas with P2M_PER_PAGE invalid pfns.
37 38 39 40 41 42
 *
 * We also have the possibility of setting 1-1 mappings on certain regions, so
 * that:
 *  pfn_to_mfn(0xc0000)=0xc0000
 *
 * The benefit of this is, that we can assume for non-RAM regions (think
43
 * PCI BARs, or ACPI spaces), we can create mappings easily because we
44 45
 * get the PFN value to match the MFN.
 *
46 47 48
 * For this to work efficiently we have one new page p2m_identity. All entries
 * in p2m_identity are set to INVALID_P2M_ENTRY type (Xen toolstack only
 * recognizes that and MFNs, no other fancy value).
49 50 51 52
 *
 * On lookup we spot that the entry points to p2m_identity and return the
 * identity value instead of dereferencing and returning INVALID_P2M_ENTRY.
 * If the entry points to an allocated page, we just proceed as before and
53
 * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
54 55 56 57 58 59
 * appropriate functions (pfn_to_mfn).
 *
 * The reason for having the IDENTITY_FRAME_BIT instead of just returning the
 * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
 * non-identity pfn. To protect ourselves against we elect to set (and get) the
 * IDENTITY_FRAME_BIT on all identity mapped PFNs.
60 61 62 63
 */

#include <linux/init.h>
#include <linux/module.h>
64 65
#include <linux/list.h>
#include <linux/hash.h>
66
#include <linux/sched.h>
67
#include <linux/seq_file.h>
68
#include <linux/bootmem.h>
69
#include <linux/slab.h>
70 71 72

#include <asm/cache.h>
#include <asm/setup.h>
73
#include <asm/uaccess.h>
74 75 76 77

#include <asm/xen/page.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
78
#include <xen/balloon.h>
79
#include <xen/grant_table.h>
80

81
#include "p2m.h"
82
#include "multicalls.h"
83 84
#include "xen-ops.h"

85 86
#define PMDS_PER_MID_PAGE	(P2M_MID_PER_PAGE / PTRS_PER_PTE)

87 88
static void __init m2p_override_init(void);

89 90 91 92
unsigned long *xen_p2m_addr __read_mostly;
EXPORT_SYMBOL_GPL(xen_p2m_addr);
unsigned long xen_p2m_size __read_mostly;
EXPORT_SYMBOL_GPL(xen_p2m_size);
93
unsigned long xen_max_p2m_pfn __read_mostly;
94
EXPORT_SYMBOL_GPL(xen_max_p2m_pfn);
95

96 97
static DEFINE_SPINLOCK(p2m_update_lock);

98 99 100
static unsigned long *p2m_mid_missing_mfn;
static unsigned long *p2m_top_mfn;
static unsigned long **p2m_top_mfn_p;
101 102 103 104
static unsigned long *p2m_missing;
static unsigned long *p2m_identity;
static pte_t *p2m_missing_pte;
static pte_t *p2m_identity_pte;
105

106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
static inline unsigned p2m_top_index(unsigned long pfn)
{
	BUG_ON(pfn >= MAX_P2M_PFN);
	return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
}

static inline unsigned p2m_mid_index(unsigned long pfn)
{
	return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
}

static inline unsigned p2m_index(unsigned long pfn)
{
	return pfn % P2M_PER_PAGE;
}

static void p2m_top_mfn_init(unsigned long *top)
{
	unsigned i;

	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
		top[i] = virt_to_mfn(p2m_mid_missing_mfn);
}

static void p2m_top_mfn_p_init(unsigned long **top)
{
	unsigned i;

	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
		top[i] = p2m_mid_missing_mfn;
}

138
static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf)
139 140 141 142
{
	unsigned i;

	for (i = 0; i < P2M_MID_PER_PAGE; i++)
143
		mid[i] = virt_to_mfn(leaf);
144 145
}

146
static void p2m_init(unsigned long *p2m)
147 148 149
{
	unsigned i;

150 151
	for (i = 0; i < P2M_PER_PAGE; i++)
		p2m[i] = INVALID_P2M_ENTRY;
152 153
}

154
static void p2m_init_identity(unsigned long *p2m, unsigned long pfn)
155 156 157
{
	unsigned i;

158 159
	for (i = 0; i < P2M_PER_PAGE; i++)
		p2m[i] = IDENTITY_FRAME(pfn + i);
160 161
}

162 163 164 165 166 167 168 169
static void * __ref alloc_p2m_page(void)
{
	if (unlikely(!slab_is_available()))
		return alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);

	return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
}

170
static void __ref free_p2m_page(void *p)
171
{
172 173 174 175 176
	if (unlikely(!slab_is_available())) {
		free_bootmem((unsigned long)p, PAGE_SIZE);
		return;
	}

177 178 179
	free_page((unsigned long)p);
}

180 181 182 183
/*
 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
 *
 * This is called both at boot time, and after resuming from suspend:
184
 * - At boot time we're called rather early, and must use alloc_bootmem*()
185 186 187
 *   to allocate memory.
 *
 * - After resume we're called from within stop_machine, but the mfn
188
 *   tree should already be completely allocated.
189
 */
190
void __ref xen_build_mfn_list_list(void)
191
{
192 193 194 195
	unsigned long pfn, mfn;
	pte_t *ptep;
	unsigned int level, topidx, mididx;
	unsigned long *mid_mfn_p;
196

197 198 199
	if (xen_feature(XENFEAT_auto_translated_physmap))
		return;

200 201
	/* Pre-initialize p2m_top_mfn to be completely missing */
	if (p2m_top_mfn == NULL) {
202
		p2m_mid_missing_mfn = alloc_p2m_page();
203
		p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
204

205
		p2m_top_mfn_p = alloc_p2m_page();
206 207
		p2m_top_mfn_p_init(p2m_top_mfn_p);

208
		p2m_top_mfn = alloc_p2m_page();
209 210 211
		p2m_top_mfn_init(p2m_top_mfn);
	} else {
		/* Reinitialise, mfn's all change after migration */
212
		p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
213 214
	}

215 216 217 218
	for (pfn = 0; pfn < xen_max_p2m_pfn && pfn < MAX_P2M_PFN;
	     pfn += P2M_PER_PAGE) {
		topidx = p2m_top_index(pfn);
		mididx = p2m_mid_index(pfn);
219 220

		mid_mfn_p = p2m_top_mfn_p[topidx];
221 222 223 224 225
		ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn),
				      &level);
		BUG_ON(!ptep || level != PG_LEVEL_4K);
		mfn = pte_mfn(*ptep);
		ptep = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
226 227 228 229 230

		/* Don't bother allocating any mfn mid levels if
		 * they're just missing, just update the stored mfn,
		 * since all could have changed over a migrate.
		 */
231
		if (ptep == p2m_missing_pte || ptep == p2m_identity_pte) {
232 233 234 235 236 237 238 239
			BUG_ON(mididx);
			BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
			p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
			pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
			continue;
		}

		if (mid_mfn_p == p2m_mid_missing_mfn) {
240
			mid_mfn_p = alloc_p2m_page();
241
			p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
242 243 244 245 246

			p2m_top_mfn_p[topidx] = mid_mfn_p;
		}

		p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
247
		mid_mfn_p[mididx] = mfn;
248 249 250 251 252
	}
}

void xen_setup_mfn_list_list(void)
{
M
Mukesh Rathor 已提交
253 254 255
	if (xen_feature(XENFEAT_auto_translated_physmap))
		return;

256 257 258 259 260 261 262 263 264 265 266 267
	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);

	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
		virt_to_mfn(p2m_top_mfn);
	HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
}

/* Set up p2m_top to point to the domain-builder provided p2m pages */
void __init xen_build_dynamic_phys_to_machine(void)
{
	unsigned long pfn;

268 269 270
	 if (xen_feature(XENFEAT_auto_translated_physmap))
		return;

271
	xen_p2m_addr = (unsigned long *)xen_start_info->mfn_list;
272
	xen_p2m_size = ALIGN(xen_start_info->nr_pages, P2M_PER_PAGE);
273

274 275
	for (pfn = xen_start_info->nr_pages; pfn < xen_p2m_size; pfn++)
		xen_p2m_addr[pfn] = INVALID_P2M_ENTRY;
276

277 278
	xen_max_p2m_pfn = xen_p2m_size;
}
279

280 281 282 283
#define P2M_TYPE_IDENTITY	0
#define P2M_TYPE_MISSING	1
#define P2M_TYPE_PFN		2
#define P2M_TYPE_UNKNOWN	3
284

285 286 287
static int xen_p2m_elem_type(unsigned long pfn)
{
	unsigned long mfn;
288

289 290
	if (pfn >= xen_p2m_size)
		return P2M_TYPE_IDENTITY;
291

292
	mfn = xen_p2m_addr[pfn];
293

294 295
	if (mfn == INVALID_P2M_ENTRY)
		return P2M_TYPE_MISSING;
296

297 298 299 300
	if (mfn & IDENTITY_FRAME_BIT)
		return P2M_TYPE_IDENTITY;

	return P2M_TYPE_PFN;
301
}
302 303

static void __init xen_rebuild_p2m_list(unsigned long *p2m)
304
{
305
	unsigned int i, chunk;
306
	unsigned long pfn;
307 308 309 310
	unsigned long *mfns;
	pte_t *ptep;
	pmd_t *pmdp;
	int type;
311

312 313 314 315
	p2m_missing = alloc_p2m_page();
	p2m_init(p2m_missing);
	p2m_identity = alloc_p2m_page();
	p2m_init(p2m_identity);
316

317 318 319 320 321 322
	p2m_missing_pte = alloc_p2m_page();
	paravirt_alloc_pte(&init_mm, __pa(p2m_missing_pte) >> PAGE_SHIFT);
	p2m_identity_pte = alloc_p2m_page();
	paravirt_alloc_pte(&init_mm, __pa(p2m_identity_pte) >> PAGE_SHIFT);
	for (i = 0; i < PTRS_PER_PTE; i++) {
		set_pte(p2m_missing_pte + i,
323
			pfn_pte(PFN_DOWN(__pa(p2m_missing)), PAGE_KERNEL_RO));
324
		set_pte(p2m_identity_pte + i,
325
			pfn_pte(PFN_DOWN(__pa(p2m_identity)), PAGE_KERNEL_RO));
326
	}
327

328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362
	for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += chunk) {
		/*
		 * Try to map missing/identity PMDs or p2m-pages if possible.
		 * We have to respect the structure of the mfn_list_list
		 * which will be built just afterwards.
		 * Chunk size to test is one p2m page if we are in the middle
		 * of a mfn_list_list mid page and the complete mid page area
		 * if we are at index 0 of the mid page. Please note that a
		 * mid page might cover more than one PMD, e.g. on 32 bit PAE
		 * kernels.
		 */
		chunk = (pfn & (P2M_PER_PAGE * P2M_MID_PER_PAGE - 1)) ?
			P2M_PER_PAGE : P2M_PER_PAGE * P2M_MID_PER_PAGE;

		type = xen_p2m_elem_type(pfn);
		i = 0;
		if (type != P2M_TYPE_PFN)
			for (i = 1; i < chunk; i++)
				if (xen_p2m_elem_type(pfn + i) != type)
					break;
		if (i < chunk)
			/* Reset to minimal chunk size. */
			chunk = P2M_PER_PAGE;

		if (type == P2M_TYPE_PFN || i < chunk) {
			/* Use initial p2m page contents. */
#ifdef CONFIG_X86_64
			mfns = alloc_p2m_page();
			copy_page(mfns, xen_p2m_addr + pfn);
#else
			mfns = xen_p2m_addr + pfn;
#endif
			ptep = populate_extra_pte((unsigned long)(p2m + pfn));
			set_pte(ptep,
				pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL));
363
			continue;
364
		}
365

366 367 368 369 370 371
		if (chunk == P2M_PER_PAGE) {
			/* Map complete missing or identity p2m-page. */
			mfns = (type == P2M_TYPE_MISSING) ?
				p2m_missing : p2m_identity;
			ptep = populate_extra_pte((unsigned long)(p2m + pfn));
			set_pte(ptep,
372
				pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL_RO));
373
			continue;
374
		}
375

376 377 378 379 380
		/* Complete missing or identity PMD(s) can be mapped. */
		ptep = (type == P2M_TYPE_MISSING) ?
			p2m_missing_pte : p2m_identity_pte;
		for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
			pmdp = populate_extra_pmd(
381
				(unsigned long)(p2m + pfn) + i * PMD_SIZE);
382 383 384 385
			set_pmd(pmdp, __pmd(__pa(ptep) | _KERNPG_TABLE));
		}
	}
}
386

387 388 389
void __init xen_vmalloc_p2m_tree(void)
{
	static struct vm_struct vm;
390

391 392 393 394 395
	vm.flags = VM_ALLOC;
	vm.size = ALIGN(sizeof(unsigned long) * xen_max_p2m_pfn,
			PMD_SIZE * PMDS_PER_MID_PAGE);
	vm_area_register_early(&vm, PMD_SIZE * PMDS_PER_MID_PAGE);
	pr_notice("p2m virtual area at %p, size is %lx\n", vm.addr, vm.size);
396

397
	xen_max_p2m_pfn = vm.size / sizeof(unsigned long);
398

399
	xen_rebuild_p2m_list(vm.addr);
400

401
	xen_p2m_addr = vm.addr;
402 403 404 405
	xen_p2m_size = xen_max_p2m_pfn;

	xen_inv_extra_mem();

406
	m2p_override_init();
407
}
408

409 410
unsigned long get_phys_to_machine(unsigned long pfn)
{
411 412
	pte_t *ptep;
	unsigned int level;
413

414 415 416 417
	if (unlikely(pfn >= xen_p2m_size)) {
		if (pfn < xen_max_p2m_pfn)
			return xen_chk_extra_mem(pfn);

418
		return IDENTITY_FRAME(pfn);
419
	}
420

421 422
	ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level);
	BUG_ON(!ptep || level != PG_LEVEL_4K);
423

424 425 426 427 428
	/*
	 * The INVALID_P2M_ENTRY is filled in both p2m_*identity
	 * and in p2m_*missing, so returning the INVALID_P2M_ENTRY
	 * would be wrong.
	 */
429
	if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity)))
430 431
		return IDENTITY_FRAME(pfn);

432
	return xen_p2m_addr[pfn];
433 434 435
}
EXPORT_SYMBOL_GPL(get_phys_to_machine);

436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
/*
 * Allocate new pmd(s). It is checked whether the old pmd is still in place.
 * If not, nothing is changed. This is okay as the only reason for allocating
 * a new pmd is to replace p2m_missing_pte or p2m_identity_pte by a individual
 * pmd. In case of PAE/x86-32 there are multiple pmds to allocate!
 */
static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *ptep, pte_t *pte_pg)
{
	pte_t *ptechk;
	pte_t *pteret = ptep;
	pte_t *pte_newpg[PMDS_PER_MID_PAGE];
	pmd_t *pmdp;
	unsigned int level;
	unsigned long flags;
	unsigned long vaddr;
	int i;

	/* Do all allocations first to bail out in error case. */
	for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
		pte_newpg[i] = alloc_p2m_page();
		if (!pte_newpg[i]) {
			for (i--; i >= 0; i--)
				free_p2m_page(pte_newpg[i]);

			return NULL;
		}
	}

	vaddr = addr & ~(PMD_SIZE * PMDS_PER_MID_PAGE - 1);

	for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
		copy_page(pte_newpg[i], pte_pg);
		paravirt_alloc_pte(&init_mm, __pa(pte_newpg[i]) >> PAGE_SHIFT);

		pmdp = lookup_pmd_address(vaddr);
		BUG_ON(!pmdp);

		spin_lock_irqsave(&p2m_update_lock, flags);

		ptechk = lookup_address(vaddr, &level);
		if (ptechk == pte_pg) {
			set_pmd(pmdp,
				__pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE));
			if (vaddr == (addr & ~(PMD_SIZE - 1)))
				pteret = pte_offset_kernel(pmdp, addr);
			pte_newpg[i] = NULL;
		}

		spin_unlock_irqrestore(&p2m_update_lock, flags);

		if (pte_newpg[i]) {
			paravirt_release_pte(__pa(pte_newpg[i]) >> PAGE_SHIFT);
			free_p2m_page(pte_newpg[i]);
		}

		vaddr += PMD_SIZE;
	}

	return pteret;
}

497
/*
498 499 500 501 502 503 504 505 506 507
 * Fully allocate the p2m structure for a given pfn.  We need to check
 * that both the top and mid levels are allocated, and make sure the
 * parallel mfn tree is kept in sync.  We may race with other cpus, so
 * the new pages are installed with cmpxchg; if we lose the race then
 * simply free the page we allocated and use the one that's there.
 */
static bool alloc_p2m(unsigned long pfn)
{
	unsigned topidx, mididx;
	unsigned long *top_mfn_p, *mid_mfn;
508 509 510 511 512
	pte_t *ptep, *pte_pg;
	unsigned int level;
	unsigned long flags;
	unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
	unsigned long p2m_pfn;
513 514 515 516

	topidx = p2m_top_index(pfn);
	mididx = p2m_mid_index(pfn);

517 518 519
	ptep = lookup_address(addr, &level);
	BUG_ON(!ptep || level != PG_LEVEL_4K);
	pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
520

521 522 523 524
	if (pte_pg == p2m_missing_pte || pte_pg == p2m_identity_pte) {
		/* PMD level is missing, allocate a new one */
		ptep = alloc_p2m_pmd(addr, ptep, pte_pg);
		if (!ptep)
525 526 527
			return false;
	}

528 529 530
	if (p2m_top_mfn) {
		top_mfn_p = &p2m_top_mfn[topidx];
		mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
531

532
		BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
533

534 535 536 537 538
		if (mid_mfn == p2m_mid_missing_mfn) {
			/* Separately check the mid mfn level */
			unsigned long missing_mfn;
			unsigned long mid_mfn_mfn;
			unsigned long old_mfn;
539

540 541 542
			mid_mfn = alloc_p2m_page();
			if (!mid_mfn)
				return false;
543

544
			p2m_mid_mfn_init(mid_mfn, p2m_missing);
545

546 547 548 549 550 551 552 553 554
			missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
			mid_mfn_mfn = virt_to_mfn(mid_mfn);
			old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn);
			if (old_mfn != missing_mfn) {
				free_p2m_page(mid_mfn);
				mid_mfn = mfn_to_virt(old_mfn);
			} else {
				p2m_top_mfn_p[topidx] = mid_mfn;
			}
555
		}
556 557
	} else {
		mid_mfn = NULL;
558 559
	}

560 561 562
	p2m_pfn = pte_pfn(ACCESS_ONCE(*ptep));
	if (p2m_pfn == PFN_DOWN(__pa(p2m_identity)) ||
	    p2m_pfn == PFN_DOWN(__pa(p2m_missing))) {
563 564 565 566 567 568 569
		/* p2m leaf page is missing */
		unsigned long *p2m;

		p2m = alloc_p2m_page();
		if (!p2m)
			return false;

570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585
		if (p2m_pfn == PFN_DOWN(__pa(p2m_missing)))
			p2m_init(p2m);
		else
			p2m_init_identity(p2m, pfn);

		spin_lock_irqsave(&p2m_update_lock, flags);

		if (pte_pfn(*ptep) == p2m_pfn) {
			set_pte(ptep,
				pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL));
			if (mid_mfn)
				mid_mfn[mididx] = virt_to_mfn(p2m);
			p2m = NULL;
		}

		spin_unlock_irqrestore(&p2m_update_lock, flags);
586

587
		if (p2m)
588 589 590 591 592 593
			free_p2m_page(p2m);
	}

	return true;
}

R
Randy Dunlap 已提交
594
unsigned long __init set_phys_range_identity(unsigned long pfn_s,
595 596 597 598
				      unsigned long pfn_e)
{
	unsigned long pfn;

599
	if (unlikely(pfn_s >= xen_p2m_size))
600 601 602 603 604 605 606 607
		return 0;

	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
		return pfn_e - pfn_s;

	if (pfn_s > pfn_e)
		return 0;

608 609
	if (pfn_e > xen_p2m_size)
		pfn_e = xen_p2m_size;
610

611 612
	for (pfn = pfn_s; pfn < pfn_e; pfn++)
		xen_p2m_addr[pfn] = IDENTITY_FRAME(pfn);
613 614 615 616

	return pfn - pfn_s;
}

617 618
bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
619 620
	pte_t *ptep;
	unsigned int level;
621

622 623
	/* don't track P2M changes in autotranslate guests */
	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
624
		return true;
625

626
	if (unlikely(pfn >= xen_p2m_size)) {
627 628 629 630
		BUG_ON(mfn != INVALID_P2M_ENTRY);
		return true;
	}

631
	if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn)))
632 633
		return true;

634 635
	ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level);
	BUG_ON(!ptep || level != PG_LEVEL_4K);
636

637
	if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_missing)))
638 639
		return mfn == INVALID_P2M_ENTRY;

640 641 642
	if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity)))
		return mfn == IDENTITY_FRAME(pfn);

643
	return false;
644 645 646 647
}

bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
648
	if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
649 650 651
		if (!alloc_p2m(pfn))
			return false;

652
		return __set_phys_to_machine(pfn, mfn);
653 654 655 656
	}

	return true;
}
657 658 659 660

#define M2P_OVERRIDE_HASH_SHIFT	10
#define M2P_OVERRIDE_HASH	(1 << M2P_OVERRIDE_HASH_SHIFT)

661
static struct list_head *m2p_overrides;
662 663 664 665 666 667
static DEFINE_SPINLOCK(m2p_override_lock);

static void __init m2p_override_init(void)
{
	unsigned i;

668 669 670
	m2p_overrides = alloc_bootmem_align(
				sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
				sizeof(unsigned long));
671 672 673 674 675 676 677 678 679 680 681

	for (i = 0; i < M2P_OVERRIDE_HASH; i++)
		INIT_LIST_HEAD(&m2p_overrides[i]);
}

static unsigned long mfn_hash(unsigned long mfn)
{
	return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
}

/* Add an MFN override for a particular page */
J
Juergen Gross 已提交
682 683
static int m2p_add_override(unsigned long mfn, struct page *page,
			    struct gnttab_map_grant_ref *kmap_op)
684 685
{
	unsigned long flags;
686
	unsigned long pfn;
687
	unsigned long uninitialized_var(address);
688 689 690 691 692 693 694 695
	unsigned level;
	pte_t *ptep = NULL;

	pfn = page_to_pfn(page);
	if (!PageHighMem(page)) {
		address = (unsigned long)__va(pfn << PAGE_SHIFT);
		ptep = lookup_address(address, &level);
		if (WARN(ptep == NULL || level != PG_LEVEL_4K,
696
			 "m2p_add_override: pfn %lx not mapped", pfn))
697 698
			return -EINVAL;
	}
699

700 701 702 703 704 705 706 707 708 709 710
	if (kmap_op != NULL) {
		if (!PageHighMem(page)) {
			struct multicall_space mcs =
				xen_mc_entry(sizeof(*kmap_op));

			MULTI_grant_table_op(mcs.mc,
					GNTTABOP_map_grant_ref, kmap_op, 1);

			xen_mc_issue(PARAVIRT_LAZY_MMU);
		}
	}
711 712 713
	spin_lock_irqsave(&m2p_override_lock, flags);
	list_add(&page->lru,  &m2p_overrides[mfn_hash(mfn)]);
	spin_unlock_irqrestore(&m2p_override_lock, flags);
714

715 716 717 718 719 720 721 722 723 724 725 726 727 728
	/* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in
	 * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other
	 * pfn so that the following mfn_to_pfn(mfn) calls will return the
	 * pfn from the m2p_override (the backend pfn) instead.
	 * We need to do this because the pages shared by the frontend
	 * (xen-blkfront) can be already locked (lock_page, called by
	 * do_read_cache_page); when the userspace backend tries to use them
	 * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so
	 * do_blockdev_direct_IO is going to try to lock the same pages
	 * again resulting in a deadlock.
	 * As a side effect get_user_pages_fast might not be safe on the
	 * frontend pages while they are being shared with the backend,
	 * because mfn_to_pfn (that ends up being called by GUPF) will
	 * return the backend pfn rather than the frontend pfn. */
729
	pfn = mfn_to_pfn_no_overrides(mfn);
730
	if (__pfn_to_mfn(pfn) == mfn)
731 732
		set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));

733
	return 0;
734
}
735

J
Juergen Gross 已提交
736 737 738
int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
			    struct gnttab_map_grant_ref *kmap_ops,
			    struct page **pages, unsigned int count)
739 740 741
{
	int i, ret = 0;
	bool lazy = false;
J
Juergen Gross 已提交
742
	pte_t *pte;
743 744 745 746 747 748 749 750 751 752 753 754

	if (xen_feature(XENFEAT_auto_translated_physmap))
		return 0;

	if (kmap_ops &&
	    !in_interrupt() &&
	    paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
		arch_enter_lazy_mmu_mode();
		lazy = true;
	}

	for (i = 0; i < count; i++) {
J
Juergen Gross 已提交
755
		unsigned long mfn, pfn;
756

J
Juergen Gross 已提交
757 758 759 760 761 762 763 764 765 766
		/* Do not add to override if the map failed. */
		if (map_ops[i].status)
			continue;

		if (map_ops[i].flags & GNTMAP_contains_pte) {
			pte = (pte_t *)(mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
				(map_ops[i].host_addr & ~PAGE_MASK));
			mfn = pte_mfn(*pte);
		} else {
			mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
767
		}
J
Juergen Gross 已提交
768
		pfn = page_to_pfn(pages[i]);
769

J
Juergen Gross 已提交
770 771 772 773
		WARN_ON(PagePrivate(pages[i]));
		SetPagePrivate(pages[i]);
		set_page_private(pages[i], mfn);
		pages[i]->index = pfn_to_mfn(pfn);
774

J
Juergen Gross 已提交
775 776
		if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
			ret = -ENOMEM;
777
			goto out;
J
Juergen Gross 已提交
778 779 780 781 782 783 784
		}

		if (kmap_ops) {
			ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
			if (ret)
				goto out;
		}
785 786 787 788 789
	}

out:
	if (lazy)
		arch_leave_lazy_mmu_mode();
J
Juergen Gross 已提交
790

791 792
	return ret;
}
J
Juergen Gross 已提交
793
EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
794

J
Juergen Gross 已提交
795 796 797
static struct page *m2p_find_override(unsigned long mfn)
{
	unsigned long flags;
798
	struct list_head *bucket;
J
Juergen Gross 已提交
799 800
	struct page *p, *ret;

801 802 803
	if (unlikely(!m2p_overrides))
		return NULL;

J
Juergen Gross 已提交
804
	ret = NULL;
805
	bucket = &m2p_overrides[mfn_hash(mfn)];
J
Juergen Gross 已提交
806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823

	spin_lock_irqsave(&m2p_override_lock, flags);

	list_for_each_entry(p, bucket, lru) {
		if (page_private(p) == mfn) {
			ret = p;
			break;
		}
	}

	spin_unlock_irqrestore(&m2p_override_lock, flags);

	return ret;
}

static int m2p_remove_override(struct page *page,
			       struct gnttab_map_grant_ref *kmap_op,
			       unsigned long mfn)
824 825
{
	unsigned long flags;
826
	unsigned long pfn;
827
	unsigned long uninitialized_var(address);
828 829
	unsigned level;
	pte_t *ptep = NULL;
830 831

	pfn = page_to_pfn(page);
832 833 834 835 836 837

	if (!PageHighMem(page)) {
		address = (unsigned long)__va(pfn << PAGE_SHIFT);
		ptep = lookup_address(address, &level);

		if (WARN(ptep == NULL || level != PG_LEVEL_4K,
838
			 "m2p_remove_override: pfn %lx not mapped", pfn))
839 840
			return -EINVAL;
	}
841

842 843 844
	spin_lock_irqsave(&m2p_override_lock, flags);
	list_del(&page->lru);
	spin_unlock_irqrestore(&m2p_override_lock, flags);
845

846
	if (kmap_op != NULL) {
847 848
		if (!PageHighMem(page)) {
			struct multicall_space mcs;
849 850 851 852
			struct gnttab_unmap_and_replace *unmap_op;
			struct page *scratch_page = get_balloon_scratch_page();
			unsigned long scratch_page_address = (unsigned long)
				__va(page_to_pfn(scratch_page) << PAGE_SHIFT);
853 854 855 856 857 858 859 860

			/*
			 * It might be that we queued all the m2p grant table
			 * hypercalls in a multicall, then m2p_remove_override
			 * get called before the multicall has actually been
			 * issued. In this case handle is going to -1 because
			 * it hasn't been modified yet.
			 */
861
			if (kmap_op->handle == -1)
862 863
				xen_mc_flush();
			/*
864
			 * Now if kmap_op->handle is negative it means that the
865 866
			 * hypercall actually returned an error.
			 */
867
			if (kmap_op->handle == GNTST_general_error) {
868 869
				pr_warn("m2p_remove_override: pfn %lx mfn %lx, failed to modify kernel mappings",
					pfn, mfn);
870
				put_balloon_scratch_page();
871 872 873
				return -1;
			}

874 875 876
			xen_mc_batch();

			mcs = __xen_mc_entry(
877
				sizeof(struct gnttab_unmap_and_replace));
878
			unmap_op = mcs.args;
879
			unmap_op->host_addr = kmap_op->host_addr;
880
			unmap_op->new_addr = scratch_page_address;
881
			unmap_op->handle = kmap_op->handle;
882 883

			MULTI_grant_table_op(mcs.mc,
884
				GNTTABOP_unmap_and_replace, unmap_op, 1);
885

886 887
			mcs = __xen_mc_entry(0);
			MULTI_update_va_mapping(mcs.mc, scratch_page_address,
888
					pfn_pte(page_to_pfn(scratch_page),
889
					PAGE_KERNEL_RO), 0);
890

891 892
			xen_mc_issue(PARAVIRT_LAZY_MMU);

893
			kmap_op->host_addr = 0;
894
			put_balloon_scratch_page();
895
		}
896
	}
897

898 899 900 901 902 903 904 905 906 907 908
	/* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present
	 * somewhere in this domain, even before being added to the
	 * m2p_override (see comment above in m2p_add_override).
	 * If there are no other entries in the m2p_override corresponding
	 * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for
	 * the original pfn (the one shared by the frontend): the backend
	 * cannot do any IO on this page anymore because it has been
	 * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of
	 * the original pfn causes mfn_to_pfn(mfn) to return the frontend
	 * pfn again. */
	mfn &= ~FOREIGN_FRAME_BIT;
909
	pfn = mfn_to_pfn_no_overrides(mfn);
910
	if (__pfn_to_mfn(pfn) == FOREIGN_FRAME(mfn) &&
911 912 913
			m2p_find_override(mfn) == NULL)
		set_phys_to_machine(pfn, mfn);

914
	return 0;
915 916
}

J
Juergen Gross 已提交
917 918 919
int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
			      struct gnttab_map_grant_ref *kmap_ops,
			      struct page **pages, unsigned int count)
920
{
J
Juergen Gross 已提交
921 922
	int i, ret = 0;
	bool lazy = false;
923

J
Juergen Gross 已提交
924 925
	if (xen_feature(XENFEAT_auto_translated_physmap))
		return 0;
926

J
Juergen Gross 已提交
927 928 929 930 931 932
	if (kmap_ops &&
	    !in_interrupt() &&
	    paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
		arch_enter_lazy_mmu_mode();
		lazy = true;
	}
933

J
Juergen Gross 已提交
934
	for (i = 0; i < count; i++) {
935
		unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i]));
J
Juergen Gross 已提交
936 937 938 939 940
		unsigned long pfn = page_to_pfn(pages[i]);

		if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) {
			ret = -EINVAL;
			goto out;
941 942
		}

J
Juergen Gross 已提交
943 944 945 946
		set_page_private(pages[i], INVALID_P2M_ENTRY);
		WARN_ON(!PagePrivate(pages[i]));
		ClearPagePrivate(pages[i]);
		set_phys_to_machine(pfn, pages[i]->index);
947

J
Juergen Gross 已提交
948 949 950 951 952 953 954 955 956
		if (kmap_ops)
			ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn);
		if (ret)
			goto out;
	}

out:
	if (lazy)
		arch_leave_lazy_mmu_mode();
957 958
	return ret;
}
J
Juergen Gross 已提交
959
EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
960 961 962 963 964 965 966 967 968 969 970

unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
{
	struct page *p = m2p_find_override(mfn);
	unsigned long ret = pfn;

	if (p)
		ret = page_to_pfn(p);

	return ret;
}
971
EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
972 973

#ifdef CONFIG_XEN_DEBUG_FS
974 975 976
#include <linux/debugfs.h>
#include "debugfs.h"
static int p2m_dump_show(struct seq_file *m, void *v)
977
{
978
	static const char * const type_name[] = {
979 980 981 982 983 984 985 986 987 988 989 990 991 992 993
				[P2M_TYPE_IDENTITY] = "identity",
				[P2M_TYPE_MISSING] = "missing",
				[P2M_TYPE_PFN] = "pfn",
				[P2M_TYPE_UNKNOWN] = "abnormal"};
	unsigned long pfn, first_pfn;
	int type, prev_type;

	prev_type = xen_p2m_elem_type(0);
	first_pfn = 0;

	for (pfn = 0; pfn < xen_p2m_size; pfn++) {
		type = xen_p2m_elem_type(pfn);
		if (type != prev_type) {
			seq_printf(m, " [0x%lx->0x%lx] %s\n", first_pfn, pfn,
				   type_name[prev_type]);
994
			prev_type = type;
995
			first_pfn = pfn;
996 997
		}
	}
998 999
	seq_printf(m, " [0x%lx->0x%lx] %s\n", first_pfn, pfn,
		   type_name[prev_type]);
1000 1001
	return 0;
}
1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030

static int p2m_dump_open(struct inode *inode, struct file *filp)
{
	return single_open(filp, p2m_dump_show, NULL);
}

static const struct file_operations p2m_dump_fops = {
	.open		= p2m_dump_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release,
};

static struct dentry *d_mmu_debug;

static int __init xen_p2m_debugfs(void)
{
	struct dentry *d_xen = xen_init_debugfs();

	if (d_xen == NULL)
		return -ENOMEM;

	d_mmu_debug = debugfs_create_dir("mmu", d_xen);

	debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
	return 0;
}
fs_initcall(xen_p2m_debugfs);
#endif /* CONFIG_XEN_DEBUG_FS */