page_tables.c 25.7 KB
Newer Older
1 2 3 4 5 6 7 8
/*P:700 The pagetable code, on the other hand, still shows the scars of
 * previous encounters.  It's functional, and as neat as it can be in the
 * circumstances, but be wary, for these things are subtle and break easily.
 * The Guest provides a virtual to physical mapping, but we can neither trust
 * it nor use it: we verify and convert it here to point the hardware to the
 * actual Guest pages when running the Guest. :*/

/* Copyright (C) Rusty Russell IBM Corporation 2006.
R
Rusty Russell 已提交
9 10 11 12 13 14 15
 * GPL v2 and any later version */
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/random.h>
#include <linux/percpu.h>
#include <asm/tlbflush.h>
16
#include <asm/uaccess.h>
R
Rusty Russell 已提交
17 18
#include "lg.h"

19 20 21 22 23
/*M:008 We hold reference to pages, which prevents them from being swapped.
 * It'd be nice to have a callback in the "struct mm_struct" when Linux wants
 * to swap out.  If we had this, and a shrinker callback to trim PTE pages, we
 * could probably consider launching Guests as non-root. :*/

R
Rusty Russell 已提交
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
/*H:300
 * The Page Table Code
 *
 * We use two-level page tables for the Guest.  If you're not entirely
 * comfortable with virtual addresses, physical addresses and page tables then
 * I recommend you review lguest.c's "Page Table Handling" (with diagrams!).
 *
 * The Guest keeps page tables, but we maintain the actual ones here: these are
 * called "shadow" page tables.  Which is a very Guest-centric name: these are
 * the real page tables the CPU uses, although we keep them up to date to
 * reflect the Guest's.  (See what I mean about weird naming?  Since when do
 * shadows reflect anything?)
 *
 * Anyway, this is the most complicated part of the Host code.  There are seven
 * parts to this:
 *  (i) Setting up a page table entry for the Guest when it faults,
 *  (ii) Setting up the page table entry for the Guest stack,
 *  (iii) Setting up a page table entry when the Guest tells us it has changed,
 *  (iv) Switching page tables,
 *  (v) Flushing (thowing away) page tables,
 *  (vi) Mapping the Switcher when the Guest is about to run,
 *  (vii) Setting up the page tables initially.
 :*/


/* 1024 entries in a page table page maps 1024 pages: 4MB.  The Switcher is
 * conveniently placed at the top 4MB, so it uses a separate, complete PTE
 * page.  */
52
#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
R
Rusty Russell 已提交
53

R
Rusty Russell 已提交
54 55 56
/* We actually need a separate PTE page for each CPU.  Remember that after the
 * Switcher code itself comes two pages for each CPU, and we don't want this
 * CPU's guest to see the pages of any other CPU. */
57
static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
R
Rusty Russell 已提交
58 59
#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)

R
Rusty Russell 已提交
60 61 62 63
/*H:320 With our shadow and Guest types established, we need to deal with
 * them: the page table code is curly enough to need helper functions to keep
 * it clear and clean.
 *
64
 * There are two functions which return pointers to the shadow (aka "real")
R
Rusty Russell 已提交
65 66 67 68 69 70
 * page tables.
 *
 * spgd_addr() takes the virtual address and returns a pointer to the top-level
 * page directory entry for that address.  Since we keep track of several page
 * tables, the "i" argument tells us which one we're interested in (it's
 * usually the current one). */
71
static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
R
Rusty Russell 已提交
72
{
73
	unsigned int index = pgd_index(vaddr);
R
Rusty Russell 已提交
74

R
Rusty Russell 已提交
75
	/* We kill any Guest trying to touch the Switcher addresses. */
R
Rusty Russell 已提交
76 77 78 79
	if (index >= SWITCHER_PGD_INDEX) {
		kill_guest(lg, "attempt to access switcher pages");
		index = 0;
	}
R
Rusty Russell 已提交
80
	/* Return a pointer index'th pgd entry for the i'th page table. */
R
Rusty Russell 已提交
81 82 83
	return &lg->pgdirs[i].pgdir[index];
}

R
Rusty Russell 已提交
84 85 86
/* This routine then takes the PGD entry given above, which contains the
 * address of the PTE page.  It then returns a pointer to the PTE entry for the
 * given address. */
87
static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)
R
Rusty Russell 已提交
88
{
89
	pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
R
Rusty Russell 已提交
90
	/* You should never call this if the PGD entry wasn't valid */
91 92
	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
	return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE];
R
Rusty Russell 已提交
93 94
}

R
Rusty Russell 已提交
95 96
/* These two functions just like the above two, except they access the Guest
 * page tables.  Hence they return a Guest address. */
R
Rusty Russell 已提交
97 98
static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr)
{
99
	unsigned int index = vaddr >> (PGDIR_SHIFT);
100
	return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t);
R
Rusty Russell 已提交
101 102 103
}

static unsigned long gpte_addr(struct lguest *lg,
104
			       pgd_t gpgd, unsigned long vaddr)
R
Rusty Russell 已提交
105
{
106 107 108
	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
	return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t);
R
Rusty Russell 已提交
109 110
}

R
Rusty Russell 已提交
111 112 113 114 115 116 117 118
/*H:350 This routine takes a page number given by the Guest and converts it to
 * an actual, physical page number.  It can fail for several reasons: the
 * virtual address might not be mapped by the Launcher, the write flag is set
 * and the page is read-only, or the write flag was set and the page was
 * shared so had to be copied, but we ran out of memory.
 *
 * This holds a reference to the page, so release_pte() is careful to
 * put that back. */
R
Rusty Russell 已提交
119 120 121
static unsigned long get_pfn(unsigned long virtpfn, int write)
{
	struct page *page;
R
Rusty Russell 已提交
122
	/* This value indicates failure. */
R
Rusty Russell 已提交
123 124
	unsigned long ret = -1UL;

R
Rusty Russell 已提交
125 126 127 128
	/* get_user_pages() is a complex interface: it gets the "struct
	 * vm_area_struct" and "struct page" assocated with a range of pages.
	 * It also needs the task's mmap_sem held, and is not very quick.
	 * It returns the number of pages it got. */
R
Rusty Russell 已提交
129 130 131 132 133 134 135 136
	down_read(&current->mm->mmap_sem);
	if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT,
			   1, write, 1, &page, NULL) == 1)
		ret = page_to_pfn(page);
	up_read(&current->mm->mmap_sem);
	return ret;
}

R
Rusty Russell 已提交
137 138 139 140
/*H:340 Converting a Guest page table entry to a shadow (ie. real) page table
 * entry can be a little tricky.  The flags are (almost) the same, but the
 * Guest PTE contains a virtual page number: the CPU needs the real page
 * number. */
141
static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
R
Rusty Russell 已提交
142
{
143
	unsigned long pfn, base, flags;
R
Rusty Russell 已提交
144

R
Rusty Russell 已提交
145 146 147 148
	/* The Guest sets the global flag, because it thinks that it is using
	 * PGE.  We only told it to use PGE so it would tell us whether it was
	 * flushing a kernel mapping or a userspace mapping.  We don't actually
	 * use the global bit, so throw it away. */
149
	flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
R
Rusty Russell 已提交
150

151 152 153
	/* The Guest's pages are offset inside the Launcher. */
	base = (unsigned long)lg->mem_base / PAGE_SIZE;

R
Rusty Russell 已提交
154 155 156 157
	/* We need a temporary "unsigned long" variable to hold the answer from
	 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
	 * fit in spte.pfn.  get_pfn() finds the real physical number of the
	 * page, given the virtual number. */
158
	pfn = get_pfn(base + pte_pfn(gpte), write);
R
Rusty Russell 已提交
159
	if (pfn == -1UL) {
160
		kill_guest(lg, "failed to get page %lu", pte_pfn(gpte));
R
Rusty Russell 已提交
161 162 163
		/* When we destroy the Guest, we'll go through the shadow page
		 * tables and release_pte() them.  Make sure we don't think
		 * this one is valid! */
164
		flags = 0;
R
Rusty Russell 已提交
165
	}
166 167
	/* Now we assemble our shadow PTE from the page number and flags. */
	return pfn_pte(pfn, __pgprot(flags));
R
Rusty Russell 已提交
168 169
}

R
Rusty Russell 已提交
170
/*H:460 And to complete the chain, release_pte() looks like this: */
171
static void release_pte(pte_t pte)
R
Rusty Russell 已提交
172
{
R
Rusty Russell 已提交
173 174
	/* Remember that get_user_pages() took a reference to the page, in
	 * get_pfn()?  We have to put it back now. */
175 176
	if (pte_flags(pte) & _PAGE_PRESENT)
		put_page(pfn_to_page(pte_pfn(pte)));
R
Rusty Russell 已提交
177
}
R
Rusty Russell 已提交
178
/*:*/
R
Rusty Russell 已提交
179

180
static void check_gpte(struct lguest *lg, pte_t gpte)
R
Rusty Russell 已提交
181
{
182 183
	if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE))
	    || pte_pfn(gpte) >= lg->pfn_limit)
R
Rusty Russell 已提交
184 185 186
		kill_guest(lg, "bad page table entry");
}

187
static void check_gpgd(struct lguest *lg, pgd_t gpgd)
R
Rusty Russell 已提交
188
{
189
	if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit)
R
Rusty Russell 已提交
190 191 192
		kill_guest(lg, "bad page directory entry");
}

R
Rusty Russell 已提交
193 194 195 196 197 198 199 200 201 202
/*H:330
 * (i) Setting up a page table entry for the Guest when it faults
 *
 * We saw this call in run_guest(): when we see a page fault in the Guest, we
 * come here.  That's because we only set up the shadow page tables lazily as
 * they're needed, so we get page faults all the time and quietly fix them up
 * and return to the Guest without it knowing.
 *
 * If we fixed up the fault (ie. we mapped the address), this routine returns
 * true. */
R
Rusty Russell 已提交
203 204
int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
{
205 206
	pgd_t gpgd;
	pgd_t *spgd;
R
Rusty Russell 已提交
207
	unsigned long gpte_ptr;
208 209
	pte_t gpte;
	pte_t *spte;
R
Rusty Russell 已提交
210

R
Rusty Russell 已提交
211
	/* First step: get the top-level Guest page table entry. */
R
Rusty Russell 已提交
212
	gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
R
Rusty Russell 已提交
213
	/* Toplevel not present?  We can't map it in. */
214
	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
R
Rusty Russell 已提交
215 216
		return 0;

R
Rusty Russell 已提交
217
	/* Now look at the matching shadow entry. */
R
Rusty Russell 已提交
218
	spgd = spgd_addr(lg, lg->pgdidx, vaddr);
219
	if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
R
Rusty Russell 已提交
220
		/* No shadow entry: allocate a new shadow PTE page. */
R
Rusty Russell 已提交
221
		unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
R
Rusty Russell 已提交
222 223
		/* This is not really the Guest's fault, but killing it is
		 * simple for this corner case. */
R
Rusty Russell 已提交
224 225 226 227
		if (!ptepage) {
			kill_guest(lg, "out of memory allocating pte page");
			return 0;
		}
R
Rusty Russell 已提交
228
		/* We check that the Guest pgd is OK. */
R
Rusty Russell 已提交
229
		check_gpgd(lg, gpgd);
R
Rusty Russell 已提交
230 231
		/* And we copy the flags to the shadow PGD entry.  The page
		 * number in the shadow PGD is the page we just allocated. */
232
		*spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd));
R
Rusty Russell 已提交
233 234
	}

R
Rusty Russell 已提交
235 236
	/* OK, now we look at the lower level in the Guest page table: keep its
	 * address, because we might update it later. */
R
Rusty Russell 已提交
237
	gpte_ptr = gpte_addr(lg, gpgd, vaddr);
R
Rusty Russell 已提交
238
	gpte = lgread(lg, gpte_ptr, pte_t);
R
Rusty Russell 已提交
239

R
Rusty Russell 已提交
240
	/* If this page isn't in the Guest page tables, we can't page it in. */
241
	if (!(pte_flags(gpte) & _PAGE_PRESENT))
R
Rusty Russell 已提交
242 243
		return 0;

R
Rusty Russell 已提交
244 245
	/* Check they're not trying to write to a page the Guest wants
	 * read-only (bit 2 of errcode == write). */
246
	if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
R
Rusty Russell 已提交
247 248
		return 0;

R
Rusty Russell 已提交
249
	/* User access to a kernel page? (bit 3 == user access) */
250
	if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
R
Rusty Russell 已提交
251 252
		return 0;

R
Rusty Russell 已提交
253 254
	/* Check that the Guest PTE flags are OK, and the page number is below
	 * the pfn_limit (ie. not mapping the Launcher binary). */
R
Rusty Russell 已提交
255
	check_gpte(lg, gpte);
R
Rusty Russell 已提交
256
	/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
257 258
	gpte = pte_mkyoung(gpte);

R
Rusty Russell 已提交
259
	if (errcode & 2)
260
		gpte = pte_mkdirty(gpte);
R
Rusty Russell 已提交
261

R
Rusty Russell 已提交
262
	/* Get the pointer to the shadow PTE entry we're going to set. */
R
Rusty Russell 已提交
263
	spte = spte_addr(lg, *spgd, vaddr);
R
Rusty Russell 已提交
264 265
	/* If there was a valid shadow PTE entry here before, we release it.
	 * This can happen with a write to a previously read-only entry. */
R
Rusty Russell 已提交
266 267
	release_pte(*spte);

R
Rusty Russell 已提交
268 269
	/* If this is a write, we insist that the Guest page is writable (the
	 * final arg to gpte_to_spte()). */
270
	if (pte_dirty(gpte))
R
Rusty Russell 已提交
271
		*spte = gpte_to_spte(lg, gpte, 1);
272
	else
R
Rusty Russell 已提交
273 274 275 276
		/* If this is a read, don't set the "writable" bit in the page
		 * table entry, even if the Guest says it's writable.  That way
		 * we come back here when a write does actually ocur, so we can
		 * update the Guest's _PAGE_DIRTY flag. */
277
		*spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0);
R
Rusty Russell 已提交
278

R
Rusty Russell 已提交
279 280
	/* Finally, we write the Guest PTE entry back: we've set the
	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
R
Rusty Russell 已提交
281
	lgwrite(lg, gpte_ptr, pte_t, gpte);
R
Rusty Russell 已提交
282 283

	/* We succeeded in mapping the page! */
R
Rusty Russell 已提交
284 285 286
	return 1;
}

R
Rusty Russell 已提交
287 288 289 290 291 292 293 294
/*H:360 (ii) Setting up the page table entry for the Guest stack.
 *
 * Remember pin_stack_pages() which makes sure the stack is mapped?  It could
 * simply call demand_page(), but as we've seen that logic is quite long, and
 * usually the stack pages are already mapped anyway, so it's not required.
 *
 * This is a quick version which answers the question: is this virtual address
 * mapped by the shadow page tables, and is it writable? */
R
Rusty Russell 已提交
295 296
static int page_writable(struct lguest *lg, unsigned long vaddr)
{
297
	pgd_t *spgd;
R
Rusty Russell 已提交
298 299
	unsigned long flags;

R
Rusty Russell 已提交
300
	/* Look at the top level entry: is it present? */
R
Rusty Russell 已提交
301
	spgd = spgd_addr(lg, lg->pgdidx, vaddr);
302
	if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
R
Rusty Russell 已提交
303 304
		return 0;

R
Rusty Russell 已提交
305 306
	/* Check the flags on the pte entry itself: it must be present and
	 * writable. */
307 308
	flags = pte_flags(*(spte_addr(lg, *spgd, vaddr)));

R
Rusty Russell 已提交
309 310 311
	return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
}

R
Rusty Russell 已提交
312 313 314
/* So, when pin_stack_pages() asks us to pin a page, we check if it's already
 * in the page tables, and if not, we call demand_page() with error code 2
 * (meaning "write"). */
R
Rusty Russell 已提交
315 316 317 318 319 320
void pin_page(struct lguest *lg, unsigned long vaddr)
{
	if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2))
		kill_guest(lg, "bad stack page %#lx", vaddr);
}

R
Rusty Russell 已提交
321
/*H:450 If we chase down the release_pgd() code, it looks like this: */
322
static void release_pgd(struct lguest *lg, pgd_t *spgd)
R
Rusty Russell 已提交
323
{
R
Rusty Russell 已提交
324
	/* If the entry's not present, there's nothing to release. */
325
	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
R
Rusty Russell 已提交
326
		unsigned int i;
R
Rusty Russell 已提交
327 328 329
		/* Converting the pfn to find the actual PTE page is easy: turn
		 * the page number into a physical address, then convert to a
		 * virtual address (easy for kernel pages like this one). */
330
		pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
R
Rusty Russell 已提交
331
		/* For each entry in the page, we might need to release it. */
332
		for (i = 0; i < PTRS_PER_PTE; i++)
R
Rusty Russell 已提交
333
			release_pte(ptepage[i]);
R
Rusty Russell 已提交
334
		/* Now we can free the page of PTEs */
R
Rusty Russell 已提交
335
		free_page((long)ptepage);
R
Rusty Russell 已提交
336
		/* And zero out the PGD entry we we never release it twice. */
337
		*spgd = __pgd(0);
R
Rusty Russell 已提交
338 339 340
	}
}

R
Rusty Russell 已提交
341 342 343 344
/*H:440 (v) Flushing (thowing away) page tables,
 *
 * We saw flush_user_mappings() called when we re-used a top-level pgdir page.
 * It simply releases every PTE page from 0 up to the kernel address. */
R
Rusty Russell 已提交
345 346 347
static void flush_user_mappings(struct lguest *lg, int idx)
{
	unsigned int i;
R
Rusty Russell 已提交
348
	/* Release every pgd entry up to the kernel's address. */
349
	for (i = 0; i < pgd_index(lg->kernel_address); i++)
R
Rusty Russell 已提交
350 351 352
		release_pgd(lg, lg->pgdirs[idx].pgdir + i);
}

R
Rusty Russell 已提交
353 354
/* The Guest also has a hypercall to do this manually: it's used when a large
 * number of mappings have been changed. */
R
Rusty Russell 已提交
355 356
void guest_pagetable_flush_user(struct lguest *lg)
{
R
Rusty Russell 已提交
357
	/* Drop the userspace part of the current page table. */
R
Rusty Russell 已提交
358 359
	flush_user_mappings(lg, lg->pgdidx);
}
R
Rusty Russell 已提交
360
/*:*/
R
Rusty Russell 已提交
361

362 363 364 365 366 367 368
/* We walk down the guest page tables to get a guest-physical address */
unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
{
	pgd_t gpgd;
	pte_t gpte;

	/* First step: get the top-level Guest page table entry. */
R
Rusty Russell 已提交
369
	gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
370 371 372 373
	/* Toplevel not present?  We can't map it in. */
	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
		kill_guest(lg, "Bad address %#lx", vaddr);

R
Rusty Russell 已提交
374
	gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t);
375 376 377 378 379 380
	if (!(pte_flags(gpte) & _PAGE_PRESENT))
		kill_guest(lg, "Bad address %#lx", vaddr);

	return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
}

R
Rusty Russell 已提交
381 382 383
/* We keep several page tables.  This is a simple routine to find the page
 * table (if any) corresponding to this top-level address the Guest has given
 * us. */
R
Rusty Russell 已提交
384 385 386 387
static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
{
	unsigned int i;
	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
388
		if (lg->pgdirs[i].gpgdir == pgtable)
R
Rusty Russell 已提交
389 390 391 392
			break;
	return i;
}

R
Rusty Russell 已提交
393 394 395
/*H:435 And this is us, creating the new page directory.  If we really do
 * allocate a new one (and so the kernel parts are not there), we set
 * blank_pgdir. */
R
Rusty Russell 已提交
396
static unsigned int new_pgdir(struct lguest *lg,
397
			      unsigned long gpgdir,
R
Rusty Russell 已提交
398 399 400 401
			      int *blank_pgdir)
{
	unsigned int next;

R
Rusty Russell 已提交
402 403
	/* We pick one entry at random to throw out.  Choosing the Least
	 * Recently Used might be better, but this is easy. */
R
Rusty Russell 已提交
404
	next = random32() % ARRAY_SIZE(lg->pgdirs);
R
Rusty Russell 已提交
405
	/* If it's never been allocated at all before, try now. */
R
Rusty Russell 已提交
406
	if (!lg->pgdirs[next].pgdir) {
407
		lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
R
Rusty Russell 已提交
408
		/* If the allocation fails, just keep using the one we have */
R
Rusty Russell 已提交
409 410 411
		if (!lg->pgdirs[next].pgdir)
			next = lg->pgdidx;
		else
R
Rusty Russell 已提交
412 413
			/* This is a blank page, so there are no kernel
			 * mappings: caller must map the stack! */
R
Rusty Russell 已提交
414 415
			*blank_pgdir = 1;
	}
R
Rusty Russell 已提交
416
	/* Record which Guest toplevel this shadows. */
417
	lg->pgdirs[next].gpgdir = gpgdir;
R
Rusty Russell 已提交
418 419 420 421 422 423
	/* Release all the non-kernel mappings. */
	flush_user_mappings(lg, next);

	return next;
}

R
Rusty Russell 已提交
424 425 426 427
/*H:430 (iv) Switching page tables
 *
 * This is what happens when the Guest changes page tables (ie. changes the
 * top-level pgdir).  This happens on almost every context switch. */
R
Rusty Russell 已提交
428 429 430 431
void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
{
	int newpgdir, repin = 0;

R
Rusty Russell 已提交
432
	/* Look to see if we have this one already. */
R
Rusty Russell 已提交
433
	newpgdir = find_pgdir(lg, pgtable);
R
Rusty Russell 已提交
434 435
	/* If not, we allocate or mug an existing one: if it's a fresh one,
	 * repin gets set to 1. */
R
Rusty Russell 已提交
436 437
	if (newpgdir == ARRAY_SIZE(lg->pgdirs))
		newpgdir = new_pgdir(lg, pgtable, &repin);
R
Rusty Russell 已提交
438
	/* Change the current pgd index to the new one. */
R
Rusty Russell 已提交
439
	lg->pgdidx = newpgdir;
R
Rusty Russell 已提交
440
	/* If it was completely blank, we map in the Guest kernel stack */
R
Rusty Russell 已提交
441 442 443 444
	if (repin)
		pin_stack_pages(lg);
}

R
Rusty Russell 已提交
445 446
/*H:470 Finally, a routine which throws away everything: all PGD entries in all
 * the shadow page tables.  This is used when we destroy the Guest. */
R
Rusty Russell 已提交
447 448 449 450
static void release_all_pagetables(struct lguest *lg)
{
	unsigned int i, j;

R
Rusty Russell 已提交
451
	/* Every shadow pagetable this Guest has */
R
Rusty Russell 已提交
452 453
	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
		if (lg->pgdirs[i].pgdir)
R
Rusty Russell 已提交
454
			/* Every PGD entry except the Switcher at the top */
R
Rusty Russell 已提交
455 456 457 458
			for (j = 0; j < SWITCHER_PGD_INDEX; j++)
				release_pgd(lg, lg->pgdirs[i].pgdir + j);
}

R
Rusty Russell 已提交
459 460 461
/* We also throw away everything when a Guest tells us it's changed a kernel
 * mapping.  Since kernel mappings are in every page table, it's easiest to
 * throw them all away.  This is amazingly slow, but thankfully rare. */
R
Rusty Russell 已提交
462 463 464
void guest_pagetable_clear_all(struct lguest *lg)
{
	release_all_pagetables(lg);
R
Rusty Russell 已提交
465
	/* We need the Guest kernel stack mapped again. */
R
Rusty Russell 已提交
466 467 468
	pin_stack_pages(lg);
}

R
Rusty Russell 已提交
469 470 471 472 473 474 475 476 477 478 479 480 481 482
/*H:420 This is the routine which actually sets the page table entry for then
 * "idx"'th shadow page table.
 *
 * Normally, we can just throw out the old entry and replace it with 0: if they
 * use it demand_page() will put the new entry in.  We need to do this anyway:
 * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page
 * is read from, and _PAGE_DIRTY when it's written to.
 *
 * But Avi Kivity pointed out that most Operating Systems (Linux included) set
 * these bits on PTEs immediately anyway.  This is done to save the CPU from
 * having to update them, but it helps us the same way: if they set
 * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
 * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
 */
R
Rusty Russell 已提交
483
static void do_set_pte(struct lguest *lg, int idx,
484
		       unsigned long vaddr, pte_t gpte)
R
Rusty Russell 已提交
485
{
R
Rusty Russell 已提交
486
	/* Look up the matching shadow page directot entry. */
487
	pgd_t *spgd = spgd_addr(lg, idx, vaddr);
R
Rusty Russell 已提交
488 489

	/* If the top level isn't present, there's no entry to update. */
490
	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
R
Rusty Russell 已提交
491
		/* Otherwise, we start by releasing the existing entry. */
492
		pte_t *spte = spte_addr(lg, *spgd, vaddr);
R
Rusty Russell 已提交
493
		release_pte(*spte);
R
Rusty Russell 已提交
494 495 496 497

		/* If they're setting this entry as dirty or accessed, we might
		 * as well put that entry they've given us in now.  This shaves
		 * 10% off a copy-on-write micro-benchmark. */
498
		if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
R
Rusty Russell 已提交
499
			check_gpte(lg, gpte);
500 501
			*spte = gpte_to_spte(lg, gpte,
					     pte_flags(gpte) & _PAGE_DIRTY);
R
Rusty Russell 已提交
502
		} else
R
Rusty Russell 已提交
503
			/* Otherwise we can demand_page() it in later. */
504
			*spte = __pte(0);
R
Rusty Russell 已提交
505 506 507
	}
}

R
Rusty Russell 已提交
508 509 510 511 512 513 514 515 516 517
/*H:410 Updating a PTE entry is a little trickier.
 *
 * We keep track of several different page tables (the Guest uses one for each
 * process, so it makes sense to cache at least a few).  Each of these have
 * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for
 * all processes.  So when the page table above that address changes, we update
 * all the page tables, not just the current one.  This is rare.
 *
 * The benefit is that when we have to track a new page table, we can copy keep
 * all the kernel mappings.  This speeds up context switch immensely. */
R
Rusty Russell 已提交
518
void guest_set_pte(struct lguest *lg,
519
		   unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
R
Rusty Russell 已提交
520
{
R
Rusty Russell 已提交
521 522
	/* Kernel mappings must be changed on all top levels.  Slow, but
	 * doesn't happen often. */
523
	if (vaddr >= lg->kernel_address) {
R
Rusty Russell 已提交
524 525 526 527 528
		unsigned int i;
		for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
			if (lg->pgdirs[i].pgdir)
				do_set_pte(lg, i, vaddr, gpte);
	} else {
R
Rusty Russell 已提交
529
		/* Is this page table one we have a shadow for? */
530
		int pgdir = find_pgdir(lg, gpgdir);
R
Rusty Russell 已提交
531
		if (pgdir != ARRAY_SIZE(lg->pgdirs))
R
Rusty Russell 已提交
532
			/* If so, do the update. */
R
Rusty Russell 已提交
533 534 535 536
			do_set_pte(lg, pgdir, vaddr, gpte);
	}
}

R
Rusty Russell 已提交
537 538 539 540 541 542 543 544 545 546 547 548 549 550
/*H:400
 * (iii) Setting up a page table entry when the Guest tells us it has changed.
 *
 * Just like we did in interrupts_and_traps.c, it makes sense for us to deal
 * with the other side of page tables while we're here: what happens when the
 * Guest asks for a page table to be updated?
 *
 * We already saw that demand_page() will fill in the shadow page tables when
 * needed, so we can simply remove shadow page table entries whenever the Guest
 * tells us they've changed.  When the Guest tries to use the new entry it will
 * fault and demand_page() will fix it up.
 *
 * So with that in mind here's our code to to update a (top-level) PGD entry:
 */
551
void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx)
R
Rusty Russell 已提交
552 553 554
{
	int pgdir;

R
Rusty Russell 已提交
555 556
	/* The kernel seems to try to initialize this early on: we ignore its
	 * attempts to map over the Switcher. */
R
Rusty Russell 已提交
557 558 559
	if (idx >= SWITCHER_PGD_INDEX)
		return;

R
Rusty Russell 已提交
560
	/* If they're talking about a page table we have a shadow for... */
561
	pgdir = find_pgdir(lg, gpgdir);
R
Rusty Russell 已提交
562
	if (pgdir < ARRAY_SIZE(lg->pgdirs))
R
Rusty Russell 已提交
563
		/* ... throw it away. */
R
Rusty Russell 已提交
564 565 566
		release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
}

R
Rusty Russell 已提交
567 568 569 570
/*H:500 (vii) Setting up the page tables initially.
 *
 * When a Guest is first created, the Launcher tells us where the toplevel of
 * its first page table is.  We set some things up here: */
R
Rusty Russell 已提交
571 572
int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
{
R
Rusty Russell 已提交
573 574
	/* We start on the first shadow page table, and give it a blank PGD
	 * page. */
R
Rusty Russell 已提交
575
	lg->pgdidx = 0;
576
	lg->pgdirs[lg->pgdidx].gpgdir = pgtable;
577
	lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL);
R
Rusty Russell 已提交
578 579 580 581 582
	if (!lg->pgdirs[lg->pgdidx].pgdir)
		return -ENOMEM;
	return 0;
}

583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600
/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
void page_table_guest_data_init(struct lguest *lg)
{
	/* We get the kernel address: above this is all kernel memory. */
	if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address)
	    /* We tell the Guest that it can't use the top 4MB of virtual
	     * addresses used by the Switcher. */
	    || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
	    || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir))
		kill_guest(lg, "bad guest page %p", lg->lguest_data);

	/* In flush_user_mappings() we loop from 0 to
	 * "pgd_index(lg->kernel_address)".  This assumes it won't hit the
	 * Switcher mappings, so check that now. */
	if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX)
		kill_guest(lg, "bad kernel address %#lx", lg->kernel_address);
}

R
Rusty Russell 已提交
601
/* When a Guest dies, our cleanup is fairly simple. */
R
Rusty Russell 已提交
602 603 604 605
void free_guest_pagetable(struct lguest *lg)
{
	unsigned int i;

R
Rusty Russell 已提交
606
	/* Throw away all page table pages. */
R
Rusty Russell 已提交
607
	release_all_pagetables(lg);
R
Rusty Russell 已提交
608
	/* Now free the top levels: free_page() can handle 0 just fine. */
R
Rusty Russell 已提交
609 610 611 612
	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
		free_page((long)lg->pgdirs[i].pgdir);
}

R
Rusty Russell 已提交
613 614 615 616 617
/*H:480 (vi) Mapping the Switcher when the Guest is about to run.
 *
 * The Switcher and the two pages for this CPU need to be available to the
 * Guest (and not the pages for other CPUs).  We have the appropriate PTE pages
 * for each CPU already set up, we just need to hook them in. */
R
Rusty Russell 已提交
618 619
void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
{
620 621 622
	pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
	pgd_t switcher_pgd;
	pte_t regs_pte;
R
Rusty Russell 已提交
623

R
Rusty Russell 已提交
624 625
	/* Make the last PGD entry for this Guest point to the Switcher's PTE
	 * page for this CPU (with appropriate flags). */
626 627
	switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL);

R
Rusty Russell 已提交
628 629
	lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;

R
Rusty Russell 已提交
630 631 632 633 634 635 636
	/* We also change the Switcher PTE page.  When we're running the Guest,
	 * we want the Guest's "regs" page to appear where the first Switcher
	 * page for this CPU is.  This is an optimization: when the Switcher
	 * saves the Guest registers, it saves them into the first page of this
	 * CPU's "struct lguest_pages": if we make sure the Guest's register
	 * page is already mapped there, we don't have to copy them out
	 * again. */
637 638
	regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL));
	switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte;
R
Rusty Russell 已提交
639
}
R
Rusty Russell 已提交
640
/*:*/
R
Rusty Russell 已提交
641 642 643 644 645 646 647 648 649

static void free_switcher_pte_pages(void)
{
	unsigned int i;

	for_each_possible_cpu(i)
		free_page((long)switcher_pte_page(i));
}

R
Rusty Russell 已提交
650 651 652 653
/*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given
 * the CPU number and the "struct page"s for the Switcher code itself.
 *
 * Currently the Switcher is less than a page long, so "pages" is always 1. */
R
Rusty Russell 已提交
654 655 656 657 658
static __init void populate_switcher_pte_page(unsigned int cpu,
					      struct page *switcher_page[],
					      unsigned int pages)
{
	unsigned int i;
659
	pte_t *pte = switcher_pte_page(cpu);
R
Rusty Russell 已提交
660

R
Rusty Russell 已提交
661
	/* The first entries are easy: they map the Switcher code. */
R
Rusty Russell 已提交
662
	for (i = 0; i < pages; i++) {
663 664
		pte[i] = mk_pte(switcher_page[i],
				__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
R
Rusty Russell 已提交
665 666
	}

R
Rusty Russell 已提交
667
	/* The only other thing we map is this CPU's pair of pages. */
R
Rusty Russell 已提交
668 669
	i = pages + cpu*2;

R
Rusty Russell 已提交
670
	/* First page (Guest registers) is writable from the Guest */
671 672 673
	pte[i] = pfn_pte(page_to_pfn(switcher_page[i]),
			 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW));

R
Rusty Russell 已提交
674 675
	/* The second page contains the "struct lguest_ro_state", and is
	 * read-only. */
676 677
	pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]),
			   __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
R
Rusty Russell 已提交
678 679
}

R
Rusty Russell 已提交
680 681
/*H:510 At boot or module load time, init_pagetables() allocates and populates
 * the Switcher PTE page for each CPU. */
R
Rusty Russell 已提交
682 683 684 685 686
__init int init_pagetables(struct page **switcher_page, unsigned int pages)
{
	unsigned int i;

	for_each_possible_cpu(i) {
687
		switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
R
Rusty Russell 已提交
688 689 690 691 692 693 694 695
		if (!switcher_pte_page(i)) {
			free_switcher_pte_pages();
			return -ENOMEM;
		}
		populate_switcher_pte_page(i, switcher_page, pages);
	}
	return 0;
}
R
Rusty Russell 已提交
696
/*:*/
R
Rusty Russell 已提交
697

R
Rusty Russell 已提交
698
/* Cleaning up simply involves freeing the PTE page for each CPU. */
R
Rusty Russell 已提交
699 700 701 702
void free_pagetables(void)
{
	free_switcher_pte_pages();
}