page_tables.c 24.5 KB
Newer Older
1 2 3 4 5 6 7 8
/*P:700 The pagetable code, on the other hand, still shows the scars of
 * previous encounters.  It's functional, and as neat as it can be in the
 * circumstances, but be wary, for these things are subtle and break easily.
 * The Guest provides a virtual to physical mapping, but we can neither trust
 * it nor use it: we verify and convert it here to point the hardware to the
 * actual Guest pages when running the Guest. :*/

/* Copyright (C) Rusty Russell IBM Corporation 2006.
R
Rusty Russell 已提交
9 10 11 12 13 14 15 16 17
 * GPL v2 and any later version */
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/random.h>
#include <linux/percpu.h>
#include <asm/tlbflush.h>
#include "lg.h"

18 19 20 21 22
/*M:008 We hold reference to pages, which prevents them from being swapped.
 * It'd be nice to have a callback in the "struct mm_struct" when Linux wants
 * to swap out.  If we had this, and a shrinker callback to trim PTE pages, we
 * could probably consider launching Guests as non-root. :*/

R
Rusty Russell 已提交
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
/*H:300
 * The Page Table Code
 *
 * We use two-level page tables for the Guest.  If you're not entirely
 * comfortable with virtual addresses, physical addresses and page tables then
 * I recommend you review lguest.c's "Page Table Handling" (with diagrams!).
 *
 * The Guest keeps page tables, but we maintain the actual ones here: these are
 * called "shadow" page tables.  Which is a very Guest-centric name: these are
 * the real page tables the CPU uses, although we keep them up to date to
 * reflect the Guest's.  (See what I mean about weird naming?  Since when do
 * shadows reflect anything?)
 *
 * Anyway, this is the most complicated part of the Host code.  There are seven
 * parts to this:
 *  (i) Setting up a page table entry for the Guest when it faults,
 *  (ii) Setting up the page table entry for the Guest stack,
 *  (iii) Setting up a page table entry when the Guest tells us it has changed,
 *  (iv) Switching page tables,
 *  (v) Flushing (thowing away) page tables,
 *  (vi) Mapping the Switcher when the Guest is about to run,
 *  (vii) Setting up the page tables initially.
 :*/


/* 1024 entries in a page table page maps 1024 pages: 4MB.  The Switcher is
 * conveniently placed at the top 4MB, so it uses a separate, complete PTE
 * page.  */
51
#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
R
Rusty Russell 已提交
52

R
Rusty Russell 已提交
53 54 55
/* We actually need a separate PTE page for each CPU.  Remember that after the
 * Switcher code itself comes two pages for each CPU, and we don't want this
 * CPU's guest to see the pages of any other CPU. */
56
static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
R
Rusty Russell 已提交
57 58
#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)

R
Rusty Russell 已提交
59 60 61 62
/*H:320 With our shadow and Guest types established, we need to deal with
 * them: the page table code is curly enough to need helper functions to keep
 * it clear and clean.
 *
63
 * There are two functions which return pointers to the shadow (aka "real")
R
Rusty Russell 已提交
64 65 66 67 68 69
 * page tables.
 *
 * spgd_addr() takes the virtual address and returns a pointer to the top-level
 * page directory entry for that address.  Since we keep track of several page
 * tables, the "i" argument tells us which one we're interested in (it's
 * usually the current one). */
70
static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
R
Rusty Russell 已提交
71
{
72
	unsigned int index = pgd_index(vaddr);
R
Rusty Russell 已提交
73

R
Rusty Russell 已提交
74
	/* We kill any Guest trying to touch the Switcher addresses. */
R
Rusty Russell 已提交
75 76 77 78
	if (index >= SWITCHER_PGD_INDEX) {
		kill_guest(lg, "attempt to access switcher pages");
		index = 0;
	}
R
Rusty Russell 已提交
79
	/* Return a pointer index'th pgd entry for the i'th page table. */
R
Rusty Russell 已提交
80 81 82
	return &lg->pgdirs[i].pgdir[index];
}

R
Rusty Russell 已提交
83 84 85
/* This routine then takes the PGD entry given above, which contains the
 * address of the PTE page.  It then returns a pointer to the PTE entry for the
 * given address. */
86
static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)
R
Rusty Russell 已提交
87
{
88
	pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
R
Rusty Russell 已提交
89
	/* You should never call this if the PGD entry wasn't valid */
90 91
	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
	return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE];
R
Rusty Russell 已提交
92 93
}

R
Rusty Russell 已提交
94 95
/* These two functions just like the above two, except they access the Guest
 * page tables.  Hence they return a Guest address. */
R
Rusty Russell 已提交
96 97
static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr)
{
98
	unsigned int index = vaddr >> (PGDIR_SHIFT);
99
	return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t);
R
Rusty Russell 已提交
100 101 102
}

static unsigned long gpte_addr(struct lguest *lg,
103
			       pgd_t gpgd, unsigned long vaddr)
R
Rusty Russell 已提交
104
{
105 106 107
	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
	return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t);
R
Rusty Russell 已提交
108 109
}

R
Rusty Russell 已提交
110 111 112 113 114 115 116 117
/*H:350 This routine takes a page number given by the Guest and converts it to
 * an actual, physical page number.  It can fail for several reasons: the
 * virtual address might not be mapped by the Launcher, the write flag is set
 * and the page is read-only, or the write flag was set and the page was
 * shared so had to be copied, but we ran out of memory.
 *
 * This holds a reference to the page, so release_pte() is careful to
 * put that back. */
R
Rusty Russell 已提交
118 119 120
static unsigned long get_pfn(unsigned long virtpfn, int write)
{
	struct page *page;
R
Rusty Russell 已提交
121
	/* This value indicates failure. */
R
Rusty Russell 已提交
122 123
	unsigned long ret = -1UL;

R
Rusty Russell 已提交
124 125 126 127
	/* get_user_pages() is a complex interface: it gets the "struct
	 * vm_area_struct" and "struct page" assocated with a range of pages.
	 * It also needs the task's mmap_sem held, and is not very quick.
	 * It returns the number of pages it got. */
R
Rusty Russell 已提交
128 129 130 131 132 133 134 135
	down_read(&current->mm->mmap_sem);
	if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT,
			   1, write, 1, &page, NULL) == 1)
		ret = page_to_pfn(page);
	up_read(&current->mm->mmap_sem);
	return ret;
}

R
Rusty Russell 已提交
136 137 138 139
/*H:340 Converting a Guest page table entry to a shadow (ie. real) page table
 * entry can be a little tricky.  The flags are (almost) the same, but the
 * Guest PTE contains a virtual page number: the CPU needs the real page
 * number. */
140
static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
R
Rusty Russell 已提交
141
{
142
	unsigned long pfn, base, flags;
R
Rusty Russell 已提交
143

R
Rusty Russell 已提交
144 145 146 147
	/* The Guest sets the global flag, because it thinks that it is using
	 * PGE.  We only told it to use PGE so it would tell us whether it was
	 * flushing a kernel mapping or a userspace mapping.  We don't actually
	 * use the global bit, so throw it away. */
148
	flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
R
Rusty Russell 已提交
149

150 151 152
	/* The Guest's pages are offset inside the Launcher. */
	base = (unsigned long)lg->mem_base / PAGE_SIZE;

R
Rusty Russell 已提交
153 154 155 156
	/* We need a temporary "unsigned long" variable to hold the answer from
	 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
	 * fit in spte.pfn.  get_pfn() finds the real physical number of the
	 * page, given the virtual number. */
157
	pfn = get_pfn(base + pte_pfn(gpte), write);
R
Rusty Russell 已提交
158
	if (pfn == -1UL) {
159
		kill_guest(lg, "failed to get page %lu", pte_pfn(gpte));
R
Rusty Russell 已提交
160 161 162
		/* When we destroy the Guest, we'll go through the shadow page
		 * tables and release_pte() them.  Make sure we don't think
		 * this one is valid! */
163
		flags = 0;
R
Rusty Russell 已提交
164
	}
165 166
	/* Now we assemble our shadow PTE from the page number and flags. */
	return pfn_pte(pfn, __pgprot(flags));
R
Rusty Russell 已提交
167 168
}

R
Rusty Russell 已提交
169
/*H:460 And to complete the chain, release_pte() looks like this: */
170
static void release_pte(pte_t pte)
R
Rusty Russell 已提交
171
{
R
Rusty Russell 已提交
172 173
	/* Remember that get_user_pages() took a reference to the page, in
	 * get_pfn()?  We have to put it back now. */
174 175
	if (pte_flags(pte) & _PAGE_PRESENT)
		put_page(pfn_to_page(pte_pfn(pte)));
R
Rusty Russell 已提交
176
}
R
Rusty Russell 已提交
177
/*:*/
R
Rusty Russell 已提交
178

179
static void check_gpte(struct lguest *lg, pte_t gpte)
R
Rusty Russell 已提交
180
{
181 182
	if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE))
	    || pte_pfn(gpte) >= lg->pfn_limit)
R
Rusty Russell 已提交
183 184 185
		kill_guest(lg, "bad page table entry");
}

186
static void check_gpgd(struct lguest *lg, pgd_t gpgd)
R
Rusty Russell 已提交
187
{
188
	if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit)
R
Rusty Russell 已提交
189 190 191
		kill_guest(lg, "bad page directory entry");
}

R
Rusty Russell 已提交
192 193 194 195 196 197 198 199 200 201
/*H:330
 * (i) Setting up a page table entry for the Guest when it faults
 *
 * We saw this call in run_guest(): when we see a page fault in the Guest, we
 * come here.  That's because we only set up the shadow page tables lazily as
 * they're needed, so we get page faults all the time and quietly fix them up
 * and return to the Guest without it knowing.
 *
 * If we fixed up the fault (ie. we mapped the address), this routine returns
 * true. */
R
Rusty Russell 已提交
202 203
int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
{
204 205
	pgd_t gpgd;
	pgd_t *spgd;
R
Rusty Russell 已提交
206
	unsigned long gpte_ptr;
207 208
	pte_t gpte;
	pte_t *spte;
R
Rusty Russell 已提交
209

R
Rusty Russell 已提交
210
	/* First step: get the top-level Guest page table entry. */
211
	gpgd = __pgd(lgread_u32(lg, gpgd_addr(lg, vaddr)));
R
Rusty Russell 已提交
212
	/* Toplevel not present?  We can't map it in. */
213
	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
R
Rusty Russell 已提交
214 215
		return 0;

R
Rusty Russell 已提交
216
	/* Now look at the matching shadow entry. */
R
Rusty Russell 已提交
217
	spgd = spgd_addr(lg, lg->pgdidx, vaddr);
218
	if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
R
Rusty Russell 已提交
219
		/* No shadow entry: allocate a new shadow PTE page. */
R
Rusty Russell 已提交
220
		unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
R
Rusty Russell 已提交
221 222
		/* This is not really the Guest's fault, but killing it is
		 * simple for this corner case. */
R
Rusty Russell 已提交
223 224 225 226
		if (!ptepage) {
			kill_guest(lg, "out of memory allocating pte page");
			return 0;
		}
R
Rusty Russell 已提交
227
		/* We check that the Guest pgd is OK. */
R
Rusty Russell 已提交
228
		check_gpgd(lg, gpgd);
R
Rusty Russell 已提交
229 230
		/* And we copy the flags to the shadow PGD entry.  The page
		 * number in the shadow PGD is the page we just allocated. */
231
		*spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd));
R
Rusty Russell 已提交
232 233
	}

R
Rusty Russell 已提交
234 235
	/* OK, now we look at the lower level in the Guest page table: keep its
	 * address, because we might update it later. */
R
Rusty Russell 已提交
236
	gpte_ptr = gpte_addr(lg, gpgd, vaddr);
237
	gpte = __pte(lgread_u32(lg, gpte_ptr));
R
Rusty Russell 已提交
238

R
Rusty Russell 已提交
239
	/* If this page isn't in the Guest page tables, we can't page it in. */
240
	if (!(pte_flags(gpte) & _PAGE_PRESENT))
R
Rusty Russell 已提交
241 242
		return 0;

R
Rusty Russell 已提交
243 244
	/* Check they're not trying to write to a page the Guest wants
	 * read-only (bit 2 of errcode == write). */
245
	if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
R
Rusty Russell 已提交
246 247
		return 0;

R
Rusty Russell 已提交
248
	/* User access to a kernel page? (bit 3 == user access) */
249
	if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
R
Rusty Russell 已提交
250 251
		return 0;

R
Rusty Russell 已提交
252 253
	/* Check that the Guest PTE flags are OK, and the page number is below
	 * the pfn_limit (ie. not mapping the Launcher binary). */
R
Rusty Russell 已提交
254
	check_gpte(lg, gpte);
R
Rusty Russell 已提交
255
	/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
256 257
	gpte = pte_mkyoung(gpte);

R
Rusty Russell 已提交
258
	if (errcode & 2)
259
		gpte = pte_mkdirty(gpte);
R
Rusty Russell 已提交
260

R
Rusty Russell 已提交
261
	/* Get the pointer to the shadow PTE entry we're going to set. */
R
Rusty Russell 已提交
262
	spte = spte_addr(lg, *spgd, vaddr);
R
Rusty Russell 已提交
263 264
	/* If there was a valid shadow PTE entry here before, we release it.
	 * This can happen with a write to a previously read-only entry. */
R
Rusty Russell 已提交
265 266
	release_pte(*spte);

R
Rusty Russell 已提交
267 268
	/* If this is a write, we insist that the Guest page is writable (the
	 * final arg to gpte_to_spte()). */
269
	if (pte_dirty(gpte))
R
Rusty Russell 已提交
270
		*spte = gpte_to_spte(lg, gpte, 1);
271
	else
R
Rusty Russell 已提交
272 273 274 275
		/* If this is a read, don't set the "writable" bit in the page
		 * table entry, even if the Guest says it's writable.  That way
		 * we come back here when a write does actually ocur, so we can
		 * update the Guest's _PAGE_DIRTY flag. */
276
		*spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0);
R
Rusty Russell 已提交
277

R
Rusty Russell 已提交
278 279
	/* Finally, we write the Guest PTE entry back: we've set the
	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
280
	lgwrite_u32(lg, gpte_ptr, pte_val(gpte));
R
Rusty Russell 已提交
281 282

	/* We succeeded in mapping the page! */
R
Rusty Russell 已提交
283 284 285
	return 1;
}

R
Rusty Russell 已提交
286 287 288 289 290 291 292 293
/*H:360 (ii) Setting up the page table entry for the Guest stack.
 *
 * Remember pin_stack_pages() which makes sure the stack is mapped?  It could
 * simply call demand_page(), but as we've seen that logic is quite long, and
 * usually the stack pages are already mapped anyway, so it's not required.
 *
 * This is a quick version which answers the question: is this virtual address
 * mapped by the shadow page tables, and is it writable? */
R
Rusty Russell 已提交
294 295
static int page_writable(struct lguest *lg, unsigned long vaddr)
{
296
	pgd_t *spgd;
R
Rusty Russell 已提交
297 298
	unsigned long flags;

R
Rusty Russell 已提交
299
	/* Look at the top level entry: is it present? */
R
Rusty Russell 已提交
300
	spgd = spgd_addr(lg, lg->pgdidx, vaddr);
301
	if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
R
Rusty Russell 已提交
302 303
		return 0;

R
Rusty Russell 已提交
304 305
	/* Check the flags on the pte entry itself: it must be present and
	 * writable. */
306 307
	flags = pte_flags(*(spte_addr(lg, *spgd, vaddr)));

R
Rusty Russell 已提交
308 309 310
	return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
}

R
Rusty Russell 已提交
311 312 313
/* So, when pin_stack_pages() asks us to pin a page, we check if it's already
 * in the page tables, and if not, we call demand_page() with error code 2
 * (meaning "write"). */
R
Rusty Russell 已提交
314 315 316 317 318 319
void pin_page(struct lguest *lg, unsigned long vaddr)
{
	if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2))
		kill_guest(lg, "bad stack page %#lx", vaddr);
}

R
Rusty Russell 已提交
320
/*H:450 If we chase down the release_pgd() code, it looks like this: */
321
static void release_pgd(struct lguest *lg, pgd_t *spgd)
R
Rusty Russell 已提交
322
{
R
Rusty Russell 已提交
323
	/* If the entry's not present, there's nothing to release. */
324
	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
R
Rusty Russell 已提交
325
		unsigned int i;
R
Rusty Russell 已提交
326 327 328
		/* Converting the pfn to find the actual PTE page is easy: turn
		 * the page number into a physical address, then convert to a
		 * virtual address (easy for kernel pages like this one). */
329
		pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
R
Rusty Russell 已提交
330
		/* For each entry in the page, we might need to release it. */
331
		for (i = 0; i < PTRS_PER_PTE; i++)
R
Rusty Russell 已提交
332
			release_pte(ptepage[i]);
R
Rusty Russell 已提交
333
		/* Now we can free the page of PTEs */
R
Rusty Russell 已提交
334
		free_page((long)ptepage);
R
Rusty Russell 已提交
335
		/* And zero out the PGD entry we we never release it twice. */
336
		*spgd = __pgd(0);
R
Rusty Russell 已提交
337 338 339
	}
}

R
Rusty Russell 已提交
340 341 342 343
/*H:440 (v) Flushing (thowing away) page tables,
 *
 * We saw flush_user_mappings() called when we re-used a top-level pgdir page.
 * It simply releases every PTE page from 0 up to the kernel address. */
R
Rusty Russell 已提交
344 345 346
static void flush_user_mappings(struct lguest *lg, int idx)
{
	unsigned int i;
R
Rusty Russell 已提交
347
	/* Release every pgd entry up to the kernel's address. */
348
	for (i = 0; i < pgd_index(lg->page_offset); i++)
R
Rusty Russell 已提交
349 350 351
		release_pgd(lg, lg->pgdirs[idx].pgdir + i);
}

R
Rusty Russell 已提交
352 353
/* The Guest also has a hypercall to do this manually: it's used when a large
 * number of mappings have been changed. */
R
Rusty Russell 已提交
354 355
void guest_pagetable_flush_user(struct lguest *lg)
{
R
Rusty Russell 已提交
356
	/* Drop the userspace part of the current page table. */
R
Rusty Russell 已提交
357 358
	flush_user_mappings(lg, lg->pgdidx);
}
R
Rusty Russell 已提交
359
/*:*/
R
Rusty Russell 已提交
360

R
Rusty Russell 已提交
361 362 363
/* We keep several page tables.  This is a simple routine to find the page
 * table (if any) corresponding to this top-level address the Guest has given
 * us. */
R
Rusty Russell 已提交
364 365 366 367
static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
{
	unsigned int i;
	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
368
		if (lg->pgdirs[i].gpgdir == pgtable)
R
Rusty Russell 已提交
369 370 371 372
			break;
	return i;
}

R
Rusty Russell 已提交
373 374 375
/*H:435 And this is us, creating the new page directory.  If we really do
 * allocate a new one (and so the kernel parts are not there), we set
 * blank_pgdir. */
R
Rusty Russell 已提交
376
static unsigned int new_pgdir(struct lguest *lg,
377
			      unsigned long gpgdir,
R
Rusty Russell 已提交
378 379 380 381
			      int *blank_pgdir)
{
	unsigned int next;

R
Rusty Russell 已提交
382 383
	/* We pick one entry at random to throw out.  Choosing the Least
	 * Recently Used might be better, but this is easy. */
R
Rusty Russell 已提交
384
	next = random32() % ARRAY_SIZE(lg->pgdirs);
R
Rusty Russell 已提交
385
	/* If it's never been allocated at all before, try now. */
R
Rusty Russell 已提交
386
	if (!lg->pgdirs[next].pgdir) {
387
		lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
R
Rusty Russell 已提交
388
		/* If the allocation fails, just keep using the one we have */
R
Rusty Russell 已提交
389 390 391
		if (!lg->pgdirs[next].pgdir)
			next = lg->pgdidx;
		else
R
Rusty Russell 已提交
392 393
			/* This is a blank page, so there are no kernel
			 * mappings: caller must map the stack! */
R
Rusty Russell 已提交
394 395
			*blank_pgdir = 1;
	}
R
Rusty Russell 已提交
396
	/* Record which Guest toplevel this shadows. */
397
	lg->pgdirs[next].gpgdir = gpgdir;
R
Rusty Russell 已提交
398 399 400 401 402 403
	/* Release all the non-kernel mappings. */
	flush_user_mappings(lg, next);

	return next;
}

R
Rusty Russell 已提交
404 405 406 407
/*H:430 (iv) Switching page tables
 *
 * This is what happens when the Guest changes page tables (ie. changes the
 * top-level pgdir).  This happens on almost every context switch. */
R
Rusty Russell 已提交
408 409 410 411
void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
{
	int newpgdir, repin = 0;

R
Rusty Russell 已提交
412
	/* Look to see if we have this one already. */
R
Rusty Russell 已提交
413
	newpgdir = find_pgdir(lg, pgtable);
R
Rusty Russell 已提交
414 415
	/* If not, we allocate or mug an existing one: if it's a fresh one,
	 * repin gets set to 1. */
R
Rusty Russell 已提交
416 417
	if (newpgdir == ARRAY_SIZE(lg->pgdirs))
		newpgdir = new_pgdir(lg, pgtable, &repin);
R
Rusty Russell 已提交
418
	/* Change the current pgd index to the new one. */
R
Rusty Russell 已提交
419
	lg->pgdidx = newpgdir;
R
Rusty Russell 已提交
420
	/* If it was completely blank, we map in the Guest kernel stack */
R
Rusty Russell 已提交
421 422 423 424
	if (repin)
		pin_stack_pages(lg);
}

R
Rusty Russell 已提交
425 426
/*H:470 Finally, a routine which throws away everything: all PGD entries in all
 * the shadow page tables.  This is used when we destroy the Guest. */
R
Rusty Russell 已提交
427 428 429 430
static void release_all_pagetables(struct lguest *lg)
{
	unsigned int i, j;

R
Rusty Russell 已提交
431
	/* Every shadow pagetable this Guest has */
R
Rusty Russell 已提交
432 433
	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
		if (lg->pgdirs[i].pgdir)
R
Rusty Russell 已提交
434
			/* Every PGD entry except the Switcher at the top */
R
Rusty Russell 已提交
435 436 437 438
			for (j = 0; j < SWITCHER_PGD_INDEX; j++)
				release_pgd(lg, lg->pgdirs[i].pgdir + j);
}

R
Rusty Russell 已提交
439 440 441
/* We also throw away everything when a Guest tells us it's changed a kernel
 * mapping.  Since kernel mappings are in every page table, it's easiest to
 * throw them all away.  This is amazingly slow, but thankfully rare. */
R
Rusty Russell 已提交
442 443 444
void guest_pagetable_clear_all(struct lguest *lg)
{
	release_all_pagetables(lg);
R
Rusty Russell 已提交
445
	/* We need the Guest kernel stack mapped again. */
R
Rusty Russell 已提交
446 447 448
	pin_stack_pages(lg);
}

R
Rusty Russell 已提交
449 450 451 452 453 454 455 456 457 458 459 460 461 462
/*H:420 This is the routine which actually sets the page table entry for then
 * "idx"'th shadow page table.
 *
 * Normally, we can just throw out the old entry and replace it with 0: if they
 * use it demand_page() will put the new entry in.  We need to do this anyway:
 * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page
 * is read from, and _PAGE_DIRTY when it's written to.
 *
 * But Avi Kivity pointed out that most Operating Systems (Linux included) set
 * these bits on PTEs immediately anyway.  This is done to save the CPU from
 * having to update them, but it helps us the same way: if they set
 * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
 * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
 */
R
Rusty Russell 已提交
463
static void do_set_pte(struct lguest *lg, int idx,
464
		       unsigned long vaddr, pte_t gpte)
R
Rusty Russell 已提交
465
{
R
Rusty Russell 已提交
466
	/* Look up the matching shadow page directot entry. */
467
	pgd_t *spgd = spgd_addr(lg, idx, vaddr);
R
Rusty Russell 已提交
468 469

	/* If the top level isn't present, there's no entry to update. */
470
	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
R
Rusty Russell 已提交
471
		/* Otherwise, we start by releasing the existing entry. */
472
		pte_t *spte = spte_addr(lg, *spgd, vaddr);
R
Rusty Russell 已提交
473
		release_pte(*spte);
R
Rusty Russell 已提交
474 475 476 477

		/* If they're setting this entry as dirty or accessed, we might
		 * as well put that entry they've given us in now.  This shaves
		 * 10% off a copy-on-write micro-benchmark. */
478
		if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
R
Rusty Russell 已提交
479
			check_gpte(lg, gpte);
480 481
			*spte = gpte_to_spte(lg, gpte,
					     pte_flags(gpte) & _PAGE_DIRTY);
R
Rusty Russell 已提交
482
		} else
R
Rusty Russell 已提交
483
			/* Otherwise we can demand_page() it in later. */
484
			*spte = __pte(0);
R
Rusty Russell 已提交
485 486 487
	}
}

R
Rusty Russell 已提交
488 489 490 491 492 493 494 495 496 497
/*H:410 Updating a PTE entry is a little trickier.
 *
 * We keep track of several different page tables (the Guest uses one for each
 * process, so it makes sense to cache at least a few).  Each of these have
 * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for
 * all processes.  So when the page table above that address changes, we update
 * all the page tables, not just the current one.  This is rare.
 *
 * The benefit is that when we have to track a new page table, we can copy keep
 * all the kernel mappings.  This speeds up context switch immensely. */
R
Rusty Russell 已提交
498
void guest_set_pte(struct lguest *lg,
499
		   unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
R
Rusty Russell 已提交
500
{
R
Rusty Russell 已提交
501 502
	/* Kernel mappings must be changed on all top levels.  Slow, but
	 * doesn't happen often. */
R
Rusty Russell 已提交
503 504 505 506 507 508
	if (vaddr >= lg->page_offset) {
		unsigned int i;
		for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
			if (lg->pgdirs[i].pgdir)
				do_set_pte(lg, i, vaddr, gpte);
	} else {
R
Rusty Russell 已提交
509
		/* Is this page table one we have a shadow for? */
510
		int pgdir = find_pgdir(lg, gpgdir);
R
Rusty Russell 已提交
511
		if (pgdir != ARRAY_SIZE(lg->pgdirs))
R
Rusty Russell 已提交
512
			/* If so, do the update. */
R
Rusty Russell 已提交
513 514 515 516
			do_set_pte(lg, pgdir, vaddr, gpte);
	}
}

R
Rusty Russell 已提交
517 518 519 520 521 522 523 524 525 526 527 528 529 530
/*H:400
 * (iii) Setting up a page table entry when the Guest tells us it has changed.
 *
 * Just like we did in interrupts_and_traps.c, it makes sense for us to deal
 * with the other side of page tables while we're here: what happens when the
 * Guest asks for a page table to be updated?
 *
 * We already saw that demand_page() will fill in the shadow page tables when
 * needed, so we can simply remove shadow page table entries whenever the Guest
 * tells us they've changed.  When the Guest tries to use the new entry it will
 * fault and demand_page() will fix it up.
 *
 * So with that in mind here's our code to to update a (top-level) PGD entry:
 */
531
void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx)
R
Rusty Russell 已提交
532 533 534
{
	int pgdir;

R
Rusty Russell 已提交
535 536
	/* The kernel seems to try to initialize this early on: we ignore its
	 * attempts to map over the Switcher. */
R
Rusty Russell 已提交
537 538 539
	if (idx >= SWITCHER_PGD_INDEX)
		return;

R
Rusty Russell 已提交
540
	/* If they're talking about a page table we have a shadow for... */
541
	pgdir = find_pgdir(lg, gpgdir);
R
Rusty Russell 已提交
542
	if (pgdir < ARRAY_SIZE(lg->pgdirs))
R
Rusty Russell 已提交
543
		/* ... throw it away. */
R
Rusty Russell 已提交
544 545 546
		release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
}

R
Rusty Russell 已提交
547 548 549 550
/*H:500 (vii) Setting up the page tables initially.
 *
 * When a Guest is first created, the Launcher tells us where the toplevel of
 * its first page table is.  We set some things up here: */
R
Rusty Russell 已提交
551 552
int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
{
R
Rusty Russell 已提交
553
	/* In flush_user_mappings() we loop from 0 to
554
	 * "pgd_index(lg->page_offset)".  This assumes it won't hit
R
Rusty Russell 已提交
555
	 * the Switcher mappings, so check that now. */
556
	if (pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX)
R
Rusty Russell 已提交
557
		return -EINVAL;
R
Rusty Russell 已提交
558 559
	/* We start on the first shadow page table, and give it a blank PGD
	 * page. */
R
Rusty Russell 已提交
560
	lg->pgdidx = 0;
561
	lg->pgdirs[lg->pgdidx].gpgdir = pgtable;
562
	lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL);
R
Rusty Russell 已提交
563 564 565 566 567
	if (!lg->pgdirs[lg->pgdidx].pgdir)
		return -ENOMEM;
	return 0;
}

R
Rusty Russell 已提交
568
/* When a Guest dies, our cleanup is fairly simple. */
R
Rusty Russell 已提交
569 570 571 572
void free_guest_pagetable(struct lguest *lg)
{
	unsigned int i;

R
Rusty Russell 已提交
573
	/* Throw away all page table pages. */
R
Rusty Russell 已提交
574
	release_all_pagetables(lg);
R
Rusty Russell 已提交
575
	/* Now free the top levels: free_page() can handle 0 just fine. */
R
Rusty Russell 已提交
576 577 578 579
	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
		free_page((long)lg->pgdirs[i].pgdir);
}

R
Rusty Russell 已提交
580 581 582 583 584
/*H:480 (vi) Mapping the Switcher when the Guest is about to run.
 *
 * The Switcher and the two pages for this CPU need to be available to the
 * Guest (and not the pages for other CPUs).  We have the appropriate PTE pages
 * for each CPU already set up, we just need to hook them in. */
R
Rusty Russell 已提交
585 586
void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
{
587 588 589
	pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
	pgd_t switcher_pgd;
	pte_t regs_pte;
R
Rusty Russell 已提交
590

R
Rusty Russell 已提交
591 592
	/* Make the last PGD entry for this Guest point to the Switcher's PTE
	 * page for this CPU (with appropriate flags). */
593 594
	switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL);

R
Rusty Russell 已提交
595 596
	lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;

R
Rusty Russell 已提交
597 598 599 600 601 602 603
	/* We also change the Switcher PTE page.  When we're running the Guest,
	 * we want the Guest's "regs" page to appear where the first Switcher
	 * page for this CPU is.  This is an optimization: when the Switcher
	 * saves the Guest registers, it saves them into the first page of this
	 * CPU's "struct lguest_pages": if we make sure the Guest's register
	 * page is already mapped there, we don't have to copy them out
	 * again. */
604 605
	regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL));
	switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte;
R
Rusty Russell 已提交
606
}
R
Rusty Russell 已提交
607
/*:*/
R
Rusty Russell 已提交
608 609 610 611 612 613 614 615 616

static void free_switcher_pte_pages(void)
{
	unsigned int i;

	for_each_possible_cpu(i)
		free_page((long)switcher_pte_page(i));
}

R
Rusty Russell 已提交
617 618 619 620
/*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given
 * the CPU number and the "struct page"s for the Switcher code itself.
 *
 * Currently the Switcher is less than a page long, so "pages" is always 1. */
R
Rusty Russell 已提交
621 622 623 624 625
static __init void populate_switcher_pte_page(unsigned int cpu,
					      struct page *switcher_page[],
					      unsigned int pages)
{
	unsigned int i;
626
	pte_t *pte = switcher_pte_page(cpu);
R
Rusty Russell 已提交
627

R
Rusty Russell 已提交
628
	/* The first entries are easy: they map the Switcher code. */
R
Rusty Russell 已提交
629
	for (i = 0; i < pages; i++) {
630 631
		pte[i] = mk_pte(switcher_page[i],
				__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
R
Rusty Russell 已提交
632 633
	}

R
Rusty Russell 已提交
634
	/* The only other thing we map is this CPU's pair of pages. */
R
Rusty Russell 已提交
635 636
	i = pages + cpu*2;

R
Rusty Russell 已提交
637
	/* First page (Guest registers) is writable from the Guest */
638 639 640
	pte[i] = pfn_pte(page_to_pfn(switcher_page[i]),
			 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW));

R
Rusty Russell 已提交
641 642
	/* The second page contains the "struct lguest_ro_state", and is
	 * read-only. */
643 644
	pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]),
			   __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
R
Rusty Russell 已提交
645 646
}

R
Rusty Russell 已提交
647 648
/*H:510 At boot or module load time, init_pagetables() allocates and populates
 * the Switcher PTE page for each CPU. */
R
Rusty Russell 已提交
649 650 651 652 653
__init int init_pagetables(struct page **switcher_page, unsigned int pages)
{
	unsigned int i;

	for_each_possible_cpu(i) {
654
		switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
R
Rusty Russell 已提交
655 656 657 658 659 660 661 662
		if (!switcher_pte_page(i)) {
			free_switcher_pte_pages();
			return -ENOMEM;
		}
		populate_switcher_pte_page(i, switcher_page, pages);
	}
	return 0;
}
R
Rusty Russell 已提交
663
/*:*/
R
Rusty Russell 已提交
664

R
Rusty Russell 已提交
665
/* Cleaning up simply involves freeing the PTE page for each CPU. */
R
Rusty Russell 已提交
666 667 668 669
void free_pagetables(void)
{
	free_switcher_pte_pages();
}