page_tables.c 36.9 KB
Newer Older
1 2 3 4
/*P:700 The pagetable code, on the other hand, still shows the scars of
 * previous encounters.  It's functional, and as neat as it can be in the
 * circumstances, but be wary, for these things are subtle and break easily.
 * The Guest provides a virtual to physical mapping, but we can neither trust
5 6
 * it nor use it: we verify and convert it here then point the CPU to the
 * converted Guest pages when running the Guest. :*/
7 8

/* Copyright (C) Rusty Russell IBM Corporation 2006.
R
Rusty Russell 已提交
9 10 11 12 13 14 15
 * GPL v2 and any later version */
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/random.h>
#include <linux/percpu.h>
#include <asm/tlbflush.h>
16
#include <asm/uaccess.h>
17
#include <asm/bootparam.h>
R
Rusty Russell 已提交
18 19
#include "lg.h"

20 21 22 23 24
/*M:008 We hold reference to pages, which prevents them from being swapped.
 * It'd be nice to have a callback in the "struct mm_struct" when Linux wants
 * to swap out.  If we had this, and a shrinker callback to trim PTE pages, we
 * could probably consider launching Guests as non-root. :*/

R
Rusty Russell 已提交
25 26 27 28 29
/*H:300
 * The Page Table Code
 *
 * We use two-level page tables for the Guest.  If you're not entirely
 * comfortable with virtual addresses, physical addresses and page tables then
R
Rusty Russell 已提交
30 31
 * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with
 * diagrams!).
R
Rusty Russell 已提交
32 33 34 35 36 37 38 39 40
 *
 * The Guest keeps page tables, but we maintain the actual ones here: these are
 * called "shadow" page tables.  Which is a very Guest-centric name: these are
 * the real page tables the CPU uses, although we keep them up to date to
 * reflect the Guest's.  (See what I mean about weird naming?  Since when do
 * shadows reflect anything?)
 *
 * Anyway, this is the most complicated part of the Host code.  There are seven
 * parts to this:
R
Rusty Russell 已提交
41 42 43
 *  (i) Looking up a page table entry when the Guest faults,
 *  (ii) Making sure the Guest stack is mapped,
 *  (iii) Setting up a page table entry when the Guest tells us one has changed,
R
Rusty Russell 已提交
44
 *  (iv) Switching page tables,
R
Rusty Russell 已提交
45
 *  (v) Flushing (throwing away) page tables,
R
Rusty Russell 已提交
46 47 48 49 50 51 52 53
 *  (vi) Mapping the Switcher when the Guest is about to run,
 *  (vii) Setting up the page tables initially.
 :*/


/* 1024 entries in a page table page maps 1024 pages: 4MB.  The Switcher is
 * conveniently placed at the top 4MB, so it uses a separate, complete PTE
 * page.  */
54
#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
R
Rusty Russell 已提交
55

M
Matias Zabaljauregui 已提交
56 57 58 59 60 61 62 63 64 65 66
/* For PAE we need the PMD index as well. We use the last 2MB, so we
 * will need the last pmd entry of the last pmd page.  */
#ifdef CONFIG_X86_PAE
#define SWITCHER_PMD_INDEX 	(PTRS_PER_PMD - 1)
#define RESERVE_MEM 		2U
#define CHECK_GPGD_MASK		_PAGE_PRESENT
#else
#define RESERVE_MEM 		4U
#define CHECK_GPGD_MASK		_PAGE_TABLE
#endif

R
Rusty Russell 已提交
67 68 69
/* We actually need a separate PTE page for each CPU.  Remember that after the
 * Switcher code itself comes two pages for each CPU, and we don't want this
 * CPU's guest to see the pages of any other CPU. */
70
static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
R
Rusty Russell 已提交
71 72
#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)

R
Rusty Russell 已提交
73 74
/*H:320 The page table code is curly enough to need helper functions to keep it
 * clear and clean.
R
Rusty Russell 已提交
75
 *
76
 * There are two functions which return pointers to the shadow (aka "real")
R
Rusty Russell 已提交
77 78 79
 * page tables.
 *
 * spgd_addr() takes the virtual address and returns a pointer to the top-level
R
Rusty Russell 已提交
80 81
 * page directory entry (PGD) for that address.  Since we keep track of several
 * page tables, the "i" argument tells us which one we're interested in (it's
R
Rusty Russell 已提交
82
 * usually the current one). */
83
static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
R
Rusty Russell 已提交
84
{
85
	unsigned int index = pgd_index(vaddr);
R
Rusty Russell 已提交
86

M
Matias Zabaljauregui 已提交
87
#ifndef CONFIG_X86_PAE
R
Rusty Russell 已提交
88
	/* We kill any Guest trying to touch the Switcher addresses. */
R
Rusty Russell 已提交
89
	if (index >= SWITCHER_PGD_INDEX) {
90
		kill_guest(cpu, "attempt to access switcher pages");
R
Rusty Russell 已提交
91 92
		index = 0;
	}
M
Matias Zabaljauregui 已提交
93
#endif
R
Rusty Russell 已提交
94
	/* Return a pointer index'th pgd entry for the i'th page table. */
95
	return &cpu->lg->pgdirs[i].pgdir[index];
R
Rusty Russell 已提交
96 97
}

M
Matias Zabaljauregui 已提交
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
#ifdef CONFIG_X86_PAE
/* This routine then takes the PGD entry given above, which contains the
 * address of the PMD page.  It then returns a pointer to the PMD entry for the
 * given address. */
static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
{
	unsigned int index = pmd_index(vaddr);
	pmd_t *page;

	/* We kill any Guest trying to touch the Switcher addresses. */
	if (pgd_index(vaddr) == SWITCHER_PGD_INDEX &&
					index >= SWITCHER_PMD_INDEX) {
		kill_guest(cpu, "attempt to access switcher pages");
		index = 0;
	}

	/* You should never call this if the PGD entry wasn't valid */
	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
	page = __va(pgd_pfn(spgd) << PAGE_SHIFT);

	return &page[index];
}
#endif

R
Rusty Russell 已提交
122 123 124
/* This routine then takes the page directory entry returned above, which
 * contains the address of the page table entry (PTE) page.  It then returns a
 * pointer to the PTE entry for the given address. */
M
Matias Zabaljauregui 已提交
125
static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
R
Rusty Russell 已提交
126
{
M
Matias Zabaljauregui 已提交
127 128 129 130 131 132 133
#ifdef CONFIG_X86_PAE
	pmd_t *pmd = spmd_addr(cpu, spgd, vaddr);
	pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT);

	/* You should never call this if the PMD entry wasn't valid */
	BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT));
#else
134
	pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
R
Rusty Russell 已提交
135
	/* You should never call this if the PGD entry wasn't valid */
136
	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
M
Matias Zabaljauregui 已提交
137 138
#endif

139
	return &page[pte_index(vaddr)];
R
Rusty Russell 已提交
140 141
}

R
Rusty Russell 已提交
142 143
/* These two functions just like the above two, except they access the Guest
 * page tables.  Hence they return a Guest address. */
144
static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
R
Rusty Russell 已提交
145
{
146
	unsigned int index = vaddr >> (PGDIR_SHIFT);
147
	return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
R
Rusty Russell 已提交
148 149
}

M
Matias Zabaljauregui 已提交
150 151
#ifdef CONFIG_X86_PAE
static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
R
Rusty Russell 已提交
152
{
153 154
	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
M
Matias Zabaljauregui 已提交
155 156 157 158
	return gpage + pmd_index(vaddr) * sizeof(pmd_t);
}

static unsigned long gpte_addr(struct lg_cpu *cpu,
R
Rusty Russell 已提交
159
			       pmd_t gpmd, unsigned long vaddr)
M
Matias Zabaljauregui 已提交
160
{
R
Rusty Russell 已提交
161
	unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT;
M
Matias Zabaljauregui 已提交
162 163

	BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT));
R
Rusty Russell 已提交
164 165
	return gpage + pte_index(vaddr) * sizeof(pte_t);
}
M
Matias Zabaljauregui 已提交
166
#else
R
Rusty Russell 已提交
167 168 169 170 171 172
static unsigned long gpte_addr(struct lg_cpu *cpu,
				pgd_t gpgd, unsigned long vaddr)
{
	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;

	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
173
	return gpage + pte_index(vaddr) * sizeof(pte_t);
R
Rusty Russell 已提交
174
}
R
Rusty Russell 已提交
175
#endif
176 177
/*:*/

178 179
/*M:014 get_pfn is slow: we could probably try to grab batches of pages here as
 * an optimization (ie. pre-faulting). :*/
R
Rusty Russell 已提交
180

R
Rusty Russell 已提交
181 182 183 184 185 186
/*H:350 This routine takes a page number given by the Guest and converts it to
 * an actual, physical page number.  It can fail for several reasons: the
 * virtual address might not be mapped by the Launcher, the write flag is set
 * and the page is read-only, or the write flag was set and the page was
 * shared so had to be copied, but we ran out of memory.
 *
187 188
 * This holds a reference to the page, so release_pte() is careful to put that
 * back. */
R
Rusty Russell 已提交
189 190 191
static unsigned long get_pfn(unsigned long virtpfn, int write)
{
	struct page *page;
192 193 194 195 196

	/* gup me one page at this address please! */
	if (get_user_pages_fast(virtpfn << PAGE_SHIFT, 1, write, &page) == 1)
		return page_to_pfn(page);

R
Rusty Russell 已提交
197
	/* This value indicates failure. */
198
	return -1UL;
R
Rusty Russell 已提交
199 200
}

R
Rusty Russell 已提交
201 202 203 204
/*H:340 Converting a Guest page table entry to a shadow (ie. real) page table
 * entry can be a little tricky.  The flags are (almost) the same, but the
 * Guest PTE contains a virtual page number: the CPU needs the real page
 * number. */
205
static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
R
Rusty Russell 已提交
206
{
207
	unsigned long pfn, base, flags;
R
Rusty Russell 已提交
208

R
Rusty Russell 已提交
209 210 211 212
	/* The Guest sets the global flag, because it thinks that it is using
	 * PGE.  We only told it to use PGE so it would tell us whether it was
	 * flushing a kernel mapping or a userspace mapping.  We don't actually
	 * use the global bit, so throw it away. */
213
	flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
R
Rusty Russell 已提交
214

215
	/* The Guest's pages are offset inside the Launcher. */
216
	base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE;
217

R
Rusty Russell 已提交
218 219 220 221
	/* We need a temporary "unsigned long" variable to hold the answer from
	 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
	 * fit in spte.pfn.  get_pfn() finds the real physical number of the
	 * page, given the virtual number. */
222
	pfn = get_pfn(base + pte_pfn(gpte), write);
R
Rusty Russell 已提交
223
	if (pfn == -1UL) {
224
		kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte));
R
Rusty Russell 已提交
225 226 227
		/* When we destroy the Guest, we'll go through the shadow page
		 * tables and release_pte() them.  Make sure we don't think
		 * this one is valid! */
228
		flags = 0;
R
Rusty Russell 已提交
229
	}
230 231
	/* Now we assemble our shadow PTE from the page number and flags. */
	return pfn_pte(pfn, __pgprot(flags));
R
Rusty Russell 已提交
232 233
}

R
Rusty Russell 已提交
234
/*H:460 And to complete the chain, release_pte() looks like this: */
235
static void release_pte(pte_t pte)
R
Rusty Russell 已提交
236
{
237
	/* Remember that get_user_pages_fast() took a reference to the page, in
R
Rusty Russell 已提交
238
	 * get_pfn()?  We have to put it back now. */
239
	if (pte_flags(pte) & _PAGE_PRESENT)
240
		put_page(pte_page(pte));
R
Rusty Russell 已提交
241
}
R
Rusty Russell 已提交
242
/*:*/
R
Rusty Russell 已提交
243

244
static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
R
Rusty Russell 已提交
245
{
246 247
	if ((pte_flags(gpte) & _PAGE_PSE) ||
	    pte_pfn(gpte) >= cpu->lg->pfn_limit)
248
		kill_guest(cpu, "bad page table entry");
R
Rusty Russell 已提交
249 250
}

251
static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
R
Rusty Russell 已提交
252
{
M
Matias Zabaljauregui 已提交
253
	if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
254 255
	   (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
		kill_guest(cpu, "bad page directory entry");
R
Rusty Russell 已提交
256 257
}

M
Matias Zabaljauregui 已提交
258 259 260 261 262 263 264 265 266
#ifdef CONFIG_X86_PAE
static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
{
	if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
	   (pmd_pfn(gpmd) >= cpu->lg->pfn_limit))
		kill_guest(cpu, "bad page middle directory entry");
}
#endif

R
Rusty Russell 已提交
267
/*H:330
R
Rusty Russell 已提交
268
 * (i) Looking up a page table entry when the Guest faults.
R
Rusty Russell 已提交
269 270 271 272 273 274 275
 *
 * We saw this call in run_guest(): when we see a page fault in the Guest, we
 * come here.  That's because we only set up the shadow page tables lazily as
 * they're needed, so we get page faults all the time and quietly fix them up
 * and return to the Guest without it knowing.
 *
 * If we fixed up the fault (ie. we mapped the address), this routine returns
R
Rusty Russell 已提交
276
 * true.  Otherwise, it was a real fault and we need to tell the Guest. */
277
bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
R
Rusty Russell 已提交
278
{
279 280
	pgd_t gpgd;
	pgd_t *spgd;
R
Rusty Russell 已提交
281
	unsigned long gpte_ptr;
282 283
	pte_t gpte;
	pte_t *spte;
R
Rusty Russell 已提交
284

M
Matias Zabaljauregui 已提交
285 286 287 288 289
#ifdef CONFIG_X86_PAE
	pmd_t *spmd;
	pmd_t gpmd;
#endif

R
Rusty Russell 已提交
290
	/* First step: get the top-level Guest page table entry. */
291
	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
R
Rusty Russell 已提交
292
	/* Toplevel not present?  We can't map it in. */
293
	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
294
		return false;
R
Rusty Russell 已提交
295

R
Rusty Russell 已提交
296
	/* Now look at the matching shadow entry. */
297
	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
298
	if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
R
Rusty Russell 已提交
299
		/* No shadow entry: allocate a new shadow PTE page. */
R
Rusty Russell 已提交
300
		unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
R
Rusty Russell 已提交
301 302
		/* This is not really the Guest's fault, but killing it is
		 * simple for this corner case. */
R
Rusty Russell 已提交
303
		if (!ptepage) {
304
			kill_guest(cpu, "out of memory allocating pte page");
305
			return false;
R
Rusty Russell 已提交
306
		}
R
Rusty Russell 已提交
307
		/* We check that the Guest pgd is OK. */
308
		check_gpgd(cpu, gpgd);
R
Rusty Russell 已提交
309 310
		/* And we copy the flags to the shadow PGD entry.  The page
		 * number in the shadow PGD is the page we just allocated. */
M
Matias Zabaljauregui 已提交
311
		set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
R
Rusty Russell 已提交
312 313
	}

M
Matias Zabaljauregui 已提交
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
#ifdef CONFIG_X86_PAE
	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
	/* middle level not present?  We can't map it in. */
	if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
		return false;

	/* Now look at the matching shadow entry. */
	spmd = spmd_addr(cpu, *spgd, vaddr);

	if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
		/* No shadow entry: allocate a new shadow PTE page. */
		unsigned long ptepage = get_zeroed_page(GFP_KERNEL);

		/* This is not really the Guest's fault, but killing it is
		* simple for this corner case. */
		if (!ptepage) {
			kill_guest(cpu, "out of memory allocating pte page");
			return false;
		}

		/* We check that the Guest pmd is OK. */
		check_gpmd(cpu, gpmd);

		/* And we copy the flags to the shadow PMD entry.  The page
		 * number in the shadow PMD is the page we just allocated. */
		native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
	}
R
Rusty Russell 已提交
341 342 343 344 345

	/* OK, now we look at the lower level in the Guest page table: keep its
	 * address, because we might update it later. */
	gpte_ptr = gpte_addr(cpu, gpmd, vaddr);
#else
R
Rusty Russell 已提交
346 347
	/* OK, now we look at the lower level in the Guest page table: keep its
	 * address, because we might update it later. */
M
Matias Zabaljauregui 已提交
348
	gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
R
Rusty Russell 已提交
349
#endif
350
	gpte = lgread(cpu, gpte_ptr, pte_t);
R
Rusty Russell 已提交
351

R
Rusty Russell 已提交
352
	/* If this page isn't in the Guest page tables, we can't page it in. */
353
	if (!(pte_flags(gpte) & _PAGE_PRESENT))
354
		return false;
R
Rusty Russell 已提交
355

R
Rusty Russell 已提交
356 357
	/* Check they're not trying to write to a page the Guest wants
	 * read-only (bit 2 of errcode == write). */
358
	if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
359
		return false;
R
Rusty Russell 已提交
360

R
Rusty Russell 已提交
361
	/* User access to a kernel-only page? (bit 3 == user access) */
362
	if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
363
		return false;
R
Rusty Russell 已提交
364

R
Rusty Russell 已提交
365 366
	/* Check that the Guest PTE flags are OK, and the page number is below
	 * the pfn_limit (ie. not mapping the Launcher binary). */
367
	check_gpte(cpu, gpte);
R
Rusty Russell 已提交
368

R
Rusty Russell 已提交
369
	/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
370
	gpte = pte_mkyoung(gpte);
R
Rusty Russell 已提交
371
	if (errcode & 2)
372
		gpte = pte_mkdirty(gpte);
R
Rusty Russell 已提交
373

R
Rusty Russell 已提交
374
	/* Get the pointer to the shadow PTE entry we're going to set. */
M
Matias Zabaljauregui 已提交
375
	spte = spte_addr(cpu, *spgd, vaddr);
R
Rusty Russell 已提交
376 377
	/* If there was a valid shadow PTE entry here before, we release it.
	 * This can happen with a write to a previously read-only entry. */
R
Rusty Russell 已提交
378 379
	release_pte(*spte);

R
Rusty Russell 已提交
380 381
	/* If this is a write, we insist that the Guest page is writable (the
	 * final arg to gpte_to_spte()). */
382
	if (pte_dirty(gpte))
383
		*spte = gpte_to_spte(cpu, gpte, 1);
384
	else
R
Rusty Russell 已提交
385 386
		/* If this is a read, don't set the "writable" bit in the page
		 * table entry, even if the Guest says it's writable.  That way
R
Rusty Russell 已提交
387 388
		 * we will come back here when a write does actually occur, so
		 * we can update the Guest's _PAGE_DIRTY flag. */
389
		native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0));
R
Rusty Russell 已提交
390

R
Rusty Russell 已提交
391 392
	/* Finally, we write the Guest PTE entry back: we've set the
	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
393
	lgwrite(cpu, gpte_ptr, pte_t, gpte);
R
Rusty Russell 已提交
394

R
Rusty Russell 已提交
395 396 397 398
	/* The fault is fixed, the page table is populated, the mapping
	 * manipulated, the result returned and the code complete.  A small
	 * delay and a trace of alliteration are the only indications the Guest
	 * has that a page fault occurred at all. */
399
	return true;
R
Rusty Russell 已提交
400 401
}

R
Rusty Russell 已提交
402 403
/*H:360
 * (ii) Making sure the Guest stack is mapped.
R
Rusty Russell 已提交
404
 *
R
Rusty Russell 已提交
405 406 407 408
 * Remember that direct traps into the Guest need a mapped Guest kernel stack.
 * pin_stack_pages() calls us here: we could simply call demand_page(), but as
 * we've seen that logic is quite long, and usually the stack pages are already
 * mapped, so it's overkill.
R
Rusty Russell 已提交
409 410 411
 *
 * This is a quick version which answers the question: is this virtual address
 * mapped by the shadow page tables, and is it writable? */
412
static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
R
Rusty Russell 已提交
413
{
414
	pgd_t *spgd;
R
Rusty Russell 已提交
415 416
	unsigned long flags;

M
Matias Zabaljauregui 已提交
417 418 419
#ifdef CONFIG_X86_PAE
	pmd_t *spmd;
#endif
R
Rusty Russell 已提交
420
	/* Look at the current top level entry: is it present? */
421
	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
422
	if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
423
		return false;
R
Rusty Russell 已提交
424

M
Matias Zabaljauregui 已提交
425 426 427 428 429 430
#ifdef CONFIG_X86_PAE
	spmd = spmd_addr(cpu, *spgd, vaddr);
	if (!(pmd_flags(*spmd) & _PAGE_PRESENT))
		return false;
#endif

R
Rusty Russell 已提交
431 432
	/* Check the flags on the pte entry itself: it must be present and
	 * writable. */
M
Matias Zabaljauregui 已提交
433
	flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr)));
434

R
Rusty Russell 已提交
435 436 437
	return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
}

R
Rusty Russell 已提交
438 439 440
/* So, when pin_stack_pages() asks us to pin a page, we check if it's already
 * in the page tables, and if not, we call demand_page() with error code 2
 * (meaning "write"). */
441
void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
R
Rusty Russell 已提交
442
{
443
	if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))
444
		kill_guest(cpu, "bad stack page %#lx", vaddr);
R
Rusty Russell 已提交
445 446
}

M
Matias Zabaljauregui 已提交
447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481
#ifdef CONFIG_X86_PAE
static void release_pmd(pmd_t *spmd)
{
	/* If the entry's not present, there's nothing to release. */
	if (pmd_flags(*spmd) & _PAGE_PRESENT) {
		unsigned int i;
		pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT);
		/* For each entry in the page, we might need to release it. */
		for (i = 0; i < PTRS_PER_PTE; i++)
			release_pte(ptepage[i]);
		/* Now we can free the page of PTEs */
		free_page((long)ptepage);
		/* And zero out the PMD entry so we never release it twice. */
		native_set_pmd(spmd, __pmd(0));
	}
}

static void release_pgd(pgd_t *spgd)
{
	/* If the entry's not present, there's nothing to release. */
	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
		unsigned int i;
		pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);

		for (i = 0; i < PTRS_PER_PMD; i++)
			release_pmd(&pmdpage[i]);

		/* Now we can free the page of PMDs */
		free_page((long)pmdpage);
		/* And zero out the PGD entry so we never release it twice. */
		set_pgd(spgd, __pgd(0));
	}
}

#else /* !CONFIG_X86_PAE */
R
Rusty Russell 已提交
482
/*H:450 If we chase down the release_pgd() code, it looks like this: */
483
static void release_pgd(pgd_t *spgd)
R
Rusty Russell 已提交
484
{
R
Rusty Russell 已提交
485
	/* If the entry's not present, there's nothing to release. */
486
	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
R
Rusty Russell 已提交
487
		unsigned int i;
R
Rusty Russell 已提交
488 489 490
		/* Converting the pfn to find the actual PTE page is easy: turn
		 * the page number into a physical address, then convert to a
		 * virtual address (easy for kernel pages like this one). */
491
		pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
R
Rusty Russell 已提交
492
		/* For each entry in the page, we might need to release it. */
493
		for (i = 0; i < PTRS_PER_PTE; i++)
R
Rusty Russell 已提交
494
			release_pte(ptepage[i]);
R
Rusty Russell 已提交
495
		/* Now we can free the page of PTEs */
R
Rusty Russell 已提交
496
		free_page((long)ptepage);
R
Rusty Russell 已提交
497
		/* And zero out the PGD entry so we never release it twice. */
498
		*spgd = __pgd(0);
R
Rusty Russell 已提交
499 500
	}
}
M
Matias Zabaljauregui 已提交
501
#endif
R
Rusty Russell 已提交
502 503 504
/*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings()
 * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
 * It simply releases every PTE page from 0 up to the Guest's kernel address. */
R
Rusty Russell 已提交
505 506 507
static void flush_user_mappings(struct lguest *lg, int idx)
{
	unsigned int i;
R
Rusty Russell 已提交
508
	/* Release every pgd entry up to the kernel's address. */
509
	for (i = 0; i < pgd_index(lg->kernel_address); i++)
510
		release_pgd(lg->pgdirs[idx].pgdir + i);
R
Rusty Russell 已提交
511 512
}

R
Rusty Russell 已提交
513 514 515 516
/*H:440 (v) Flushing (throwing away) page tables,
 *
 * The Guest has a hypercall to throw away the page tables: it's used when a
 * large number of mappings have been changed. */
517
void guest_pagetable_flush_user(struct lg_cpu *cpu)
R
Rusty Russell 已提交
518
{
R
Rusty Russell 已提交
519
	/* Drop the userspace part of the current page table. */
520
	flush_user_mappings(cpu->lg, cpu->cpu_pgd);
R
Rusty Russell 已提交
521
}
R
Rusty Russell 已提交
522
/*:*/
R
Rusty Russell 已提交
523

524
/* We walk down the guest page tables to get a guest-physical address */
525
unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
526 527 528
{
	pgd_t gpgd;
	pte_t gpte;
M
Matias Zabaljauregui 已提交
529 530 531
#ifdef CONFIG_X86_PAE
	pmd_t gpmd;
#endif
532
	/* First step: get the top-level Guest page table entry. */
533
	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
534
	/* Toplevel not present?  We can't map it in. */
535
	if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) {
536
		kill_guest(cpu, "Bad address %#lx", vaddr);
537 538
		return -1UL;
	}
539

M
Matias Zabaljauregui 已提交
540 541 542 543
#ifdef CONFIG_X86_PAE
	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
	if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
		kill_guest(cpu, "Bad address %#lx", vaddr);
R
Rusty Russell 已提交
544 545
	gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
#else
M
Matias Zabaljauregui 已提交
546
	gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
R
Rusty Russell 已提交
547
#endif
548
	if (!(pte_flags(gpte) & _PAGE_PRESENT))
549
		kill_guest(cpu, "Bad address %#lx", vaddr);
550 551 552 553

	return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
}

R
Rusty Russell 已提交
554 555 556
/* We keep several page tables.  This is a simple routine to find the page
 * table (if any) corresponding to this top-level address the Guest has given
 * us. */
R
Rusty Russell 已提交
557 558 559 560
static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
{
	unsigned int i;
	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
561
		if (lg->pgdirs[i].pgdir && lg->pgdirs[i].gpgdir == pgtable)
R
Rusty Russell 已提交
562 563 564 565
			break;
	return i;
}

R
Rusty Russell 已提交
566 567 568
/*H:435 And this is us, creating the new page directory.  If we really do
 * allocate a new one (and so the kernel parts are not there), we set
 * blank_pgdir. */
569
static unsigned int new_pgdir(struct lg_cpu *cpu,
570
			      unsigned long gpgdir,
R
Rusty Russell 已提交
571 572 573
			      int *blank_pgdir)
{
	unsigned int next;
M
Matias Zabaljauregui 已提交
574 575 576
#ifdef CONFIG_X86_PAE
	pmd_t *pmd_table;
#endif
R
Rusty Russell 已提交
577

R
Rusty Russell 已提交
578 579
	/* We pick one entry at random to throw out.  Choosing the Least
	 * Recently Used might be better, but this is easy. */
580
	next = random32() % ARRAY_SIZE(cpu->lg->pgdirs);
R
Rusty Russell 已提交
581
	/* If it's never been allocated at all before, try now. */
582 583 584
	if (!cpu->lg->pgdirs[next].pgdir) {
		cpu->lg->pgdirs[next].pgdir =
					(pgd_t *)get_zeroed_page(GFP_KERNEL);
R
Rusty Russell 已提交
585
		/* If the allocation fails, just keep using the one we have */
586
		if (!cpu->lg->pgdirs[next].pgdir)
587
			next = cpu->cpu_pgd;
M
Matias Zabaljauregui 已提交
588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605
		else {
#ifdef CONFIG_X86_PAE
			/* In PAE mode, allocate a pmd page and populate the
			 * last pgd entry. */
			pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
			if (!pmd_table) {
				free_page((long)cpu->lg->pgdirs[next].pgdir);
				set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
				next = cpu->cpu_pgd;
			} else {
				set_pgd(cpu->lg->pgdirs[next].pgdir +
					SWITCHER_PGD_INDEX,
					__pgd(__pa(pmd_table) | _PAGE_PRESENT));
				/* This is a blank page, so there are no kernel
				 * mappings: caller must map the stack! */
				*blank_pgdir = 1;
			}
#else
R
Rusty Russell 已提交
606
			*blank_pgdir = 1;
M
Matias Zabaljauregui 已提交
607 608
#endif
		}
R
Rusty Russell 已提交
609
	}
R
Rusty Russell 已提交
610
	/* Record which Guest toplevel this shadows. */
611
	cpu->lg->pgdirs[next].gpgdir = gpgdir;
R
Rusty Russell 已提交
612
	/* Release all the non-kernel mappings. */
613
	flush_user_mappings(cpu->lg, next);
R
Rusty Russell 已提交
614 615 616 617

	return next;
}

R
Rusty Russell 已提交
618 619
/*H:430 (iv) Switching page tables
 *
620
 * Now we've seen all the page table setting and manipulation, let's see
R
Rusty Russell 已提交
621 622
 * what happens when the Guest changes page tables (ie. changes the top-level
 * pgdir).  This occurs on almost every context switch. */
623
void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
R
Rusty Russell 已提交
624 625 626
{
	int newpgdir, repin = 0;

R
Rusty Russell 已提交
627
	/* Look to see if we have this one already. */
628
	newpgdir = find_pgdir(cpu->lg, pgtable);
R
Rusty Russell 已提交
629 630
	/* If not, we allocate or mug an existing one: if it's a fresh one,
	 * repin gets set to 1. */
631
	if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
632
		newpgdir = new_pgdir(cpu, pgtable, &repin);
R
Rusty Russell 已提交
633
	/* Change the current pgd index to the new one. */
634
	cpu->cpu_pgd = newpgdir;
R
Rusty Russell 已提交
635
	/* If it was completely blank, we map in the Guest kernel stack */
R
Rusty Russell 已提交
636
	if (repin)
637
		pin_stack_pages(cpu);
R
Rusty Russell 已提交
638 639
}

R
Rusty Russell 已提交
640
/*H:470 Finally, a routine which throws away everything: all PGD entries in all
R
Rusty Russell 已提交
641 642
 * the shadow page tables, including the Guest's kernel mappings.  This is used
 * when we destroy the Guest. */
R
Rusty Russell 已提交
643 644 645 646
static void release_all_pagetables(struct lguest *lg)
{
	unsigned int i, j;

R
Rusty Russell 已提交
647
	/* Every shadow pagetable this Guest has */
R
Rusty Russell 已提交
648
	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
M
Matias Zabaljauregui 已提交
649 650 651 652 653 654 655 656 657 658 659 660 661 662 663
		if (lg->pgdirs[i].pgdir) {
#ifdef CONFIG_X86_PAE
			pgd_t *spgd;
			pmd_t *pmdpage;
			unsigned int k;

			/* Get the last pmd page. */
			spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX;
			pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);

			/* And release the pmd entries of that pmd page,
			 * except for the switcher pmd. */
			for (k = 0; k < SWITCHER_PMD_INDEX; k++)
				release_pmd(&pmdpage[k]);
#endif
R
Rusty Russell 已提交
664
			/* Every PGD entry except the Switcher at the top */
R
Rusty Russell 已提交
665
			for (j = 0; j < SWITCHER_PGD_INDEX; j++)
666
				release_pgd(lg->pgdirs[i].pgdir + j);
M
Matias Zabaljauregui 已提交
667
		}
R
Rusty Russell 已提交
668 669
}

R
Rusty Russell 已提交
670 671
/* We also throw away everything when a Guest tells us it's changed a kernel
 * mapping.  Since kernel mappings are in every page table, it's easiest to
R
Rusty Russell 已提交
672 673
 * throw them all away.  This traps the Guest in amber for a while as
 * everything faults back in, but it's rare. */
674
void guest_pagetable_clear_all(struct lg_cpu *cpu)
R
Rusty Russell 已提交
675
{
676
	release_all_pagetables(cpu->lg);
R
Rusty Russell 已提交
677
	/* We need the Guest kernel stack mapped again. */
678
	pin_stack_pages(cpu);
R
Rusty Russell 已提交
679
}
R
Rusty Russell 已提交
680 681 682 683 684 685 686 687
/*:*/
/*M:009 Since we throw away all mappings when a kernel mapping changes, our
 * performance sucks for guests using highmem.  In fact, a guest with
 * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is
 * usually slower than a Guest with less memory.
 *
 * This, of course, cannot be fixed.  It would take some kind of... well, I
 * don't know, but the term "puissant code-fu" comes to mind. :*/
R
Rusty Russell 已提交
688

R
Rusty Russell 已提交
689 690 691 692 693 694 695 696 697 698 699 700 701 702
/*H:420 This is the routine which actually sets the page table entry for then
 * "idx"'th shadow page table.
 *
 * Normally, we can just throw out the old entry and replace it with 0: if they
 * use it demand_page() will put the new entry in.  We need to do this anyway:
 * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page
 * is read from, and _PAGE_DIRTY when it's written to.
 *
 * But Avi Kivity pointed out that most Operating Systems (Linux included) set
 * these bits on PTEs immediately anyway.  This is done to save the CPU from
 * having to update them, but it helps us the same way: if they set
 * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
 * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
 */
703
static void do_set_pte(struct lg_cpu *cpu, int idx,
704
		       unsigned long vaddr, pte_t gpte)
R
Rusty Russell 已提交
705
{
R
Rusty Russell 已提交
706
	/* Look up the matching shadow page directory entry. */
707
	pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
M
Matias Zabaljauregui 已提交
708 709 710
#ifdef CONFIG_X86_PAE
	pmd_t *spmd;
#endif
R
Rusty Russell 已提交
711 712

	/* If the top level isn't present, there's no entry to update. */
713
	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
M
Matias Zabaljauregui 已提交
714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738
#ifdef CONFIG_X86_PAE
		spmd = spmd_addr(cpu, *spgd, vaddr);
		if (pmd_flags(*spmd) & _PAGE_PRESENT) {
#endif
			/* Otherwise, we start by releasing
			 * the existing entry. */
			pte_t *spte = spte_addr(cpu, *spgd, vaddr);
			release_pte(*spte);

			/* If they're setting this entry as dirty or accessed,
			 * we might as well put that entry they've given us
			 * in now.  This shaves 10% off a
			 * copy-on-write micro-benchmark. */
			if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
				check_gpte(cpu, gpte);
				native_set_pte(spte,
						gpte_to_spte(cpu, gpte,
						pte_flags(gpte) & _PAGE_DIRTY));
			} else
				/* Otherwise kill it and we can demand_page()
				 * it in later. */
				native_set_pte(spte, __pte(0));
#ifdef CONFIG_X86_PAE
		}
#endif
R
Rusty Russell 已提交
739 740 741
	}
}

R
Rusty Russell 已提交
742 743 744 745 746 747 748 749
/*H:410 Updating a PTE entry is a little trickier.
 *
 * We keep track of several different page tables (the Guest uses one for each
 * process, so it makes sense to cache at least a few).  Each of these have
 * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for
 * all processes.  So when the page table above that address changes, we update
 * all the page tables, not just the current one.  This is rare.
 *
750 751
 * The benefit is that when we have to track a new page table, we can keep all
 * the kernel mappings.  This speeds up context switch immensely. */
752
void guest_set_pte(struct lg_cpu *cpu,
753
		   unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
R
Rusty Russell 已提交
754
{
755 756
	/* Kernel mappings must be changed on all top levels.  Slow, but doesn't
	 * happen often. */
757
	if (vaddr >= cpu->lg->kernel_address) {
R
Rusty Russell 已提交
758
		unsigned int i;
759 760 761
		for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
			if (cpu->lg->pgdirs[i].pgdir)
				do_set_pte(cpu, i, vaddr, gpte);
R
Rusty Russell 已提交
762
	} else {
R
Rusty Russell 已提交
763
		/* Is this page table one we have a shadow for? */
764 765
		int pgdir = find_pgdir(cpu->lg, gpgdir);
		if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs))
R
Rusty Russell 已提交
766
			/* If so, do the update. */
767
			do_set_pte(cpu, pgdir, vaddr, gpte);
R
Rusty Russell 已提交
768 769 770
	}
}

R
Rusty Russell 已提交
771
/*H:400
R
Rusty Russell 已提交
772
 * (iii) Setting up a page table entry when the Guest tells us one has changed.
R
Rusty Russell 已提交
773 774 775 776 777 778 779 780 781 782 783 784
 *
 * Just like we did in interrupts_and_traps.c, it makes sense for us to deal
 * with the other side of page tables while we're here: what happens when the
 * Guest asks for a page table to be updated?
 *
 * We already saw that demand_page() will fill in the shadow page tables when
 * needed, so we can simply remove shadow page table entries whenever the Guest
 * tells us they've changed.  When the Guest tries to use the new entry it will
 * fault and demand_page() will fix it up.
 *
 * So with that in mind here's our code to to update a (top-level) PGD entry:
 */
785
void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
R
Rusty Russell 已提交
786 787 788 789 790 791
{
	int pgdir;

	if (idx >= SWITCHER_PGD_INDEX)
		return;

R
Rusty Russell 已提交
792
	/* If they're talking about a page table we have a shadow for... */
793
	pgdir = find_pgdir(lg, gpgdir);
R
Rusty Russell 已提交
794
	if (pgdir < ARRAY_SIZE(lg->pgdirs))
R
Rusty Russell 已提交
795
		/* ... throw it away. */
796
		release_pgd(lg->pgdirs[pgdir].pgdir + idx);
R
Rusty Russell 已提交
797
}
M
Matias Zabaljauregui 已提交
798 799 800 801 802 803
#ifdef CONFIG_X86_PAE
void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
{
	guest_pagetable_clear_all(&lg->cpus[0]);
}
#endif
R
Rusty Russell 已提交
804

805 806 807 808 809 810 811 812 813 814 815 816 817
/* Once we know how much memory we have we can construct simple identity
 * (which set virtual == physical) and linear mappings
 * which will get the Guest far enough into the boot to create its own.
 *
 * We lay them out of the way, just below the initrd (which is why we need to
 * know its size here). */
static unsigned long setup_pagetables(struct lguest *lg,
				      unsigned long mem,
				      unsigned long initrd_size)
{
	pgd_t __user *pgdir;
	pte_t __user *linear;
	unsigned long mem_base = (unsigned long)lg->mem_base;
M
Matias Zabaljauregui 已提交
818 819 820 821 822 823 824 825 826
	unsigned int mapped_pages, i, linear_pages;
#ifdef CONFIG_X86_PAE
	pmd_t __user *pmds;
	unsigned int j;
	pgd_t pgd;
	pmd_t pmd;
#else
	unsigned int phys_linear;
#endif
827 828 829 830 831 832 833 834 835 836 837 838

	/* We have mapped_pages frames to map, so we need
	 * linear_pages page tables to map them. */
	mapped_pages = mem / PAGE_SIZE;
	linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE;

	/* We put the toplevel page directory page at the top of memory. */
	pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE);

	/* Now we use the next linear_pages pages as pte pages */
	linear = (void *)pgdir - linear_pages * PAGE_SIZE;

M
Matias Zabaljauregui 已提交
839 840 841
#ifdef CONFIG_X86_PAE
	pmds = (void *)linear - PAGE_SIZE;
#endif
842 843 844 845 846 847 848 849 850 851 852
	/* Linear mapping is easy: put every page's address into the
	 * mapping in order. */
	for (i = 0; i < mapped_pages; i++) {
		pte_t pte;
		pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER));
		if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0)
			return -EFAULT;
	}

	/* The top level points to the linear page table pages above.
	 * We setup the identity and linear mappings here. */
M
Matias Zabaljauregui 已提交
853
#ifdef CONFIG_X86_PAE
R
Rusty Russell 已提交
854
	for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
M
Matias Zabaljauregui 已提交
855 856 857 858 859 860 861 862 863 864 865 866 867 868
	     i += PTRS_PER_PTE, j++) {
		native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i)
		- mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER));

		if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0)
			return -EFAULT;
	}

	set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT));
	if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
		return -EFAULT;
	if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0)
		return -EFAULT;
#else
869 870 871 872 873 874 875 876 877 878 879 880
	phys_linear = (unsigned long)linear - mem_base;
	for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
		pgd_t pgd;
		pgd = __pgd((phys_linear + i * sizeof(pte_t)) |
			    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));

		if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd))
		    || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET)
					   + i / PTRS_PER_PTE],
				    &pgd, sizeof(pgd)))
			return -EFAULT;
	}
M
Matias Zabaljauregui 已提交
881
#endif
882 883 884 885 886 887

	/* We return the top level (guest-physical) address: remember where
	 * this is. */
	return (unsigned long)pgdir - mem_base;
}

R
Rusty Russell 已提交
888 889 890 891
/*H:500 (vii) Setting up the page tables initially.
 *
 * When a Guest is first created, the Launcher tells us where the toplevel of
 * its first page table is.  We set some things up here: */
892
int init_guest_pagetable(struct lguest *lg)
R
Rusty Russell 已提交
893
{
894 895 896
	u64 mem;
	u32 initrd_size;
	struct boot_params __user *boot = (struct boot_params *)lg->mem_base;
M
Matias Zabaljauregui 已提交
897 898 899 900
#ifdef CONFIG_X86_PAE
	pgd_t *pgd;
	pmd_t *pmd_table;
#endif
901 902 903 904 905 906
	/* Get the Guest memory size and the ramdisk size from the boot header
	 * located at lg->mem_base (Guest address 0). */
	if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem))
	    || get_user(initrd_size, &boot->hdr.ramdisk_size))
		return -EFAULT;

R
Rusty Russell 已提交
907 908
	/* We start on the first shadow page table, and give it a blank PGD
	 * page. */
909 910 911
	lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size);
	if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir))
		return lg->pgdirs[0].gpgdir;
912 913
	lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
	if (!lg->pgdirs[0].pgdir)
R
Rusty Russell 已提交
914
		return -ENOMEM;
M
Matias Zabaljauregui 已提交
915 916 917 918 919 920 921 922 923
#ifdef CONFIG_X86_PAE
	pgd = lg->pgdirs[0].pgdir;
	pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
	if (!pmd_table)
		return -ENOMEM;

	set_pgd(pgd + SWITCHER_PGD_INDEX,
		__pgd(__pa(pmd_table) | _PAGE_PRESENT));
#endif
924
	lg->cpus[0].cpu_pgd = 0;
R
Rusty Russell 已提交
925 926 927
	return 0;
}

928
/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
929
void page_table_guest_data_init(struct lg_cpu *cpu)
930 931
{
	/* We get the kernel address: above this is all kernel memory. */
932
	if (get_user(cpu->lg->kernel_address,
M
Matias Zabaljauregui 已提交
933 934 935 936 937 938 939
		&cpu->lg->lguest_data->kernel_address)
		/* We tell the Guest that it can't use the top 2 or 4 MB
		 * of virtual addresses used by the Switcher. */
		|| put_user(RESERVE_MEM * 1024 * 1024,
			&cpu->lg->lguest_data->reserve_mem)
		|| put_user(cpu->lg->pgdirs[0].gpgdir,
			&cpu->lg->lguest_data->pgdir))
940
		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
941 942 943 944

	/* In flush_user_mappings() we loop from 0 to
	 * "pgd_index(lg->kernel_address)".  This assumes it won't hit the
	 * Switcher mappings, so check that now. */
M
Matias Zabaljauregui 已提交
945 946 947 948
#ifdef CONFIG_X86_PAE
	if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
		pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
#else
949
	if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
M
Matias Zabaljauregui 已提交
950
#endif
951 952
		kill_guest(cpu, "bad kernel address %#lx",
				 cpu->lg->kernel_address);
953 954
}

R
Rusty Russell 已提交
955
/* When a Guest dies, our cleanup is fairly simple. */
R
Rusty Russell 已提交
956 957 958 959
void free_guest_pagetable(struct lguest *lg)
{
	unsigned int i;

R
Rusty Russell 已提交
960
	/* Throw away all page table pages. */
R
Rusty Russell 已提交
961
	release_all_pagetables(lg);
R
Rusty Russell 已提交
962
	/* Now free the top levels: free_page() can handle 0 just fine. */
R
Rusty Russell 已提交
963 964 965 966
	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
		free_page((long)lg->pgdirs[i].pgdir);
}

R
Rusty Russell 已提交
967 968
/*H:480 (vi) Mapping the Switcher when the Guest is about to run.
 *
R
Rusty Russell 已提交
969
 * The Switcher and the two pages for this CPU need to be visible in the
R
Rusty Russell 已提交
970
 * Guest (and not the pages for other CPUs).  We have the appropriate PTE pages
R
Rusty Russell 已提交
971 972
 * for each CPU already set up, we just need to hook them in now we know which
 * Guest is about to run on this CPU. */
973
void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
R
Rusty Russell 已提交
974
{
975 976
	pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
	pte_t regs_pte;
977
	unsigned long pfn;
R
Rusty Russell 已提交
978

M
Matias Zabaljauregui 已提交
979 980 981 982 983 984 985 986 987 988 989 990 991 992
#ifdef CONFIG_X86_PAE
	pmd_t switcher_pmd;
	pmd_t *pmd_table;

	native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >>
		       PAGE_SHIFT, PAGE_KERNEL_EXEC));

	pmd_table = __va(pgd_pfn(cpu->lg->
			pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
								<< PAGE_SHIFT);
	native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
#else
	pgd_t switcher_pgd;

R
Rusty Russell 已提交
993 994
	/* Make the last PGD entry for this Guest point to the Switcher's PTE
	 * page for this CPU (with appropriate flags). */
995
	switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
996

997
	cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
R
Rusty Russell 已提交
998

M
Matias Zabaljauregui 已提交
999
#endif
R
Rusty Russell 已提交
1000 1001 1002 1003 1004 1005 1006
	/* We also change the Switcher PTE page.  When we're running the Guest,
	 * we want the Guest's "regs" page to appear where the first Switcher
	 * page for this CPU is.  This is an optimization: when the Switcher
	 * saves the Guest registers, it saves them into the first page of this
	 * CPU's "struct lguest_pages": if we make sure the Guest's register
	 * page is already mapped there, we don't have to copy them out
	 * again. */
1007
	pfn = __pa(cpu->regs_page) >> PAGE_SHIFT;
1008 1009 1010
	native_set_pte(&regs_pte, pfn_pte(pfn, PAGE_KERNEL));
	native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)],
			regs_pte);
R
Rusty Russell 已提交
1011
}
R
Rusty Russell 已提交
1012
/*:*/
R
Rusty Russell 已提交
1013 1014 1015 1016 1017 1018 1019 1020 1021

static void free_switcher_pte_pages(void)
{
	unsigned int i;

	for_each_possible_cpu(i)
		free_page((long)switcher_pte_page(i));
}

R
Rusty Russell 已提交
1022 1023 1024 1025
/*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given
 * the CPU number and the "struct page"s for the Switcher code itself.
 *
 * Currently the Switcher is less than a page long, so "pages" is always 1. */
R
Rusty Russell 已提交
1026 1027 1028 1029 1030
static __init void populate_switcher_pte_page(unsigned int cpu,
					      struct page *switcher_page[],
					      unsigned int pages)
{
	unsigned int i;
1031
	pte_t *pte = switcher_pte_page(cpu);
R
Rusty Russell 已提交
1032

R
Rusty Russell 已提交
1033
	/* The first entries are easy: they map the Switcher code. */
R
Rusty Russell 已提交
1034
	for (i = 0; i < pages; i++) {
1035 1036
		native_set_pte(&pte[i], mk_pte(switcher_page[i],
				__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
R
Rusty Russell 已提交
1037 1038
	}

R
Rusty Russell 已提交
1039
	/* The only other thing we map is this CPU's pair of pages. */
R
Rusty Russell 已提交
1040 1041
	i = pages + cpu*2;

R
Rusty Russell 已提交
1042
	/* First page (Guest registers) is writable from the Guest */
1043 1044
	native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]),
			 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
1045

R
Rusty Russell 已提交
1046 1047
	/* The second page contains the "struct lguest_ro_state", and is
	 * read-only. */
1048 1049
	native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]),
			   __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
R
Rusty Russell 已提交
1050 1051
}

R
Rusty Russell 已提交
1052 1053 1054
/* We've made it through the page table code.  Perhaps our tired brains are
 * still processing the details, or perhaps we're simply glad it's over.
 *
1055 1056 1057 1058 1059
 * If nothing else, note that all this complexity in juggling shadow page tables
 * in sync with the Guest's page tables is for one reason: for most Guests this
 * page table dance determines how bad performance will be.  This is why Xen
 * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD
 * have implemented shadow page table support directly into hardware.
R
Rusty Russell 已提交
1060 1061 1062
 *
 * There is just one file remaining in the Host. */

R
Rusty Russell 已提交
1063 1064
/*H:510 At boot or module load time, init_pagetables() allocates and populates
 * the Switcher PTE page for each CPU. */
R
Rusty Russell 已提交
1065 1066 1067 1068 1069
__init int init_pagetables(struct page **switcher_page, unsigned int pages)
{
	unsigned int i;

	for_each_possible_cpu(i) {
1070
		switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
R
Rusty Russell 已提交
1071 1072 1073 1074 1075 1076 1077 1078
		if (!switcher_pte_page(i)) {
			free_switcher_pte_pages();
			return -ENOMEM;
		}
		populate_switcher_pte_page(i, switcher_page, pages);
	}
	return 0;
}
R
Rusty Russell 已提交
1079
/*:*/
R
Rusty Russell 已提交
1080

R
Rusty Russell 已提交
1081
/* Cleaning up simply involves freeing the PTE page for each CPU. */
R
Rusty Russell 已提交
1082 1083 1084 1085
void free_pagetables(void)
{
	free_switcher_pte_pages();
}