pgtable.c 12.3 KB
Newer Older
1
/*
2
 *    Copyright IBM Corp. 2007,2009
3 4 5 6 7 8
 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
 */

#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
9
#include <linux/gfp.h>
10 11 12 13 14 15 16 17
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/quicklist.h>
18
#include <linux/rcupdate.h>
19 20 21 22 23 24

#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
M
Martin Schwidefsky 已提交
25
#include <asm/mmu_context.h>
26

27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
struct rcu_table_freelist {
	struct rcu_head rcu;
	struct mm_struct *mm;
	unsigned int pgt_index;
	unsigned int crst_index;
	unsigned long *table[0];
};

#define RCU_FREELIST_SIZE \
	((PAGE_SIZE - sizeof(struct rcu_table_freelist)) \
	  / sizeof(unsigned long))

DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
static DEFINE_PER_CPU(struct rcu_table_freelist *, rcu_table_freelist);

static void __page_table_free(struct mm_struct *mm, unsigned long *table);
static void __crst_table_free(struct mm_struct *mm, unsigned long *table);

static struct rcu_table_freelist *rcu_table_freelist_get(struct mm_struct *mm)
{
	struct rcu_table_freelist **batchp = &__get_cpu_var(rcu_table_freelist);
	struct rcu_table_freelist *batch = *batchp;

	if (batch)
		return batch;
	batch = (struct rcu_table_freelist *) __get_free_page(GFP_ATOMIC);
	if (batch) {
		batch->mm = mm;
		batch->pgt_index = 0;
		batch->crst_index = RCU_FREELIST_SIZE;
		*batchp = batch;
	}
	return batch;
}

static void rcu_table_freelist_callback(struct rcu_head *head)
{
	struct rcu_table_freelist *batch =
		container_of(head, struct rcu_table_freelist, rcu);

	while (batch->pgt_index > 0)
		__page_table_free(batch->mm, batch->table[--batch->pgt_index]);
	while (batch->crst_index < RCU_FREELIST_SIZE)
		__crst_table_free(batch->mm, batch->table[batch->crst_index++]);
	free_page((unsigned long) batch);
}

void rcu_table_freelist_finish(void)
{
	struct rcu_table_freelist *batch = __get_cpu_var(rcu_table_freelist);

	if (!batch)
		return;
	call_rcu(&batch->rcu, rcu_table_freelist_callback);
	__get_cpu_var(rcu_table_freelist) = NULL;
}

static void smp_sync(void *arg)
{
}

88 89
#ifndef CONFIG_64BIT
#define ALLOC_ORDER	1
90 91 92
#define TABLES_PER_PAGE	4
#define FRAG_MASK	15UL
#define SECOND_HALVES	10UL
93 94 95 96 97 98 99 100 101

void clear_table_pgstes(unsigned long *table)
{
	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
	memset(table + 256, 0, PAGE_SIZE/4);
	clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
	memset(table + 768, 0, PAGE_SIZE/4);
}

102 103
#else
#define ALLOC_ORDER	2
104 105 106
#define TABLES_PER_PAGE	2
#define FRAG_MASK	3UL
#define SECOND_HALVES	2UL
107 108 109 110 111 112 113

void clear_table_pgstes(unsigned long *table)
{
	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
	memset(table + 256, 0, PAGE_SIZE/2);
}

114 115
#endif

116 117 118 119 120 121 122 123 124 125 126 127
unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE;
EXPORT_SYMBOL(VMALLOC_START);

static int __init parse_vmalloc(char *arg)
{
	if (!arg)
		return -EINVAL;
	VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK;
	return 0;
}
early_param("vmalloc", parse_vmalloc);

128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec)
{
	struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);

	if (!page)
		return NULL;
	page->index = 0;
	if (noexec) {
		struct page *shadow = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
		if (!shadow) {
			__free_pages(page, ALLOC_ORDER);
			return NULL;
		}
		page->index = page_to_phys(shadow);
	}
143
	spin_lock_bh(&mm->context.list_lock);
144
	list_add(&page->lru, &mm->context.crst_list);
145
	spin_unlock_bh(&mm->context.list_lock);
146 147 148
	return (unsigned long *) page_to_phys(page);
}

149
static void __crst_table_free(struct mm_struct *mm, unsigned long *table)
150 151 152 153 154 155 156 157
{
	unsigned long *shadow = get_shadow_table(table);

	if (shadow)
		free_pages((unsigned long) shadow, ALLOC_ORDER);
	free_pages((unsigned long) table, ALLOC_ORDER);
}

158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
void crst_table_free(struct mm_struct *mm, unsigned long *table)
{
	struct page *page = virt_to_page(table);

	spin_lock_bh(&mm->context.list_lock);
	list_del(&page->lru);
	spin_unlock_bh(&mm->context.list_lock);
	__crst_table_free(mm, table);
}

void crst_table_free_rcu(struct mm_struct *mm, unsigned long *table)
{
	struct rcu_table_freelist *batch;
	struct page *page = virt_to_page(table);

	spin_lock_bh(&mm->context.list_lock);
	list_del(&page->lru);
	spin_unlock_bh(&mm->context.list_lock);
	if (atomic_read(&mm->mm_users) < 2 &&
	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
		__crst_table_free(mm, table);
		return;
	}
	batch = rcu_table_freelist_get(mm);
	if (!batch) {
		smp_call_function(smp_sync, NULL, 1);
		__crst_table_free(mm, table);
		return;
	}
	batch->table[--batch->crst_index] = table;
	if (batch->pgt_index >= batch->crst_index)
		rcu_table_freelist_finish();
}

M
Martin Schwidefsky 已提交
192 193 194 195 196 197 198 199 200 201 202
#ifdef CONFIG_64BIT
int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
{
	unsigned long *table, *pgd;
	unsigned long entry;

	BUG_ON(limit > (1UL << 53));
repeat:
	table = crst_table_alloc(mm, mm->context.noexec);
	if (!table)
		return -ENOMEM;
203
	spin_lock_bh(&mm->page_table_lock);
M
Martin Schwidefsky 已提交
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
	if (mm->context.asce_limit < limit) {
		pgd = (unsigned long *) mm->pgd;
		if (mm->context.asce_limit <= (1UL << 31)) {
			entry = _REGION3_ENTRY_EMPTY;
			mm->context.asce_limit = 1UL << 42;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION3;
		} else {
			entry = _REGION2_ENTRY_EMPTY;
			mm->context.asce_limit = 1UL << 53;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION2;
		}
		crst_table_init(table, entry);
		pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
		mm->pgd = (pgd_t *) table;
222
		mm->task_size = mm->context.asce_limit;
M
Martin Schwidefsky 已提交
223 224
		table = NULL;
	}
225
	spin_unlock_bh(&mm->page_table_lock);
M
Martin Schwidefsky 已提交
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
	if (table)
		crst_table_free(mm, table);
	if (mm->context.asce_limit < limit)
		goto repeat;
	update_mm(mm, current);
	return 0;
}

void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
{
	pgd_t *pgd;

	if (mm->context.asce_limit <= limit)
		return;
	__tlb_flush_mm(mm);
	while (mm->context.asce_limit > limit) {
		pgd = mm->pgd;
		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
		case _REGION_ENTRY_TYPE_R2:
			mm->context.asce_limit = 1UL << 42;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION3;
			break;
		case _REGION_ENTRY_TYPE_R3:
			mm->context.asce_limit = 1UL << 31;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_SEGMENT;
			break;
		default:
			BUG();
		}
		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
260
		mm->task_size = mm->context.asce_limit;
M
Martin Schwidefsky 已提交
261 262 263 264 265 266
		crst_table_free(mm, (unsigned long *) pgd);
	}
	update_mm(mm, current);
}
#endif

267 268 269
/*
 * page table entry allocation/free routines.
 */
270
unsigned long *page_table_alloc(struct mm_struct *mm)
271
{
272
	struct page *page;
273
	unsigned long *table;
274
	unsigned long bits;
275

276
	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
277
	spin_lock_bh(&mm->context.list_lock);
278 279 280 281 282 283 284 285
	page = NULL;
	if (!list_empty(&mm->context.pgtable_list)) {
		page = list_first_entry(&mm->context.pgtable_list,
					struct page, lru);
		if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
			page = NULL;
	}
	if (!page) {
286
		spin_unlock_bh(&mm->context.list_lock);
287 288
		page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
		if (!page)
289
			return NULL;
290 291 292
		pgtable_page_ctor(page);
		page->flags &= ~FRAG_MASK;
		table = (unsigned long *) page_to_phys(page);
293
		if (mm->context.has_pgste)
294 295 296
			clear_table_pgstes(table);
		else
			clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
297
		spin_lock_bh(&mm->context.list_lock);
298
		list_add(&page->lru, &mm->context.pgtable_list);
299 300
	}
	table = (unsigned long *) page_to_phys(page);
301 302 303 304 305 306 307
	while (page->flags & bits) {
		table += 256;
		bits <<= 1;
	}
	page->flags |= bits;
	if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
		list_move_tail(&page->lru, &mm->context.pgtable_list);
308
	spin_unlock_bh(&mm->context.list_lock);
309 310 311
	return table;
}

312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
static void __page_table_free(struct mm_struct *mm, unsigned long *table)
{
	struct page *page;
	unsigned long bits;

	bits = ((unsigned long) table) & 15;
	table = (unsigned long *)(((unsigned long) table) ^ bits);
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
	page->flags ^= bits;
	if (!(page->flags & FRAG_MASK)) {
		pgtable_page_dtor(page);
		__free_page(page);
	}
}

327
void page_table_free(struct mm_struct *mm, unsigned long *table)
328
{
329 330
	struct page *page;
	unsigned long bits;
331

332
	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
333 334
	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
335
	spin_lock_bh(&mm->context.list_lock);
336 337 338
	page->flags ^= bits;
	if (page->flags & FRAG_MASK) {
		/* Page now has some free pgtable fragments. */
339 340
		if (!list_empty(&page->lru))
			list_move(&page->lru, &mm->context.pgtable_list);
341 342 343 344
		page = NULL;
	} else
		/* All fragments of the 4K page have been freed. */
		list_del(&page->lru);
345
	spin_unlock_bh(&mm->context.list_lock);
346 347 348 349 350
	if (page) {
		pgtable_page_dtor(page);
		__free_page(page);
	}
}
351

352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
void page_table_free_rcu(struct mm_struct *mm, unsigned long *table)
{
	struct rcu_table_freelist *batch;
	struct page *page;
	unsigned long bits;

	if (atomic_read(&mm->mm_users) < 2 &&
	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
		page_table_free(mm, table);
		return;
	}
	batch = rcu_table_freelist_get(mm);
	if (!batch) {
		smp_call_function(smp_sync, NULL, 1);
		page_table_free(mm, table);
		return;
	}
	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
	spin_lock_bh(&mm->context.list_lock);
	/* Delayed freeing with rcu prevents reuse of pgtable fragments */
	list_del_init(&page->lru);
	spin_unlock_bh(&mm->context.list_lock);
	table = (unsigned long *)(((unsigned long) table) | bits);
	batch->table[batch->pgt_index++] = table;
	if (batch->pgt_index >= batch->crst_index)
		rcu_table_freelist_finish();
}

382 383 384 385
void disable_noexec(struct mm_struct *mm, struct task_struct *tsk)
{
	struct page *page;

386
	spin_lock_bh(&mm->context.list_lock);
387 388 389 390 391 392 393 394 395
	/* Free shadow region and segment tables. */
	list_for_each_entry(page, &mm->context.crst_list, lru)
		if (page->index) {
			free_pages((unsigned long) page->index, ALLOC_ORDER);
			page->index = 0;
		}
	/* "Free" second halves of page tables. */
	list_for_each_entry(page, &mm->context.pgtable_list, lru)
		page->flags &= ~SECOND_HALVES;
396
	spin_unlock_bh(&mm->context.list_lock);
397 398
	mm->context.noexec = 0;
	update_mm(mm, tsk);
399
}
400 401 402 403 404 405 406

/*
 * switch on pgstes for its userspace process (for kvm)
 */
int s390_enable_sie(void)
{
	struct task_struct *tsk = current;
407
	struct mm_struct *mm, *old_mm;
408

409
	/* Do we have switched amode? If no, we cannot do sie */
410
	if (user_mode == HOME_SPACE_MODE)
411 412
		return -EINVAL;

413
	/* Do we have pgstes? if yes, we are done */
414
	if (tsk->mm->context.has_pgste)
415
		return 0;
416

417 418
	/* lets check if we are allowed to replace the mm */
	task_lock(tsk);
419
	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
420 421 422 423
#ifdef CONFIG_AIO
	    !hlist_empty(&tsk->mm->ioctx_list) ||
#endif
	    tsk->mm != tsk->active_mm) {
424 425 426 427
		task_unlock(tsk);
		return -EINVAL;
	}
	task_unlock(tsk);
428

429 430
	/* we copy the mm and let dup_mm create the page tables with_pgstes */
	tsk->mm->context.alloc_pgste = 1;
431
	mm = dup_mm(tsk);
432
	tsk->mm->context.alloc_pgste = 0;
433
	if (!mm)
434 435
		return -ENOMEM;

436
	/* Now lets check again if something happened */
437 438
	task_lock(tsk);
	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
439 440 441 442
#ifdef CONFIG_AIO
	    !hlist_empty(&tsk->mm->ioctx_list) ||
#endif
	    tsk->mm != tsk->active_mm) {
443 444 445 446 447 448 449
		mmput(mm);
		task_unlock(tsk);
		return -EINVAL;
	}

	/* ok, we are alone. No ptrace, no threads, etc. */
	old_mm = tsk->mm;
450 451 452
	tsk->mm = tsk->active_mm = mm;
	preempt_disable();
	update_mm(mm, tsk);
453 454
	atomic_inc(&mm->context.attach_count);
	atomic_dec(&old_mm->context.attach_count);
455
	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
456 457
	preempt_enable();
	task_unlock(tsk);
458 459
	mmput(old_mm);
	return 0;
460 461
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
462

463
#if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION)
464 465 466 467 468 469
bool kernel_page_present(struct page *page)
{
	unsigned long addr;
	int cc;

	addr = page_to_phys(page);
470 471 472 473 474
	asm volatile(
		"	lra	%1,0(%1)\n"
		"	ipm	%0\n"
		"	srl	%0,28"
		: "=d" (cc), "+a" (addr) : : "cc");
475 476
	return cc == 0;
}
477
#endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */