pgtable.c 10.7 KB
Newer Older
1
/*
2
 *    Copyright IBM Corp. 2007,2009
3 4 5 6 7 8
 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
 */

#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
9
#include <linux/gfp.h>
10 11 12 13 14 15 16 17
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/quicklist.h>
18
#include <linux/rcupdate.h>
19 20 21 22 23 24

#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
M
Martin Schwidefsky 已提交
25
#include <asm/mmu_context.h>
26

27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
struct rcu_table_freelist {
	struct rcu_head rcu;
	struct mm_struct *mm;
	unsigned int pgt_index;
	unsigned int crst_index;
	unsigned long *table[0];
};

#define RCU_FREELIST_SIZE \
	((PAGE_SIZE - sizeof(struct rcu_table_freelist)) \
	  / sizeof(unsigned long))

static DEFINE_PER_CPU(struct rcu_table_freelist *, rcu_table_freelist);

static void __page_table_free(struct mm_struct *mm, unsigned long *table);

static struct rcu_table_freelist *rcu_table_freelist_get(struct mm_struct *mm)
{
	struct rcu_table_freelist **batchp = &__get_cpu_var(rcu_table_freelist);
	struct rcu_table_freelist *batch = *batchp;

	if (batch)
		return batch;
	batch = (struct rcu_table_freelist *) __get_free_page(GFP_ATOMIC);
	if (batch) {
		batch->mm = mm;
		batch->pgt_index = 0;
		batch->crst_index = RCU_FREELIST_SIZE;
		*batchp = batch;
	}
	return batch;
}

static void rcu_table_freelist_callback(struct rcu_head *head)
{
	struct rcu_table_freelist *batch =
		container_of(head, struct rcu_table_freelist, rcu);

	while (batch->pgt_index > 0)
		__page_table_free(batch->mm, batch->table[--batch->pgt_index]);
	while (batch->crst_index < RCU_FREELIST_SIZE)
68
		crst_table_free(batch->mm, batch->table[batch->crst_index++]);
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
	free_page((unsigned long) batch);
}

void rcu_table_freelist_finish(void)
{
	struct rcu_table_freelist *batch = __get_cpu_var(rcu_table_freelist);

	if (!batch)
		return;
	call_rcu(&batch->rcu, rcu_table_freelist_callback);
	__get_cpu_var(rcu_table_freelist) = NULL;
}

static void smp_sync(void *arg)
{
}

86 87
#ifndef CONFIG_64BIT
#define ALLOC_ORDER	1
88 89 90
#define TABLES_PER_PAGE	4
#define FRAG_MASK	15UL
#define SECOND_HALVES	10UL
91 92 93 94 95 96 97 98 99

void clear_table_pgstes(unsigned long *table)
{
	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
	memset(table + 256, 0, PAGE_SIZE/4);
	clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
	memset(table + 768, 0, PAGE_SIZE/4);
}

100 101
#else
#define ALLOC_ORDER	2
102 103 104
#define TABLES_PER_PAGE	2
#define FRAG_MASK	3UL
#define SECOND_HALVES	2UL
105 106 107 108 109 110 111

void clear_table_pgstes(unsigned long *table)
{
	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
	memset(table + 256, 0, PAGE_SIZE/2);
}

112 113
#endif

114 115 116 117 118 119 120 121 122 123 124 125
unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE;
EXPORT_SYMBOL(VMALLOC_START);

static int __init parse_vmalloc(char *arg)
{
	if (!arg)
		return -EINVAL;
	VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK;
	return 0;
}
early_param("vmalloc", parse_vmalloc);

126
unsigned long *crst_table_alloc(struct mm_struct *mm)
127 128 129 130 131 132 133 134
{
	struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);

	if (!page)
		return NULL;
	return (unsigned long *) page_to_phys(page);
}

135 136
void crst_table_free(struct mm_struct *mm, unsigned long *table)
{
137
	free_pages((unsigned long) table, ALLOC_ORDER);
138 139 140 141 142 143 144 145
}

void crst_table_free_rcu(struct mm_struct *mm, unsigned long *table)
{
	struct rcu_table_freelist *batch;

	if (atomic_read(&mm->mm_users) < 2 &&
	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
146
		crst_table_free(mm, table);
147 148 149 150 151
		return;
	}
	batch = rcu_table_freelist_get(mm);
	if (!batch) {
		smp_call_function(smp_sync, NULL, 1);
152
		crst_table_free(mm, table);
153 154 155 156 157 158 159
		return;
	}
	batch->table[--batch->crst_index] = table;
	if (batch->pgt_index >= batch->crst_index)
		rcu_table_freelist_finish();
}

M
Martin Schwidefsky 已提交
160 161 162 163 164 165 166 167
#ifdef CONFIG_64BIT
int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
{
	unsigned long *table, *pgd;
	unsigned long entry;

	BUG_ON(limit > (1UL << 53));
repeat:
168
	table = crst_table_alloc(mm);
M
Martin Schwidefsky 已提交
169 170
	if (!table)
		return -ENOMEM;
171
	spin_lock_bh(&mm->page_table_lock);
M
Martin Schwidefsky 已提交
172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
	if (mm->context.asce_limit < limit) {
		pgd = (unsigned long *) mm->pgd;
		if (mm->context.asce_limit <= (1UL << 31)) {
			entry = _REGION3_ENTRY_EMPTY;
			mm->context.asce_limit = 1UL << 42;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION3;
		} else {
			entry = _REGION2_ENTRY_EMPTY;
			mm->context.asce_limit = 1UL << 53;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION2;
		}
		crst_table_init(table, entry);
		pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
		mm->pgd = (pgd_t *) table;
190
		mm->task_size = mm->context.asce_limit;
M
Martin Schwidefsky 已提交
191 192
		table = NULL;
	}
193
	spin_unlock_bh(&mm->page_table_lock);
M
Martin Schwidefsky 已提交
194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
	if (table)
		crst_table_free(mm, table);
	if (mm->context.asce_limit < limit)
		goto repeat;
	update_mm(mm, current);
	return 0;
}

void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
{
	pgd_t *pgd;

	if (mm->context.asce_limit <= limit)
		return;
	__tlb_flush_mm(mm);
	while (mm->context.asce_limit > limit) {
		pgd = mm->pgd;
		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
		case _REGION_ENTRY_TYPE_R2:
			mm->context.asce_limit = 1UL << 42;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION3;
			break;
		case _REGION_ENTRY_TYPE_R3:
			mm->context.asce_limit = 1UL << 31;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_SEGMENT;
			break;
		default:
			BUG();
		}
		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
228
		mm->task_size = mm->context.asce_limit;
M
Martin Schwidefsky 已提交
229 230 231 232 233 234
		crst_table_free(mm, (unsigned long *) pgd);
	}
	update_mm(mm, current);
}
#endif

235 236 237
/*
 * page table entry allocation/free routines.
 */
238
unsigned long *page_table_alloc(struct mm_struct *mm)
239
{
240
	struct page *page;
241
	unsigned long *table;
242
	unsigned long bits;
243

244
	bits = (mm->context.has_pgste) ? 3UL : 1UL;
245
	spin_lock_bh(&mm->context.list_lock);
246 247 248 249 250 251 252 253
	page = NULL;
	if (!list_empty(&mm->context.pgtable_list)) {
		page = list_first_entry(&mm->context.pgtable_list,
					struct page, lru);
		if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
			page = NULL;
	}
	if (!page) {
254
		spin_unlock_bh(&mm->context.list_lock);
255 256
		page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
		if (!page)
257
			return NULL;
258 259 260
		pgtable_page_ctor(page);
		page->flags &= ~FRAG_MASK;
		table = (unsigned long *) page_to_phys(page);
261
		if (mm->context.has_pgste)
262 263 264
			clear_table_pgstes(table);
		else
			clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
265
		spin_lock_bh(&mm->context.list_lock);
266
		list_add(&page->lru, &mm->context.pgtable_list);
267 268
	}
	table = (unsigned long *) page_to_phys(page);
269 270 271 272 273 274 275
	while (page->flags & bits) {
		table += 256;
		bits <<= 1;
	}
	page->flags |= bits;
	if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
		list_move_tail(&page->lru, &mm->context.pgtable_list);
276
	spin_unlock_bh(&mm->context.list_lock);
277 278 279
	return table;
}

280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
static void __page_table_free(struct mm_struct *mm, unsigned long *table)
{
	struct page *page;
	unsigned long bits;

	bits = ((unsigned long) table) & 15;
	table = (unsigned long *)(((unsigned long) table) ^ bits);
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
	page->flags ^= bits;
	if (!(page->flags & FRAG_MASK)) {
		pgtable_page_dtor(page);
		__free_page(page);
	}
}

295
void page_table_free(struct mm_struct *mm, unsigned long *table)
296
{
297 298
	struct page *page;
	unsigned long bits;
299

300
	bits = (mm->context.has_pgste) ? 3UL : 1UL;
301 302
	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
303
	spin_lock_bh(&mm->context.list_lock);
304 305 306
	page->flags ^= bits;
	if (page->flags & FRAG_MASK) {
		/* Page now has some free pgtable fragments. */
307 308
		if (!list_empty(&page->lru))
			list_move(&page->lru, &mm->context.pgtable_list);
309 310 311 312
		page = NULL;
	} else
		/* All fragments of the 4K page have been freed. */
		list_del(&page->lru);
313
	spin_unlock_bh(&mm->context.list_lock);
314 315 316 317 318
	if (page) {
		pgtable_page_dtor(page);
		__free_page(page);
	}
}
319

320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
void page_table_free_rcu(struct mm_struct *mm, unsigned long *table)
{
	struct rcu_table_freelist *batch;
	struct page *page;
	unsigned long bits;

	if (atomic_read(&mm->mm_users) < 2 &&
	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
		page_table_free(mm, table);
		return;
	}
	batch = rcu_table_freelist_get(mm);
	if (!batch) {
		smp_call_function(smp_sync, NULL, 1);
		page_table_free(mm, table);
		return;
	}
337
	bits = (mm->context.has_pgste) ? 3UL : 1UL;
338 339 340 341 342 343 344 345 346 347 348 349
	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
	spin_lock_bh(&mm->context.list_lock);
	/* Delayed freeing with rcu prevents reuse of pgtable fragments */
	list_del_init(&page->lru);
	spin_unlock_bh(&mm->context.list_lock);
	table = (unsigned long *)(((unsigned long) table) | bits);
	batch->table[batch->pgt_index++] = table;
	if (batch->pgt_index >= batch->crst_index)
		rcu_table_freelist_finish();
}

350 351 352 353 354 355
/*
 * switch on pgstes for its userspace process (for kvm)
 */
int s390_enable_sie(void)
{
	struct task_struct *tsk = current;
356
	struct mm_struct *mm, *old_mm;
357

358
	/* Do we have switched amode? If no, we cannot do sie */
359
	if (user_mode == HOME_SPACE_MODE)
360 361
		return -EINVAL;

362
	/* Do we have pgstes? if yes, we are done */
363
	if (tsk->mm->context.has_pgste)
364
		return 0;
365

366 367
	/* lets check if we are allowed to replace the mm */
	task_lock(tsk);
368
	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
369 370 371 372
#ifdef CONFIG_AIO
	    !hlist_empty(&tsk->mm->ioctx_list) ||
#endif
	    tsk->mm != tsk->active_mm) {
373 374 375 376
		task_unlock(tsk);
		return -EINVAL;
	}
	task_unlock(tsk);
377

378 379
	/* we copy the mm and let dup_mm create the page tables with_pgstes */
	tsk->mm->context.alloc_pgste = 1;
380
	mm = dup_mm(tsk);
381
	tsk->mm->context.alloc_pgste = 0;
382
	if (!mm)
383 384
		return -ENOMEM;

385
	/* Now lets check again if something happened */
386 387
	task_lock(tsk);
	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
388 389 390 391
#ifdef CONFIG_AIO
	    !hlist_empty(&tsk->mm->ioctx_list) ||
#endif
	    tsk->mm != tsk->active_mm) {
392 393 394 395 396 397 398
		mmput(mm);
		task_unlock(tsk);
		return -EINVAL;
	}

	/* ok, we are alone. No ptrace, no threads, etc. */
	old_mm = tsk->mm;
399 400 401
	tsk->mm = tsk->active_mm = mm;
	preempt_disable();
	update_mm(mm, tsk);
402 403
	atomic_inc(&mm->context.attach_count);
	atomic_dec(&old_mm->context.attach_count);
404
	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
405 406
	preempt_enable();
	task_unlock(tsk);
407 408
	mmput(old_mm);
	return 0;
409 410
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
411

412
#if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION)
413 414 415 416 417 418
bool kernel_page_present(struct page *page)
{
	unsigned long addr;
	int cc;

	addr = page_to_phys(page);
419 420 421 422 423
	asm volatile(
		"	lra	%1,0(%1)\n"
		"	ipm	%0\n"
		"	srl	%0,28"
		: "=d" (cc), "+a" (addr) : : "cc");
424 425
	return cc == 0;
}
426
#endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */