pgtable.c 9.4 KB
Newer Older
1
/*
2
 *    Copyright IBM Corp. 2007,2009
3 4 5 6 7 8
 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
 */

#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
9
#include <linux/gfp.h>
10 11 12 13 14 15 16 17
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/quicklist.h>
18
#include <linux/rcupdate.h>
19 20 21 22 23 24

#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
M
Martin Schwidefsky 已提交
25
#include <asm/mmu_context.h>
26 27 28

#ifndef CONFIG_64BIT
#define ALLOC_ORDER	1
29
#define FRAG_MASK	0x0f
30 31
#else
#define ALLOC_ORDER	2
32
#define FRAG_MASK	0x03
33 34
#endif

35 36 37 38 39 40 41 42 43 44 45 46
unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE;
EXPORT_SYMBOL(VMALLOC_START);

static int __init parse_vmalloc(char *arg)
{
	if (!arg)
		return -EINVAL;
	VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK;
	return 0;
}
early_param("vmalloc", parse_vmalloc);

47
unsigned long *crst_table_alloc(struct mm_struct *mm)
48 49 50 51 52 53 54 55
{
	struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);

	if (!page)
		return NULL;
	return (unsigned long *) page_to_phys(page);
}

56 57
void crst_table_free(struct mm_struct *mm, unsigned long *table)
{
58
	free_pages((unsigned long) table, ALLOC_ORDER);
59 60
}

M
Martin Schwidefsky 已提交
61 62 63 64 65 66 67 68
#ifdef CONFIG_64BIT
int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
{
	unsigned long *table, *pgd;
	unsigned long entry;

	BUG_ON(limit > (1UL << 53));
repeat:
69
	table = crst_table_alloc(mm);
M
Martin Schwidefsky 已提交
70 71
	if (!table)
		return -ENOMEM;
72
	spin_lock_bh(&mm->page_table_lock);
M
Martin Schwidefsky 已提交
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
	if (mm->context.asce_limit < limit) {
		pgd = (unsigned long *) mm->pgd;
		if (mm->context.asce_limit <= (1UL << 31)) {
			entry = _REGION3_ENTRY_EMPTY;
			mm->context.asce_limit = 1UL << 42;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION3;
		} else {
			entry = _REGION2_ENTRY_EMPTY;
			mm->context.asce_limit = 1UL << 53;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION2;
		}
		crst_table_init(table, entry);
		pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
		mm->pgd = (pgd_t *) table;
91
		mm->task_size = mm->context.asce_limit;
M
Martin Schwidefsky 已提交
92 93
		table = NULL;
	}
94
	spin_unlock_bh(&mm->page_table_lock);
M
Martin Schwidefsky 已提交
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
	if (table)
		crst_table_free(mm, table);
	if (mm->context.asce_limit < limit)
		goto repeat;
	update_mm(mm, current);
	return 0;
}

void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
{
	pgd_t *pgd;

	if (mm->context.asce_limit <= limit)
		return;
	__tlb_flush_mm(mm);
	while (mm->context.asce_limit > limit) {
		pgd = mm->pgd;
		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
		case _REGION_ENTRY_TYPE_R2:
			mm->context.asce_limit = 1UL << 42;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION3;
			break;
		case _REGION_ENTRY_TYPE_R3:
			mm->context.asce_limit = 1UL << 31;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_SEGMENT;
			break;
		default:
			BUG();
		}
		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
129
		mm->task_size = mm->context.asce_limit;
M
Martin Schwidefsky 已提交
130 131 132 133 134 135
		crst_table_free(mm, (unsigned long *) pgd);
	}
	update_mm(mm, current);
}
#endif

136 137 138 139 140 141 142 143 144 145 146
static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
{
	unsigned int old, new;

	do {
		old = atomic_read(v);
		new = old ^ bits;
	} while (atomic_cmpxchg(v, old, new) != old);
	return new;
}

147 148 149
/*
 * page table entry allocation/free routines.
 */
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
#ifdef CONFIG_PGSTE
static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm)
{
	struct page *page;
	unsigned long *table;

	page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
	if (!page)
		return NULL;
	pgtable_page_ctor(page);
	atomic_set(&page->_mapcount, 3);
	table = (unsigned long *) page_to_phys(page);
	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
	clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
	return table;
}

static inline void page_table_free_pgste(unsigned long *table)
{
	struct page *page;

	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
	pgtable_page_ctor(page);
	atomic_set(&page->_mapcount, -1);
	__free_page(page);
}
#endif

178
unsigned long *page_table_alloc(struct mm_struct *mm)
179
{
180
	struct page *page;
181
	unsigned long *table;
182
	unsigned int mask, bit;
183

184 185 186 187 188
#ifdef CONFIG_PGSTE
	if (mm_has_pgste(mm))
		return page_table_alloc_pgste(mm);
#endif
	/* Allocate fragments of a 4K page as 1K/2K page table */
189
	spin_lock_bh(&mm->context.list_lock);
190
	mask = FRAG_MASK;
191 192 193
	if (!list_empty(&mm->context.pgtable_list)) {
		page = list_first_entry(&mm->context.pgtable_list,
					struct page, lru);
194 195 196
		table = (unsigned long *) page_to_phys(page);
		mask = atomic_read(&page->_mapcount);
		mask = mask | (mask >> 4);
197
	}
198
	if ((mask & FRAG_MASK) == FRAG_MASK) {
199
		spin_unlock_bh(&mm->context.list_lock);
200 201
		page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
		if (!page)
202
			return NULL;
203
		pgtable_page_ctor(page);
204
		atomic_set(&page->_mapcount, 1);
205
		table = (unsigned long *) page_to_phys(page);
206
		clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
207
		spin_lock_bh(&mm->context.list_lock);
208
		list_add(&page->lru, &mm->context.pgtable_list);
209 210 211 212 213 214
	} else {
		for (bit = 1; mask & bit; bit <<= 1)
			table += PTRS_PER_PTE;
		mask = atomic_xor_bits(&page->_mapcount, bit);
		if ((mask & FRAG_MASK) == FRAG_MASK)
			list_del(&page->lru);
215
	}
216
	spin_unlock_bh(&mm->context.list_lock);
217 218 219
	return table;
}

220
void page_table_free(struct mm_struct *mm, unsigned long *table)
221 222
{
	struct page *page;
223
	unsigned int bit, mask;
224

225 226 227 228 229
#ifdef CONFIG_PGSTE
	if (mm_has_pgste(mm))
		return page_table_free_pgste(table);
#endif
	/* Free 1K/2K page table fragment of a 4K page */
230
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
231 232 233 234 235 236 237 238 239
	bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
	spin_lock_bh(&mm->context.list_lock);
	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
		list_del(&page->lru);
	mask = atomic_xor_bits(&page->_mapcount, bit);
	if (mask & FRAG_MASK)
		list_add(&page->lru, &mm->context.pgtable_list);
	spin_unlock_bh(&mm->context.list_lock);
	if (mask == 0) {
240
		pgtable_page_dtor(page);
241
		atomic_set(&page->_mapcount, -1);
242 243 244 245
		__free_page(page);
	}
}

246 247 248
#ifdef CONFIG_HAVE_RCU_TABLE_FREE

static void __page_table_free_rcu(void *table, unsigned bit)
249
{
250
	struct page *page;
251

252 253 254 255 256
#ifdef CONFIG_PGSTE
	if (bit == FRAG_MASK)
		return page_table_free_pgste(table);
#endif
	/* Free 1K/2K page table fragment of a 4K page */
257
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
258
	if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
259
		pgtable_page_dtor(page);
260
		atomic_set(&page->_mapcount, -1);
261 262 263
		__free_page(page);
	}
}
264

265
void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
266
{
267
	struct mm_struct *mm;
268
	struct page *page;
269
	unsigned int bit, mask;
270

271 272 273 274 275 276
	mm = tlb->mm;
#ifdef CONFIG_PGSTE
	if (mm_has_pgste(mm)) {
		table = (unsigned long *) (__pa(table) | FRAG_MASK);
		tlb_remove_table(tlb, table);
		return;
277
	}
278 279
#endif
	bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
280 281
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
	spin_lock_bh(&mm->context.list_lock);
282 283 284 285 286
	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
		list_del(&page->lru);
	mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
	if (mask & FRAG_MASK)
		list_add_tail(&page->lru, &mm->context.pgtable_list);
287
	spin_unlock_bh(&mm->context.list_lock);
288 289 290 291 292 293 294 295 296 297 298 299 300
	table = (unsigned long *) (__pa(table) | (bit << 4));
	tlb_remove_table(tlb, table);
}

void __tlb_remove_table(void *_table)
{
	void *table = (void *)((unsigned long) _table & PAGE_MASK);
	unsigned type = (unsigned long) _table & ~PAGE_MASK;

	if (type)
		__page_table_free_rcu(table, type);
	else
		free_pages((unsigned long) table, ALLOC_ORDER);
301 302
}

303 304
#endif

305 306 307 308 309 310
/*
 * switch on pgstes for its userspace process (for kvm)
 */
int s390_enable_sie(void)
{
	struct task_struct *tsk = current;
311
	struct mm_struct *mm, *old_mm;
312

313
	/* Do we have switched amode? If no, we cannot do sie */
314
	if (user_mode == HOME_SPACE_MODE)
315 316
		return -EINVAL;

317
	/* Do we have pgstes? if yes, we are done */
318
	if (mm_has_pgste(tsk->mm))
319
		return 0;
320

321 322
	/* lets check if we are allowed to replace the mm */
	task_lock(tsk);
323
	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
324 325 326 327
#ifdef CONFIG_AIO
	    !hlist_empty(&tsk->mm->ioctx_list) ||
#endif
	    tsk->mm != tsk->active_mm) {
328 329 330 331
		task_unlock(tsk);
		return -EINVAL;
	}
	task_unlock(tsk);
332

333 334
	/* we copy the mm and let dup_mm create the page tables with_pgstes */
	tsk->mm->context.alloc_pgste = 1;
335
	mm = dup_mm(tsk);
336
	tsk->mm->context.alloc_pgste = 0;
337
	if (!mm)
338 339
		return -ENOMEM;

340
	/* Now lets check again if something happened */
341 342
	task_lock(tsk);
	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
343 344 345 346
#ifdef CONFIG_AIO
	    !hlist_empty(&tsk->mm->ioctx_list) ||
#endif
	    tsk->mm != tsk->active_mm) {
347 348 349 350 351 352 353
		mmput(mm);
		task_unlock(tsk);
		return -EINVAL;
	}

	/* ok, we are alone. No ptrace, no threads, etc. */
	old_mm = tsk->mm;
354 355 356
	tsk->mm = tsk->active_mm = mm;
	preempt_disable();
	update_mm(mm, tsk);
357 358
	atomic_inc(&mm->context.attach_count);
	atomic_dec(&old_mm->context.attach_count);
359
	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
360 361
	preempt_enable();
	task_unlock(tsk);
362 363
	mmput(old_mm);
	return 0;
364 365
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
366

367
#if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION)
368 369 370 371 372 373
bool kernel_page_present(struct page *page)
{
	unsigned long addr;
	int cc;

	addr = page_to_phys(page);
374 375 376 377 378
	asm volatile(
		"	lra	%1,0(%1)\n"
		"	ipm	%0\n"
		"	srl	%0,28"
		: "=d" (cc), "+a" (addr) : : "cc");
379 380
	return cc == 0;
}
381
#endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */