pgtable.c 8.3 KB
Newer Older
1
/*
2
 *    Copyright IBM Corp. 2007,2009
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
 */

#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/quicklist.h>

#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
M
Martin Schwidefsky 已提交
24
#include <asm/mmu_context.h>
25 26 27

#ifndef CONFIG_64BIT
#define ALLOC_ORDER	1
28 29 30
#define TABLES_PER_PAGE	4
#define FRAG_MASK	15UL
#define SECOND_HALVES	10UL
31 32 33 34 35 36 37 38 39

void clear_table_pgstes(unsigned long *table)
{
	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
	memset(table + 256, 0, PAGE_SIZE/4);
	clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
	memset(table + 768, 0, PAGE_SIZE/4);
}

40 41
#else
#define ALLOC_ORDER	2
42 43 44
#define TABLES_PER_PAGE	2
#define FRAG_MASK	3UL
#define SECOND_HALVES	2UL
45 46 47 48 49 50 51

void clear_table_pgstes(unsigned long *table)
{
	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
	memset(table + 256, 0, PAGE_SIZE/2);
}

52 53
#endif

54 55 56 57 58 59 60 61 62 63 64 65
unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE;
EXPORT_SYMBOL(VMALLOC_START);

static int __init parse_vmalloc(char *arg)
{
	if (!arg)
		return -EINVAL;
	VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK;
	return 0;
}
early_param("vmalloc", parse_vmalloc);

66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec)
{
	struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);

	if (!page)
		return NULL;
	page->index = 0;
	if (noexec) {
		struct page *shadow = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
		if (!shadow) {
			__free_pages(page, ALLOC_ORDER);
			return NULL;
		}
		page->index = page_to_phys(shadow);
	}
81
	spin_lock(&mm->context.list_lock);
82
	list_add(&page->lru, &mm->context.crst_list);
83
	spin_unlock(&mm->context.list_lock);
84 85 86
	return (unsigned long *) page_to_phys(page);
}

87
void crst_table_free(struct mm_struct *mm, unsigned long *table)
88 89
{
	unsigned long *shadow = get_shadow_table(table);
90
	struct page *page = virt_to_page(table);
91

92
	spin_lock(&mm->context.list_lock);
93
	list_del(&page->lru);
94
	spin_unlock(&mm->context.list_lock);
95 96 97 98 99
	if (shadow)
		free_pages((unsigned long) shadow, ALLOC_ORDER);
	free_pages((unsigned long) table, ALLOC_ORDER);
}

M
Martin Schwidefsky 已提交
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
#ifdef CONFIG_64BIT
int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
{
	unsigned long *table, *pgd;
	unsigned long entry;

	BUG_ON(limit > (1UL << 53));
repeat:
	table = crst_table_alloc(mm, mm->context.noexec);
	if (!table)
		return -ENOMEM;
	spin_lock(&mm->page_table_lock);
	if (mm->context.asce_limit < limit) {
		pgd = (unsigned long *) mm->pgd;
		if (mm->context.asce_limit <= (1UL << 31)) {
			entry = _REGION3_ENTRY_EMPTY;
			mm->context.asce_limit = 1UL << 42;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION3;
		} else {
			entry = _REGION2_ENTRY_EMPTY;
			mm->context.asce_limit = 1UL << 53;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION2;
		}
		crst_table_init(table, entry);
		pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
		mm->pgd = (pgd_t *) table;
130
		mm->task_size = mm->context.asce_limit;
M
Martin Schwidefsky 已提交
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
		table = NULL;
	}
	spin_unlock(&mm->page_table_lock);
	if (table)
		crst_table_free(mm, table);
	if (mm->context.asce_limit < limit)
		goto repeat;
	update_mm(mm, current);
	return 0;
}

void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
{
	pgd_t *pgd;

	if (mm->context.asce_limit <= limit)
		return;
	__tlb_flush_mm(mm);
	while (mm->context.asce_limit > limit) {
		pgd = mm->pgd;
		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
		case _REGION_ENTRY_TYPE_R2:
			mm->context.asce_limit = 1UL << 42;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION3;
			break;
		case _REGION_ENTRY_TYPE_R3:
			mm->context.asce_limit = 1UL << 31;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_SEGMENT;
			break;
		default:
			BUG();
		}
		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
168
		mm->task_size = mm->context.asce_limit;
M
Martin Schwidefsky 已提交
169 170 171 172 173 174
		crst_table_free(mm, (unsigned long *) pgd);
	}
	update_mm(mm, current);
}
#endif

175 176 177
/*
 * page table entry allocation/free routines.
 */
178
unsigned long *page_table_alloc(struct mm_struct *mm)
179
{
180
	struct page *page;
181
	unsigned long *table;
182
	unsigned long bits;
183

184
	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
185
	spin_lock(&mm->context.list_lock);
186 187 188 189 190 191 192 193
	page = NULL;
	if (!list_empty(&mm->context.pgtable_list)) {
		page = list_first_entry(&mm->context.pgtable_list,
					struct page, lru);
		if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
			page = NULL;
	}
	if (!page) {
194
		spin_unlock(&mm->context.list_lock);
195 196
		page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
		if (!page)
197
			return NULL;
198 199 200
		pgtable_page_ctor(page);
		page->flags &= ~FRAG_MASK;
		table = (unsigned long *) page_to_phys(page);
201
		if (mm->context.has_pgste)
202 203 204
			clear_table_pgstes(table);
		else
			clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
205
		spin_lock(&mm->context.list_lock);
206
		list_add(&page->lru, &mm->context.pgtable_list);
207 208
	}
	table = (unsigned long *) page_to_phys(page);
209 210 211 212 213 214 215
	while (page->flags & bits) {
		table += 256;
		bits <<= 1;
	}
	page->flags |= bits;
	if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
		list_move_tail(&page->lru, &mm->context.pgtable_list);
216
	spin_unlock(&mm->context.list_lock);
217 218 219
	return table;
}

220
void page_table_free(struct mm_struct *mm, unsigned long *table)
221
{
222 223
	struct page *page;
	unsigned long bits;
224

225
	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
226 227
	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
228
	spin_lock(&mm->context.list_lock);
229 230 231 232 233 234 235 236
	page->flags ^= bits;
	if (page->flags & FRAG_MASK) {
		/* Page now has some free pgtable fragments. */
		list_move(&page->lru, &mm->context.pgtable_list);
		page = NULL;
	} else
		/* All fragments of the 4K page have been freed. */
		list_del(&page->lru);
237
	spin_unlock(&mm->context.list_lock);
238 239 240 241 242
	if (page) {
		pgtable_page_dtor(page);
		__free_page(page);
	}
}
243

244 245 246 247
void disable_noexec(struct mm_struct *mm, struct task_struct *tsk)
{
	struct page *page;

248
	spin_lock(&mm->context.list_lock);
249 250 251 252 253 254 255 256 257
	/* Free shadow region and segment tables. */
	list_for_each_entry(page, &mm->context.crst_list, lru)
		if (page->index) {
			free_pages((unsigned long) page->index, ALLOC_ORDER);
			page->index = 0;
		}
	/* "Free" second halves of page tables. */
	list_for_each_entry(page, &mm->context.pgtable_list, lru)
		page->flags &= ~SECOND_HALVES;
258
	spin_unlock(&mm->context.list_lock);
259 260
	mm->context.noexec = 0;
	update_mm(mm, tsk);
261
}
262 263 264 265 266 267 268

/*
 * switch on pgstes for its userspace process (for kvm)
 */
int s390_enable_sie(void)
{
	struct task_struct *tsk = current;
269
	struct mm_struct *mm, *old_mm;
270

271 272 273 274
	/* Do we have switched amode? If no, we cannot do sie */
	if (!switch_amode)
		return -EINVAL;

275
	/* Do we have pgstes? if yes, we are done */
276
	if (tsk->mm->context.has_pgste)
277
		return 0;
278

279 280
	/* lets check if we are allowed to replace the mm */
	task_lock(tsk);
281
	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
J
Jens Axboe 已提交
282
	    tsk->mm != tsk->active_mm || !hlist_empty(&tsk->mm->ioctx_list)) {
283 284 285 286
		task_unlock(tsk);
		return -EINVAL;
	}
	task_unlock(tsk);
287

288 289
	/* we copy the mm and let dup_mm create the page tables with_pgstes */
	tsk->mm->context.alloc_pgste = 1;
290
	mm = dup_mm(tsk);
291
	tsk->mm->context.alloc_pgste = 0;
292
	if (!mm)
293 294
		return -ENOMEM;

295
	/* Now lets check again if something happened */
296 297
	task_lock(tsk);
	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
J
Jens Axboe 已提交
298
	    tsk->mm != tsk->active_mm || !hlist_empty(&tsk->mm->ioctx_list)) {
299 300 301 302 303 304 305
		mmput(mm);
		task_unlock(tsk);
		return -EINVAL;
	}

	/* ok, we are alone. No ptrace, no threads, etc. */
	old_mm = tsk->mm;
306 307 308
	tsk->mm = tsk->active_mm = mm;
	preempt_disable();
	update_mm(mm, tsk);
309
	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
310 311
	preempt_enable();
	task_unlock(tsk);
312 313
	mmput(old_mm);
	return 0;
314 315
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
316

317
#if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION)
318 319 320 321 322 323
bool kernel_page_present(struct page *page)
{
	unsigned long addr;
	int cc;

	addr = page_to_phys(page);
324 325 326 327 328
	asm volatile(
		"	lra	%1,0(%1)\n"
		"	ipm	%0\n"
		"	srl	%0,28"
		: "=d" (cc), "+a" (addr) : : "cc");
329 330
	return cc == 0;
}
331
#endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */