hugetlbpage.c 9.9 KB
Newer Older
S
Steve Capper 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
/*
 * arch/arm64/mm/hugetlbpage.c
 *
 * Copyright (C) 2013 Linaro Ltd.
 *
 * Based on arch/x86/mm/hugetlbpage.c.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/err.h>
#include <linux/sysctl.h>
#include <asm/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>

int pmd_huge(pmd_t pmd)
{
32
	return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
S
Steve Capper 已提交
33 34 35 36
}

int pud_huge(pud_t pud)
{
37
#ifndef __PAGETABLE_PMD_FOLDED
38
	return pud_val(pud) && !(pud_val(pud) & PUD_TABLE_BIT);
39 40 41
#else
	return 0;
#endif
S
Steve Capper 已提交
42 43
}

44 45 46 47 48 49 50 51 52 53
/*
 * Select all bits except the pfn
 */
static inline pgprot_t pte_pgprot(pte_t pte)
{
	unsigned long pfn = pte_pfn(pte);

	return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
}

54
static int find_num_contig(struct mm_struct *mm, unsigned long addr,
55
			   pte_t *ptep, size_t *pgsize)
56
{
57 58 59
	pgd_t *pgdp = pgd_offset(mm, addr);
	pud_t *pudp;
	pmd_t *pmdp;
60 61

	*pgsize = PAGE_SIZE;
62 63 64
	pudp = pud_offset(pgdp, addr);
	pmdp = pmd_offset(pudp, addr);
	if ((pte_t *)pmdp == ptep) {
65 66 67 68 69 70
		*pgsize = PMD_SIZE;
		return CONT_PMDS;
	}
	return CONT_PTES;
}

71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
static inline int num_contig_ptes(unsigned long size, size_t *pgsize)
{
	int contig_ptes = 0;

	*pgsize = size;

	switch (size) {
#ifdef CONFIG_ARM64_4K_PAGES
	case PUD_SIZE:
#endif
	case PMD_SIZE:
		contig_ptes = 1;
		break;
	case CONT_PMD_SIZE:
		*pgsize = PMD_SIZE;
		contig_ptes = CONT_PMDS;
		break;
	case CONT_PTE_SIZE:
		*pgsize = PAGE_SIZE;
		contig_ptes = CONT_PTES;
		break;
	}

	return contig_ptes;
}

97 98 99 100 101 102 103 104 105 106 107 108 109 110
/*
 * Changing some bits of contiguous entries requires us to follow a
 * Break-Before-Make approach, breaking the whole contiguous set
 * before we can change any entries. See ARM DDI 0487A.k_iss10775,
 * "Misprogramming of the Contiguous bit", page D4-1762.
 *
 * This helper performs the break step.
 */
static pte_t get_clear_flush(struct mm_struct *mm,
			     unsigned long addr,
			     pte_t *ptep,
			     unsigned long pgsize,
			     unsigned long ncontig)
{
111
	struct vm_area_struct vma;
112 113 114 115
	pte_t orig_pte = huge_ptep_get(ptep);
	bool valid = pte_valid(orig_pte);
	unsigned long i, saddr = addr;

116 117
	vma_init(&vma, mm);

118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) {
		pte_t pte = ptep_get_and_clear(mm, addr, ptep);

		/*
		 * If HW_AFDBM is enabled, then the HW could turn on
		 * the dirty bit for any page in the set, so check
		 * them all.  All hugetlb entries are already young.
		 */
		if (pte_dirty(pte))
			orig_pte = pte_mkdirty(orig_pte);
	}

	if (valid)
		flush_tlb_range(&vma, saddr, addr);
	return orig_pte;
}

/*
 * Changing some bits of contiguous entries requires us to follow a
 * Break-Before-Make approach, breaking the whole contiguous set
 * before we can change any entries. See ARM DDI 0487A.k_iss10775,
 * "Misprogramming of the Contiguous bit", page D4-1762.
 *
 * This helper performs the break step for use cases where the
 * original pte is not needed.
 */
static void clear_flush(struct mm_struct *mm,
			     unsigned long addr,
			     pte_t *ptep,
			     unsigned long pgsize,
			     unsigned long ncontig)
{
150
	struct vm_area_struct vma;
151 152
	unsigned long i, saddr = addr;

153
	vma_init(&vma, mm);
154 155 156 157 158 159
	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
		pte_clear(mm, addr, ptep);

	flush_tlb_range(&vma, saddr, addr);
}

160 161 162 163 164
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
			    pte_t *ptep, pte_t pte)
{
	size_t pgsize;
	int i;
165
	int ncontig;
166
	unsigned long pfn, dpfn;
167 168
	pgprot_t hugeprot;

169 170 171 172 173 174
	/*
	 * Code needs to be expanded to handle huge swap and migration
	 * entries. Needed for HUGETLB and MEMORY_FAILURE.
	 */
	WARN_ON(!pte_present(pte));

175
	if (!pte_cont(pte)) {
176 177 178 179
		set_pte_at(mm, addr, ptep, pte);
		return;
	}

180
	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
181
	pfn = pte_pfn(pte);
182
	dpfn = pgsize >> PAGE_SHIFT;
183
	hugeprot = pte_pgprot(pte);
184

185 186
	clear_flush(mm, addr, ptep, pgsize, ncontig);

187
	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
188 189 190
		set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
}

191 192 193 194 195 196 197 198 199 200 201 202
void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
			  pte_t *ptep, pte_t pte, unsigned long sz)
{
	int i, ncontig;
	size_t pgsize;

	ncontig = num_contig_ptes(sz, &pgsize);

	for (i = 0; i < ncontig; i++, ptep++)
		set_pte(ptep, pte);
}

203 204 205
pte_t *huge_pte_alloc(struct mm_struct *mm,
		      unsigned long addr, unsigned long sz)
{
206 207 208 209 210 211 212 213
	pgd_t *pgdp;
	pud_t *pudp;
	pmd_t *pmdp;
	pte_t *ptep = NULL;

	pgdp = pgd_offset(mm, addr);
	pudp = pud_alloc(mm, pgdp, addr);
	if (!pudp)
214 215 216
		return NULL;

	if (sz == PUD_SIZE) {
217
		ptep = (pte_t *)pudp;
218
	} else if (sz == (PAGE_SIZE * CONT_PTES)) {
219
		pmdp = pmd_alloc(mm, pudp, addr);
220 221 222 223 224 225 226 227 228

		WARN_ON(addr & (sz - 1));
		/*
		 * Note that if this code were ever ported to the
		 * 32-bit arm platform then it will cause trouble in
		 * the case where CONFIG_HIGHPTE is set, since there
		 * will be no pte_unmap() to correspond with this
		 * pte_alloc_map().
		 */
229
		ptep = pte_alloc_map(mm, pmdp, addr);
230 231
	} else if (sz == PMD_SIZE) {
		if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
232 233
		    pud_none(READ_ONCE(*pudp)))
			ptep = huge_pmd_share(mm, addr, pudp);
234
		else
235
			ptep = (pte_t *)pmd_alloc(mm, pudp, addr);
236
	} else if (sz == (PMD_SIZE * CONT_PMDS)) {
237
		pmdp = pmd_alloc(mm, pudp, addr);
238
		WARN_ON(addr & (sz - 1));
239
		return (pte_t *)pmdp;
240 241
	}

242
	return ptep;
243 244
}

245 246
pte_t *huge_pte_offset(struct mm_struct *mm,
		       unsigned long addr, unsigned long sz)
247
{
248 249 250
	pgd_t *pgdp;
	pud_t *pudp, pud;
	pmd_t *pmdp, pmd;
251

252 253
	pgdp = pgd_offset(mm, addr);
	if (!pgd_present(READ_ONCE(*pgdp)))
254
		return NULL;
255

256 257 258
	pudp = pud_offset(pgdp, addr);
	pud = READ_ONCE(*pudp);
	if (sz != PUD_SIZE && pud_none(pud))
259
		return NULL;
260
	/* hugepage or swap? */
261 262
	if (pud_huge(pud) || !pud_present(pud))
		return (pte_t *)pudp;
263 264
	/* table; check the next level */

265 266 267
	if (sz == CONT_PMD_SIZE)
		addr &= CONT_PMD_MASK;

268 269
	pmdp = pmd_offset(pudp, addr);
	pmd = READ_ONCE(*pmdp);
270
	if (!(sz == PMD_SIZE || sz == CONT_PMD_SIZE) &&
271
	    pmd_none(pmd))
272
		return NULL;
273 274
	if (pmd_huge(pmd) || !pmd_present(pmd))
		return (pte_t *)pmdp;
275

276 277
	if (sz == CONT_PTE_SIZE)
		return pte_offset_kernel(pmdp, (addr & CONT_PTE_MASK));
278

279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
	return NULL;
}

pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
			 struct page *page, int writable)
{
	size_t pagesize = huge_page_size(hstate_vma(vma));

	if (pagesize == CONT_PTE_SIZE) {
		entry = pte_mkcont(entry);
	} else if (pagesize == CONT_PMD_SIZE) {
		entry = pmd_pte(pmd_mkcont(pte_pmd(entry)));
	} else if (pagesize != PUD_SIZE && pagesize != PMD_SIZE) {
		pr_warn("%s: unrecognized huge page size 0x%lx\n",
			__func__, pagesize);
	}
	return entry;
}

298 299 300 301 302 303 304 305 306 307 308 309
void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
		    pte_t *ptep, unsigned long sz)
{
	int i, ncontig;
	size_t pgsize;

	ncontig = num_contig_ptes(sz, &pgsize);

	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
		pte_clear(mm, addr, ptep);
}

310 311 312
pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
			      unsigned long addr, pte_t *ptep)
{
313
	int ncontig;
314 315 316 317
	size_t pgsize;
	pte_t orig_pte = huge_ptep_get(ptep);

	if (!pte_cont(orig_pte))
318
		return ptep_get_and_clear(mm, addr, ptep);
319 320 321

	ncontig = find_num_contig(mm, addr, ptep, &pgsize);

322
	return get_clear_flush(mm, addr, ptep, pgsize, ncontig);
323 324 325 326 327 328
}

int huge_ptep_set_access_flags(struct vm_area_struct *vma,
			       unsigned long addr, pte_t *ptep,
			       pte_t pte, int dirty)
{
329 330 331 332
	int ncontig, i, changed = 0;
	size_t pgsize = 0;
	unsigned long pfn = pte_pfn(pte), dpfn;
	pgprot_t hugeprot;
333
	pte_t orig_pte;
334 335

	if (!pte_cont(pte))
336
		return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
337 338 339 340

	ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize);
	dpfn = pgsize >> PAGE_SHIFT;

341 342 343 344 345 346 347 348 349 350 351
	orig_pte = get_clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig);
	if (!pte_same(orig_pte, pte))
		changed = 1;

	/* Make sure we don't lose the dirty state */
	if (pte_dirty(orig_pte))
		pte = pte_mkdirty(pte);

	hugeprot = pte_pgprot(pte);
	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
		set_pte_at(vma->vm_mm, addr, ptep, pfn_pte(pfn, hugeprot));
352 353

	return changed;
354 355 356 357 358
}

void huge_ptep_set_wrprotect(struct mm_struct *mm,
			     unsigned long addr, pte_t *ptep)
{
359 360
	unsigned long pfn, dpfn;
	pgprot_t hugeprot;
361 362
	int ncontig, i;
	size_t pgsize;
363
	pte_t pte;
364

365
	if (!pte_cont(READ_ONCE(*ptep))) {
366
		ptep_set_wrprotect(mm, addr, ptep);
367
		return;
368
	}
369 370

	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
371 372 373 374 375 376 377 378 379 380
	dpfn = pgsize >> PAGE_SHIFT;

	pte = get_clear_flush(mm, addr, ptep, pgsize, ncontig);
	pte = pte_wrprotect(pte);

	hugeprot = pte_pgprot(pte);
	pfn = pte_pfn(pte);

	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
		set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
381 382 383 384 385
}

void huge_ptep_clear_flush(struct vm_area_struct *vma,
			   unsigned long addr, pte_t *ptep)
{
386
	size_t pgsize;
387
	int ncontig;
388

389
	if (!pte_cont(READ_ONCE(*ptep))) {
390
		ptep_clear_flush(vma, addr, ptep);
391
		return;
392
	}
393 394

	ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize);
395
	clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig);
396 397
}

S
Steve Capper 已提交
398 399 400
static __init int setup_hugepagesz(char *opt)
{
	unsigned long ps = memparse(opt, &opt);
401

402 403 404 405 406 407 408 409 410
	switch (ps) {
#ifdef CONFIG_ARM64_4K_PAGES
	case PUD_SIZE:
#endif
	case PMD_SIZE * CONT_PMDS:
	case PMD_SIZE:
	case PAGE_SIZE * CONT_PTES:
		hugetlb_add_hstate(ilog2(ps) - PAGE_SHIFT);
		return 1;
S
Steve Capper 已提交
411
	}
412 413 414 415

	hugetlb_bad_size();
	pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
	return 0;
S
Steve Capper 已提交
416 417
}
__setup("hugepagesz=", setup_hugepagesz);
418 419 420 421 422 423 424 425 426 427

#ifdef CONFIG_ARM64_64K_PAGES
static __init int add_default_hugepagesz(void)
{
	if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL)
		hugetlb_add_hstate(CONT_PTE_SHIFT);
	return 0;
}
arch_initcall(add_default_hugepagesz);
#endif