inode.c 25.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3
/*
 * hugetlbpage-backed filesystem.  Based on ramfs.
 *
4
 * Nadia Yvette Chambers, 2002
L
Linus Torvalds 已提交
5 6 7 8
 *
 * Copyright (C) 2002 Linus Torvalds.
 */

9 10
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

L
Linus Torvalds 已提交
11 12 13 14 15 16 17
#include <linux/module.h>
#include <linux/thread_info.h>
#include <asm/current.h>
#include <linux/sched.h>		/* remove ASAP */
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/file.h>
18
#include <linux/kernel.h>
L
Linus Torvalds 已提交
19 20 21 22 23
#include <linux/writeback.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/string.h>
24
#include <linux/capability.h>
25
#include <linux/ctype.h>
L
Linus Torvalds 已提交
26 27 28
#include <linux/backing-dev.h>
#include <linux/hugetlb.h>
#include <linux/pagevec.h>
29
#include <linux/parser.h>
30
#include <linux/mman.h>
L
Linus Torvalds 已提交
31 32 33 34
#include <linux/slab.h>
#include <linux/dnotify.h>
#include <linux/statfs.h>
#include <linux/security.h>
N
Nick Black 已提交
35
#include <linux/magic.h>
N
Naoya Horiguchi 已提交
36
#include <linux/migrate.h>
A
Al Viro 已提交
37
#include <linux/uio.h>
L
Linus Torvalds 已提交
38 39 40

#include <asm/uaccess.h>

41
static const struct super_operations hugetlbfs_ops;
42
static const struct address_space_operations hugetlbfs_aops;
43
const struct file_operations hugetlbfs_file_operations;
44 45
static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;
L
Linus Torvalds 已提交
46

D
David Gibson 已提交
47
struct hugetlbfs_config {
48 49
	kuid_t   uid;
	kgid_t   gid;
D
David Gibson 已提交
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
	umode_t mode;
	long	nr_blocks;
	long	nr_inodes;
	struct hstate *hstate;
};

struct hugetlbfs_inode_info {
	struct shared_policy policy;
	struct inode vfs_inode;
};

static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
{
	return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
}

L
Linus Torvalds 已提交
66 67
int sysctl_hugetlb_shm_group;

68 69 70
enum {
	Opt_size, Opt_nr_inodes,
	Opt_mode, Opt_uid, Opt_gid,
71
	Opt_pagesize,
72 73 74
	Opt_err,
};

75
static const match_table_t tokens = {
76 77 78 79 80
	{Opt_size,	"size=%s"},
	{Opt_nr_inodes,	"nr_inodes=%s"},
	{Opt_mode,	"mode=%o"},
	{Opt_uid,	"uid=%u"},
	{Opt_gid,	"gid=%u"},
81
	{Opt_pagesize,	"pagesize=%s"},
82 83 84
	{Opt_err,	NULL},
};

85 86 87 88 89 90 91 92 93 94
static void huge_pagevec_release(struct pagevec *pvec)
{
	int i;

	for (i = 0; i < pagevec_count(pvec); ++i)
		put_page(pvec->pages[i]);

	pagevec_reinit(pvec);
}

L
Linus Torvalds 已提交
95 96
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
A
Al Viro 已提交
97
	struct inode *inode = file_inode(file);
L
Linus Torvalds 已提交
98 99
	loff_t len, vma_len;
	int ret;
100
	struct hstate *h = hstate_file(file);
L
Linus Torvalds 已提交
101

102
	/*
103 104 105 106 107 108
	 * vma address alignment (but not the pgoff alignment) has
	 * already been checked by prepare_hugepage_range.  If you add
	 * any error returns here, do so after setting VM_HUGETLB, so
	 * is_vm_hugetlb_page tests below unmap_region go the right
	 * way when do_mmap_pgoff unwinds (may be important on powerpc
	 * and ia64).
109
	 */
110
	vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
111
	vma->vm_ops = &hugetlb_vm_ops;
L
Linus Torvalds 已提交
112

113
	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
114 115
		return -EINVAL;

L
Linus Torvalds 已提交
116 117
	vma_len = (loff_t)(vma->vm_end - vma->vm_start);

118
	mutex_lock(&inode->i_mutex);
L
Linus Torvalds 已提交
119 120 121 122 123
	file_accessed(file);

	ret = -ENOMEM;
	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);

124
	if (hugetlb_reserve_pages(inode,
125
				vma->vm_pgoff >> huge_page_order(h),
126 127
				len >> huge_page_shift(h), vma,
				vma->vm_flags))
128
		goto out;
129

A
Adam Litke 已提交
130 131
	ret = 0;
	hugetlb_prefault_arch_hook(vma->vm_mm);
132
	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
L
Linus Torvalds 已提交
133 134
		inode->i_size = len;
out:
135
	mutex_unlock(&inode->i_mutex);
L
Linus Torvalds 已提交
136 137 138 139 140

	return ret;
}

/*
141
 * Called under down_write(mmap_sem).
L
Linus Torvalds 已提交
142 143
 */

144
#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
L
Linus Torvalds 已提交
145 146 147 148 149 150
static unsigned long
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
		unsigned long len, unsigned long pgoff, unsigned long flags)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
151
	struct hstate *h = hstate_file(file);
152
	struct vm_unmapped_area_info info;
L
Linus Torvalds 已提交
153

154
	if (len & ~huge_page_mask(h))
L
Linus Torvalds 已提交
155 156 157 158
		return -EINVAL;
	if (len > TASK_SIZE)
		return -ENOMEM;

159
	if (flags & MAP_FIXED) {
160
		if (prepare_hugepage_range(file, addr, len))
161 162 163 164
			return -EINVAL;
		return addr;
	}

L
Linus Torvalds 已提交
165
	if (addr) {
166
		addr = ALIGN(addr, huge_page_size(h));
L
Linus Torvalds 已提交
167 168 169 170 171 172
		vma = find_vma(mm, addr);
		if (TASK_SIZE - len >= addr &&
		    (!vma || addr + len <= vma->vm_start))
			return addr;
	}

173 174 175 176 177 178 179
	info.flags = 0;
	info.length = len;
	info.low_limit = TASK_UNMAPPED_BASE;
	info.high_limit = TASK_SIZE;
	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
	info.align_offset = 0;
	return vm_unmapped_area(&info);
L
Linus Torvalds 已提交
180 181 182
}
#endif

A
Al Viro 已提交
183
static size_t
B
Badari Pulavarty 已提交
184
hugetlbfs_read_actor(struct page *page, unsigned long offset,
A
Al Viro 已提交
185
			struct iov_iter *to, unsigned long size)
B
Badari Pulavarty 已提交
186
{
A
Al Viro 已提交
187
	size_t copied = 0;
B
Badari Pulavarty 已提交
188 189 190 191 192 193 194
	int i, chunksize;

	/* Find which 4k chunk and offset with in that chunk */
	i = offset >> PAGE_CACHE_SHIFT;
	offset = offset & ~PAGE_CACHE_MASK;

	while (size) {
A
Al Viro 已提交
195
		size_t n;
B
Badari Pulavarty 已提交
196 197 198 199 200
		chunksize = PAGE_CACHE_SIZE;
		if (offset)
			chunksize -= offset;
		if (chunksize > size)
			chunksize = size;
A
Al Viro 已提交
201 202 203 204
		n = copy_page_to_iter(&page[i], offset, chunksize, to);
		copied += n;
		if (n != chunksize)
			return copied;
B
Badari Pulavarty 已提交
205 206 207 208
		offset = 0;
		size -= chunksize;
		i++;
	}
A
Al Viro 已提交
209
	return copied;
B
Badari Pulavarty 已提交
210 211 212 213 214 215 216
}

/*
 * Support for read() - Find the page attached to f_mapping and copy out the
 * data. Its *very* similar to do_generic_mapping_read(), we can't use that
 * since it has PAGE_CACHE_SIZE assumptions.
 */
A
Al Viro 已提交
217
static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
B
Badari Pulavarty 已提交
218
{
A
Al Viro 已提交
219 220 221
	struct file *file = iocb->ki_filp;
	struct hstate *h = hstate_file(file);
	struct address_space *mapping = file->f_mapping;
B
Badari Pulavarty 已提交
222
	struct inode *inode = mapping->host;
A
Al Viro 已提交
223 224
	unsigned long index = iocb->ki_pos >> huge_page_shift(h);
	unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
B
Badari Pulavarty 已提交
225 226 227 228
	unsigned long end_index;
	loff_t isize;
	ssize_t retval = 0;

A
Al Viro 已提交
229
	while (iov_iter_count(to)) {
B
Badari Pulavarty 已提交
230
		struct page *page;
A
Al Viro 已提交
231
		size_t nr, copied;
B
Badari Pulavarty 已提交
232 233

		/* nr is the maximum number of bytes to copy from this page */
234
		nr = huge_page_size(h);
235 236
		isize = i_size_read(inode);
		if (!isize)
A
Al Viro 已提交
237
			break;
238
		end_index = (isize - 1) >> huge_page_shift(h);
A
Al Viro 已提交
239 240 241
		if (index > end_index)
			break;
		if (index == end_index) {
242
			nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
243
			if (nr <= offset)
A
Al Viro 已提交
244
				break;
B
Badari Pulavarty 已提交
245 246 247 248
		}
		nr = nr - offset;

		/* Find the page */
249
		page = find_lock_page(mapping, index);
B
Badari Pulavarty 已提交
250 251 252 253 254
		if (unlikely(page == NULL)) {
			/*
			 * We have a HOLE, zero out the user-buffer for the
			 * length of the hole or request.
			 */
A
Al Viro 已提交
255
			copied = iov_iter_zero(nr, to);
B
Badari Pulavarty 已提交
256
		} else {
257 258
			unlock_page(page);

B
Badari Pulavarty 已提交
259 260 261
			/*
			 * We have the page, copy it to user space buffer.
			 */
A
Al Viro 已提交
262
			copied = hugetlbfs_read_actor(page, offset, to, nr);
263
			page_cache_release(page);
B
Badari Pulavarty 已提交
264
		}
A
Al Viro 已提交
265 266 267 268 269 270
		offset += copied;
		retval += copied;
		if (copied != nr && iov_iter_count(to)) {
			if (!retval)
				retval = -EFAULT;
			break;
B
Badari Pulavarty 已提交
271
		}
272 273
		index += offset >> huge_page_shift(h);
		offset &= ~huge_page_mask(h);
B
Badari Pulavarty 已提交
274
	}
A
Al Viro 已提交
275
	iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
B
Badari Pulavarty 已提交
276 277 278
	return retval;
}

N
Nick Piggin 已提交
279 280 281 282
static int hugetlbfs_write_begin(struct file *file,
			struct address_space *mapping,
			loff_t pos, unsigned len, unsigned flags,
			struct page **pagep, void **fsdata)
L
Linus Torvalds 已提交
283 284 285 286
{
	return -EINVAL;
}

N
Nick Piggin 已提交
287 288 289
static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
			loff_t pos, unsigned len, unsigned copied,
			struct page *page, void *fsdata)
L
Linus Torvalds 已提交
290
{
N
Nick Piggin 已提交
291
	BUG();
L
Linus Torvalds 已提交
292 293 294 295 296
	return -EINVAL;
}

static void truncate_huge_page(struct page *page)
{
297
	cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
L
Linus Torvalds 已提交
298
	ClearPageUptodate(page);
299
	delete_from_page_cache(page);
L
Linus Torvalds 已提交
300 301
}

302
static void truncate_hugepages(struct inode *inode, loff_t lstart)
L
Linus Torvalds 已提交
303
{
304
	struct hstate *h = hstate_inode(inode);
305
	struct address_space *mapping = &inode->i_data;
306
	const pgoff_t start = lstart >> huge_page_shift(h);
L
Linus Torvalds 已提交
307 308
	struct pagevec pvec;
	pgoff_t next;
309
	int i, freed = 0;
L
Linus Torvalds 已提交
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329

	pagevec_init(&pvec, 0);
	next = start;
	while (1) {
		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
			if (next == start)
				break;
			next = start;
			continue;
		}

		for (i = 0; i < pagevec_count(&pvec); ++i) {
			struct page *page = pvec.pages[i];

			lock_page(page);
			if (page->index > next)
				next = page->index;
			++next;
			truncate_huge_page(page);
			unlock_page(page);
330
			freed++;
L
Linus Torvalds 已提交
331 332 333 334
		}
		huge_pagevec_release(&pvec);
	}
	BUG_ON(!lstart && mapping->nrpages);
335
	hugetlb_unreserve_pages(inode, start, freed);
L
Linus Torvalds 已提交
336 337
}

A
Al Viro 已提交
338
static void hugetlbfs_evict_inode(struct inode *inode)
L
Linus Torvalds 已提交
339
{
340 341
	struct resv_map *resv_map;

342
	truncate_hugepages(inode, 0);
343 344 345 346
	resv_map = (struct resv_map *)inode->i_mapping->private_data;
	/* root inode doesn't have the resv_map, so we should check it */
	if (resv_map)
		resv_map_release(&resv_map->refs);
347
	clear_inode(inode);
348 349
}

L
Linus Torvalds 已提交
350
static inline void
351
hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
L
Linus Torvalds 已提交
352 353 354
{
	struct vm_area_struct *vma;

355
	vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
L
Linus Torvalds 已提交
356 357 358
		unsigned long v_offset;

		/*
H
Hugh Dickins 已提交
359
		 * Can the expression below overflow on 32-bit arches?
360
		 * No, because the interval tree returns us only those vmas
H
Hugh Dickins 已提交
361 362
		 * which overlap the truncated area starting at pgoff,
		 * and no vma on a 32-bit arch can span beyond the 4GB.
L
Linus Torvalds 已提交
363
		 */
H
Hugh Dickins 已提交
364 365 366
		if (vma->vm_pgoff < pgoff)
			v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
		else
L
Linus Torvalds 已提交
367 368
			v_offset = 0;

369 370
		unmap_hugepage_range(vma, vma->vm_start + v_offset,
				     vma->vm_end, NULL);
L
Linus Torvalds 已提交
371 372 373 374 375
	}
}

static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
{
H
Hugh Dickins 已提交
376
	pgoff_t pgoff;
L
Linus Torvalds 已提交
377
	struct address_space *mapping = inode->i_mapping;
378
	struct hstate *h = hstate_inode(inode);
L
Linus Torvalds 已提交
379

380
	BUG_ON(offset & ~huge_page_mask(h));
H
Hugh Dickins 已提交
381
	pgoff = offset >> PAGE_SHIFT;
L
Linus Torvalds 已提交
382

383
	i_size_write(inode, offset);
384
	i_mmap_lock_write(mapping);
385
	if (!RB_EMPTY_ROOT(&mapping->i_mmap))
L
Linus Torvalds 已提交
386
		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
387
	i_mmap_unlock_write(mapping);
388
	truncate_hugepages(inode, offset);
L
Linus Torvalds 已提交
389 390 391 392 393
	return 0;
}

static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
{
394
	struct inode *inode = d_inode(dentry);
395
	struct hstate *h = hstate_inode(inode);
L
Linus Torvalds 已提交
396 397 398 399 400 401 402
	int error;
	unsigned int ia_valid = attr->ia_valid;

	BUG_ON(!inode);

	error = inode_change_ok(inode, attr);
	if (error)
C
Christoph Hellwig 已提交
403
		return error;
L
Linus Torvalds 已提交
404 405 406

	if (ia_valid & ATTR_SIZE) {
		error = -EINVAL;
C
Christoph Hellwig 已提交
407 408 409
		if (attr->ia_size & ~huge_page_mask(h))
			return -EINVAL;
		error = hugetlb_vmtruncate(inode, attr->ia_size);
L
Linus Torvalds 已提交
410
		if (error)
C
Christoph Hellwig 已提交
411
			return error;
L
Linus Torvalds 已提交
412
	}
C
Christoph Hellwig 已提交
413 414 415 416

	setattr_copy(inode, attr);
	mark_inode_dirty(inode);
	return 0;
L
Linus Torvalds 已提交
417 418
}

419 420
static struct inode *hugetlbfs_get_root(struct super_block *sb,
					struct hugetlbfs_config *config)
L
Linus Torvalds 已提交
421 422 423 424 425 426
{
	struct inode *inode;

	inode = new_inode(sb);
	if (inode) {
		struct hugetlbfs_inode_info *info;
427
		inode->i_ino = get_next_ino();
428 429 430 431 432 433 434 435 436 437
		inode->i_mode = S_IFDIR | config->mode;
		inode->i_uid = config->uid;
		inode->i_gid = config->gid;
		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
		info = HUGETLBFS_I(inode);
		mpol_shared_policy_init(&info->policy, NULL);
		inode->i_op = &hugetlbfs_dir_inode_operations;
		inode->i_fop = &simple_dir_operations;
		/* directory inodes start off with i_nlink == 2 (for "." entry) */
		inc_nlink(inode);
438
		lockdep_annotate_inode_mutex_key(inode);
439 440 441 442
	}
	return inode;
}

443
/*
444
 * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
445 446
 * be taken from reclaim -- unlike regular filesystems. This needs an
 * annotation because huge_pmd_share() does an allocation under
447
 * i_mmap_rwsem.
448
 */
449
static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
450

451 452
static struct inode *hugetlbfs_get_inode(struct super_block *sb,
					struct inode *dir,
A
Al Viro 已提交
453
					umode_t mode, dev_t dev)
454 455
{
	struct inode *inode;
456 457 458 459 460
	struct resv_map *resv_map;

	resv_map = resv_map_alloc();
	if (!resv_map)
		return NULL;
461 462 463 464 465 466

	inode = new_inode(sb);
	if (inode) {
		struct hugetlbfs_inode_info *info;
		inode->i_ino = get_next_ino();
		inode_init_owner(inode, dir, mode);
467 468
		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
				&hugetlbfs_i_mmap_rwsem_key);
L
Linus Torvalds 已提交
469 470
		inode->i_mapping->a_ops = &hugetlbfs_aops;
		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
471
		inode->i_mapping->private_data = resv_map;
L
Linus Torvalds 已提交
472
		info = HUGETLBFS_I(inode);
473 474 475 476 477 478 479
		/*
		 * The policy is initialized here even if we are creating a
		 * private inode because initialization simply creates an
		 * an empty rb tree and calls spin_lock_init(), later when we
		 * call mpol_free_shared_policy() it will just return because
		 * the rb tree will still be empty.
		 */
480
		mpol_shared_policy_init(&info->policy, NULL);
L
Linus Torvalds 已提交
481 482 483 484 485 486 487 488 489 490 491 492 493
		switch (mode & S_IFMT) {
		default:
			init_special_inode(inode, mode, dev);
			break;
		case S_IFREG:
			inode->i_op = &hugetlbfs_inode_operations;
			inode->i_fop = &hugetlbfs_file_operations;
			break;
		case S_IFDIR:
			inode->i_op = &hugetlbfs_dir_inode_operations;
			inode->i_fop = &simple_dir_operations;

			/* directory inodes start off with i_nlink == 2 (for "." entry) */
494
			inc_nlink(inode);
L
Linus Torvalds 已提交
495 496 497 498 499
			break;
		case S_IFLNK:
			inode->i_op = &page_symlink_inode_operations;
			break;
		}
500
		lockdep_annotate_inode_mutex_key(inode);
501 502 503
	} else
		kref_put(&resv_map->refs, resv_map_release);

L
Linus Torvalds 已提交
504 505 506 507 508 509 510
	return inode;
}

/*
 * File creation. Allocate an inode, and we're done..
 */
static int hugetlbfs_mknod(struct inode *dir,
A
Al Viro 已提交
511
			struct dentry *dentry, umode_t mode, dev_t dev)
L
Linus Torvalds 已提交
512 513 514
{
	struct inode *inode;
	int error = -ENOSPC;
515 516

	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
L
Linus Torvalds 已提交
517 518 519 520 521 522 523 524 525
	if (inode) {
		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
		d_instantiate(dentry, inode);
		dget(dentry);	/* Extra count - pin the dentry in core */
		error = 0;
	}
	return error;
}

526
static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
L
Linus Torvalds 已提交
527 528 529
{
	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
	if (!retval)
530
		inc_nlink(dir);
L
Linus Torvalds 已提交
531 532 533
	return retval;
}

A
Al Viro 已提交
534
static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
L
Linus Torvalds 已提交
535 536 537 538 539 540 541 542 543 544
{
	return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
}

static int hugetlbfs_symlink(struct inode *dir,
			struct dentry *dentry, const char *symname)
{
	struct inode *inode;
	int error = -ENOSPC;

545
	inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
L
Linus Torvalds 已提交
546 547 548 549 550 551 552 553 554 555 556 557 558 559 560
	if (inode) {
		int l = strlen(symname)+1;
		error = page_symlink(inode, symname, l);
		if (!error) {
			d_instantiate(dentry, inode);
			dget(dentry);
		} else
			iput(inode);
	}
	dir->i_ctime = dir->i_mtime = CURRENT_TIME;

	return error;
}

/*
561
 * mark the head page dirty
L
Linus Torvalds 已提交
562 563 564
 */
static int hugetlbfs_set_page_dirty(struct page *page)
{
565
	struct page *head = compound_head(page);
566 567

	SetPageDirty(head);
L
Linus Torvalds 已提交
568 569 570
	return 0;
}

N
Naoya Horiguchi 已提交
571
static int hugetlbfs_migrate_page(struct address_space *mapping,
572
				struct page *newpage, struct page *page,
573
				enum migrate_mode mode)
N
Naoya Horiguchi 已提交
574 575 576 577
{
	int rc;

	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
578
	if (rc != MIGRATEPAGE_SUCCESS)
N
Naoya Horiguchi 已提交
579 580 581
		return rc;
	migrate_page_copy(newpage, page);

582
	return MIGRATEPAGE_SUCCESS;
N
Naoya Horiguchi 已提交
583 584
}

585
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
L
Linus Torvalds 已提交
586
{
587
	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
588
	struct hstate *h = hstate_inode(d_inode(dentry));
L
Linus Torvalds 已提交
589 590

	buf->f_type = HUGETLBFS_MAGIC;
591
	buf->f_bsize = huge_page_size(h);
L
Linus Torvalds 已提交
592 593
	if (sbinfo) {
		spin_lock(&sbinfo->stat_lock);
594 595
		/* If no limits set, just report 0 for max/free/used
		 * blocks, like simple_statfs() */
596 597 598 599 600 601 602 603 604
		if (sbinfo->spool) {
			long free_pages;

			spin_lock(&sbinfo->spool->lock);
			buf->f_blocks = sbinfo->spool->max_hpages;
			free_pages = sbinfo->spool->max_hpages
				- sbinfo->spool->used_hpages;
			buf->f_bavail = buf->f_bfree = free_pages;
			spin_unlock(&sbinfo->spool->lock);
605 606 607
			buf->f_files = sbinfo->max_inodes;
			buf->f_ffree = sbinfo->free_inodes;
		}
L
Linus Torvalds 已提交
608 609 610 611 612 613 614 615 616 617 618 619
		spin_unlock(&sbinfo->stat_lock);
	}
	buf->f_namelen = NAME_MAX;
	return 0;
}

static void hugetlbfs_put_super(struct super_block *sb)
{
	struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);

	if (sbi) {
		sb->s_fs_info = NULL;
620 621 622 623

		if (sbi->spool)
			hugepage_put_subpool(sbi->spool);

L
Linus Torvalds 已提交
624 625 626 627
		kfree(sbi);
	}
}

628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652
static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
{
	if (sbinfo->free_inodes >= 0) {
		spin_lock(&sbinfo->stat_lock);
		if (unlikely(!sbinfo->free_inodes)) {
			spin_unlock(&sbinfo->stat_lock);
			return 0;
		}
		sbinfo->free_inodes--;
		spin_unlock(&sbinfo->stat_lock);
	}

	return 1;
}

static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
{
	if (sbinfo->free_inodes >= 0) {
		spin_lock(&sbinfo->stat_lock);
		sbinfo->free_inodes++;
		spin_unlock(&sbinfo->stat_lock);
	}
}


653
static struct kmem_cache *hugetlbfs_inode_cachep;
L
Linus Torvalds 已提交
654 655 656

static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
{
657
	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
L
Linus Torvalds 已提交
658 659
	struct hugetlbfs_inode_info *p;

660 661
	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
		return NULL;
662
	p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
663 664
	if (unlikely(!p)) {
		hugetlbfs_inc_free_inodes(sbinfo);
L
Linus Torvalds 已提交
665
		return NULL;
666
	}
L
Linus Torvalds 已提交
667 668 669
	return &p->vfs_inode;
}

N
Nick Piggin 已提交
670 671 672 673 674 675
static void hugetlbfs_i_callback(struct rcu_head *head)
{
	struct inode *inode = container_of(head, struct inode, i_rcu);
	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
}

L
Linus Torvalds 已提交
676 677
static void hugetlbfs_destroy_inode(struct inode *inode)
{
678
	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
L
Linus Torvalds 已提交
679
	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
N
Nick Piggin 已提交
680
	call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
L
Linus Torvalds 已提交
681 682
}

683
static const struct address_space_operations hugetlbfs_aops = {
N
Nick Piggin 已提交
684 685
	.write_begin	= hugetlbfs_write_begin,
	.write_end	= hugetlbfs_write_end,
L
Linus Torvalds 已提交
686
	.set_page_dirty	= hugetlbfs_set_page_dirty,
N
Naoya Horiguchi 已提交
687
	.migratepage    = hugetlbfs_migrate_page,
L
Linus Torvalds 已提交
688 689
};

690

691
static void init_once(void *foo)
692 693 694
{
	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;

C
Christoph Lameter 已提交
695
	inode_init_once(&ei->vfs_inode);
696 697
}

698
const struct file_operations hugetlbfs_file_operations = {
A
Al Viro 已提交
699
	.read_iter		= hugetlbfs_read_iter,
L
Linus Torvalds 已提交
700
	.mmap			= hugetlbfs_file_mmap,
701
	.fsync			= noop_fsync,
L
Linus Torvalds 已提交
702
	.get_unmapped_area	= hugetlb_get_unmapped_area,
703
	.llseek		= default_llseek,
L
Linus Torvalds 已提交
704 705
};

706
static const struct inode_operations hugetlbfs_dir_inode_operations = {
L
Linus Torvalds 已提交
707 708 709 710 711 712 713 714 715 716 717 718
	.create		= hugetlbfs_create,
	.lookup		= simple_lookup,
	.link		= simple_link,
	.unlink		= simple_unlink,
	.symlink	= hugetlbfs_symlink,
	.mkdir		= hugetlbfs_mkdir,
	.rmdir		= simple_rmdir,
	.mknod		= hugetlbfs_mknod,
	.rename		= simple_rename,
	.setattr	= hugetlbfs_setattr,
};

719
static const struct inode_operations hugetlbfs_inode_operations = {
L
Linus Torvalds 已提交
720 721 722
	.setattr	= hugetlbfs_setattr,
};

723
static const struct super_operations hugetlbfs_ops = {
L
Linus Torvalds 已提交
724 725
	.alloc_inode    = hugetlbfs_alloc_inode,
	.destroy_inode  = hugetlbfs_destroy_inode,
A
Al Viro 已提交
726
	.evict_inode	= hugetlbfs_evict_inode,
L
Linus Torvalds 已提交
727 728
	.statfs		= hugetlbfs_statfs,
	.put_super	= hugetlbfs_put_super,
M
Miklos Szeredi 已提交
729
	.show_options	= generic_show_options,
L
Linus Torvalds 已提交
730 731 732 733 734
};

static int
hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
{
735 736 737
	char *p, *rest;
	substring_t args[MAX_OPT_ARGS];
	int option;
738 739
	unsigned long long size = 0;
	enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE;
L
Linus Torvalds 已提交
740 741 742 743

	if (!options)
		return 0;

744 745
	while ((p = strsep(&options, ",")) != NULL) {
		int token;
746 747
		if (!*p)
			continue;
748 749 750 751 752 753

		token = match_token(p, tokens, args);
		switch (token) {
		case Opt_uid:
			if (match_int(&args[0], &option))
 				goto bad_val;
754 755 756
			pconfig->uid = make_kuid(current_user_ns(), option);
			if (!uid_valid(pconfig->uid))
				goto bad_val;
757 758 759 760 761
			break;

		case Opt_gid:
			if (match_int(&args[0], &option))
 				goto bad_val;
762 763 764
			pconfig->gid = make_kgid(current_user_ns(), option);
			if (!gid_valid(pconfig->gid))
				goto bad_val;
765 766 767 768 769
			break;

		case Opt_mode:
			if (match_octal(&args[0], &option))
 				goto bad_val;
770
			pconfig->mode = option & 01777U;
771 772 773 774 775 776 777
			break;

		case Opt_size: {
			/* memparse() will accept a K/M/G without a digit */
			if (!isdigit(*args[0].from))
				goto bad_val;
			size = memparse(args[0].from, &rest);
778 779 780
			setsize = SIZE_STD;
			if (*rest == '%')
				setsize = SIZE_PERCENT;
781 782
			break;
		}
L
Linus Torvalds 已提交
783

784 785 786 787 788 789 790
		case Opt_nr_inodes:
			/* memparse() will accept a K/M/G without a digit */
			if (!isdigit(*args[0].from))
				goto bad_val;
			pconfig->nr_inodes = memparse(args[0].from, &rest);
			break;

791 792 793 794 795
		case Opt_pagesize: {
			unsigned long ps;
			ps = memparse(args[0].from, &rest);
			pconfig->hstate = size_to_hstate(ps);
			if (!pconfig->hstate) {
796
				pr_err("Unsupported page size %lu MB\n",
797 798 799 800 801 802
					ps >> 20);
				return -EINVAL;
			}
			break;
		}

803
		default:
804
			pr_err("Bad mount option: \"%s\"\n", p);
805
			return -EINVAL;
806 807
			break;
		}
L
Linus Torvalds 已提交
808
	}
809 810 811 812 813 814 815 816 817 818 819 820

	/* Do size after hstate is set up */
	if (setsize > NO_SIZE) {
		struct hstate *h = pconfig->hstate;
		if (setsize == SIZE_PERCENT) {
			size <<= huge_page_shift(h);
			size *= h->max_huge_pages;
			do_div(size, 100);
		}
		pconfig->nr_blocks = (size >> huge_page_shift(h));
	}

L
Linus Torvalds 已提交
821
	return 0;
822 823

bad_val:
824
	pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p);
825
 	return -EINVAL;
L
Linus Torvalds 已提交
826 827 828 829 830 831 832 833 834
}

static int
hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
{
	int ret;
	struct hugetlbfs_config config;
	struct hugetlbfs_sb_info *sbinfo;

M
Miklos Szeredi 已提交
835 836
	save_mount_options(sb, data);

L
Linus Torvalds 已提交
837 838
	config.nr_blocks = -1; /* No limit on size by default */
	config.nr_inodes = -1; /* No limit on number of inodes by default */
839 840
	config.uid = current_fsuid();
	config.gid = current_fsgid();
L
Linus Torvalds 已提交
841
	config.mode = 0755;
842
	config.hstate = &default_hstate;
L
Linus Torvalds 已提交
843 844 845 846 847 848 849 850
	ret = hugetlbfs_parse_options(data, &config);
	if (ret)
		return ret;

	sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
	if (!sbinfo)
		return -ENOMEM;
	sb->s_fs_info = sbinfo;
851
	sbinfo->hstate = config.hstate;
L
Linus Torvalds 已提交
852 853 854
	spin_lock_init(&sbinfo->stat_lock);
	sbinfo->max_inodes = config.nr_inodes;
	sbinfo->free_inodes = config.nr_inodes;
855 856 857 858 859 860
	sbinfo->spool = NULL;
	if (config.nr_blocks != -1) {
		sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
		if (!sbinfo->spool)
			goto out_free;
	}
L
Linus Torvalds 已提交
861
	sb->s_maxbytes = MAX_LFS_FILESIZE;
862 863
	sb->s_blocksize = huge_page_size(config.hstate);
	sb->s_blocksize_bits = huge_page_shift(config.hstate);
L
Linus Torvalds 已提交
864 865 866
	sb->s_magic = HUGETLBFS_MAGIC;
	sb->s_op = &hugetlbfs_ops;
	sb->s_time_gran = 1;
867 868
	sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
	if (!sb->s_root)
L
Linus Torvalds 已提交
869 870 871
		goto out_free;
	return 0;
out_free:
872
	kfree(sbinfo->spool);
L
Linus Torvalds 已提交
873 874 875 876
	kfree(sbinfo);
	return -ENOMEM;
}

A
Al Viro 已提交
877 878
static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
	int flags, const char *dev_name, void *data)
L
Linus Torvalds 已提交
879
{
A
Al Viro 已提交
880
	return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
L
Linus Torvalds 已提交
881 882 883 884
}

static struct file_system_type hugetlbfs_fs_type = {
	.name		= "hugetlbfs",
A
Al Viro 已提交
885
	.mount		= hugetlbfs_mount,
L
Linus Torvalds 已提交
886 887
	.kill_sb	= kill_litter_super,
};
888
MODULE_ALIAS_FS("hugetlbfs");
L
Linus Torvalds 已提交
889

890
static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
L
Linus Torvalds 已提交
891

892
static int can_do_hugetlb_shm(void)
L
Linus Torvalds 已提交
893
{
894 895 896
	kgid_t shm_group;
	shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
	return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
L
Linus Torvalds 已提交
897 898
}

899 900
static int get_hstate_idx(int page_size_log)
{
901
	struct hstate *h = hstate_sizelog(page_size_log);
902 903 904 905 906 907

	if (!h)
		return -1;
	return h - hstates;
}

908
static const struct dentry_operations anon_ops = {
909
	.d_dname = simple_dname
910 911
};

912 913 914 915 916 917
/*
 * Note that size should be aligned to proper hugepage size in caller side,
 * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
 */
struct file *hugetlb_file_setup(const char *name, size_t size,
				vm_flags_t acctflag, struct user_struct **user,
918
				int creat_flags, int page_size_log)
L
Linus Torvalds 已提交
919
{
920
	struct file *file = ERR_PTR(-ENOMEM);
L
Linus Torvalds 已提交
921
	struct inode *inode;
922
	struct path path;
923
	struct super_block *sb;
L
Linus Torvalds 已提交
924
	struct qstr quick_string;
925 926 927 928 929
	int hstate_idx;

	hstate_idx = get_hstate_idx(page_size_log);
	if (hstate_idx < 0)
		return ERR_PTR(-ENODEV);
L
Linus Torvalds 已提交
930

931
	*user = NULL;
932
	if (!hugetlbfs_vfsmount[hstate_idx])
933 934
		return ERR_PTR(-ENOENT);

935
	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
936 937
		*user = current_user();
		if (user_shm_lock(size, *user)) {
938
			task_lock(current);
939
			pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
940 941
				current->comm, current->pid);
			task_unlock(current);
942 943
		} else {
			*user = NULL;
944
			return ERR_PTR(-EPERM);
945
		}
946
	}
L
Linus Torvalds 已提交
947

948
	sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb;
949
	quick_string.name = name;
L
Linus Torvalds 已提交
950 951
	quick_string.len = strlen(quick_string.name);
	quick_string.hash = 0;
952
	path.dentry = d_alloc_pseudo(sb, &quick_string);
953
	if (!path.dentry)
L
Linus Torvalds 已提交
954 955
		goto out_shm_unlock;

956
	d_set_d_op(path.dentry, &anon_ops);
957
	path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
958
	file = ERR_PTR(-ENOSPC);
959
	inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);
L
Linus Torvalds 已提交
960
	if (!inode)
961
		goto out_dentry;
L
Linus Torvalds 已提交
962

963
	file = ERR_PTR(-ENOMEM);
964 965 966
	if (hugetlb_reserve_pages(inode, 0,
			size >> huge_page_shift(hstate_inode(inode)), NULL,
			acctflag))
967 968
		goto out_inode;

969
	d_instantiate(path.dentry, inode);
L
Linus Torvalds 已提交
970
	inode->i_size = size;
971
	clear_nlink(inode);
972

973
	file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
974
			&hugetlbfs_file_operations);
975
	if (IS_ERR(file))
976
		goto out_dentry; /* inode is already attached */
977

L
Linus Torvalds 已提交
978 979
	return file;

980 981
out_inode:
	iput(inode);
L
Linus Torvalds 已提交
982
out_dentry:
983
	path_put(&path);
L
Linus Torvalds 已提交
984
out_shm_unlock:
985 986 987 988
	if (*user) {
		user_shm_unlock(size, *user);
		*user = NULL;
	}
989
	return file;
L
Linus Torvalds 已提交
990 991 992 993
}

static int __init init_hugetlbfs_fs(void)
{
994
	struct hstate *h;
L
Linus Torvalds 已提交
995
	int error;
996
	int i;
L
Linus Torvalds 已提交
997

998
	if (!hugepages_supported()) {
999
		pr_info("disabling because there are no supported hugepage sizes\n");
1000 1001 1002
		return -ENOTSUPP;
	}

1003
	error = -ENOMEM;
L
Linus Torvalds 已提交
1004 1005
	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
					sizeof(struct hugetlbfs_inode_info),
1006
					0, 0, init_once);
L
Linus Torvalds 已提交
1007
	if (hugetlbfs_inode_cachep == NULL)
P
Peter Zijlstra 已提交
1008
		goto out2;
L
Linus Torvalds 已提交
1009 1010 1011 1012 1013

	error = register_filesystem(&hugetlbfs_fs_type);
	if (error)
		goto out;

1014 1015 1016 1017
	i = 0;
	for_each_hstate(h) {
		char buf[50];
		unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
L
Linus Torvalds 已提交
1018

1019 1020 1021
		snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
		hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
							buf);
L
Linus Torvalds 已提交
1022

1023
		if (IS_ERR(hugetlbfs_vfsmount[i])) {
1024
			pr_err("Cannot mount internal hugetlbfs for "
1025 1026 1027 1028 1029 1030 1031 1032 1033
				"page size %uK", ps_kb);
			error = PTR_ERR(hugetlbfs_vfsmount[i]);
			hugetlbfs_vfsmount[i] = NULL;
		}
		i++;
	}
	/* Non default hstates are optional */
	if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
		return 0;
L
Linus Torvalds 已提交
1034 1035

 out:
1036
	kmem_cache_destroy(hugetlbfs_inode_cachep);
P
Peter Zijlstra 已提交
1037
 out2:
L
Linus Torvalds 已提交
1038 1039 1040 1041 1042
	return error;
}

static void __exit exit_hugetlbfs_fs(void)
{
1043 1044 1045 1046
	struct hstate *h;
	int i;


1047 1048 1049 1050 1051
	/*
	 * Make sure all delayed rcu free inodes are flushed before we
	 * destroy cache.
	 */
	rcu_barrier();
L
Linus Torvalds 已提交
1052
	kmem_cache_destroy(hugetlbfs_inode_cachep);
1053 1054 1055
	i = 0;
	for_each_hstate(h)
		kern_unmount(hugetlbfs_vfsmount[i++]);
L
Linus Torvalds 已提交
1056 1057 1058 1059 1060 1061 1062
	unregister_filesystem(&hugetlbfs_fs_type);
}

module_init(init_hugetlbfs_fs)
module_exit(exit_hugetlbfs_fs)

MODULE_LICENSE("GPL");