inode.c 26.9 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3
/*
 * hugetlbpage-backed filesystem.  Based on ramfs.
 *
4
 * Nadia Yvette Chambers, 2002
L
Linus Torvalds 已提交
5 6 7 8
 *
 * Copyright (C) 2002 Linus Torvalds.
 */

9 10
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

L
Linus Torvalds 已提交
11 12 13 14 15 16 17
#include <linux/module.h>
#include <linux/thread_info.h>
#include <asm/current.h>
#include <linux/sched.h>		/* remove ASAP */
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/file.h>
18
#include <linux/kernel.h>
L
Linus Torvalds 已提交
19 20 21 22 23
#include <linux/writeback.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/string.h>
24
#include <linux/capability.h>
25
#include <linux/ctype.h>
L
Linus Torvalds 已提交
26 27 28
#include <linux/backing-dev.h>
#include <linux/hugetlb.h>
#include <linux/pagevec.h>
29
#include <linux/parser.h>
30
#include <linux/mman.h>
L
Linus Torvalds 已提交
31 32 33 34
#include <linux/slab.h>
#include <linux/dnotify.h>
#include <linux/statfs.h>
#include <linux/security.h>
N
Nick Black 已提交
35
#include <linux/magic.h>
N
Naoya Horiguchi 已提交
36
#include <linux/migrate.h>
A
Al Viro 已提交
37
#include <linux/uio.h>
L
Linus Torvalds 已提交
38 39 40

#include <asm/uaccess.h>

41
static const struct super_operations hugetlbfs_ops;
42
static const struct address_space_operations hugetlbfs_aops;
43
const struct file_operations hugetlbfs_file_operations;
44 45
static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;
L
Linus Torvalds 已提交
46

D
David Gibson 已提交
47
struct hugetlbfs_config {
48 49
	kuid_t   uid;
	kgid_t   gid;
D
David Gibson 已提交
50
	umode_t mode;
51
	long	max_hpages;
D
David Gibson 已提交
52 53
	long	nr_inodes;
	struct hstate *hstate;
54
	long    min_hpages;
D
David Gibson 已提交
55 56 57 58 59 60 61 62 63 64 65 66
};

struct hugetlbfs_inode_info {
	struct shared_policy policy;
	struct inode vfs_inode;
};

static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
{
	return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
}

L
Linus Torvalds 已提交
67 68
int sysctl_hugetlb_shm_group;

69 70 71
enum {
	Opt_size, Opt_nr_inodes,
	Opt_mode, Opt_uid, Opt_gid,
72
	Opt_pagesize, Opt_min_size,
73 74 75
	Opt_err,
};

76
static const match_table_t tokens = {
77 78 79 80 81
	{Opt_size,	"size=%s"},
	{Opt_nr_inodes,	"nr_inodes=%s"},
	{Opt_mode,	"mode=%o"},
	{Opt_uid,	"uid=%u"},
	{Opt_gid,	"gid=%u"},
82
	{Opt_pagesize,	"pagesize=%s"},
83
	{Opt_min_size,	"min_size=%s"},
84 85 86
	{Opt_err,	NULL},
};

87 88 89 90 91 92 93 94 95 96
static void huge_pagevec_release(struct pagevec *pvec)
{
	int i;

	for (i = 0; i < pagevec_count(pvec); ++i)
		put_page(pvec->pages[i]);

	pagevec_reinit(pvec);
}

L
Linus Torvalds 已提交
97 98
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
A
Al Viro 已提交
99
	struct inode *inode = file_inode(file);
L
Linus Torvalds 已提交
100 101
	loff_t len, vma_len;
	int ret;
102
	struct hstate *h = hstate_file(file);
L
Linus Torvalds 已提交
103

104
	/*
105 106 107 108 109 110
	 * vma address alignment (but not the pgoff alignment) has
	 * already been checked by prepare_hugepage_range.  If you add
	 * any error returns here, do so after setting VM_HUGETLB, so
	 * is_vm_hugetlb_page tests below unmap_region go the right
	 * way when do_mmap_pgoff unwinds (may be important on powerpc
	 * and ia64).
111
	 */
112
	vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
113
	vma->vm_ops = &hugetlb_vm_ops;
L
Linus Torvalds 已提交
114

115
	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
116 117
		return -EINVAL;

L
Linus Torvalds 已提交
118 119
	vma_len = (loff_t)(vma->vm_end - vma->vm_start);

120
	mutex_lock(&inode->i_mutex);
L
Linus Torvalds 已提交
121 122 123 124 125
	file_accessed(file);

	ret = -ENOMEM;
	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);

126
	if (hugetlb_reserve_pages(inode,
127
				vma->vm_pgoff >> huge_page_order(h),
128 129
				len >> huge_page_shift(h), vma,
				vma->vm_flags))
130
		goto out;
131

A
Adam Litke 已提交
132
	ret = 0;
133
	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
L
Linus Torvalds 已提交
134 135
		inode->i_size = len;
out:
136
	mutex_unlock(&inode->i_mutex);
L
Linus Torvalds 已提交
137 138 139 140 141

	return ret;
}

/*
142
 * Called under down_write(mmap_sem).
L
Linus Torvalds 已提交
143 144
 */

145
#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
L
Linus Torvalds 已提交
146 147 148 149 150 151
static unsigned long
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
		unsigned long len, unsigned long pgoff, unsigned long flags)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
152
	struct hstate *h = hstate_file(file);
153
	struct vm_unmapped_area_info info;
L
Linus Torvalds 已提交
154

155
	if (len & ~huge_page_mask(h))
L
Linus Torvalds 已提交
156 157 158 159
		return -EINVAL;
	if (len > TASK_SIZE)
		return -ENOMEM;

160
	if (flags & MAP_FIXED) {
161
		if (prepare_hugepage_range(file, addr, len))
162 163 164 165
			return -EINVAL;
		return addr;
	}

L
Linus Torvalds 已提交
166
	if (addr) {
167
		addr = ALIGN(addr, huge_page_size(h));
L
Linus Torvalds 已提交
168 169 170 171 172 173
		vma = find_vma(mm, addr);
		if (TASK_SIZE - len >= addr &&
		    (!vma || addr + len <= vma->vm_start))
			return addr;
	}

174 175 176 177 178 179 180
	info.flags = 0;
	info.length = len;
	info.low_limit = TASK_UNMAPPED_BASE;
	info.high_limit = TASK_SIZE;
	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
	info.align_offset = 0;
	return vm_unmapped_area(&info);
L
Linus Torvalds 已提交
181 182 183
}
#endif

A
Al Viro 已提交
184
static size_t
B
Badari Pulavarty 已提交
185
hugetlbfs_read_actor(struct page *page, unsigned long offset,
A
Al Viro 已提交
186
			struct iov_iter *to, unsigned long size)
B
Badari Pulavarty 已提交
187
{
A
Al Viro 已提交
188
	size_t copied = 0;
B
Badari Pulavarty 已提交
189 190 191 192 193 194 195
	int i, chunksize;

	/* Find which 4k chunk and offset with in that chunk */
	i = offset >> PAGE_CACHE_SHIFT;
	offset = offset & ~PAGE_CACHE_MASK;

	while (size) {
A
Al Viro 已提交
196
		size_t n;
B
Badari Pulavarty 已提交
197 198 199 200 201
		chunksize = PAGE_CACHE_SIZE;
		if (offset)
			chunksize -= offset;
		if (chunksize > size)
			chunksize = size;
A
Al Viro 已提交
202 203 204 205
		n = copy_page_to_iter(&page[i], offset, chunksize, to);
		copied += n;
		if (n != chunksize)
			return copied;
B
Badari Pulavarty 已提交
206 207 208 209
		offset = 0;
		size -= chunksize;
		i++;
	}
A
Al Viro 已提交
210
	return copied;
B
Badari Pulavarty 已提交
211 212 213 214 215 216 217
}

/*
 * Support for read() - Find the page attached to f_mapping and copy out the
 * data. Its *very* similar to do_generic_mapping_read(), we can't use that
 * since it has PAGE_CACHE_SIZE assumptions.
 */
A
Al Viro 已提交
218
static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
B
Badari Pulavarty 已提交
219
{
A
Al Viro 已提交
220 221 222
	struct file *file = iocb->ki_filp;
	struct hstate *h = hstate_file(file);
	struct address_space *mapping = file->f_mapping;
B
Badari Pulavarty 已提交
223
	struct inode *inode = mapping->host;
A
Al Viro 已提交
224 225
	unsigned long index = iocb->ki_pos >> huge_page_shift(h);
	unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
B
Badari Pulavarty 已提交
226 227 228 229
	unsigned long end_index;
	loff_t isize;
	ssize_t retval = 0;

A
Al Viro 已提交
230
	while (iov_iter_count(to)) {
B
Badari Pulavarty 已提交
231
		struct page *page;
A
Al Viro 已提交
232
		size_t nr, copied;
B
Badari Pulavarty 已提交
233 234

		/* nr is the maximum number of bytes to copy from this page */
235
		nr = huge_page_size(h);
236 237
		isize = i_size_read(inode);
		if (!isize)
A
Al Viro 已提交
238
			break;
239
		end_index = (isize - 1) >> huge_page_shift(h);
A
Al Viro 已提交
240 241 242
		if (index > end_index)
			break;
		if (index == end_index) {
243
			nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
244
			if (nr <= offset)
A
Al Viro 已提交
245
				break;
B
Badari Pulavarty 已提交
246 247 248 249
		}
		nr = nr - offset;

		/* Find the page */
250
		page = find_lock_page(mapping, index);
B
Badari Pulavarty 已提交
251 252 253 254 255
		if (unlikely(page == NULL)) {
			/*
			 * We have a HOLE, zero out the user-buffer for the
			 * length of the hole or request.
			 */
A
Al Viro 已提交
256
			copied = iov_iter_zero(nr, to);
B
Badari Pulavarty 已提交
257
		} else {
258 259
			unlock_page(page);

B
Badari Pulavarty 已提交
260 261 262
			/*
			 * We have the page, copy it to user space buffer.
			 */
A
Al Viro 已提交
263
			copied = hugetlbfs_read_actor(page, offset, to, nr);
264
			page_cache_release(page);
B
Badari Pulavarty 已提交
265
		}
A
Al Viro 已提交
266 267 268 269 270 271
		offset += copied;
		retval += copied;
		if (copied != nr && iov_iter_count(to)) {
			if (!retval)
				retval = -EFAULT;
			break;
B
Badari Pulavarty 已提交
272
		}
273 274
		index += offset >> huge_page_shift(h);
		offset &= ~huge_page_mask(h);
B
Badari Pulavarty 已提交
275
	}
A
Al Viro 已提交
276
	iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
B
Badari Pulavarty 已提交
277 278 279
	return retval;
}

N
Nick Piggin 已提交
280 281 282 283
static int hugetlbfs_write_begin(struct file *file,
			struct address_space *mapping,
			loff_t pos, unsigned len, unsigned flags,
			struct page **pagep, void **fsdata)
L
Linus Torvalds 已提交
284 285 286 287
{
	return -EINVAL;
}

N
Nick Piggin 已提交
288 289 290
static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
			loff_t pos, unsigned len, unsigned copied,
			struct page *page, void *fsdata)
L
Linus Torvalds 已提交
291
{
N
Nick Piggin 已提交
292
	BUG();
L
Linus Torvalds 已提交
293 294 295 296 297
	return -EINVAL;
}

static void truncate_huge_page(struct page *page)
{
298
	ClearPageDirty(page);
L
Linus Torvalds 已提交
299
	ClearPageUptodate(page);
300
	delete_from_page_cache(page);
L
Linus Torvalds 已提交
301 302
}

303
static void truncate_hugepages(struct inode *inode, loff_t lstart)
L
Linus Torvalds 已提交
304
{
305
	struct hstate *h = hstate_inode(inode);
306
	struct address_space *mapping = &inode->i_data;
307
	const pgoff_t start = lstart >> huge_page_shift(h);
L
Linus Torvalds 已提交
308 309
	struct pagevec pvec;
	pgoff_t next;
310
	int i, freed = 0;
L
Linus Torvalds 已提交
311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330

	pagevec_init(&pvec, 0);
	next = start;
	while (1) {
		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
			if (next == start)
				break;
			next = start;
			continue;
		}

		for (i = 0; i < pagevec_count(&pvec); ++i) {
			struct page *page = pvec.pages[i];

			lock_page(page);
			if (page->index > next)
				next = page->index;
			++next;
			truncate_huge_page(page);
			unlock_page(page);
331
			freed++;
L
Linus Torvalds 已提交
332 333 334 335
		}
		huge_pagevec_release(&pvec);
	}
	BUG_ON(!lstart && mapping->nrpages);
336
	hugetlb_unreserve_pages(inode, start, freed);
L
Linus Torvalds 已提交
337 338
}

A
Al Viro 已提交
339
static void hugetlbfs_evict_inode(struct inode *inode)
L
Linus Torvalds 已提交
340
{
341 342
	struct resv_map *resv_map;

343
	truncate_hugepages(inode, 0);
344 345 346 347
	resv_map = (struct resv_map *)inode->i_mapping->private_data;
	/* root inode doesn't have the resv_map, so we should check it */
	if (resv_map)
		resv_map_release(&resv_map->refs);
348
	clear_inode(inode);
349 350
}

L
Linus Torvalds 已提交
351
static inline void
352
hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
L
Linus Torvalds 已提交
353 354 355
{
	struct vm_area_struct *vma;

356
	vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
L
Linus Torvalds 已提交
357 358 359
		unsigned long v_offset;

		/*
H
Hugh Dickins 已提交
360
		 * Can the expression below overflow on 32-bit arches?
361
		 * No, because the interval tree returns us only those vmas
H
Hugh Dickins 已提交
362 363
		 * which overlap the truncated area starting at pgoff,
		 * and no vma on a 32-bit arch can span beyond the 4GB.
L
Linus Torvalds 已提交
364
		 */
H
Hugh Dickins 已提交
365 366 367
		if (vma->vm_pgoff < pgoff)
			v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
		else
L
Linus Torvalds 已提交
368 369
			v_offset = 0;

370 371
		unmap_hugepage_range(vma, vma->vm_start + v_offset,
				     vma->vm_end, NULL);
L
Linus Torvalds 已提交
372 373 374 375 376
	}
}

static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
{
H
Hugh Dickins 已提交
377
	pgoff_t pgoff;
L
Linus Torvalds 已提交
378
	struct address_space *mapping = inode->i_mapping;
379
	struct hstate *h = hstate_inode(inode);
L
Linus Torvalds 已提交
380

381
	BUG_ON(offset & ~huge_page_mask(h));
H
Hugh Dickins 已提交
382
	pgoff = offset >> PAGE_SHIFT;
L
Linus Torvalds 已提交
383

384
	i_size_write(inode, offset);
385
	i_mmap_lock_write(mapping);
386
	if (!RB_EMPTY_ROOT(&mapping->i_mmap))
L
Linus Torvalds 已提交
387
		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
388
	i_mmap_unlock_write(mapping);
389
	truncate_hugepages(inode, offset);
L
Linus Torvalds 已提交
390 391 392 393 394
	return 0;
}

static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
{
395
	struct inode *inode = d_inode(dentry);
396
	struct hstate *h = hstate_inode(inode);
L
Linus Torvalds 已提交
397 398 399 400 401 402 403
	int error;
	unsigned int ia_valid = attr->ia_valid;

	BUG_ON(!inode);

	error = inode_change_ok(inode, attr);
	if (error)
C
Christoph Hellwig 已提交
404
		return error;
L
Linus Torvalds 已提交
405 406 407

	if (ia_valid & ATTR_SIZE) {
		error = -EINVAL;
C
Christoph Hellwig 已提交
408 409 410
		if (attr->ia_size & ~huge_page_mask(h))
			return -EINVAL;
		error = hugetlb_vmtruncate(inode, attr->ia_size);
L
Linus Torvalds 已提交
411
		if (error)
C
Christoph Hellwig 已提交
412
			return error;
L
Linus Torvalds 已提交
413
	}
C
Christoph Hellwig 已提交
414 415 416 417

	setattr_copy(inode, attr);
	mark_inode_dirty(inode);
	return 0;
L
Linus Torvalds 已提交
418 419
}

420 421
static struct inode *hugetlbfs_get_root(struct super_block *sb,
					struct hugetlbfs_config *config)
L
Linus Torvalds 已提交
422 423 424 425 426 427
{
	struct inode *inode;

	inode = new_inode(sb);
	if (inode) {
		struct hugetlbfs_inode_info *info;
428
		inode->i_ino = get_next_ino();
429 430 431 432 433 434 435 436 437 438
		inode->i_mode = S_IFDIR | config->mode;
		inode->i_uid = config->uid;
		inode->i_gid = config->gid;
		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
		info = HUGETLBFS_I(inode);
		mpol_shared_policy_init(&info->policy, NULL);
		inode->i_op = &hugetlbfs_dir_inode_operations;
		inode->i_fop = &simple_dir_operations;
		/* directory inodes start off with i_nlink == 2 (for "." entry) */
		inc_nlink(inode);
439
		lockdep_annotate_inode_mutex_key(inode);
440 441 442 443
	}
	return inode;
}

444
/*
445
 * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
446 447
 * be taken from reclaim -- unlike regular filesystems. This needs an
 * annotation because huge_pmd_share() does an allocation under
448
 * i_mmap_rwsem.
449
 */
450
static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
451

452 453
static struct inode *hugetlbfs_get_inode(struct super_block *sb,
					struct inode *dir,
A
Al Viro 已提交
454
					umode_t mode, dev_t dev)
455 456
{
	struct inode *inode;
457 458 459 460 461
	struct resv_map *resv_map;

	resv_map = resv_map_alloc();
	if (!resv_map)
		return NULL;
462 463 464 465 466 467

	inode = new_inode(sb);
	if (inode) {
		struct hugetlbfs_inode_info *info;
		inode->i_ino = get_next_ino();
		inode_init_owner(inode, dir, mode);
468 469
		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
				&hugetlbfs_i_mmap_rwsem_key);
L
Linus Torvalds 已提交
470 471
		inode->i_mapping->a_ops = &hugetlbfs_aops;
		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
472
		inode->i_mapping->private_data = resv_map;
L
Linus Torvalds 已提交
473
		info = HUGETLBFS_I(inode);
474 475 476 477 478 479 480
		/*
		 * The policy is initialized here even if we are creating a
		 * private inode because initialization simply creates an
		 * an empty rb tree and calls spin_lock_init(), later when we
		 * call mpol_free_shared_policy() it will just return because
		 * the rb tree will still be empty.
		 */
481
		mpol_shared_policy_init(&info->policy, NULL);
L
Linus Torvalds 已提交
482 483 484 485 486 487 488 489 490 491 492 493 494
		switch (mode & S_IFMT) {
		default:
			init_special_inode(inode, mode, dev);
			break;
		case S_IFREG:
			inode->i_op = &hugetlbfs_inode_operations;
			inode->i_fop = &hugetlbfs_file_operations;
			break;
		case S_IFDIR:
			inode->i_op = &hugetlbfs_dir_inode_operations;
			inode->i_fop = &simple_dir_operations;

			/* directory inodes start off with i_nlink == 2 (for "." entry) */
495
			inc_nlink(inode);
L
Linus Torvalds 已提交
496 497 498 499 500
			break;
		case S_IFLNK:
			inode->i_op = &page_symlink_inode_operations;
			break;
		}
501
		lockdep_annotate_inode_mutex_key(inode);
502 503 504
	} else
		kref_put(&resv_map->refs, resv_map_release);

L
Linus Torvalds 已提交
505 506 507 508 509 510 511
	return inode;
}

/*
 * File creation. Allocate an inode, and we're done..
 */
static int hugetlbfs_mknod(struct inode *dir,
A
Al Viro 已提交
512
			struct dentry *dentry, umode_t mode, dev_t dev)
L
Linus Torvalds 已提交
513 514 515
{
	struct inode *inode;
	int error = -ENOSPC;
516 517

	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
L
Linus Torvalds 已提交
518 519 520 521 522 523 524 525 526
	if (inode) {
		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
		d_instantiate(dentry, inode);
		dget(dentry);	/* Extra count - pin the dentry in core */
		error = 0;
	}
	return error;
}

527
static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
L
Linus Torvalds 已提交
528 529 530
{
	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
	if (!retval)
531
		inc_nlink(dir);
L
Linus Torvalds 已提交
532 533 534
	return retval;
}

A
Al Viro 已提交
535
static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
L
Linus Torvalds 已提交
536 537 538 539 540 541 542 543 544 545
{
	return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
}

static int hugetlbfs_symlink(struct inode *dir,
			struct dentry *dentry, const char *symname)
{
	struct inode *inode;
	int error = -ENOSPC;

546
	inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
L
Linus Torvalds 已提交
547 548 549 550 551 552 553 554 555 556 557 558 559 560 561
	if (inode) {
		int l = strlen(symname)+1;
		error = page_symlink(inode, symname, l);
		if (!error) {
			d_instantiate(dentry, inode);
			dget(dentry);
		} else
			iput(inode);
	}
	dir->i_ctime = dir->i_mtime = CURRENT_TIME;

	return error;
}

/*
562
 * mark the head page dirty
L
Linus Torvalds 已提交
563 564 565
 */
static int hugetlbfs_set_page_dirty(struct page *page)
{
566
	struct page *head = compound_head(page);
567 568

	SetPageDirty(head);
L
Linus Torvalds 已提交
569 570 571
	return 0;
}

N
Naoya Horiguchi 已提交
572
static int hugetlbfs_migrate_page(struct address_space *mapping,
573
				struct page *newpage, struct page *page,
574
				enum migrate_mode mode)
N
Naoya Horiguchi 已提交
575 576 577 578
{
	int rc;

	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
579
	if (rc != MIGRATEPAGE_SUCCESS)
N
Naoya Horiguchi 已提交
580 581 582
		return rc;
	migrate_page_copy(newpage, page);

583
	return MIGRATEPAGE_SUCCESS;
N
Naoya Horiguchi 已提交
584 585
}

586
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
L
Linus Torvalds 已提交
587
{
588
	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
589
	struct hstate *h = hstate_inode(d_inode(dentry));
L
Linus Torvalds 已提交
590 591

	buf->f_type = HUGETLBFS_MAGIC;
592
	buf->f_bsize = huge_page_size(h);
L
Linus Torvalds 已提交
593 594
	if (sbinfo) {
		spin_lock(&sbinfo->stat_lock);
595 596
		/* If no limits set, just report 0 for max/free/used
		 * blocks, like simple_statfs() */
597 598 599 600 601 602 603 604 605
		if (sbinfo->spool) {
			long free_pages;

			spin_lock(&sbinfo->spool->lock);
			buf->f_blocks = sbinfo->spool->max_hpages;
			free_pages = sbinfo->spool->max_hpages
				- sbinfo->spool->used_hpages;
			buf->f_bavail = buf->f_bfree = free_pages;
			spin_unlock(&sbinfo->spool->lock);
606 607 608
			buf->f_files = sbinfo->max_inodes;
			buf->f_ffree = sbinfo->free_inodes;
		}
L
Linus Torvalds 已提交
609 610 611 612 613 614 615 616 617 618 619 620
		spin_unlock(&sbinfo->stat_lock);
	}
	buf->f_namelen = NAME_MAX;
	return 0;
}

static void hugetlbfs_put_super(struct super_block *sb)
{
	struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);

	if (sbi) {
		sb->s_fs_info = NULL;
621 622 623 624

		if (sbi->spool)
			hugepage_put_subpool(sbi->spool);

L
Linus Torvalds 已提交
625 626 627 628
		kfree(sbi);
	}
}

629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653
static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
{
	if (sbinfo->free_inodes >= 0) {
		spin_lock(&sbinfo->stat_lock);
		if (unlikely(!sbinfo->free_inodes)) {
			spin_unlock(&sbinfo->stat_lock);
			return 0;
		}
		sbinfo->free_inodes--;
		spin_unlock(&sbinfo->stat_lock);
	}

	return 1;
}

static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
{
	if (sbinfo->free_inodes >= 0) {
		spin_lock(&sbinfo->stat_lock);
		sbinfo->free_inodes++;
		spin_unlock(&sbinfo->stat_lock);
	}
}


654
static struct kmem_cache *hugetlbfs_inode_cachep;
L
Linus Torvalds 已提交
655 656 657

static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
{
658
	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
L
Linus Torvalds 已提交
659 660
	struct hugetlbfs_inode_info *p;

661 662
	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
		return NULL;
663
	p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
664 665
	if (unlikely(!p)) {
		hugetlbfs_inc_free_inodes(sbinfo);
L
Linus Torvalds 已提交
666
		return NULL;
667
	}
L
Linus Torvalds 已提交
668 669 670
	return &p->vfs_inode;
}

N
Nick Piggin 已提交
671 672 673 674 675 676
static void hugetlbfs_i_callback(struct rcu_head *head)
{
	struct inode *inode = container_of(head, struct inode, i_rcu);
	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
}

L
Linus Torvalds 已提交
677 678
static void hugetlbfs_destroy_inode(struct inode *inode)
{
679
	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
L
Linus Torvalds 已提交
680
	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
N
Nick Piggin 已提交
681
	call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
L
Linus Torvalds 已提交
682 683
}

684
static const struct address_space_operations hugetlbfs_aops = {
N
Nick Piggin 已提交
685 686
	.write_begin	= hugetlbfs_write_begin,
	.write_end	= hugetlbfs_write_end,
L
Linus Torvalds 已提交
687
	.set_page_dirty	= hugetlbfs_set_page_dirty,
N
Naoya Horiguchi 已提交
688
	.migratepage    = hugetlbfs_migrate_page,
L
Linus Torvalds 已提交
689 690
};

691

692
static void init_once(void *foo)
693 694 695
{
	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;

C
Christoph Lameter 已提交
696
	inode_init_once(&ei->vfs_inode);
697 698
}

699
const struct file_operations hugetlbfs_file_operations = {
A
Al Viro 已提交
700
	.read_iter		= hugetlbfs_read_iter,
L
Linus Torvalds 已提交
701
	.mmap			= hugetlbfs_file_mmap,
702
	.fsync			= noop_fsync,
L
Linus Torvalds 已提交
703
	.get_unmapped_area	= hugetlb_get_unmapped_area,
704
	.llseek		= default_llseek,
L
Linus Torvalds 已提交
705 706
};

707
static const struct inode_operations hugetlbfs_dir_inode_operations = {
L
Linus Torvalds 已提交
708 709 710 711 712 713 714 715 716 717 718 719
	.create		= hugetlbfs_create,
	.lookup		= simple_lookup,
	.link		= simple_link,
	.unlink		= simple_unlink,
	.symlink	= hugetlbfs_symlink,
	.mkdir		= hugetlbfs_mkdir,
	.rmdir		= simple_rmdir,
	.mknod		= hugetlbfs_mknod,
	.rename		= simple_rename,
	.setattr	= hugetlbfs_setattr,
};

720
static const struct inode_operations hugetlbfs_inode_operations = {
L
Linus Torvalds 已提交
721 722 723
	.setattr	= hugetlbfs_setattr,
};

724
static const struct super_operations hugetlbfs_ops = {
L
Linus Torvalds 已提交
725 726
	.alloc_inode    = hugetlbfs_alloc_inode,
	.destroy_inode  = hugetlbfs_destroy_inode,
A
Al Viro 已提交
727
	.evict_inode	= hugetlbfs_evict_inode,
L
Linus Torvalds 已提交
728 729
	.statfs		= hugetlbfs_statfs,
	.put_super	= hugetlbfs_put_super,
M
Miklos Szeredi 已提交
730
	.show_options	= generic_show_options,
L
Linus Torvalds 已提交
731 732
};

733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756
enum { NO_SIZE, SIZE_STD, SIZE_PERCENT };

/*
 * Convert size option passed from command line to number of huge pages
 * in the pool specified by hstate.  Size option could be in bytes
 * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
 */
static long long
hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
								int val_type)
{
	if (val_type == NO_SIZE)
		return -1;

	if (val_type == SIZE_PERCENT) {
		size_opt <<= huge_page_shift(h);
		size_opt *= h->max_huge_pages;
		do_div(size_opt, 100);
	}

	size_opt >>= huge_page_shift(h);
	return size_opt;
}

L
Linus Torvalds 已提交
757 758 759
static int
hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
{
760 761 762
	char *p, *rest;
	substring_t args[MAX_OPT_ARGS];
	int option;
763 764
	unsigned long long max_size_opt = 0, min_size_opt = 0;
	int max_val_type = NO_SIZE, min_val_type = NO_SIZE;
L
Linus Torvalds 已提交
765 766 767 768

	if (!options)
		return 0;

769 770
	while ((p = strsep(&options, ",")) != NULL) {
		int token;
771 772
		if (!*p)
			continue;
773 774 775 776 777 778

		token = match_token(p, tokens, args);
		switch (token) {
		case Opt_uid:
			if (match_int(&args[0], &option))
 				goto bad_val;
779 780 781
			pconfig->uid = make_kuid(current_user_ns(), option);
			if (!uid_valid(pconfig->uid))
				goto bad_val;
782 783 784 785 786
			break;

		case Opt_gid:
			if (match_int(&args[0], &option))
 				goto bad_val;
787 788 789
			pconfig->gid = make_kgid(current_user_ns(), option);
			if (!gid_valid(pconfig->gid))
				goto bad_val;
790 791 792 793 794
			break;

		case Opt_mode:
			if (match_octal(&args[0], &option))
 				goto bad_val;
795
			pconfig->mode = option & 01777U;
796 797 798 799 800 801
			break;

		case Opt_size: {
			/* memparse() will accept a K/M/G without a digit */
			if (!isdigit(*args[0].from))
				goto bad_val;
802 803
			max_size_opt = memparse(args[0].from, &rest);
			max_val_type = SIZE_STD;
804
			if (*rest == '%')
805
				max_val_type = SIZE_PERCENT;
806 807
			break;
		}
L
Linus Torvalds 已提交
808

809 810 811 812 813 814 815
		case Opt_nr_inodes:
			/* memparse() will accept a K/M/G without a digit */
			if (!isdigit(*args[0].from))
				goto bad_val;
			pconfig->nr_inodes = memparse(args[0].from, &rest);
			break;

816 817 818 819 820
		case Opt_pagesize: {
			unsigned long ps;
			ps = memparse(args[0].from, &rest);
			pconfig->hstate = size_to_hstate(ps);
			if (!pconfig->hstate) {
821
				pr_err("Unsupported page size %lu MB\n",
822 823 824 825 826 827
					ps >> 20);
				return -EINVAL;
			}
			break;
		}

828 829 830 831 832 833 834 835 836 837 838
		case Opt_min_size: {
			/* memparse() will accept a K/M/G without a digit */
			if (!isdigit(*args[0].from))
				goto bad_val;
			min_size_opt = memparse(args[0].from, &rest);
			min_val_type = SIZE_STD;
			if (*rest == '%')
				min_val_type = SIZE_PERCENT;
			break;
		}

839
		default:
840
			pr_err("Bad mount option: \"%s\"\n", p);
841
			return -EINVAL;
842 843
			break;
		}
L
Linus Torvalds 已提交
844
	}
845

846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861
	/*
	 * Use huge page pool size (in hstate) to convert the size
	 * options to number of huge pages.  If NO_SIZE, -1 is returned.
	 */
	pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
						max_size_opt, max_val_type);
	pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
						min_size_opt, min_val_type);

	/*
	 * If max_size was specified, then min_size must be smaller
	 */
	if (max_val_type > NO_SIZE &&
	    pconfig->min_hpages > pconfig->max_hpages) {
		pr_err("minimum size can not be greater than maximum size\n");
		return -EINVAL;
862 863
	}

L
Linus Torvalds 已提交
864
	return 0;
865 866

bad_val:
867
	pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p);
868
 	return -EINVAL;
L
Linus Torvalds 已提交
869 870 871 872 873 874 875 876 877
}

static int
hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
{
	int ret;
	struct hugetlbfs_config config;
	struct hugetlbfs_sb_info *sbinfo;

M
Miklos Szeredi 已提交
878 879
	save_mount_options(sb, data);

880
	config.max_hpages = -1; /* No limit on size by default */
L
Linus Torvalds 已提交
881
	config.nr_inodes = -1; /* No limit on number of inodes by default */
882 883
	config.uid = current_fsuid();
	config.gid = current_fsgid();
L
Linus Torvalds 已提交
884
	config.mode = 0755;
885
	config.hstate = &default_hstate;
886
	config.min_hpages = -1; /* No default minimum size */
L
Linus Torvalds 已提交
887 888 889 890 891 892 893 894
	ret = hugetlbfs_parse_options(data, &config);
	if (ret)
		return ret;

	sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
	if (!sbinfo)
		return -ENOMEM;
	sb->s_fs_info = sbinfo;
895
	sbinfo->hstate = config.hstate;
L
Linus Torvalds 已提交
896 897 898
	spin_lock_init(&sbinfo->stat_lock);
	sbinfo->max_inodes = config.nr_inodes;
	sbinfo->free_inodes = config.nr_inodes;
899
	sbinfo->spool = NULL;
900 901 902 903 904 905 906 907 908
	/*
	 * Allocate and initialize subpool if maximum or minimum size is
	 * specified.  Any needed reservations (for minimim size) are taken
	 * taken when the subpool is created.
	 */
	if (config.max_hpages != -1 || config.min_hpages != -1) {
		sbinfo->spool = hugepage_new_subpool(config.hstate,
							config.max_hpages,
							config.min_hpages);
909 910 911
		if (!sbinfo->spool)
			goto out_free;
	}
L
Linus Torvalds 已提交
912
	sb->s_maxbytes = MAX_LFS_FILESIZE;
913 914
	sb->s_blocksize = huge_page_size(config.hstate);
	sb->s_blocksize_bits = huge_page_shift(config.hstate);
L
Linus Torvalds 已提交
915 916 917
	sb->s_magic = HUGETLBFS_MAGIC;
	sb->s_op = &hugetlbfs_ops;
	sb->s_time_gran = 1;
918 919
	sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
	if (!sb->s_root)
L
Linus Torvalds 已提交
920 921 922
		goto out_free;
	return 0;
out_free:
923
	kfree(sbinfo->spool);
L
Linus Torvalds 已提交
924 925 926 927
	kfree(sbinfo);
	return -ENOMEM;
}

A
Al Viro 已提交
928 929
static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
	int flags, const char *dev_name, void *data)
L
Linus Torvalds 已提交
930
{
A
Al Viro 已提交
931
	return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
L
Linus Torvalds 已提交
932 933 934 935
}

static struct file_system_type hugetlbfs_fs_type = {
	.name		= "hugetlbfs",
A
Al Viro 已提交
936
	.mount		= hugetlbfs_mount,
L
Linus Torvalds 已提交
937 938
	.kill_sb	= kill_litter_super,
};
939
MODULE_ALIAS_FS("hugetlbfs");
L
Linus Torvalds 已提交
940

941
static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
L
Linus Torvalds 已提交
942

943
static int can_do_hugetlb_shm(void)
L
Linus Torvalds 已提交
944
{
945 946 947
	kgid_t shm_group;
	shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
	return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
L
Linus Torvalds 已提交
948 949
}

950 951
static int get_hstate_idx(int page_size_log)
{
952
	struct hstate *h = hstate_sizelog(page_size_log);
953 954 955 956 957 958

	if (!h)
		return -1;
	return h - hstates;
}

959
static const struct dentry_operations anon_ops = {
960
	.d_dname = simple_dname
961 962
};

963 964 965 966 967 968
/*
 * Note that size should be aligned to proper hugepage size in caller side,
 * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
 */
struct file *hugetlb_file_setup(const char *name, size_t size,
				vm_flags_t acctflag, struct user_struct **user,
969
				int creat_flags, int page_size_log)
L
Linus Torvalds 已提交
970
{
971
	struct file *file = ERR_PTR(-ENOMEM);
L
Linus Torvalds 已提交
972
	struct inode *inode;
973
	struct path path;
974
	struct super_block *sb;
L
Linus Torvalds 已提交
975
	struct qstr quick_string;
976 977 978 979 980
	int hstate_idx;

	hstate_idx = get_hstate_idx(page_size_log);
	if (hstate_idx < 0)
		return ERR_PTR(-ENODEV);
L
Linus Torvalds 已提交
981

982
	*user = NULL;
983
	if (!hugetlbfs_vfsmount[hstate_idx])
984 985
		return ERR_PTR(-ENOENT);

986
	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
987 988
		*user = current_user();
		if (user_shm_lock(size, *user)) {
989
			task_lock(current);
990
			pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
991 992
				current->comm, current->pid);
			task_unlock(current);
993 994
		} else {
			*user = NULL;
995
			return ERR_PTR(-EPERM);
996
		}
997
	}
L
Linus Torvalds 已提交
998

999
	sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb;
1000
	quick_string.name = name;
L
Linus Torvalds 已提交
1001 1002
	quick_string.len = strlen(quick_string.name);
	quick_string.hash = 0;
1003
	path.dentry = d_alloc_pseudo(sb, &quick_string);
1004
	if (!path.dentry)
L
Linus Torvalds 已提交
1005 1006
		goto out_shm_unlock;

1007
	d_set_d_op(path.dentry, &anon_ops);
1008
	path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
1009
	file = ERR_PTR(-ENOSPC);
1010
	inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);
L
Linus Torvalds 已提交
1011
	if (!inode)
1012
		goto out_dentry;
L
Linus Torvalds 已提交
1013

1014
	file = ERR_PTR(-ENOMEM);
1015 1016 1017
	if (hugetlb_reserve_pages(inode, 0,
			size >> huge_page_shift(hstate_inode(inode)), NULL,
			acctflag))
1018 1019
		goto out_inode;

1020
	d_instantiate(path.dentry, inode);
L
Linus Torvalds 已提交
1021
	inode->i_size = size;
1022
	clear_nlink(inode);
1023

1024
	file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
1025
			&hugetlbfs_file_operations);
1026
	if (IS_ERR(file))
1027
		goto out_dentry; /* inode is already attached */
1028

L
Linus Torvalds 已提交
1029 1030
	return file;

1031 1032
out_inode:
	iput(inode);
L
Linus Torvalds 已提交
1033
out_dentry:
1034
	path_put(&path);
L
Linus Torvalds 已提交
1035
out_shm_unlock:
1036 1037 1038 1039
	if (*user) {
		user_shm_unlock(size, *user);
		*user = NULL;
	}
1040
	return file;
L
Linus Torvalds 已提交
1041 1042 1043 1044
}

static int __init init_hugetlbfs_fs(void)
{
1045
	struct hstate *h;
L
Linus Torvalds 已提交
1046
	int error;
1047
	int i;
L
Linus Torvalds 已提交
1048

1049
	if (!hugepages_supported()) {
1050
		pr_info("disabling because there are no supported hugepage sizes\n");
1051 1052 1053
		return -ENOTSUPP;
	}

1054
	error = -ENOMEM;
L
Linus Torvalds 已提交
1055 1056
	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
					sizeof(struct hugetlbfs_inode_info),
1057
					0, 0, init_once);
L
Linus Torvalds 已提交
1058
	if (hugetlbfs_inode_cachep == NULL)
P
Peter Zijlstra 已提交
1059
		goto out2;
L
Linus Torvalds 已提交
1060 1061 1062 1063 1064

	error = register_filesystem(&hugetlbfs_fs_type);
	if (error)
		goto out;

1065 1066 1067 1068
	i = 0;
	for_each_hstate(h) {
		char buf[50];
		unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
L
Linus Torvalds 已提交
1069

1070 1071 1072
		snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
		hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
							buf);
L
Linus Torvalds 已提交
1073

1074
		if (IS_ERR(hugetlbfs_vfsmount[i])) {
1075
			pr_err("Cannot mount internal hugetlbfs for "
1076 1077 1078 1079 1080 1081 1082 1083 1084
				"page size %uK", ps_kb);
			error = PTR_ERR(hugetlbfs_vfsmount[i]);
			hugetlbfs_vfsmount[i] = NULL;
		}
		i++;
	}
	/* Non default hstates are optional */
	if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
		return 0;
L
Linus Torvalds 已提交
1085 1086

 out:
1087
	kmem_cache_destroy(hugetlbfs_inode_cachep);
P
Peter Zijlstra 已提交
1088
 out2:
L
Linus Torvalds 已提交
1089 1090 1091 1092 1093
	return error;
}

static void __exit exit_hugetlbfs_fs(void)
{
1094 1095 1096 1097
	struct hstate *h;
	int i;


1098 1099 1100 1101 1102
	/*
	 * Make sure all delayed rcu free inodes are flushed before we
	 * destroy cache.
	 */
	rcu_barrier();
L
Linus Torvalds 已提交
1103
	kmem_cache_destroy(hugetlbfs_inode_cachep);
1104 1105 1106
	i = 0;
	for_each_hstate(h)
		kern_unmount(hugetlbfs_vfsmount[i++]);
L
Linus Torvalds 已提交
1107 1108 1109 1110 1111 1112 1113
	unregister_filesystem(&hugetlbfs_fs_type);
}

module_init(init_hugetlbfs_fs)
module_exit(exit_hugetlbfs_fs)

MODULE_LICENSE("GPL");