inode.c 26.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3
/*
 * hugetlbpage-backed filesystem.  Based on ramfs.
 *
4
 * Nadia Yvette Chambers, 2002
L
Linus Torvalds 已提交
5 6 7 8
 *
 * Copyright (C) 2002 Linus Torvalds.
 */

9 10
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

L
Linus Torvalds 已提交
11 12 13 14 15 16 17
#include <linux/module.h>
#include <linux/thread_info.h>
#include <asm/current.h>
#include <linux/sched.h>		/* remove ASAP */
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/file.h>
18
#include <linux/kernel.h>
L
Linus Torvalds 已提交
19 20 21 22 23
#include <linux/writeback.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/string.h>
24
#include <linux/capability.h>
25
#include <linux/ctype.h>
L
Linus Torvalds 已提交
26 27 28
#include <linux/backing-dev.h>
#include <linux/hugetlb.h>
#include <linux/pagevec.h>
29
#include <linux/parser.h>
30
#include <linux/mman.h>
L
Linus Torvalds 已提交
31 32 33 34
#include <linux/slab.h>
#include <linux/dnotify.h>
#include <linux/statfs.h>
#include <linux/security.h>
N
Nick Black 已提交
35
#include <linux/magic.h>
N
Naoya Horiguchi 已提交
36
#include <linux/migrate.h>
L
Linus Torvalds 已提交
37 38 39

#include <asm/uaccess.h>

40
static const struct super_operations hugetlbfs_ops;
41
static const struct address_space_operations hugetlbfs_aops;
42
const struct file_operations hugetlbfs_file_operations;
43 44
static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;
L
Linus Torvalds 已提交
45

D
David Gibson 已提交
46
struct hugetlbfs_config {
47 48
	kuid_t   uid;
	kgid_t   gid;
D
David Gibson 已提交
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
	umode_t mode;
	long	nr_blocks;
	long	nr_inodes;
	struct hstate *hstate;
};

struct hugetlbfs_inode_info {
	struct shared_policy policy;
	struct inode vfs_inode;
};

static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
{
	return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
}

L
Linus Torvalds 已提交
65
static struct backing_dev_info hugetlbfs_backing_dev_info = {
66
	.name		= "hugetlbfs",
L
Linus Torvalds 已提交
67
	.ra_pages	= 0,	/* No readahead */
68
	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
L
Linus Torvalds 已提交
69 70 71 72
};

int sysctl_hugetlb_shm_group;

73 74 75
enum {
	Opt_size, Opt_nr_inodes,
	Opt_mode, Opt_uid, Opt_gid,
76
	Opt_pagesize,
77 78 79
	Opt_err,
};

80
static const match_table_t tokens = {
81 82 83 84 85
	{Opt_size,	"size=%s"},
	{Opt_nr_inodes,	"nr_inodes=%s"},
	{Opt_mode,	"mode=%o"},
	{Opt_uid,	"uid=%u"},
	{Opt_gid,	"gid=%u"},
86
	{Opt_pagesize,	"pagesize=%s"},
87 88 89
	{Opt_err,	NULL},
};

90 91 92 93 94 95 96 97 98 99
static void huge_pagevec_release(struct pagevec *pvec)
{
	int i;

	for (i = 0; i < pagevec_count(pvec); ++i)
		put_page(pvec->pages[i]);

	pagevec_reinit(pvec);
}

L
Linus Torvalds 已提交
100 101
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
A
Al Viro 已提交
102
	struct inode *inode = file_inode(file);
L
Linus Torvalds 已提交
103 104
	loff_t len, vma_len;
	int ret;
105
	struct hstate *h = hstate_file(file);
L
Linus Torvalds 已提交
106

107
	/*
108 109 110 111 112 113
	 * vma address alignment (but not the pgoff alignment) has
	 * already been checked by prepare_hugepage_range.  If you add
	 * any error returns here, do so after setting VM_HUGETLB, so
	 * is_vm_hugetlb_page tests below unmap_region go the right
	 * way when do_mmap_pgoff unwinds (may be important on powerpc
	 * and ia64).
114
	 */
115
	vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
116
	vma->vm_ops = &hugetlb_vm_ops;
L
Linus Torvalds 已提交
117

118
	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
119 120
		return -EINVAL;

L
Linus Torvalds 已提交
121 122
	vma_len = (loff_t)(vma->vm_end - vma->vm_start);

123
	mutex_lock(&inode->i_mutex);
L
Linus Torvalds 已提交
124 125 126 127 128
	file_accessed(file);

	ret = -ENOMEM;
	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);

129
	if (hugetlb_reserve_pages(inode,
130
				vma->vm_pgoff >> huge_page_order(h),
131 132
				len >> huge_page_shift(h), vma,
				vma->vm_flags))
133
		goto out;
134

A
Adam Litke 已提交
135 136
	ret = 0;
	hugetlb_prefault_arch_hook(vma->vm_mm);
137
	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
L
Linus Torvalds 已提交
138 139
		inode->i_size = len;
out:
140
	mutex_unlock(&inode->i_mutex);
L
Linus Torvalds 已提交
141 142 143 144 145

	return ret;
}

/*
146
 * Called under down_write(mmap_sem).
L
Linus Torvalds 已提交
147 148
 */

149
#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
L
Linus Torvalds 已提交
150 151 152 153 154 155
static unsigned long
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
		unsigned long len, unsigned long pgoff, unsigned long flags)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
156
	struct hstate *h = hstate_file(file);
157
	struct vm_unmapped_area_info info;
L
Linus Torvalds 已提交
158

159
	if (len & ~huge_page_mask(h))
L
Linus Torvalds 已提交
160 161 162 163
		return -EINVAL;
	if (len > TASK_SIZE)
		return -ENOMEM;

164
	if (flags & MAP_FIXED) {
165
		if (prepare_hugepage_range(file, addr, len))
166 167 168 169
			return -EINVAL;
		return addr;
	}

L
Linus Torvalds 已提交
170
	if (addr) {
171
		addr = ALIGN(addr, huge_page_size(h));
L
Linus Torvalds 已提交
172 173 174 175 176 177
		vma = find_vma(mm, addr);
		if (TASK_SIZE - len >= addr &&
		    (!vma || addr + len <= vma->vm_start))
			return addr;
	}

178 179 180 181 182 183 184
	info.flags = 0;
	info.length = len;
	info.low_limit = TASK_UNMAPPED_BASE;
	info.high_limit = TASK_SIZE;
	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
	info.align_offset = 0;
	return vm_unmapped_area(&info);
L
Linus Torvalds 已提交
185 186 187
}
#endif

B
Badari Pulavarty 已提交
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
static int
hugetlbfs_read_actor(struct page *page, unsigned long offset,
			char __user *buf, unsigned long count,
			unsigned long size)
{
	char *kaddr;
	unsigned long left, copied = 0;
	int i, chunksize;

	if (size > count)
		size = count;

	/* Find which 4k chunk and offset with in that chunk */
	i = offset >> PAGE_CACHE_SHIFT;
	offset = offset & ~PAGE_CACHE_MASK;

	while (size) {
		chunksize = PAGE_CACHE_SIZE;
		if (offset)
			chunksize -= offset;
		if (chunksize > size)
			chunksize = size;
		kaddr = kmap(&page[i]);
		left = __copy_to_user(buf, kaddr + offset, chunksize);
		kunmap(&page[i]);
		if (left) {
			copied += (chunksize - left);
			break;
		}
		offset = 0;
		size -= chunksize;
		buf += chunksize;
		copied += chunksize;
		i++;
	}
	return copied ? copied : -EFAULT;
}

/*
 * Support for read() - Find the page attached to f_mapping and copy out the
 * data. Its *very* similar to do_generic_mapping_read(), we can't use that
 * since it has PAGE_CACHE_SIZE assumptions.
 */
static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
			      size_t len, loff_t *ppos)
{
234
	struct hstate *h = hstate_file(filp);
B
Badari Pulavarty 已提交
235 236
	struct address_space *mapping = filp->f_mapping;
	struct inode *inode = mapping->host;
237 238
	unsigned long index = *ppos >> huge_page_shift(h);
	unsigned long offset = *ppos & ~huge_page_mask(h);
B
Badari Pulavarty 已提交
239 240 241 242 243 244 245 246 247 248
	unsigned long end_index;
	loff_t isize;
	ssize_t retval = 0;

	/* validate length */
	if (len == 0)
		goto out;

	for (;;) {
		struct page *page;
249
		unsigned long nr, ret;
250
		int ra;
B
Badari Pulavarty 已提交
251 252

		/* nr is the maximum number of bytes to copy from this page */
253
		nr = huge_page_size(h);
254 255 256 257
		isize = i_size_read(inode);
		if (!isize)
			goto out;
		end_index = (isize - 1) >> huge_page_shift(h);
B
Badari Pulavarty 已提交
258 259 260
		if (index >= end_index) {
			if (index > end_index)
				goto out;
261
			nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
262
			if (nr <= offset)
B
Badari Pulavarty 已提交
263 264 265 266 267
				goto out;
		}
		nr = nr - offset;

		/* Find the page */
268
		page = find_lock_page(mapping, index);
B
Badari Pulavarty 已提交
269 270 271 272 273 274 275
		if (unlikely(page == NULL)) {
			/*
			 * We have a HOLE, zero out the user-buffer for the
			 * length of the hole or request.
			 */
			ret = len < nr ? len : nr;
			if (clear_user(buf, ret))
276 277 278
				ra = -EFAULT;
			else
				ra = 0;
B
Badari Pulavarty 已提交
279
		} else {
280 281
			unlock_page(page);

B
Badari Pulavarty 已提交
282 283 284
			/*
			 * We have the page, copy it to user space buffer.
			 */
285 286
			ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
			ret = ra;
287
			page_cache_release(page);
B
Badari Pulavarty 已提交
288
		}
289
		if (ra < 0) {
B
Badari Pulavarty 已提交
290
			if (retval == 0)
291
				retval = ra;
B
Badari Pulavarty 已提交
292 293 294 295 296 297
			goto out;
		}

		offset += ret;
		retval += ret;
		len -= ret;
298 299
		index += offset >> huge_page_shift(h);
		offset &= ~huge_page_mask(h);
B
Badari Pulavarty 已提交
300 301 302 303 304 305

		/* short read or no more work */
		if ((ret != nr) || (len == 0))
			break;
	}
out:
306
	*ppos = ((loff_t)index << huge_page_shift(h)) + offset;
B
Badari Pulavarty 已提交
307 308 309
	return retval;
}

N
Nick Piggin 已提交
310 311 312 313
static int hugetlbfs_write_begin(struct file *file,
			struct address_space *mapping,
			loff_t pos, unsigned len, unsigned flags,
			struct page **pagep, void **fsdata)
L
Linus Torvalds 已提交
314 315 316 317
{
	return -EINVAL;
}

N
Nick Piggin 已提交
318 319 320
static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
			loff_t pos, unsigned len, unsigned copied,
			struct page *page, void *fsdata)
L
Linus Torvalds 已提交
321
{
N
Nick Piggin 已提交
322
	BUG();
L
Linus Torvalds 已提交
323 324 325 326 327
	return -EINVAL;
}

static void truncate_huge_page(struct page *page)
{
328
	cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
L
Linus Torvalds 已提交
329
	ClearPageUptodate(page);
330
	delete_from_page_cache(page);
L
Linus Torvalds 已提交
331 332
}

333
static void truncate_hugepages(struct inode *inode, loff_t lstart)
L
Linus Torvalds 已提交
334
{
335
	struct hstate *h = hstate_inode(inode);
336
	struct address_space *mapping = &inode->i_data;
337
	const pgoff_t start = lstart >> huge_page_shift(h);
L
Linus Torvalds 已提交
338 339
	struct pagevec pvec;
	pgoff_t next;
340
	int i, freed = 0;
L
Linus Torvalds 已提交
341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360

	pagevec_init(&pvec, 0);
	next = start;
	while (1) {
		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
			if (next == start)
				break;
			next = start;
			continue;
		}

		for (i = 0; i < pagevec_count(&pvec); ++i) {
			struct page *page = pvec.pages[i];

			lock_page(page);
			if (page->index > next)
				next = page->index;
			++next;
			truncate_huge_page(page);
			unlock_page(page);
361
			freed++;
L
Linus Torvalds 已提交
362 363 364 365
		}
		huge_pagevec_release(&pvec);
	}
	BUG_ON(!lstart && mapping->nrpages);
366
	hugetlb_unreserve_pages(inode, start, freed);
L
Linus Torvalds 已提交
367 368
}

A
Al Viro 已提交
369
static void hugetlbfs_evict_inode(struct inode *inode)
L
Linus Torvalds 已提交
370
{
371 372
	struct resv_map *resv_map;

373
	truncate_hugepages(inode, 0);
374 375 376 377
	resv_map = (struct resv_map *)inode->i_mapping->private_data;
	/* root inode doesn't have the resv_map, so we should check it */
	if (resv_map)
		resv_map_release(&resv_map->refs);
378
	clear_inode(inode);
379 380
}

L
Linus Torvalds 已提交
381
static inline void
382
hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
L
Linus Torvalds 已提交
383 384 385
{
	struct vm_area_struct *vma;

386
	vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
L
Linus Torvalds 已提交
387 388 389
		unsigned long v_offset;

		/*
H
Hugh Dickins 已提交
390
		 * Can the expression below overflow on 32-bit arches?
391
		 * No, because the interval tree returns us only those vmas
H
Hugh Dickins 已提交
392 393
		 * which overlap the truncated area starting at pgoff,
		 * and no vma on a 32-bit arch can span beyond the 4GB.
L
Linus Torvalds 已提交
394
		 */
H
Hugh Dickins 已提交
395 396 397
		if (vma->vm_pgoff < pgoff)
			v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
		else
L
Linus Torvalds 已提交
398 399
			v_offset = 0;

400 401
		unmap_hugepage_range(vma, vma->vm_start + v_offset,
				     vma->vm_end, NULL);
L
Linus Torvalds 已提交
402 403 404 405 406
	}
}

static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
{
H
Hugh Dickins 已提交
407
	pgoff_t pgoff;
L
Linus Torvalds 已提交
408
	struct address_space *mapping = inode->i_mapping;
409
	struct hstate *h = hstate_inode(inode);
L
Linus Torvalds 已提交
410

411
	BUG_ON(offset & ~huge_page_mask(h));
H
Hugh Dickins 已提交
412
	pgoff = offset >> PAGE_SHIFT;
L
Linus Torvalds 已提交
413

414
	i_size_write(inode, offset);
415
	i_mmap_lock_write(mapping);
416
	if (!RB_EMPTY_ROOT(&mapping->i_mmap))
L
Linus Torvalds 已提交
417
		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
418
	i_mmap_unlock_write(mapping);
419
	truncate_hugepages(inode, offset);
L
Linus Torvalds 已提交
420 421 422 423 424 425
	return 0;
}

static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
{
	struct inode *inode = dentry->d_inode;
426
	struct hstate *h = hstate_inode(inode);
L
Linus Torvalds 已提交
427 428 429 430 431 432 433
	int error;
	unsigned int ia_valid = attr->ia_valid;

	BUG_ON(!inode);

	error = inode_change_ok(inode, attr);
	if (error)
C
Christoph Hellwig 已提交
434
		return error;
L
Linus Torvalds 已提交
435 436 437

	if (ia_valid & ATTR_SIZE) {
		error = -EINVAL;
C
Christoph Hellwig 已提交
438 439 440
		if (attr->ia_size & ~huge_page_mask(h))
			return -EINVAL;
		error = hugetlb_vmtruncate(inode, attr->ia_size);
L
Linus Torvalds 已提交
441
		if (error)
C
Christoph Hellwig 已提交
442
			return error;
L
Linus Torvalds 已提交
443
	}
C
Christoph Hellwig 已提交
444 445 446 447

	setattr_copy(inode, attr);
	mark_inode_dirty(inode);
	return 0;
L
Linus Torvalds 已提交
448 449
}

450 451
static struct inode *hugetlbfs_get_root(struct super_block *sb,
					struct hugetlbfs_config *config)
L
Linus Torvalds 已提交
452 453 454 455 456 457
{
	struct inode *inode;

	inode = new_inode(sb);
	if (inode) {
		struct hugetlbfs_inode_info *info;
458
		inode->i_ino = get_next_ino();
459 460 461 462 463 464 465 466 467 468
		inode->i_mode = S_IFDIR | config->mode;
		inode->i_uid = config->uid;
		inode->i_gid = config->gid;
		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
		info = HUGETLBFS_I(inode);
		mpol_shared_policy_init(&info->policy, NULL);
		inode->i_op = &hugetlbfs_dir_inode_operations;
		inode->i_fop = &simple_dir_operations;
		/* directory inodes start off with i_nlink == 2 (for "." entry) */
		inc_nlink(inode);
469
		lockdep_annotate_inode_mutex_key(inode);
470 471 472 473
	}
	return inode;
}

474 475 476 477 478 479
/*
 * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never
 * be taken from reclaim -- unlike regular filesystems. This needs an
 * annotation because huge_pmd_share() does an allocation under
 * i_mmap_mutex.
 */
480
static struct lock_class_key hugetlbfs_i_mmap_mutex_key;
481

482 483
static struct inode *hugetlbfs_get_inode(struct super_block *sb,
					struct inode *dir,
A
Al Viro 已提交
484
					umode_t mode, dev_t dev)
485 486
{
	struct inode *inode;
487 488 489 490 491
	struct resv_map *resv_map;

	resv_map = resv_map_alloc();
	if (!resv_map)
		return NULL;
492 493 494 495 496 497

	inode = new_inode(sb);
	if (inode) {
		struct hugetlbfs_inode_info *info;
		inode->i_ino = get_next_ino();
		inode_init_owner(inode, dir, mode);
498 499
		lockdep_set_class(&inode->i_mapping->i_mmap_mutex,
				&hugetlbfs_i_mmap_mutex_key);
L
Linus Torvalds 已提交
500 501 502
		inode->i_mapping->a_ops = &hugetlbfs_aops;
		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
503
		inode->i_mapping->private_data = resv_map;
L
Linus Torvalds 已提交
504
		info = HUGETLBFS_I(inode);
505 506 507 508 509 510 511
		/*
		 * The policy is initialized here even if we are creating a
		 * private inode because initialization simply creates an
		 * an empty rb tree and calls spin_lock_init(), later when we
		 * call mpol_free_shared_policy() it will just return because
		 * the rb tree will still be empty.
		 */
512
		mpol_shared_policy_init(&info->policy, NULL);
L
Linus Torvalds 已提交
513 514 515 516 517 518 519 520 521 522 523 524 525
		switch (mode & S_IFMT) {
		default:
			init_special_inode(inode, mode, dev);
			break;
		case S_IFREG:
			inode->i_op = &hugetlbfs_inode_operations;
			inode->i_fop = &hugetlbfs_file_operations;
			break;
		case S_IFDIR:
			inode->i_op = &hugetlbfs_dir_inode_operations;
			inode->i_fop = &simple_dir_operations;

			/* directory inodes start off with i_nlink == 2 (for "." entry) */
526
			inc_nlink(inode);
L
Linus Torvalds 已提交
527 528 529 530 531
			break;
		case S_IFLNK:
			inode->i_op = &page_symlink_inode_operations;
			break;
		}
532
		lockdep_annotate_inode_mutex_key(inode);
533 534 535
	} else
		kref_put(&resv_map->refs, resv_map_release);

L
Linus Torvalds 已提交
536 537 538 539 540 541 542
	return inode;
}

/*
 * File creation. Allocate an inode, and we're done..
 */
static int hugetlbfs_mknod(struct inode *dir,
A
Al Viro 已提交
543
			struct dentry *dentry, umode_t mode, dev_t dev)
L
Linus Torvalds 已提交
544 545 546
{
	struct inode *inode;
	int error = -ENOSPC;
547 548

	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
L
Linus Torvalds 已提交
549 550 551 552 553 554 555 556 557
	if (inode) {
		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
		d_instantiate(dentry, inode);
		dget(dentry);	/* Extra count - pin the dentry in core */
		error = 0;
	}
	return error;
}

558
static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
L
Linus Torvalds 已提交
559 560 561
{
	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
	if (!retval)
562
		inc_nlink(dir);
L
Linus Torvalds 已提交
563 564 565
	return retval;
}

A
Al Viro 已提交
566
static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
L
Linus Torvalds 已提交
567 568 569 570 571 572 573 574 575 576
{
	return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
}

static int hugetlbfs_symlink(struct inode *dir,
			struct dentry *dentry, const char *symname)
{
	struct inode *inode;
	int error = -ENOSPC;

577
	inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
L
Linus Torvalds 已提交
578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
	if (inode) {
		int l = strlen(symname)+1;
		error = page_symlink(inode, symname, l);
		if (!error) {
			d_instantiate(dentry, inode);
			dget(dentry);
		} else
			iput(inode);
	}
	dir->i_ctime = dir->i_mtime = CURRENT_TIME;

	return error;
}

/*
593
 * mark the head page dirty
L
Linus Torvalds 已提交
594 595 596
 */
static int hugetlbfs_set_page_dirty(struct page *page)
{
597
	struct page *head = compound_head(page);
598 599

	SetPageDirty(head);
L
Linus Torvalds 已提交
600 601 602
	return 0;
}

N
Naoya Horiguchi 已提交
603
static int hugetlbfs_migrate_page(struct address_space *mapping,
604
				struct page *newpage, struct page *page,
605
				enum migrate_mode mode)
N
Naoya Horiguchi 已提交
606 607 608 609
{
	int rc;

	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
610
	if (rc != MIGRATEPAGE_SUCCESS)
N
Naoya Horiguchi 已提交
611 612 613
		return rc;
	migrate_page_copy(newpage, page);

614
	return MIGRATEPAGE_SUCCESS;
N
Naoya Horiguchi 已提交
615 616
}

617
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
L
Linus Torvalds 已提交
618
{
619
	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
620
	struct hstate *h = hstate_inode(dentry->d_inode);
L
Linus Torvalds 已提交
621 622

	buf->f_type = HUGETLBFS_MAGIC;
623
	buf->f_bsize = huge_page_size(h);
L
Linus Torvalds 已提交
624 625
	if (sbinfo) {
		spin_lock(&sbinfo->stat_lock);
626 627
		/* If no limits set, just report 0 for max/free/used
		 * blocks, like simple_statfs() */
628 629 630 631 632 633 634 635 636
		if (sbinfo->spool) {
			long free_pages;

			spin_lock(&sbinfo->spool->lock);
			buf->f_blocks = sbinfo->spool->max_hpages;
			free_pages = sbinfo->spool->max_hpages
				- sbinfo->spool->used_hpages;
			buf->f_bavail = buf->f_bfree = free_pages;
			spin_unlock(&sbinfo->spool->lock);
637 638 639
			buf->f_files = sbinfo->max_inodes;
			buf->f_ffree = sbinfo->free_inodes;
		}
L
Linus Torvalds 已提交
640 641 642 643 644 645 646 647 648 649 650 651
		spin_unlock(&sbinfo->stat_lock);
	}
	buf->f_namelen = NAME_MAX;
	return 0;
}

static void hugetlbfs_put_super(struct super_block *sb)
{
	struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);

	if (sbi) {
		sb->s_fs_info = NULL;
652 653 654 655

		if (sbi->spool)
			hugepage_put_subpool(sbi->spool);

L
Linus Torvalds 已提交
656 657 658 659
		kfree(sbi);
	}
}

660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684
static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
{
	if (sbinfo->free_inodes >= 0) {
		spin_lock(&sbinfo->stat_lock);
		if (unlikely(!sbinfo->free_inodes)) {
			spin_unlock(&sbinfo->stat_lock);
			return 0;
		}
		sbinfo->free_inodes--;
		spin_unlock(&sbinfo->stat_lock);
	}

	return 1;
}

static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
{
	if (sbinfo->free_inodes >= 0) {
		spin_lock(&sbinfo->stat_lock);
		sbinfo->free_inodes++;
		spin_unlock(&sbinfo->stat_lock);
	}
}


685
static struct kmem_cache *hugetlbfs_inode_cachep;
L
Linus Torvalds 已提交
686 687 688

static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
{
689
	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
L
Linus Torvalds 已提交
690 691
	struct hugetlbfs_inode_info *p;

692 693
	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
		return NULL;
694
	p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
695 696
	if (unlikely(!p)) {
		hugetlbfs_inc_free_inodes(sbinfo);
L
Linus Torvalds 已提交
697
		return NULL;
698
	}
L
Linus Torvalds 已提交
699 700 701
	return &p->vfs_inode;
}

N
Nick Piggin 已提交
702 703 704 705 706 707
static void hugetlbfs_i_callback(struct rcu_head *head)
{
	struct inode *inode = container_of(head, struct inode, i_rcu);
	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
}

L
Linus Torvalds 已提交
708 709
static void hugetlbfs_destroy_inode(struct inode *inode)
{
710
	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
L
Linus Torvalds 已提交
711
	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
N
Nick Piggin 已提交
712
	call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
L
Linus Torvalds 已提交
713 714
}

715
static const struct address_space_operations hugetlbfs_aops = {
N
Nick Piggin 已提交
716 717
	.write_begin	= hugetlbfs_write_begin,
	.write_end	= hugetlbfs_write_end,
L
Linus Torvalds 已提交
718
	.set_page_dirty	= hugetlbfs_set_page_dirty,
N
Naoya Horiguchi 已提交
719
	.migratepage    = hugetlbfs_migrate_page,
L
Linus Torvalds 已提交
720 721
};

722

723
static void init_once(void *foo)
724 725 726
{
	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;

C
Christoph Lameter 已提交
727
	inode_init_once(&ei->vfs_inode);
728 729
}

730
const struct file_operations hugetlbfs_file_operations = {
B
Badari Pulavarty 已提交
731
	.read			= hugetlbfs_read,
L
Linus Torvalds 已提交
732
	.mmap			= hugetlbfs_file_mmap,
733
	.fsync			= noop_fsync,
L
Linus Torvalds 已提交
734
	.get_unmapped_area	= hugetlb_get_unmapped_area,
735
	.llseek		= default_llseek,
L
Linus Torvalds 已提交
736 737
};

738
static const struct inode_operations hugetlbfs_dir_inode_operations = {
L
Linus Torvalds 已提交
739 740 741 742 743 744 745 746 747 748 749 750
	.create		= hugetlbfs_create,
	.lookup		= simple_lookup,
	.link		= simple_link,
	.unlink		= simple_unlink,
	.symlink	= hugetlbfs_symlink,
	.mkdir		= hugetlbfs_mkdir,
	.rmdir		= simple_rmdir,
	.mknod		= hugetlbfs_mknod,
	.rename		= simple_rename,
	.setattr	= hugetlbfs_setattr,
};

751
static const struct inode_operations hugetlbfs_inode_operations = {
L
Linus Torvalds 已提交
752 753 754
	.setattr	= hugetlbfs_setattr,
};

755
static const struct super_operations hugetlbfs_ops = {
L
Linus Torvalds 已提交
756 757
	.alloc_inode    = hugetlbfs_alloc_inode,
	.destroy_inode  = hugetlbfs_destroy_inode,
A
Al Viro 已提交
758
	.evict_inode	= hugetlbfs_evict_inode,
L
Linus Torvalds 已提交
759 760
	.statfs		= hugetlbfs_statfs,
	.put_super	= hugetlbfs_put_super,
M
Miklos Szeredi 已提交
761
	.show_options	= generic_show_options,
L
Linus Torvalds 已提交
762 763 764 765 766
};

static int
hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
{
767 768 769
	char *p, *rest;
	substring_t args[MAX_OPT_ARGS];
	int option;
770 771
	unsigned long long size = 0;
	enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE;
L
Linus Torvalds 已提交
772 773 774 775

	if (!options)
		return 0;

776 777
	while ((p = strsep(&options, ",")) != NULL) {
		int token;
778 779
		if (!*p)
			continue;
780 781 782 783 784 785

		token = match_token(p, tokens, args);
		switch (token) {
		case Opt_uid:
			if (match_int(&args[0], &option))
 				goto bad_val;
786 787 788
			pconfig->uid = make_kuid(current_user_ns(), option);
			if (!uid_valid(pconfig->uid))
				goto bad_val;
789 790 791 792 793
			break;

		case Opt_gid:
			if (match_int(&args[0], &option))
 				goto bad_val;
794 795 796
			pconfig->gid = make_kgid(current_user_ns(), option);
			if (!gid_valid(pconfig->gid))
				goto bad_val;
797 798 799 800 801
			break;

		case Opt_mode:
			if (match_octal(&args[0], &option))
 				goto bad_val;
802
			pconfig->mode = option & 01777U;
803 804 805 806 807 808 809
			break;

		case Opt_size: {
			/* memparse() will accept a K/M/G without a digit */
			if (!isdigit(*args[0].from))
				goto bad_val;
			size = memparse(args[0].from, &rest);
810 811 812
			setsize = SIZE_STD;
			if (*rest == '%')
				setsize = SIZE_PERCENT;
813 814
			break;
		}
L
Linus Torvalds 已提交
815

816 817 818 819 820 821 822
		case Opt_nr_inodes:
			/* memparse() will accept a K/M/G without a digit */
			if (!isdigit(*args[0].from))
				goto bad_val;
			pconfig->nr_inodes = memparse(args[0].from, &rest);
			break;

823 824 825 826 827
		case Opt_pagesize: {
			unsigned long ps;
			ps = memparse(args[0].from, &rest);
			pconfig->hstate = size_to_hstate(ps);
			if (!pconfig->hstate) {
828
				pr_err("Unsupported page size %lu MB\n",
829 830 831 832 833 834
					ps >> 20);
				return -EINVAL;
			}
			break;
		}

835
		default:
836
			pr_err("Bad mount option: \"%s\"\n", p);
837
			return -EINVAL;
838 839
			break;
		}
L
Linus Torvalds 已提交
840
	}
841 842 843 844 845 846 847 848 849 850 851 852

	/* Do size after hstate is set up */
	if (setsize > NO_SIZE) {
		struct hstate *h = pconfig->hstate;
		if (setsize == SIZE_PERCENT) {
			size <<= huge_page_shift(h);
			size *= h->max_huge_pages;
			do_div(size, 100);
		}
		pconfig->nr_blocks = (size >> huge_page_shift(h));
	}

L
Linus Torvalds 已提交
853
	return 0;
854 855

bad_val:
856
	pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p);
857
 	return -EINVAL;
L
Linus Torvalds 已提交
858 859 860 861 862 863 864 865 866
}

static int
hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
{
	int ret;
	struct hugetlbfs_config config;
	struct hugetlbfs_sb_info *sbinfo;

M
Miklos Szeredi 已提交
867 868
	save_mount_options(sb, data);

L
Linus Torvalds 已提交
869 870
	config.nr_blocks = -1; /* No limit on size by default */
	config.nr_inodes = -1; /* No limit on number of inodes by default */
871 872
	config.uid = current_fsuid();
	config.gid = current_fsgid();
L
Linus Torvalds 已提交
873
	config.mode = 0755;
874
	config.hstate = &default_hstate;
L
Linus Torvalds 已提交
875 876 877 878 879 880 881 882
	ret = hugetlbfs_parse_options(data, &config);
	if (ret)
		return ret;

	sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
	if (!sbinfo)
		return -ENOMEM;
	sb->s_fs_info = sbinfo;
883
	sbinfo->hstate = config.hstate;
L
Linus Torvalds 已提交
884 885 886
	spin_lock_init(&sbinfo->stat_lock);
	sbinfo->max_inodes = config.nr_inodes;
	sbinfo->free_inodes = config.nr_inodes;
887 888 889 890 891 892
	sbinfo->spool = NULL;
	if (config.nr_blocks != -1) {
		sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
		if (!sbinfo->spool)
			goto out_free;
	}
L
Linus Torvalds 已提交
893
	sb->s_maxbytes = MAX_LFS_FILESIZE;
894 895
	sb->s_blocksize = huge_page_size(config.hstate);
	sb->s_blocksize_bits = huge_page_shift(config.hstate);
L
Linus Torvalds 已提交
896 897 898
	sb->s_magic = HUGETLBFS_MAGIC;
	sb->s_op = &hugetlbfs_ops;
	sb->s_time_gran = 1;
899 900
	sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
	if (!sb->s_root)
L
Linus Torvalds 已提交
901 902 903
		goto out_free;
	return 0;
out_free:
904
	kfree(sbinfo->spool);
L
Linus Torvalds 已提交
905 906 907 908
	kfree(sbinfo);
	return -ENOMEM;
}

A
Al Viro 已提交
909 910
static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
	int flags, const char *dev_name, void *data)
L
Linus Torvalds 已提交
911
{
A
Al Viro 已提交
912
	return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
L
Linus Torvalds 已提交
913 914 915 916
}

static struct file_system_type hugetlbfs_fs_type = {
	.name		= "hugetlbfs",
A
Al Viro 已提交
917
	.mount		= hugetlbfs_mount,
L
Linus Torvalds 已提交
918 919
	.kill_sb	= kill_litter_super,
};
920
MODULE_ALIAS_FS("hugetlbfs");
L
Linus Torvalds 已提交
921

922
static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
L
Linus Torvalds 已提交
923

924
static int can_do_hugetlb_shm(void)
L
Linus Torvalds 已提交
925
{
926 927 928
	kgid_t shm_group;
	shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
	return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
L
Linus Torvalds 已提交
929 930
}

931 932
static int get_hstate_idx(int page_size_log)
{
933
	struct hstate *h = hstate_sizelog(page_size_log);
934 935 936 937 938 939

	if (!h)
		return -1;
	return h - hstates;
}

940
static const struct dentry_operations anon_ops = {
941
	.d_dname = simple_dname
942 943
};

944 945 946 947 948 949
/*
 * Note that size should be aligned to proper hugepage size in caller side,
 * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
 */
struct file *hugetlb_file_setup(const char *name, size_t size,
				vm_flags_t acctflag, struct user_struct **user,
950
				int creat_flags, int page_size_log)
L
Linus Torvalds 已提交
951
{
952
	struct file *file = ERR_PTR(-ENOMEM);
L
Linus Torvalds 已提交
953
	struct inode *inode;
954
	struct path path;
955
	struct super_block *sb;
L
Linus Torvalds 已提交
956
	struct qstr quick_string;
957 958 959 960 961
	int hstate_idx;

	hstate_idx = get_hstate_idx(page_size_log);
	if (hstate_idx < 0)
		return ERR_PTR(-ENODEV);
L
Linus Torvalds 已提交
962

963
	*user = NULL;
964
	if (!hugetlbfs_vfsmount[hstate_idx])
965 966
		return ERR_PTR(-ENOENT);

967
	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
968 969
		*user = current_user();
		if (user_shm_lock(size, *user)) {
970
			task_lock(current);
971
			pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
972 973
				current->comm, current->pid);
			task_unlock(current);
974 975
		} else {
			*user = NULL;
976
			return ERR_PTR(-EPERM);
977
		}
978
	}
L
Linus Torvalds 已提交
979

980
	sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb;
981
	quick_string.name = name;
L
Linus Torvalds 已提交
982 983
	quick_string.len = strlen(quick_string.name);
	quick_string.hash = 0;
984
	path.dentry = d_alloc_pseudo(sb, &quick_string);
985
	if (!path.dentry)
L
Linus Torvalds 已提交
986 987
		goto out_shm_unlock;

988
	d_set_d_op(path.dentry, &anon_ops);
989
	path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
990
	file = ERR_PTR(-ENOSPC);
991
	inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);
L
Linus Torvalds 已提交
992
	if (!inode)
993
		goto out_dentry;
L
Linus Torvalds 已提交
994

995
	file = ERR_PTR(-ENOMEM);
996 997 998
	if (hugetlb_reserve_pages(inode, 0,
			size >> huge_page_shift(hstate_inode(inode)), NULL,
			acctflag))
999 1000
		goto out_inode;

1001
	d_instantiate(path.dentry, inode);
L
Linus Torvalds 已提交
1002
	inode->i_size = size;
1003
	clear_nlink(inode);
1004

1005
	file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
1006
			&hugetlbfs_file_operations);
1007
	if (IS_ERR(file))
1008
		goto out_dentry; /* inode is already attached */
1009

L
Linus Torvalds 已提交
1010 1011
	return file;

1012 1013
out_inode:
	iput(inode);
L
Linus Torvalds 已提交
1014
out_dentry:
1015
	path_put(&path);
L
Linus Torvalds 已提交
1016
out_shm_unlock:
1017 1018 1019 1020
	if (*user) {
		user_shm_unlock(size, *user);
		*user = NULL;
	}
1021
	return file;
L
Linus Torvalds 已提交
1022 1023 1024 1025
}

static int __init init_hugetlbfs_fs(void)
{
1026
	struct hstate *h;
L
Linus Torvalds 已提交
1027
	int error;
1028
	int i;
L
Linus Torvalds 已提交
1029

1030
	if (!hugepages_supported()) {
1031
		pr_info("disabling because there are no supported hugepage sizes\n");
1032 1033 1034
		return -ENOTSUPP;
	}

P
Peter Zijlstra 已提交
1035 1036 1037 1038
	error = bdi_init(&hugetlbfs_backing_dev_info);
	if (error)
		return error;

1039
	error = -ENOMEM;
L
Linus Torvalds 已提交
1040 1041
	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
					sizeof(struct hugetlbfs_inode_info),
1042
					0, 0, init_once);
L
Linus Torvalds 已提交
1043
	if (hugetlbfs_inode_cachep == NULL)
P
Peter Zijlstra 已提交
1044
		goto out2;
L
Linus Torvalds 已提交
1045 1046 1047 1048 1049

	error = register_filesystem(&hugetlbfs_fs_type);
	if (error)
		goto out;

1050 1051 1052 1053
	i = 0;
	for_each_hstate(h) {
		char buf[50];
		unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
L
Linus Torvalds 已提交
1054

1055 1056 1057
		snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
		hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
							buf);
L
Linus Torvalds 已提交
1058

1059
		if (IS_ERR(hugetlbfs_vfsmount[i])) {
1060
			pr_err("Cannot mount internal hugetlbfs for "
1061 1062 1063 1064 1065 1066 1067 1068 1069
				"page size %uK", ps_kb);
			error = PTR_ERR(hugetlbfs_vfsmount[i]);
			hugetlbfs_vfsmount[i] = NULL;
		}
		i++;
	}
	/* Non default hstates are optional */
	if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
		return 0;
L
Linus Torvalds 已提交
1070 1071

 out:
1072
	kmem_cache_destroy(hugetlbfs_inode_cachep);
P
Peter Zijlstra 已提交
1073 1074
 out2:
	bdi_destroy(&hugetlbfs_backing_dev_info);
L
Linus Torvalds 已提交
1075 1076 1077 1078 1079
	return error;
}

static void __exit exit_hugetlbfs_fs(void)
{
1080 1081 1082 1083
	struct hstate *h;
	int i;


1084 1085 1086 1087 1088
	/*
	 * Make sure all delayed rcu free inodes are flushed before we
	 * destroy cache.
	 */
	rcu_barrier();
L
Linus Torvalds 已提交
1089
	kmem_cache_destroy(hugetlbfs_inode_cachep);
1090 1091 1092
	i = 0;
	for_each_hstate(h)
		kern_unmount(hugetlbfs_vfsmount[i++]);
L
Linus Torvalds 已提交
1093
	unregister_filesystem(&hugetlbfs_fs_type);
P
Peter Zijlstra 已提交
1094
	bdi_destroy(&hugetlbfs_backing_dev_info);
L
Linus Torvalds 已提交
1095 1096 1097 1098 1099 1100
}

module_init(init_hugetlbfs_fs)
module_exit(exit_hugetlbfs_fs)

MODULE_LICENSE("GPL");