hugetlb.c 36.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12
/*
 * Generic hugetlb support.
 * (C) William Irwin, April 2004
 */
#include <linux/gfp.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
#include <linux/nodemask.h>
D
David Gibson 已提交
13
#include <linux/pagemap.h>
14
#include <linux/mempolicy.h>
15
#include <linux/cpuset.h>
16
#include <linux/mutex.h>
17

D
David Gibson 已提交
18 19 20 21
#include <asm/page.h>
#include <asm/pgtable.h>

#include <linux/hugetlb.h>
22
#include "internal.h"
L
Linus Torvalds 已提交
23 24

const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25
static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26
static unsigned long surplus_huge_pages;
27
static unsigned long nr_overcommit_huge_pages;
L
Linus Torvalds 已提交
28
unsigned long max_huge_pages;
29
unsigned long sysctl_overcommit_huge_pages;
L
Linus Torvalds 已提交
30 31 32
static struct list_head hugepage_freelists[MAX_NUMNODES];
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
static unsigned int free_huge_pages_node[MAX_NUMNODES];
33
static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
34 35
static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;
36
static int hugetlb_next_nid;
37

38 39 40 41
/*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
 */
static DEFINE_SPINLOCK(hugetlb_lock);
42

43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
/*
 * These helpers are used to track how many pages are reserved for
 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
 * is guaranteed to have their future faults succeed.
 *
 * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
 * the reserve counters are updated with the hugetlb_lock held. It is safe
 * to reset the VMA at fork() time as it is not in use yet and there is no
 * chance of the global counters getting corrupted as a result of the values.
 */
static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma)
{
	VM_BUG_ON(!is_vm_hugetlb_page(vma));
	if (!(vma->vm_flags & VM_SHARED))
		return (unsigned long)vma->vm_private_data;
	return 0;
}

static void set_vma_resv_huge_pages(struct vm_area_struct *vma,
							unsigned long reserve)
{
	VM_BUG_ON(!is_vm_hugetlb_page(vma));
	VM_BUG_ON(vma->vm_flags & VM_SHARED);

	vma->vm_private_data = (void *)reserve;
}

/* Decrement the reserved pages in the hugepage pool by one */
static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
{
	if (vma->vm_flags & VM_SHARED) {
		/* Shared mappings always use reserves */
		resv_huge_pages--;
	} else {
		/*
		 * Only the process that called mmap() has reserves for
		 * private mappings.
		 */
		if (vma_resv_huge_pages(vma)) {
			resv_huge_pages--;
			reserve = (unsigned long)vma->vm_private_data - 1;
			vma->vm_private_data = (void *)reserve;
		}
	}
}

void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
	VM_BUG_ON(!is_vm_hugetlb_page(vma));
	if (!(vma->vm_flags & VM_SHARED))
		vma->vm_private_data = (void *)0;
}

/* Returns true if the VMA has associated reserve pages */
static int vma_has_private_reserves(struct vm_area_struct *vma)
{
	if (vma->vm_flags & VM_SHARED)
		return 0;
	if (!vma_resv_huge_pages(vma))
		return 0;
	return 1;
}

106 107 108 109 110 111 112
static void clear_huge_page(struct page *page, unsigned long addr)
{
	int i;

	might_sleep();
	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
		cond_resched();
113
		clear_user_highpage(page + i, addr + i * PAGE_SIZE);
114 115 116 117
	}
}

static void copy_huge_page(struct page *dst, struct page *src,
118
			   unsigned long addr, struct vm_area_struct *vma)
119 120 121 122 123 124
{
	int i;

	might_sleep();
	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
		cond_resched();
125
		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
126 127 128
	}
}

L
Linus Torvalds 已提交
129 130 131 132 133 134 135 136
static void enqueue_huge_page(struct page *page)
{
	int nid = page_to_nid(page);
	list_add(&page->lru, &hugepage_freelists[nid]);
	free_huge_pages++;
	free_huge_pages_node[nid]++;
}

137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
static struct page *dequeue_huge_page(void)
{
	int nid;
	struct page *page = NULL;

	for (nid = 0; nid < MAX_NUMNODES; ++nid) {
		if (!list_empty(&hugepage_freelists[nid])) {
			page = list_entry(hugepage_freelists[nid].next,
					  struct page, lru);
			list_del(&page->lru);
			free_huge_pages--;
			free_huge_pages_node[nid]--;
			break;
		}
	}
	return page;
}

static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
156
				unsigned long address)
L
Linus Torvalds 已提交
157
{
158
	int nid;
L
Linus Torvalds 已提交
159
	struct page *page = NULL;
160
	struct mempolicy *mpol;
161
	nodemask_t *nodemask;
162
	struct zonelist *zonelist = huge_zonelist(vma, address,
163
					htlb_alloc_mask, &mpol, &nodemask);
164 165
	struct zone *zone;
	struct zoneref *z;
L
Linus Torvalds 已提交
166

167 168 169 170 171 172 173 174 175
	/*
	 * A child process with MAP_PRIVATE mappings created by their parent
	 * have no page reserves. This check ensures that reservations are
	 * not "stolen". The child may still get SIGKILLed
	 */
	if (!vma_has_private_reserves(vma) &&
			free_huge_pages - resv_huge_pages == 0)
		return NULL;

176 177
	for_each_zone_zonelist_nodemask(zone, z, zonelist,
						MAX_NR_ZONES - 1, nodemask) {
178 179
		nid = zone_to_nid(zone);
		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
A
Andrew Morton 已提交
180 181 182 183 184 185
		    !list_empty(&hugepage_freelists[nid])) {
			page = list_entry(hugepage_freelists[nid].next,
					  struct page, lru);
			list_del(&page->lru);
			free_huge_pages--;
			free_huge_pages_node[nid]--;
186 187
			decrement_hugepage_resv_vma(vma);

K
Ken Chen 已提交
188
			break;
A
Andrew Morton 已提交
189
		}
L
Linus Torvalds 已提交
190
	}
191
	mpol_cond_put(mpol);
L
Linus Torvalds 已提交
192 193 194
	return page;
}

A
Adam Litke 已提交
195 196 197 198 199 200 201 202 203 204 205 206
static void update_and_free_page(struct page *page)
{
	int i;
	nr_huge_pages--;
	nr_huge_pages_node[page_to_nid(page)]--;
	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
				1 << PG_private | 1<< PG_writeback);
	}
	set_compound_page_dtor(page, NULL);
	set_page_refcounted(page);
207
	arch_release_hugepage(page);
A
Adam Litke 已提交
208 209 210
	__free_pages(page, HUGETLB_PAGE_ORDER);
}

211 212
static void free_huge_page(struct page *page)
{
213
	int nid = page_to_nid(page);
214
	struct address_space *mapping;
215

216
	mapping = (struct address_space *) page_private(page);
217
	set_page_private(page, 0);
218
	BUG_ON(page_count(page));
219 220 221
	INIT_LIST_HEAD(&page->lru);

	spin_lock(&hugetlb_lock);
222 223 224 225 226 227 228
	if (surplus_huge_pages_node[nid]) {
		update_and_free_page(page);
		surplus_huge_pages--;
		surplus_huge_pages_node[nid]--;
	} else {
		enqueue_huge_page(page);
	}
229
	spin_unlock(&hugetlb_lock);
230
	if (mapping)
231
		hugetlb_put_quota(mapping, 1);
232 233
}

234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
/*
 * Increment or decrement surplus_huge_pages.  Keep node-specific counters
 * balanced by operating on them in a round-robin fashion.
 * Returns 1 if an adjustment was made.
 */
static int adjust_pool_surplus(int delta)
{
	static int prev_nid;
	int nid = prev_nid;
	int ret = 0;

	VM_BUG_ON(delta != -1 && delta != 1);
	do {
		nid = next_node(nid, node_online_map);
		if (nid == MAX_NUMNODES)
			nid = first_node(node_online_map);

		/* To shrink on this node, there must be a surplus page */
		if (delta < 0 && !surplus_huge_pages_node[nid])
			continue;
		/* Surplus cannot exceed the total number of pages */
		if (delta > 0 && surplus_huge_pages_node[nid] >=
						nr_huge_pages_node[nid])
			continue;

		surplus_huge_pages += delta;
		surplus_huge_pages_node[nid] += delta;
		ret = 1;
		break;
	} while (nid != prev_nid);

	prev_nid = nid;
	return ret;
}

269
static struct page *alloc_fresh_huge_page_node(int nid)
L
Linus Torvalds 已提交
270 271
{
	struct page *page;
272

273
	page = alloc_pages_node(nid,
274 275
		htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
						__GFP_REPEAT|__GFP_NOWARN,
276
		HUGETLB_PAGE_ORDER);
L
Linus Torvalds 已提交
277
	if (page) {
278 279
		if (arch_prepare_hugepage(page)) {
			__free_pages(page, HUGETLB_PAGE_ORDER);
280
			return NULL;
281
		}
282
		set_compound_page_dtor(page, free_huge_page);
283
		spin_lock(&hugetlb_lock);
L
Linus Torvalds 已提交
284
		nr_huge_pages++;
285
		nr_huge_pages_node[nid]++;
286
		spin_unlock(&hugetlb_lock);
N
Nick Piggin 已提交
287
		put_page(page); /* free it into the hugepage allocator */
L
Linus Torvalds 已提交
288
	}
289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322

	return page;
}

static int alloc_fresh_huge_page(void)
{
	struct page *page;
	int start_nid;
	int next_nid;
	int ret = 0;

	start_nid = hugetlb_next_nid;

	do {
		page = alloc_fresh_huge_page_node(hugetlb_next_nid);
		if (page)
			ret = 1;
		/*
		 * Use a helper variable to find the next node and then
		 * copy it back to hugetlb_next_nid afterwards:
		 * otherwise there's a window in which a racer might
		 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
		 * But we don't need to use a spin_lock here: it really
		 * doesn't matter if occasionally a racer chooses the
		 * same nid as we do.  Move nid forward in the mask even
		 * if we just successfully allocated a hugepage so that
		 * the next caller gets hugepages on the next node.
		 */
		next_nid = next_node(hugetlb_next_nid, node_online_map);
		if (next_nid == MAX_NUMNODES)
			next_nid = first_node(node_online_map);
		hugetlb_next_nid = next_nid;
	} while (!page && hugetlb_next_nid != start_nid);

323 324 325 326 327
	if (ret)
		count_vm_event(HTLB_BUDDY_PGALLOC);
	else
		count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);

328
	return ret;
L
Linus Torvalds 已提交
329 330
}

331 332 333 334
static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
						unsigned long address)
{
	struct page *page;
335
	unsigned int nid;
336

337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
	/*
	 * Assume we will successfully allocate the surplus page to
	 * prevent racing processes from causing the surplus to exceed
	 * overcommit
	 *
	 * This however introduces a different race, where a process B
	 * tries to grow the static hugepage pool while alloc_pages() is
	 * called by process A. B will only examine the per-node
	 * counters in determining if surplus huge pages can be
	 * converted to normal huge pages in adjust_pool_surplus(). A
	 * won't be able to increment the per-node counter, until the
	 * lock is dropped by B, but B doesn't drop hugetlb_lock until
	 * no more huge pages can be converted from surplus to normal
	 * state (and doesn't try to convert again). Thus, we have a
	 * case where a surplus huge page exists, the pool is grown, and
	 * the surplus huge page still exists after, even though it
	 * should just have been converted to a normal huge page. This
	 * does not leak memory, though, as the hugepage will be freed
	 * once it is out of use. It also does not allow the counters to
	 * go out of whack in adjust_pool_surplus() as we don't modify
	 * the node values until we've gotten the hugepage and only the
	 * per-node value is checked there.
	 */
	spin_lock(&hugetlb_lock);
	if (surplus_huge_pages >= nr_overcommit_huge_pages) {
		spin_unlock(&hugetlb_lock);
		return NULL;
	} else {
		nr_huge_pages++;
		surplus_huge_pages++;
	}
	spin_unlock(&hugetlb_lock);

370 371
	page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
					__GFP_REPEAT|__GFP_NOWARN,
372
					HUGETLB_PAGE_ORDER);
373 374

	spin_lock(&hugetlb_lock);
375
	if (page) {
376 377 378 379 380 381
		/*
		 * This page is now managed by the hugetlb allocator and has
		 * no users -- drop the buddy allocator's reference.
		 */
		put_page_testzero(page);
		VM_BUG_ON(page_count(page));
382
		nid = page_to_nid(page);
383
		set_compound_page_dtor(page, free_huge_page);
384 385 386 387 388
		/*
		 * We incremented the global counters already
		 */
		nr_huge_pages_node[nid]++;
		surplus_huge_pages_node[nid]++;
389
		__count_vm_event(HTLB_BUDDY_PGALLOC);
390 391 392
	} else {
		nr_huge_pages--;
		surplus_huge_pages--;
393
		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
394
	}
395
	spin_unlock(&hugetlb_lock);
396 397 398 399

	return page;
}

400 401 402 403 404 405 406 407 408 409 410 411
/*
 * Increase the hugetlb pool such that it can accomodate a reservation
 * of size 'delta'.
 */
static int gather_surplus_pages(int delta)
{
	struct list_head surplus_list;
	struct page *page, *tmp;
	int ret, i;
	int needed, allocated;

	needed = (resv_huge_pages + delta) - free_huge_pages;
412 413
	if (needed <= 0) {
		resv_huge_pages += delta;
414
		return 0;
415
	}
416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452

	allocated = 0;
	INIT_LIST_HEAD(&surplus_list);

	ret = -ENOMEM;
retry:
	spin_unlock(&hugetlb_lock);
	for (i = 0; i < needed; i++) {
		page = alloc_buddy_huge_page(NULL, 0);
		if (!page) {
			/*
			 * We were not able to allocate enough pages to
			 * satisfy the entire reservation so we free what
			 * we've allocated so far.
			 */
			spin_lock(&hugetlb_lock);
			needed = 0;
			goto free;
		}

		list_add(&page->lru, &surplus_list);
	}
	allocated += needed;

	/*
	 * After retaking hugetlb_lock, we need to recalculate 'needed'
	 * because either resv_huge_pages or free_huge_pages may have changed.
	 */
	spin_lock(&hugetlb_lock);
	needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
	if (needed > 0)
		goto retry;

	/*
	 * The surplus_list now contains _at_least_ the number of extra pages
	 * needed to accomodate the reservation.  Add the appropriate number
	 * of pages to the hugetlb pool and free the extras back to the buddy
453 454 455
	 * allocator.  Commit the entire reservation here to prevent another
	 * process from stealing the pages as they are added to the pool but
	 * before they are reserved.
456 457
	 */
	needed += allocated;
458
	resv_huge_pages += delta;
459 460
	ret = 0;
free:
461
	/* Free the needed pages to the hugetlb pool */
462
	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
463 464
		if ((--needed) < 0)
			break;
465
		list_del(&page->lru);
466 467 468 469 470 471 472 473
		enqueue_huge_page(page);
	}

	/* Free unnecessary surplus pages to the buddy allocator */
	if (!list_empty(&surplus_list)) {
		spin_unlock(&hugetlb_lock);
		list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
			list_del(&page->lru);
474
			/*
475 476 477
			 * The page has a reference count of zero already, so
			 * call free_huge_page directly instead of using
			 * put_page.  This must be done with hugetlb_lock
478 479 480
			 * unlocked which is safe because free_huge_page takes
			 * hugetlb_lock before deciding how to free the page.
			 */
481
			free_huge_page(page);
482
		}
483
		spin_lock(&hugetlb_lock);
484 485 486 487 488 489 490 491 492 493
	}

	return ret;
}

/*
 * When releasing a hugetlb pool reservation, any surplus pages that were
 * allocated to satisfy the reservation must be explicitly freed if they were
 * never used.
 */
A
Adrian Bunk 已提交
494
static void return_unused_surplus_pages(unsigned long unused_resv_pages)
495 496 497 498 499
{
	static int nid = -1;
	struct page *page;
	unsigned long nr_pages;

500 501 502 503 504 505 506 507
	/*
	 * We want to release as many surplus pages as possible, spread
	 * evenly across all nodes. Iterate across all nodes until we
	 * can no longer free unreserved surplus pages. This occurs when
	 * the nodes with surplus pages have no free pages.
	 */
	unsigned long remaining_iterations = num_online_nodes();

508 509 510
	/* Uncommit the reservation */
	resv_huge_pages -= unused_resv_pages;

511 512
	nr_pages = min(unused_resv_pages, surplus_huge_pages);

513
	while (remaining_iterations-- && nr_pages) {
514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530
		nid = next_node(nid, node_online_map);
		if (nid == MAX_NUMNODES)
			nid = first_node(node_online_map);

		if (!surplus_huge_pages_node[nid])
			continue;

		if (!list_empty(&hugepage_freelists[nid])) {
			page = list_entry(hugepage_freelists[nid].next,
					  struct page, lru);
			list_del(&page->lru);
			update_and_free_page(page);
			free_huge_pages--;
			free_huge_pages_node[nid]--;
			surplus_huge_pages--;
			surplus_huge_pages_node[nid]--;
			nr_pages--;
531
			remaining_iterations = num_online_nodes();
532 533 534 535
		}
	}
}

536 537
static struct page *alloc_huge_page(struct vm_area_struct *vma,
				    unsigned long addr)
L
Linus Torvalds 已提交
538
{
539
	struct page *page;
540 541 542 543 544 545 546 547 548 549 550 551 552 553
	struct address_space *mapping = vma->vm_file->f_mapping;
	struct inode *inode = mapping->host;
	unsigned int chg = 0;

	/*
	 * Processes that did not create the mapping will have no reserves and
	 * will not have accounted against quota. Check that the quota can be
	 * made before satisfying the allocation
	 */
	if (!vma_has_private_reserves(vma)) {
		chg = 1;
		if (hugetlb_get_quota(inode->i_mapping, chg))
			return ERR_PTR(-ENOSPC);
	}
L
Linus Torvalds 已提交
554 555

	spin_lock(&hugetlb_lock);
556
	page = dequeue_huge_page_vma(vma, addr);
L
Linus Torvalds 已提交
557
	spin_unlock(&hugetlb_lock);
558

K
Ken Chen 已提交
559
	if (!page) {
560
		page = alloc_buddy_huge_page(vma, addr);
K
Ken Chen 已提交
561
		if (!page) {
562
			hugetlb_put_quota(inode->i_mapping, chg);
K
Ken Chen 已提交
563 564 565
			return ERR_PTR(-VM_FAULT_OOM);
		}
	}
566

567 568
	set_page_refcounted(page);
	set_page_private(page, (unsigned long) mapping);
569 570

	return page;
571 572
}

L
Linus Torvalds 已提交
573 574 575 576
static int __init hugetlb_init(void)
{
	unsigned long i;

577 578 579
	if (HPAGE_SHIFT == 0)
		return 0;

L
Linus Torvalds 已提交
580 581 582
	for (i = 0; i < MAX_NUMNODES; ++i)
		INIT_LIST_HEAD(&hugepage_freelists[i]);

583 584
	hugetlb_next_nid = first_node(node_online_map);

L
Linus Torvalds 已提交
585
	for (i = 0; i < max_huge_pages; ++i) {
N
Nick Piggin 已提交
586
		if (!alloc_fresh_huge_page())
L
Linus Torvalds 已提交
587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602
			break;
	}
	max_huge_pages = free_huge_pages = nr_huge_pages = i;
	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
	return 0;
}
module_init(hugetlb_init);

static int __init hugetlb_setup(char *s)
{
	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
		max_huge_pages = 0;
	return 1;
}
__setup("hugepages=", hugetlb_setup);

603 604 605 606 607 608 609 610 611 612 613
static unsigned int cpuset_mems_nr(unsigned int *array)
{
	int node;
	unsigned int nr = 0;

	for_each_node_mask(node, cpuset_current_mems_allowed)
		nr += array[node];

	return nr;
}

L
Linus Torvalds 已提交
614 615 616 617
#ifdef CONFIG_SYSCTL
#ifdef CONFIG_HIGHMEM
static void try_to_free_low(unsigned long count)
{
618 619
	int i;

L
Linus Torvalds 已提交
620 621 622
	for (i = 0; i < MAX_NUMNODES; ++i) {
		struct page *page, *next;
		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
623 624
			if (count >= nr_huge_pages)
				return;
L
Linus Torvalds 已提交
625 626 627 628 629
			if (PageHighMem(page))
				continue;
			list_del(&page->lru);
			update_and_free_page(page);
			free_huge_pages--;
630
			free_huge_pages_node[page_to_nid(page)]--;
L
Linus Torvalds 已提交
631 632 633 634 635 636 637 638 639
		}
	}
}
#else
static inline void try_to_free_low(unsigned long count)
{
}
#endif

640
#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
L
Linus Torvalds 已提交
641 642
static unsigned long set_max_huge_pages(unsigned long count)
{
643
	unsigned long min_count, ret;
L
Linus Torvalds 已提交
644

645 646 647 648
	/*
	 * Increase the pool size
	 * First take pages out of surplus state.  Then make up the
	 * remaining difference by allocating fresh huge pages.
649 650 651 652 653 654
	 *
	 * We might race with alloc_buddy_huge_page() here and be unable
	 * to convert a surplus huge page to a normal huge page. That is
	 * not critical, though, it just means the overall size of the
	 * pool might be one hugepage larger than it needs to be, but
	 * within all the constraints specified by the sysctls.
655
	 */
L
Linus Torvalds 已提交
656
	spin_lock(&hugetlb_lock);
657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681
	while (surplus_huge_pages && count > persistent_huge_pages) {
		if (!adjust_pool_surplus(-1))
			break;
	}

	while (count > persistent_huge_pages) {
		/*
		 * If this allocation races such that we no longer need the
		 * page, free_huge_page will handle it by freeing the page
		 * and reducing the surplus.
		 */
		spin_unlock(&hugetlb_lock);
		ret = alloc_fresh_huge_page();
		spin_lock(&hugetlb_lock);
		if (!ret)
			goto out;

	}

	/*
	 * Decrease the pool size
	 * First return free pages to the buddy allocator (being careful
	 * to keep enough around to satisfy reservations).  Then place
	 * pages into surplus state as needed so the pool will shrink
	 * to the desired size as pages become free.
682 683 684 685 686 687 688 689
	 *
	 * By placing pages into the surplus state independent of the
	 * overcommit value, we are allowing the surplus pool size to
	 * exceed overcommit. There are few sane options here. Since
	 * alloc_buddy_huge_page() is checking the global counter,
	 * though, we'll note that we're not allowed to exceed surplus
	 * and won't grow the pool anywhere else. Not until one of the
	 * sysctls are changed, or the surplus pages go out of use.
690
	 */
691 692
	min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
	min_count = max(count, min_count);
693 694
	try_to_free_low(min_count);
	while (min_count < persistent_huge_pages) {
695
		struct page *page = dequeue_huge_page();
L
Linus Torvalds 已提交
696 697 698 699
		if (!page)
			break;
		update_and_free_page(page);
	}
700 701 702 703 704 705
	while (count < persistent_huge_pages) {
		if (!adjust_pool_surplus(1))
			break;
	}
out:
	ret = persistent_huge_pages;
L
Linus Torvalds 已提交
706
	spin_unlock(&hugetlb_lock);
707
	return ret;
L
Linus Torvalds 已提交
708 709 710 711 712 713 714 715 716 717
}

int hugetlb_sysctl_handler(struct ctl_table *table, int write,
			   struct file *file, void __user *buffer,
			   size_t *length, loff_t *ppos)
{
	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
	max_huge_pages = set_max_huge_pages(max_huge_pages);
	return 0;
}
718 719 720 721 722 723 724 725 726 727 728 729 730

int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
			struct file *file, void __user *buffer,
			size_t *length, loff_t *ppos)
{
	proc_dointvec(table, write, file, buffer, length, ppos);
	if (hugepages_treat_as_movable)
		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
	else
		htlb_alloc_mask = GFP_HIGHUSER;
	return 0;
}

731 732 733 734 735
int hugetlb_overcommit_handler(struct ctl_table *table, int write,
			struct file *file, void __user *buffer,
			size_t *length, loff_t *ppos)
{
	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
736 737
	spin_lock(&hugetlb_lock);
	nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
738 739 740 741
	spin_unlock(&hugetlb_lock);
	return 0;
}

L
Linus Torvalds 已提交
742 743 744 745 746 747 748
#endif /* CONFIG_SYSCTL */

int hugetlb_report_meminfo(char *buf)
{
	return sprintf(buf,
			"HugePages_Total: %5lu\n"
			"HugePages_Free:  %5lu\n"
749
			"HugePages_Rsvd:  %5lu\n"
750
			"HugePages_Surp:  %5lu\n"
L
Linus Torvalds 已提交
751 752 753
			"Hugepagesize:    %5lu kB\n",
			nr_huge_pages,
			free_huge_pages,
754
			resv_huge_pages,
755
			surplus_huge_pages,
L
Linus Torvalds 已提交
756 757 758 759 760 761 762
			HPAGE_SIZE/1024);
}

int hugetlb_report_node_meminfo(int nid, char *buf)
{
	return sprintf(buf,
		"Node %d HugePages_Total: %5u\n"
763 764
		"Node %d HugePages_Free:  %5u\n"
		"Node %d HugePages_Surp:  %5u\n",
L
Linus Torvalds 已提交
765
		nid, nr_huge_pages_node[nid],
766 767
		nid, free_huge_pages_node[nid],
		nid, surplus_huge_pages_node[nid]);
L
Linus Torvalds 已提交
768 769 770 771 772 773 774 775
}

/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
}

M
Mel Gorman 已提交
776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816
static int hugetlb_acct_memory(long delta)
{
	int ret = -ENOMEM;

	spin_lock(&hugetlb_lock);
	/*
	 * When cpuset is configured, it breaks the strict hugetlb page
	 * reservation as the accounting is done on a global variable. Such
	 * reservation is completely rubbish in the presence of cpuset because
	 * the reservation is not checked against page availability for the
	 * current cpuset. Application can still potentially OOM'ed by kernel
	 * with lack of free htlb page in cpuset that the task is in.
	 * Attempt to enforce strict accounting with cpuset is almost
	 * impossible (or too ugly) because cpuset is too fluid that
	 * task or memory node can be dynamically moved between cpusets.
	 *
	 * The change of semantics for shared hugetlb mapping with cpuset is
	 * undesirable. However, in order to preserve some of the semantics,
	 * we fall back to check against current free page availability as
	 * a best attempt and hopefully to minimize the impact of changing
	 * semantics that cpuset has.
	 */
	if (delta > 0) {
		if (gather_surplus_pages(delta) < 0)
			goto out;

		if (delta > cpuset_mems_nr(free_huge_pages_node)) {
			return_unused_surplus_pages(delta);
			goto out;
		}
	}

	ret = 0;
	if (delta < 0)
		return_unused_surplus_pages((unsigned long) -delta);

out:
	spin_unlock(&hugetlb_lock);
	return ret;
}

817 818 819 820 821 822 823
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
	unsigned long reserve = vma_resv_huge_pages(vma);
	if (reserve)
		hugetlb_acct_memory(-reserve);
}

L
Linus Torvalds 已提交
824 825 826 827 828 829
/*
 * We cannot handle pagefaults against hugetlb pages at all.  They cause
 * handle_mm_fault() to try to instantiate regular-sized pages in the
 * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
 * this far.
 */
N
Nick Piggin 已提交
830
static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
L
Linus Torvalds 已提交
831 832
{
	BUG();
N
Nick Piggin 已提交
833
	return 0;
L
Linus Torvalds 已提交
834 835 836
}

struct vm_operations_struct hugetlb_vm_ops = {
N
Nick Piggin 已提交
837
	.fault = hugetlb_vm_op_fault,
838
	.close = hugetlb_vm_op_close,
L
Linus Torvalds 已提交
839 840
};

841 842
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
				int writable)
D
David Gibson 已提交
843 844 845
{
	pte_t entry;

846
	if (writable) {
D
David Gibson 已提交
847 848 849
		entry =
		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
	} else {
850
		entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
D
David Gibson 已提交
851 852 853 854 855 856 857
	}
	entry = pte_mkyoung(entry);
	entry = pte_mkhuge(entry);

	return entry;
}

858 859 860 861 862
static void set_huge_ptep_writable(struct vm_area_struct *vma,
				   unsigned long address, pte_t *ptep)
{
	pte_t entry;

863 864
	entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
	if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
865 866
		update_mmu_cache(vma, address, entry);
	}
867 868 869
}


D
David Gibson 已提交
870 871 872 873 874
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
			    struct vm_area_struct *vma)
{
	pte_t *src_pte, *dst_pte, entry;
	struct page *ptepage;
875
	unsigned long addr;
876 877 878
	int cow;

	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
D
David Gibson 已提交
879

880
	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
H
Hugh Dickins 已提交
881 882 883
		src_pte = huge_pte_offset(src, addr);
		if (!src_pte)
			continue;
D
David Gibson 已提交
884 885 886
		dst_pte = huge_pte_alloc(dst, addr);
		if (!dst_pte)
			goto nomem;
887 888 889 890 891

		/* If the pagetables are shared don't copy or take references */
		if (dst_pte == src_pte)
			continue;

H
Hugh Dickins 已提交
892
		spin_lock(&dst->page_table_lock);
N
Nick Piggin 已提交
893
		spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
894
		if (!huge_pte_none(huge_ptep_get(src_pte))) {
895
			if (cow)
896 897
				huge_ptep_set_wrprotect(src, addr, src_pte);
			entry = huge_ptep_get(src_pte);
898 899 900 901 902
			ptepage = pte_page(entry);
			get_page(ptepage);
			set_huge_pte_at(dst, addr, dst_pte, entry);
		}
		spin_unlock(&src->page_table_lock);
H
Hugh Dickins 已提交
903
		spin_unlock(&dst->page_table_lock);
D
David Gibson 已提交
904 905 906 907 908 909 910
	}
	return 0;

nomem:
	return -ENOMEM;
}

911 912
void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
			    unsigned long end)
D
David Gibson 已提交
913 914 915
{
	struct mm_struct *mm = vma->vm_mm;
	unsigned long address;
916
	pte_t *ptep;
D
David Gibson 已提交
917 918
	pte_t pte;
	struct page *page;
919
	struct page *tmp;
920 921 922 923 924
	/*
	 * A page gathering list, protected by per file i_mmap_lock. The
	 * lock is used to avoid list corruption from multiple unmapping
	 * of the same page since we are using page->lru.
	 */
925
	LIST_HEAD(page_list);
D
David Gibson 已提交
926 927 928 929 930

	WARN_ON(!is_vm_hugetlb_page(vma));
	BUG_ON(start & ~HPAGE_MASK);
	BUG_ON(end & ~HPAGE_MASK);

931
	spin_lock(&mm->page_table_lock);
D
David Gibson 已提交
932
	for (address = start; address < end; address += HPAGE_SIZE) {
933
		ptep = huge_pte_offset(mm, address);
A
Adam Litke 已提交
934
		if (!ptep)
935 936
			continue;

937 938 939
		if (huge_pmd_unshare(mm, &address, ptep))
			continue;

940
		pte = huge_ptep_get_and_clear(mm, address, ptep);
941
		if (huge_pte_none(pte))
D
David Gibson 已提交
942
			continue;
943

D
David Gibson 已提交
944
		page = pte_page(pte);
945 946
		if (pte_dirty(pte))
			set_page_dirty(page);
947
		list_add(&page->lru, &page_list);
D
David Gibson 已提交
948
	}
L
Linus Torvalds 已提交
949
	spin_unlock(&mm->page_table_lock);
950
	flush_tlb_range(vma, start, end);
951 952 953 954
	list_for_each_entry_safe(page, tmp, &page_list, lru) {
		list_del(&page->lru);
		put_page(page);
	}
L
Linus Torvalds 已提交
955
}
D
David Gibson 已提交
956

957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
			  unsigned long end)
{
	/*
	 * It is undesirable to test vma->vm_file as it should be non-null
	 * for valid hugetlb area. However, vm_file will be NULL in the error
	 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
	 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
	 * to clean up. Since no pte has actually been setup, it is safe to
	 * do nothing in this case.
	 */
	if (vma->vm_file) {
		spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
		__unmap_hugepage_range(vma, start, end);
		spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
	}
}

975 976 977 978
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
			unsigned long address, pte_t *ptep, pte_t pte)
{
	struct page *old_page, *new_page;
979
	int avoidcopy;
980 981 982 983 984 985 986 987

	old_page = pte_page(pte);

	/* If no-one else is actually using this page, avoid the copy
	 * and just make the page writable */
	avoidcopy = (page_count(old_page) == 1);
	if (avoidcopy) {
		set_huge_ptep_writable(vma, address, ptep);
N
Nick Piggin 已提交
988
		return 0;
989 990 991
	}

	page_cache_get(old_page);
992
	new_page = alloc_huge_page(vma, address);
993

994
	if (IS_ERR(new_page)) {
995
		page_cache_release(old_page);
996
		return -PTR_ERR(new_page);
997 998 999
	}

	spin_unlock(&mm->page_table_lock);
1000
	copy_huge_page(new_page, old_page, address, vma);
N
Nick Piggin 已提交
1001
	__SetPageUptodate(new_page);
1002 1003 1004
	spin_lock(&mm->page_table_lock);

	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
1005
	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
1006
		/* Break COW */
1007
		huge_ptep_clear_flush(vma, address, ptep);
1008 1009 1010 1011 1012 1013 1014
		set_huge_pte_at(mm, address, ptep,
				make_huge_pte(vma, new_page, 1));
		/* Make the old page be freed below */
		new_page = old_page;
	}
	page_cache_release(new_page);
	page_cache_release(old_page);
N
Nick Piggin 已提交
1015
	return 0;
1016 1017
}

1018
static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1019
			unsigned long address, pte_t *ptep, int write_access)
1020 1021
{
	int ret = VM_FAULT_SIGBUS;
A
Adam Litke 已提交
1022 1023 1024 1025
	unsigned long idx;
	unsigned long size;
	struct page *page;
	struct address_space *mapping;
1026
	pte_t new_pte;
A
Adam Litke 已提交
1027 1028 1029 1030 1031 1032 1033 1034 1035

	mapping = vma->vm_file->f_mapping;
	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));

	/*
	 * Use page lock to guard against racing truncation
	 * before we get page_table_lock.
	 */
1036 1037 1038
retry:
	page = find_lock_page(mapping, idx);
	if (!page) {
1039 1040 1041
		size = i_size_read(mapping->host) >> HPAGE_SHIFT;
		if (idx >= size)
			goto out;
1042
		page = alloc_huge_page(vma, address);
1043 1044
		if (IS_ERR(page)) {
			ret = -PTR_ERR(page);
1045 1046
			goto out;
		}
1047
		clear_huge_page(page, address);
N
Nick Piggin 已提交
1048
		__SetPageUptodate(page);
1049

1050 1051
		if (vma->vm_flags & VM_SHARED) {
			int err;
K
Ken Chen 已提交
1052
			struct inode *inode = mapping->host;
1053 1054 1055 1056 1057 1058 1059 1060

			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
			if (err) {
				put_page(page);
				if (err == -EEXIST)
					goto retry;
				goto out;
			}
K
Ken Chen 已提交
1061 1062 1063 1064

			spin_lock(&inode->i_lock);
			inode->i_blocks += BLOCKS_PER_HUGEPAGE;
			spin_unlock(&inode->i_lock);
1065 1066 1067
		} else
			lock_page(page);
	}
1068

1069
	spin_lock(&mm->page_table_lock);
A
Adam Litke 已提交
1070 1071 1072 1073
	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
	if (idx >= size)
		goto backout;

N
Nick Piggin 已提交
1074
	ret = 0;
1075
	if (!huge_pte_none(huge_ptep_get(ptep)))
A
Adam Litke 已提交
1076 1077
		goto backout;

1078 1079 1080 1081 1082 1083 1084 1085 1086
	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
				&& (vma->vm_flags & VM_SHARED)));
	set_huge_pte_at(mm, address, ptep, new_pte);

	if (write_access && !(vma->vm_flags & VM_SHARED)) {
		/* Optimization, do the COW without a second fault */
		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
	}

1087
	spin_unlock(&mm->page_table_lock);
A
Adam Litke 已提交
1088 1089
	unlock_page(page);
out:
1090
	return ret;
A
Adam Litke 已提交
1091 1092 1093 1094 1095 1096

backout:
	spin_unlock(&mm->page_table_lock);
	unlock_page(page);
	put_page(page);
	goto out;
1097 1098
}

1099 1100 1101 1102 1103
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
			unsigned long address, int write_access)
{
	pte_t *ptep;
	pte_t entry;
1104
	int ret;
1105
	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
1106 1107 1108 1109 1110

	ptep = huge_pte_alloc(mm, address);
	if (!ptep)
		return VM_FAULT_OOM;

1111 1112 1113 1114 1115 1116
	/*
	 * Serialize hugepage allocation and instantiation, so that we don't
	 * get spurious allocation failures if two CPUs race to instantiate
	 * the same page in the page cache.
	 */
	mutex_lock(&hugetlb_instantiation_mutex);
1117 1118
	entry = huge_ptep_get(ptep);
	if (huge_pte_none(entry)) {
1119 1120 1121 1122
		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
		mutex_unlock(&hugetlb_instantiation_mutex);
		return ret;
	}
1123

N
Nick Piggin 已提交
1124
	ret = 0;
1125 1126 1127

	spin_lock(&mm->page_table_lock);
	/* Check for a racing update before calling hugetlb_cow */
1128
	if (likely(pte_same(entry, huge_ptep_get(ptep))))
1129 1130 1131
		if (write_access && !pte_write(entry))
			ret = hugetlb_cow(mm, vma, address, ptep, entry);
	spin_unlock(&mm->page_table_lock);
1132
	mutex_unlock(&hugetlb_instantiation_mutex);
1133 1134

	return ret;
1135 1136
}

D
David Gibson 已提交
1137 1138
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
			struct page **pages, struct vm_area_struct **vmas,
1139 1140
			unsigned long *position, int *length, int i,
			int write)
D
David Gibson 已提交
1141
{
1142 1143
	unsigned long pfn_offset;
	unsigned long vaddr = *position;
D
David Gibson 已提交
1144 1145
	int remainder = *length;

1146
	spin_lock(&mm->page_table_lock);
D
David Gibson 已提交
1147
	while (vaddr < vma->vm_end && remainder) {
A
Adam Litke 已提交
1148 1149
		pte_t *pte;
		struct page *page;
D
David Gibson 已提交
1150

A
Adam Litke 已提交
1151 1152 1153 1154 1155 1156
		/*
		 * Some archs (sparc64, sh*) have multiple pte_ts to
		 * each hugepage.  We have to make * sure we get the
		 * first, for the page indexing below to work.
		 */
		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
D
David Gibson 已提交
1157

1158 1159
		if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
		    (write && !pte_write(huge_ptep_get(pte)))) {
A
Adam Litke 已提交
1160
			int ret;
D
David Gibson 已提交
1161

A
Adam Litke 已提交
1162
			spin_unlock(&mm->page_table_lock);
1163
			ret = hugetlb_fault(mm, vma, vaddr, write);
A
Adam Litke 已提交
1164
			spin_lock(&mm->page_table_lock);
1165
			if (!(ret & VM_FAULT_ERROR))
A
Adam Litke 已提交
1166
				continue;
D
David Gibson 已提交
1167

A
Adam Litke 已提交
1168 1169 1170 1171 1172 1173
			remainder = 0;
			if (!i)
				i = -EFAULT;
			break;
		}

1174
		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
1175
		page = pte_page(huge_ptep_get(pte));
1176
same_page:
1177 1178
		if (pages) {
			get_page(page);
1179
			pages[i] = page + pfn_offset;
1180
		}
D
David Gibson 已提交
1181 1182 1183 1184 1185

		if (vmas)
			vmas[i] = vma;

		vaddr += PAGE_SIZE;
1186
		++pfn_offset;
D
David Gibson 已提交
1187 1188
		--remainder;
		++i;
1189 1190 1191 1192 1193 1194 1195 1196
		if (vaddr < vma->vm_end && remainder &&
				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
			/*
			 * We use pfn_offset to avoid touching the pageframes
			 * of this compound page.
			 */
			goto same_page;
		}
D
David Gibson 已提交
1197
	}
1198
	spin_unlock(&mm->page_table_lock);
D
David Gibson 已提交
1199 1200 1201 1202 1203
	*length = remainder;
	*position = vaddr;

	return i;
}
1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215

void hugetlb_change_protection(struct vm_area_struct *vma,
		unsigned long address, unsigned long end, pgprot_t newprot)
{
	struct mm_struct *mm = vma->vm_mm;
	unsigned long start = address;
	pte_t *ptep;
	pte_t pte;

	BUG_ON(address >= end);
	flush_cache_range(vma, address, end);

1216
	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
1217 1218 1219 1220 1221
	spin_lock(&mm->page_table_lock);
	for (; address < end; address += HPAGE_SIZE) {
		ptep = huge_pte_offset(mm, address);
		if (!ptep)
			continue;
1222 1223
		if (huge_pmd_unshare(mm, &address, ptep))
			continue;
1224
		if (!huge_pte_none(huge_ptep_get(ptep))) {
1225 1226 1227 1228 1229 1230
			pte = huge_ptep_get_and_clear(mm, address, ptep);
			pte = pte_mkhuge(pte_modify(pte, newprot));
			set_huge_pte_at(mm, address, ptep, pte);
		}
	}
	spin_unlock(&mm->page_table_lock);
1231
	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
1232 1233 1234 1235

	flush_tlb_range(vma, start, end);
}

1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289
struct file_region {
	struct list_head link;
	long from;
	long to;
};

static long region_add(struct list_head *head, long f, long t)
{
	struct file_region *rg, *nrg, *trg;

	/* Locate the region we are either in or before. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

	/* Round our left edge to the current segment if it encloses us. */
	if (f > rg->from)
		f = rg->from;

	/* Check for and consume any regions we now overlap with. */
	nrg = rg;
	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		if (rg->from > t)
			break;

		/* If this area reaches higher then extend our area to
		 * include it completely.  If this is not the first area
		 * which we intend to reuse, free it. */
		if (rg->to > t)
			t = rg->to;
		if (rg != nrg) {
			list_del(&rg->link);
			kfree(rg);
		}
	}
	nrg->from = f;
	nrg->to = t;
	return 0;
}

static long region_chg(struct list_head *head, long f, long t)
{
	struct file_region *rg, *nrg;
	long chg = 0;

	/* Locate the region we are before or in. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

	/* If we are below the current region then a new region is required.
	 * Subtle, allocate a new region at the position but make it zero
S
Simon Arlott 已提交
1290
	 * size such that we can guarantee to record the reservation. */
1291 1292
	if (&rg->link == head || t < rg->from) {
		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
1293
		if (!nrg)
1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356
			return -ENOMEM;
		nrg->from = f;
		nrg->to   = f;
		INIT_LIST_HEAD(&nrg->link);
		list_add(&nrg->link, rg->link.prev);

		return t - f;
	}

	/* Round our left edge to the current segment if it encloses us. */
	if (f > rg->from)
		f = rg->from;
	chg = t - f;

	/* Check for and consume any regions we now overlap with. */
	list_for_each_entry(rg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		if (rg->from > t)
			return chg;

		/* We overlap with this area, if it extends futher than
		 * us then we must extend ourselves.  Account for its
		 * existing reservation. */
		if (rg->to > t) {
			chg += rg->to - t;
			t = rg->to;
		}
		chg -= rg->to - rg->from;
	}
	return chg;
}

static long region_truncate(struct list_head *head, long end)
{
	struct file_region *rg, *trg;
	long chg = 0;

	/* Locate the region we are either in or before. */
	list_for_each_entry(rg, head, link)
		if (end <= rg->to)
			break;
	if (&rg->link == head)
		return 0;

	/* If we are in the middle of a region then adjust it. */
	if (end > rg->from) {
		chg = rg->to - end;
		rg->to = end;
		rg = list_entry(rg->link.next, typeof(*rg), link);
	}

	/* Drop any remaining regions. */
	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		chg += rg->to - rg->from;
		list_del(&rg->link);
		kfree(rg);
	}
	return chg;
}

1357 1358 1359
int hugetlb_reserve_pages(struct inode *inode,
					long from, long to,
					struct vm_area_struct *vma)
1360 1361 1362
{
	long ret, chg;

1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375
	/*
	 * Shared mappings base their reservation on the number of pages that
	 * are already allocated on behalf of the file. Private mappings need
	 * to reserve the full area even if read-only as mprotect() may be
	 * called to make the mapping read-write. Assume !vma is a shm mapping
	 */
	if (!vma || vma->vm_flags & VM_SHARED)
		chg = region_chg(&inode->i_mapping->private_list, from, to);
	else {
		chg = to - from;
		set_vma_resv_huge_pages(vma, chg);
	}

1376 1377
	if (chg < 0)
		return chg;
1378

1379 1380
	if (hugetlb_get_quota(inode->i_mapping, chg))
		return -ENOSPC;
1381
	ret = hugetlb_acct_memory(chg);
K
Ken Chen 已提交
1382 1383
	if (ret < 0) {
		hugetlb_put_quota(inode->i_mapping, chg);
1384
		return ret;
K
Ken Chen 已提交
1385
	}
1386 1387
	if (!vma || vma->vm_flags & VM_SHARED)
		region_add(&inode->i_mapping->private_list, from, to);
1388 1389 1390 1391 1392 1393
	return 0;
}

void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
{
	long chg = region_truncate(&inode->i_mapping->private_list, offset);
K
Ken Chen 已提交
1394 1395 1396 1397 1398

	spin_lock(&inode->i_lock);
	inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed;
	spin_unlock(&inode->i_lock);

1399 1400
	hugetlb_put_quota(inode->i_mapping, (chg - freed));
	hugetlb_acct_memory(-(chg - freed));
1401
}