memcontrol.c 33.7 KB
Newer Older
B
Balbir Singh 已提交
1 2 3 4 5
/* memcontrol.c - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
6 7 8
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 *
B
Balbir Singh 已提交
9 10 11 12 13 14 15 16 17 18 19 20 21 22
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
23
#include <linux/mm.h>
24
#include <linux/smp.h>
25
#include <linux/page-flags.h>
26
#include <linux/backing-dev.h>
27 28
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
29
#include <linux/slab.h>
30 31 32
#include <linux/swap.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
33
#include <linux/seq_file.h>
34
#include <linux/vmalloc.h>
35
#include <linux/mm_inline.h>
36
#include <linux/page_cgroup.h>
B
Balbir Singh 已提交
37

38 39
#include <asm/uaccess.h>

40 41
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES	5
B
Balbir Singh 已提交
42

43 44 45 46 47 48 49 50 51
/*
 * Statistics for memory cgroup.
 */
enum mem_cgroup_stat_index {
	/*
	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
	 */
	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as rss */
52 53
	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
54 55 56 57 58 59 60 61 62

	MEM_CGROUP_STAT_NSTATS,
};

struct mem_cgroup_stat_cpu {
	s64 count[MEM_CGROUP_STAT_NSTATS];
} ____cacheline_aligned_in_smp;

struct mem_cgroup_stat {
63
	struct mem_cgroup_stat_cpu cpustat[0];
64 65 66 67 68
};

/*
 * For accounting under irq disable, no need for increment preempt count.
 */
69
static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
70 71
		enum mem_cgroup_stat_index idx, int val)
{
72
	stat->count[idx] += val;
73 74 75 76 77 78 79 80 81 82 83 84
}

static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
		enum mem_cgroup_stat_index idx)
{
	int cpu;
	s64 ret = 0;
	for_each_possible_cpu(cpu)
		ret += stat->cpustat[cpu].count[idx];
	return ret;
}

85 86 87 88
/*
 * per-zone information in memory controller.
 */
struct mem_cgroup_per_zone {
89 90 91 92
	/*
	 * spin_lock to protect the per cgroup LRU
	 */
	spinlock_t		lru_lock;
93 94
	struct list_head	lists[NR_LRU_LISTS];
	unsigned long		count[NR_LRU_LISTS];
95 96 97 98 99 100 101 102 103 104 105 106
};
/* Macro for accessing counter */
#define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])

struct mem_cgroup_per_node {
	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};

struct mem_cgroup_lru_info {
	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
};

B
Balbir Singh 已提交
107 108 109 110 111 112 113
/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 *
 * TODO: Add a water mark for the memory controller. Reclaim will begin when
114 115 116
 * we hit the water mark. May be even add a low water mark, such that
 * no reclaim occurs from a cgroup at it's low water mark, this is
 * a feature that will be implemented much later in the future.
B
Balbir Singh 已提交
117 118 119 120 121 122 123
 */
struct mem_cgroup {
	struct cgroup_subsys_state css;
	/*
	 * the counter to account for memory usage
	 */
	struct res_counter res;
124 125 126 127
	/*
	 * Per cgroup active and inactive list, similar to the
	 * per zone LRU lists.
	 */
128
	struct mem_cgroup_lru_info info;
129

130
	int	prev_priority;	/* for recording reclaim priority */
131
	/*
132
	 * statistics. This must be placed at the end of memcg.
133 134
	 */
	struct mem_cgroup_stat stat;
B
Balbir Singh 已提交
135 136
};

137 138 139
enum charge_type {
	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
	MEM_CGROUP_CHARGE_TYPE_MAPPED,
140
	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
141 142 143 144
	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
	NR_CHARGE_TYPE,
};

145 146 147 148 149 150
/* only for here (for easy reading.) */
#define PCGF_CACHE	(1UL << PCG_CACHE)
#define PCGF_USED	(1UL << PCG_USED)
#define PCGF_ACTIVE	(1UL << PCG_ACTIVE)
#define PCGF_LOCK	(1UL << PCG_LOCK)
#define PCGF_FILE	(1UL << PCG_FILE)
151 152
static const unsigned long
pcg_default_flags[NR_CHARGE_TYPE] = {
153 154 155 156
	PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
	PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
	PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
	0, /* FORCE */
157 158
};

159 160 161
/*
 * Always modified under lru lock. Then, not necessary to preempt_disable()
 */
162 163 164
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
					 struct page_cgroup *pc,
					 bool charge)
165 166 167
{
	int val = (charge)? 1 : -1;
	struct mem_cgroup_stat *stat = &mem->stat;
168
	struct mem_cgroup_stat_cpu *cpustat;
169

170
	VM_BUG_ON(!irqs_disabled());
171 172

	cpustat = &stat->cpustat[smp_processor_id()];
173
	if (PageCgroupCache(pc))
174
		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
175
	else
176
		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
177 178

	if (charge)
179
		__mem_cgroup_stat_add_safe(cpustat,
180 181
				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
	else
182
		__mem_cgroup_stat_add_safe(cpustat,
183
				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
184 185
}

186
static struct mem_cgroup_per_zone *
187 188 189 190 191
mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
{
	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
}

192
static struct mem_cgroup_per_zone *
193 194 195 196 197
page_cgroup_zoneinfo(struct page_cgroup *pc)
{
	struct mem_cgroup *mem = pc->mem_cgroup;
	int nid = page_cgroup_nid(pc);
	int zid = page_cgroup_zid(pc);
198

199 200 201 202
	return mem_cgroup_zoneinfo(mem, nid, zid);
}

static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
203
					enum lru_list idx)
204 205 206 207 208 209 210 211 212 213 214
{
	int nid, zid;
	struct mem_cgroup_per_zone *mz;
	u64 total = 0;

	for_each_online_node(nid)
		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
			mz = mem_cgroup_zoneinfo(mem, nid, zid);
			total += MEM_CGROUP_ZSTAT(mz, idx);
		}
	return total;
215 216
}

217
static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
B
Balbir Singh 已提交
218 219 220 221 222 223
{
	return container_of(cgroup_subsys_state(cont,
				mem_cgroup_subsys_id), struct mem_cgroup,
				css);
}

224
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
225
{
226 227 228 229 230 231 232 233
	/*
	 * mm_update_next_owner() may clear mm->owner to NULL
	 * if it races with swapoff, page migration, etc.
	 * So this can be called with p == NULL.
	 */
	if (unlikely(!p))
		return NULL;

234 235 236 237
	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
				struct mem_cgroup, css);
}

238 239
static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
			struct page_cgroup *pc)
240
{
241 242
	int lru = LRU_BASE;

243
	if (PageCgroupUnevictable(pc))
L
Lee Schermerhorn 已提交
244 245
		lru = LRU_UNEVICTABLE;
	else {
246
		if (PageCgroupActive(pc))
L
Lee Schermerhorn 已提交
247
			lru += LRU_ACTIVE;
248
		if (PageCgroupFile(pc))
L
Lee Schermerhorn 已提交
249 250
			lru += LRU_FILE;
	}
251

252
	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
253

254
	mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
255
	list_del(&pc->lru);
256 257
}

258
static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
259
				struct page_cgroup *pc, bool hot)
260
{
261
	int lru = LRU_BASE;
262

263
	if (PageCgroupUnevictable(pc))
L
Lee Schermerhorn 已提交
264 265
		lru = LRU_UNEVICTABLE;
	else {
266
		if (PageCgroupActive(pc))
L
Lee Schermerhorn 已提交
267
			lru += LRU_ACTIVE;
268
		if (PageCgroupFile(pc))
L
Lee Schermerhorn 已提交
269 270
			lru += LRU_FILE;
	}
271 272

	MEM_CGROUP_ZSTAT(mz, lru) += 1;
273 274 275 276
	if (hot)
		list_add(&pc->lru, &mz->lists[lru]);
	else
		list_add_tail(&pc->lru, &mz->lists[lru]);
277

278
	mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
279 280
}

L
Lee Schermerhorn 已提交
281
static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
282
{
283
	struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
284 285 286
	int active    = PageCgroupActive(pc);
	int file      = PageCgroupFile(pc);
	int unevictable = PageCgroupUnevictable(pc);
L
Lee Schermerhorn 已提交
287 288
	enum lru_list from = unevictable ? LRU_UNEVICTABLE :
				(LRU_FILE * !!file + !!active);
289

L
Lee Schermerhorn 已提交
290 291
	if (lru == from)
		return;
292

L
Lee Schermerhorn 已提交
293
	MEM_CGROUP_ZSTAT(mz, from) -= 1;
294 295 296 297 298
	/*
	 * However this is done under mz->lru_lock, another flags, which
	 * are not related to LRU, will be modified from out-of-lock.
	 * We have to use atomic set/clear flags.
	 */
L
Lee Schermerhorn 已提交
299
	if (is_unevictable_lru(lru)) {
300 301
		ClearPageCgroupActive(pc);
		SetPageCgroupUnevictable(pc);
L
Lee Schermerhorn 已提交
302 303
	} else {
		if (is_active_lru(lru))
304
			SetPageCgroupActive(pc);
L
Lee Schermerhorn 已提交
305
		else
306 307
			ClearPageCgroupActive(pc);
		ClearPageCgroupUnevictable(pc);
L
Lee Schermerhorn 已提交
308
	}
309 310 311

	MEM_CGROUP_ZSTAT(mz, lru) += 1;
	list_move(&pc->lru, &mz->lists[lru]);
312 313
}

314 315 316 317 318
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
{
	int ret;

	task_lock(task);
319
	ret = task->mm && mm_match_cgroup(task->mm, mem);
320 321 322 323
	task_unlock(task);
	return ret;
}

324 325 326
/*
 * This routine assumes that the appropriate zone's lru lock is already held
 */
L
Lee Schermerhorn 已提交
327
void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
328
{
329
	struct page_cgroup *pc;
330 331 332
	struct mem_cgroup_per_zone *mz;
	unsigned long flags;

333 334 335
	if (mem_cgroup_subsys.disabled)
		return;

336 337 338 339 340 341 342
	/*
	 * We cannot lock_page_cgroup while holding zone's lru_lock,
	 * because other holders of lock_page_cgroup can be interrupted
	 * with an attempt to rotate_reclaimable_page.  But we cannot
	 * safely get to page_cgroup without it, so just try_lock it:
	 * mem_cgroup_isolate_pages allows for page left on wrong list.
	 */
343 344
	pc = lookup_page_cgroup(page);
	if (!trylock_page_cgroup(pc))
345
		return;
346
	if (pc && PageCgroupUsed(pc)) {
347 348
		mz = page_cgroup_zoneinfo(pc);
		spin_lock_irqsave(&mz->lru_lock, flags);
L
Lee Schermerhorn 已提交
349
		__mem_cgroup_move_lists(pc, lru);
350
		spin_unlock_irqrestore(&mz->lru_lock, flags);
351
	}
352
	unlock_page_cgroup(pc);
353 354
}

355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370
/*
 * Calculate mapped_ratio under memory controller. This will be used in
 * vmscan.c for deteremining we have to reclaim mapped pages.
 */
int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
{
	long total, rss;

	/*
	 * usage is recorded in bytes. But, here, we assume the number of
	 * physical pages can be represented by "long" on any arch.
	 */
	total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
	rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
	return (int)((rss * 100L) / total);
}
371

372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390
/*
 * prev_priority control...this will be used in memory reclaim path.
 */
int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
{
	return mem->prev_priority;
}

void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
{
	if (priority < mem->prev_priority)
		mem->prev_priority = priority;
}

void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
{
	mem->prev_priority = priority;
}

391 392 393 394 395 396 397 398
/*
 * Calculate # of pages to be scanned in this priority/zone.
 * See also vmscan.c
 *
 * priority starts from "DEF_PRIORITY" and decremented in each loop.
 * (see include/linux/mmzone.h)
 */

399 400
long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
					int priority, enum lru_list lru)
401
{
402
	long nr_pages;
403 404 405 406
	int nid = zone->zone_pgdat->node_id;
	int zid = zone_idx(zone);
	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);

407
	nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
408

409
	return (nr_pages >> priority);
410 411
}

412 413 414 415 416
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
					struct list_head *dst,
					unsigned long *scanned, int order,
					int mode, struct zone *z,
					struct mem_cgroup *mem_cont,
417
					int active, int file)
418 419 420 421 422 423
{
	unsigned long nr_taken = 0;
	struct page *page;
	unsigned long scan;
	LIST_HEAD(pc_list);
	struct list_head *src;
424
	struct page_cgroup *pc, *tmp;
425 426 427
	int nid = z->zone_pgdat->node_id;
	int zid = zone_idx(z);
	struct mem_cgroup_per_zone *mz;
428
	int lru = LRU_FILE * !!file + !!active;
429

430
	BUG_ON(!mem_cont);
431
	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
432
	src = &mz->lists[lru];
433

434
	spin_lock(&mz->lru_lock);
435 436
	scan = 0;
	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
H
Hugh Dickins 已提交
437
		if (scan >= nr_to_scan)
438
			break;
439 440
		if (unlikely(!PageCgroupUsed(pc)))
			continue;
441 442
		page = pc->page;

H
Hugh Dickins 已提交
443
		if (unlikely(!PageLRU(page)))
444 445
			continue;

446 447 448
		/*
		 * TODO: play better with lumpy reclaim, grabbing anything.
		 */
L
Lee Schermerhorn 已提交
449 450 451 452
		if (PageUnevictable(page) ||
		    (PageActive(page) && !active) ||
		    (!PageActive(page) && active)) {
			__mem_cgroup_move_lists(pc, page_lru(page));
453 454 455
			continue;
		}

H
Hugh Dickins 已提交
456 457
		scan++;
		list_move(&pc->lru, &pc_list);
458

459
		if (__isolate_lru_page(page, mode, file) == 0) {
460 461 462 463 464 465
			list_move(&page->lru, dst);
			nr_taken++;
		}
	}

	list_splice(&pc_list, src);
466
	spin_unlock(&mz->lru_lock);
467 468 469 470 471

	*scanned = scan;
	return nr_taken;
}

472 473 474
/*
 * Unlike exported interface, "oom" parameter is added. if oom==true,
 * oom-killer can be invoked.
475
 */
476 477
static int __mem_cgroup_try_charge(struct mm_struct *mm,
			gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
478 479
{
	struct mem_cgroup *mem;
480
	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
481
	/*
482 483
	 * We always charge the cgroup the mm_struct belongs to.
	 * The mm_struct's mem_cgroup changes on task migration if the
484 485 486
	 * thread group leader migrates. It's possible that mm is not
	 * set, if so charge the init_mm (happens for pagecache usage).
	 */
487
	if (likely(!*memcg)) {
488 489
		rcu_read_lock();
		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
490 491 492 493
		if (unlikely(!mem)) {
			rcu_read_unlock();
			return 0;
		}
494 495 496 497
		/*
		 * For every charge from the cgroup, increment reference count
		 */
		css_get(&mem->css);
498
		*memcg = mem;
499 500
		rcu_read_unlock();
	} else {
501 502
		mem = *memcg;
		css_get(&mem->css);
503
	}
504

505

506
	while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
507
		if (!(gfp_mask & __GFP_WAIT))
508
			goto nomem;
509 510

		if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
511 512 513
			continue;

		/*
514 515 516 517 518 519
		 * try_to_free_mem_cgroup_pages() might not give us a full
		 * picture of reclaim. Some pages are reclaimed and might be
		 * moved to swap cache or just unmapped from the cgroup.
		 * Check the limit again to see if the reclaim reduced the
		 * current usage of the cgroup before giving up
		 */
520 521
		if (res_counter_check_under_limit(&mem->res))
			continue;
522 523

		if (!nr_retries--) {
524 525
			if (oom)
				mem_cgroup_out_of_memory(mem, gfp_mask);
526
			goto nomem;
527
		}
528
	}
529 530 531 532 533
	return 0;
nomem:
	css_put(&mem->css);
	return -ENOMEM;
}
534

535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553
/**
 * mem_cgroup_try_charge - get charge of PAGE_SIZE.
 * @mm: an mm_struct which is charged against. (when *memcg is NULL)
 * @gfp_mask: gfp_mask for reclaim.
 * @memcg: a pointer to memory cgroup which is charged against.
 *
 * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
 * memory cgroup from @mm is got and stored in *memcg.
 *
 * Returns 0 if success. -ENOMEM at failure.
 * This call can invoke OOM-Killer.
 */

int mem_cgroup_try_charge(struct mm_struct *mm,
			  gfp_t mask, struct mem_cgroup **memcg)
{
	return __mem_cgroup_try_charge(mm, mask, memcg, true);
}

554 555 556 557 558 559 560 561 562 563 564 565 566 567 568
/*
 * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
 * USED state. If already USED, uncharge and return.
 */

static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
				     struct page_cgroup *pc,
				     enum charge_type ctype)
{
	struct mem_cgroup_per_zone *mz;
	unsigned long flags;

	/* try_charge() can return NULL to *memcg, taking care of it. */
	if (!mem)
		return;
569 570 571 572 573 574

	lock_page_cgroup(pc);
	if (unlikely(PageCgroupUsed(pc))) {
		unlock_page_cgroup(pc);
		res_counter_uncharge(&mem->res, PAGE_SIZE);
		css_put(&mem->css);
575
		return;
576
	}
577
	pc->mem_cgroup = mem;
578 579 580 581
	/*
	 * If a page is accounted as a page cache, insert to inactive list.
	 * If anon, insert to active list.
	 */
582
	pc->flags = pcg_default_flags[ctype];
583

584
	mz = page_cgroup_zoneinfo(pc);
585

586
	spin_lock_irqsave(&mz->lru_lock, flags);
587
	__mem_cgroup_add_list(mz, pc, true);
588
	spin_unlock_irqrestore(&mz->lru_lock, flags);
589
	unlock_page_cgroup(pc);
590
}
591

592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689
/**
 * mem_cgroup_move_account - move account of the page
 * @pc:	page_cgroup of the page.
 * @from: mem_cgroup which the page is moved from.
 * @to:	mem_cgroup which the page is moved to. @from != @to.
 *
 * The caller must confirm following.
 * 1. disable irq.
 * 2. lru_lock of old mem_cgroup(@from) should be held.
 *
 * returns 0 at success,
 * returns -EBUSY when lock is busy or "pc" is unstable.
 *
 * This function does "uncharge" from old cgroup but doesn't do "charge" to
 * new cgroup. It should be done by a caller.
 */

static int mem_cgroup_move_account(struct page_cgroup *pc,
	struct mem_cgroup *from, struct mem_cgroup *to)
{
	struct mem_cgroup_per_zone *from_mz, *to_mz;
	int nid, zid;
	int ret = -EBUSY;

	VM_BUG_ON(!irqs_disabled());
	VM_BUG_ON(from == to);

	nid = page_cgroup_nid(pc);
	zid = page_cgroup_zid(pc);
	from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
	to_mz =  mem_cgroup_zoneinfo(to, nid, zid);


	if (!trylock_page_cgroup(pc))
		return ret;

	if (!PageCgroupUsed(pc))
		goto out;

	if (pc->mem_cgroup != from)
		goto out;

	if (spin_trylock(&to_mz->lru_lock)) {
		__mem_cgroup_remove_list(from_mz, pc);
		css_put(&from->css);
		res_counter_uncharge(&from->res, PAGE_SIZE);
		pc->mem_cgroup = to;
		css_get(&to->css);
		__mem_cgroup_add_list(to_mz, pc, false);
		ret = 0;
		spin_unlock(&to_mz->lru_lock);
	}
out:
	unlock_page_cgroup(pc);
	return ret;
}

/*
 * move charges to its parent.
 */

static int mem_cgroup_move_parent(struct page_cgroup *pc,
				  struct mem_cgroup *child,
				  gfp_t gfp_mask)
{
	struct cgroup *cg = child->css.cgroup;
	struct cgroup *pcg = cg->parent;
	struct mem_cgroup *parent;
	struct mem_cgroup_per_zone *mz;
	unsigned long flags;
	int ret;

	/* Is ROOT ? */
	if (!pcg)
		return -EINVAL;

	parent = mem_cgroup_from_cont(pcg);

	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
	if (ret)
		return ret;

	mz = mem_cgroup_zoneinfo(child,
			page_cgroup_nid(pc), page_cgroup_zid(pc));

	spin_lock_irqsave(&mz->lru_lock, flags);
	ret = mem_cgroup_move_account(pc, child, parent);
	spin_unlock_irqrestore(&mz->lru_lock, flags);

	/* drop extra refcnt */
	css_put(&parent->css);
	/* uncharge if move fails */
	if (ret)
		res_counter_uncharge(&parent->res, PAGE_SIZE);

	return ret;
}

690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710
/*
 * Charge the memory controller for page usage.
 * Return
 * 0 if the charge was successful
 * < 0 if the cgroup is over its limit
 */
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
				gfp_t gfp_mask, enum charge_type ctype,
				struct mem_cgroup *memcg)
{
	struct mem_cgroup *mem;
	struct page_cgroup *pc;
	int ret;

	pc = lookup_page_cgroup(page);
	/* can happen at boot */
	if (unlikely(!pc))
		return 0;
	prefetchw(pc);

	mem = memcg;
711
	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
712 713 714 715
	if (ret)
		return ret;

	__mem_cgroup_commit_charge(mem, pc, ctype);
716 717 718
	return 0;
}

719 720
int mem_cgroup_newpage_charge(struct page *page,
			      struct mm_struct *mm, gfp_t gfp_mask)
721
{
722 723
	if (mem_cgroup_subsys.disabled)
		return 0;
724 725
	if (PageCompound(page))
		return 0;
726 727 728 729 730 731 732 733 734 735 736
	/*
	 * If already mapped, we don't have to account.
	 * If page cache, page->mapping has address_space.
	 * But page->mapping may have out-of-use anon_vma pointer,
	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
	 * is NULL.
  	 */
	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
		return 0;
	if (unlikely(!mm))
		mm = &init_mm;
737
	return mem_cgroup_charge_common(page, mm, gfp_mask,
738
				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
739 740
}

741 742
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
				gfp_t gfp_mask)
743
{
744 745
	if (mem_cgroup_subsys.disabled)
		return 0;
746 747
	if (PageCompound(page))
		return 0;
748 749 750 751 752 753 754 755 756 757 758 759
	/*
	 * Corner case handling. This is called from add_to_page_cache()
	 * in usual. But some FS (shmem) precharges this page before calling it
	 * and call add_to_page_cache() with GFP_NOWAIT.
	 *
	 * For GFP_NOWAIT case, the page may be pre-charged before calling
	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
	 * charge twice. (It works but has to pay a bit larger cost.)
	 */
	if (!(gfp_mask & __GFP_WAIT)) {
		struct page_cgroup *pc;

760 761 762 763 764 765 766

		pc = lookup_page_cgroup(page);
		if (!pc)
			return 0;
		lock_page_cgroup(pc);
		if (PageCgroupUsed(pc)) {
			unlock_page_cgroup(pc);
767 768
			return 0;
		}
769
		unlock_page_cgroup(pc);
770 771
	}

772
	if (unlikely(!mm))
773
		mm = &init_mm;
774

775 776
	if (page_is_file_cache(page))
		return mem_cgroup_charge_common(page, mm, gfp_mask,
777
				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
778 779 780
	else
		return mem_cgroup_charge_common(page, mm, gfp_mask,
				MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
781 782
}

783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805
void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
{
	struct page_cgroup *pc;

	if (mem_cgroup_subsys.disabled)
		return;
	if (!ptr)
		return;
	pc = lookup_page_cgroup(page);
	__mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
}

void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
{
	if (mem_cgroup_subsys.disabled)
		return;
	if (!mem)
		return;
	res_counter_uncharge(&mem->res, PAGE_SIZE);
	css_put(&mem->css);
}


806
/*
807
 * uncharge if !page_mapped(page)
808
 */
809 810
static void
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
811
{
H
Hugh Dickins 已提交
812
	struct page_cgroup *pc;
813
	struct mem_cgroup *mem;
814
	struct mem_cgroup_per_zone *mz;
815
	unsigned long flags;
816

817 818 819
	if (mem_cgroup_subsys.disabled)
		return;

820
	/*
821
	 * Check if our page_cgroup is valid
822
	 */
823 824 825
	pc = lookup_page_cgroup(page);
	if (unlikely(!pc || !PageCgroupUsed(pc)))
		return;
826

827 828 829 830 831 832 833 834 835
	lock_page_cgroup(pc);
	if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
	     || !PageCgroupUsed(pc)) {
		/* This happens at race in zap_pte_range() and do_swap_page()*/
		unlock_page_cgroup(pc);
		return;
	}
	ClearPageCgroupUsed(pc);
	mem = pc->mem_cgroup;
836

837 838 839 840
	mz = page_cgroup_zoneinfo(pc);
	spin_lock_irqsave(&mz->lru_lock, flags);
	__mem_cgroup_remove_list(mz, pc);
	spin_unlock_irqrestore(&mz->lru_lock, flags);
841
	unlock_page_cgroup(pc);
H
Hugh Dickins 已提交
842

843 844
	res_counter_uncharge(&mem->res, PAGE_SIZE);
	css_put(&mem->css);
845

846
	return;
847 848
}

849 850
void mem_cgroup_uncharge_page(struct page *page)
{
851 852 853 854 855
	/* early check. */
	if (page_mapped(page))
		return;
	if (page->mapping && !PageAnon(page))
		return;
856 857 858 859 860 861
	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
}

void mem_cgroup_uncharge_cache_page(struct page *page)
{
	VM_BUG_ON(page_mapped(page));
862
	VM_BUG_ON(page->mapping);
863 864 865
	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
}

866
/*
867 868
 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
 * page belongs to.
869
 */
870
int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
871 872
{
	struct page_cgroup *pc;
873 874
	struct mem_cgroup *mem = NULL;
	int ret = 0;
875

876 877 878
	if (mem_cgroup_subsys.disabled)
		return 0;

879 880 881
	pc = lookup_page_cgroup(page);
	lock_page_cgroup(pc);
	if (PageCgroupUsed(pc)) {
882 883 884
		mem = pc->mem_cgroup;
		css_get(&mem->css);
	}
885
	unlock_page_cgroup(pc);
886

887
	if (mem) {
888
		ret = mem_cgroup_try_charge(NULL, GFP_HIGHUSER_MOVABLE, &mem);
889 890
		css_put(&mem->css);
	}
891
	*ptr = mem;
892
	return ret;
893
}
894

895
/* remove redundant charge if migration failed*/
896 897
void mem_cgroup_end_migration(struct mem_cgroup *mem,
		struct page *oldpage, struct page *newpage)
898
{
899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926
	struct page *target, *unused;
	struct page_cgroup *pc;
	enum charge_type ctype;

	if (!mem)
		return;

	/* at migration success, oldpage->mapping is NULL. */
	if (oldpage->mapping) {
		target = oldpage;
		unused = NULL;
	} else {
		target = newpage;
		unused = oldpage;
	}

	if (PageAnon(target))
		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
	else if (page_is_file_cache(target))
		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
	else
		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;

	/* unused page is not on radix-tree now. */
	if (unused && ctype != MEM_CGROUP_CHARGE_TYPE_MAPPED)
		__mem_cgroup_uncharge_common(unused, ctype);

	pc = lookup_page_cgroup(target);
927
	/*
928 929 930 931 932 933 934 935 936 937 938 939 940 941
	 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
	 * So, double-counting is effectively avoided.
	 */
	__mem_cgroup_commit_charge(mem, pc, ctype);

	/*
	 * Both of oldpage and newpage are still under lock_page().
	 * Then, we don't have to care about race in radix-tree.
	 * But we have to be careful that this page is unmapped or not.
	 *
	 * There is a case for !page_mapped(). At the start of
	 * migration, oldpage was mapped. But now, it's zapped.
	 * But we know *target* page is not freed/reused under us.
	 * mem_cgroup_uncharge_page() does all necessary checks.
942
	 */
943 944
	if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
		mem_cgroup_uncharge_page(target);
945
}
946

947 948 949 950 951 952 953 954 955 956 957
/*
 * A call to try to shrink memory usage under specified resource controller.
 * This is typically used for page reclaiming for shmem for reducing side
 * effect of page allocation from shmem, which is used by some mem_cgroup.
 */
int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
{
	struct mem_cgroup *mem;
	int progress = 0;
	int retry = MEM_CGROUP_RECLAIM_RETRIES;

958 959
	if (mem_cgroup_subsys.disabled)
		return 0;
960 961
	if (!mm)
		return 0;
962

963 964
	rcu_read_lock();
	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
965 966 967 968
	if (unlikely(!mem)) {
		rcu_read_unlock();
		return 0;
	}
969 970 971 972 973
	css_get(&mem->css);
	rcu_read_unlock();

	do {
		progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
974
		progress += res_counter_check_under_limit(&mem->res);
975 976 977 978 979 980 981 982
	} while (!progress && --retry);

	css_put(&mem->css);
	if (!retry)
		return -ENOMEM;
	return 0;
}

983 984
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
				   unsigned long long val)
985 986 987 988 989 990 991 992 993 994 995 996 997 998 999
{

	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
	int progress;
	int ret = 0;

	while (res_counter_set_limit(&memcg->res, val)) {
		if (signal_pending(current)) {
			ret = -EINTR;
			break;
		}
		if (!retry_count) {
			ret = -EBUSY;
			break;
		}
1000 1001
		progress = try_to_free_mem_cgroup_pages(memcg,
				GFP_HIGHUSER_MOVABLE);
1002 1003 1004 1005 1006 1007 1008
		if (!progress)
			retry_count--;
	}
	return ret;
}


1009 1010 1011 1012
/*
 * This routine traverse page_cgroup in given list and drop them all.
 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
 */
1013
static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
1014
			    struct mem_cgroup_per_zone *mz,
1015
			    enum lru_list lru)
1016
{
1017
	struct page_cgroup *pc, *busy;
1018
	unsigned long flags;
1019
	unsigned long loop;
1020
	struct list_head *list;
1021
	int ret = 0;
1022

1023
	list = &mz->lists[lru];
1024

1025 1026 1027 1028 1029 1030 1031 1032 1033
	loop = MEM_CGROUP_ZSTAT(mz, lru);
	/* give some margin against EBUSY etc...*/
	loop += 256;
	busy = NULL;
	while (loop--) {
		ret = 0;
		spin_lock_irqsave(&mz->lru_lock, flags);
		if (list_empty(list)) {
			spin_unlock_irqrestore(&mz->lru_lock, flags);
1034
			break;
1035 1036 1037 1038 1039 1040 1041 1042
		}
		pc = list_entry(list->prev, struct page_cgroup, lru);
		if (busy == pc) {
			list_move(&pc->lru, list);
			busy = 0;
			spin_unlock_irqrestore(&mz->lru_lock, flags);
			continue;
		}
1043
		spin_unlock_irqrestore(&mz->lru_lock, flags);
1044 1045 1046

		ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);
		if (ret == -ENOMEM)
1047
			break;
1048 1049 1050 1051 1052 1053 1054

		if (ret == -EBUSY || ret == -EINVAL) {
			/* found lock contention or "pc" is obsolete. */
			busy = pc;
			cond_resched();
		} else
			busy = NULL;
1055
	}
1056 1057 1058
	if (!ret && !list_empty(list))
		return -EBUSY;
	return ret;
1059 1060 1061 1062 1063 1064
}

/*
 * make mem_cgroup's charge to be 0 if there is no task.
 * This enables deleting this mem_cgroup.
 */
1065
static int mem_cgroup_force_empty(struct mem_cgroup *mem)
1066
{
1067 1068 1069
	int ret;
	int node, zid, shrink;
	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1070

1071
	css_get(&mem->css);
1072 1073 1074

	shrink = 0;
move_account:
1075
	while (mem->res.usage > 0) {
1076
		ret = -EBUSY;
1077 1078
		if (atomic_read(&mem->css.cgroup->count) > 0)
			goto out;
1079

1080 1081
		/* This is for making all *used* pages to be on LRU. */
		lru_add_drain_all();
1082 1083 1084
		ret = 0;
		for_each_node_state(node, N_POSSIBLE) {
			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
1085
				struct mem_cgroup_per_zone *mz;
1086
				enum lru_list l;
1087
				mz = mem_cgroup_zoneinfo(mem, node, zid);
1088 1089 1090 1091 1092 1093
				for_each_lru(l) {
					ret = mem_cgroup_force_empty_list(mem,
								  mz, l);
					if (ret)
						break;
				}
1094
			}
1095 1096 1097 1098 1099 1100
			if (ret)
				break;
		}
		/* it seems parent cgroup doesn't have enough mem */
		if (ret == -ENOMEM)
			goto try_to_free;
1101
		cond_resched();
1102 1103 1104 1105 1106
	}
	ret = 0;
out:
	css_put(&mem->css);
	return ret;
1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128

try_to_free:
	/* returns EBUSY if we come here twice. */
	if (shrink)  {
		ret = -EBUSY;
		goto out;
	}
	/* try to free all pages in this cgroup */
	shrink = 1;
	while (nr_retries && mem->res.usage > 0) {
		int progress;
		progress = try_to_free_mem_cgroup_pages(mem,
						  GFP_HIGHUSER_MOVABLE);
		if (!progress)
			nr_retries--;

	}
	/* try move_account...there may be some *locked* pages. */
	if (mem->res.usage)
		goto move_account;
	ret = 0;
	goto out;
1129 1130
}

1131
static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
B
Balbir Singh 已提交
1132
{
1133 1134
	return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
				    cft->private);
B
Balbir Singh 已提交
1135
}
1136 1137 1138 1139
/*
 * The user of this function is...
 * RES_LIMIT.
 */
1140 1141
static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
			    const char *buffer)
B
Balbir Singh 已提交
1142
{
1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158
	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
	unsigned long long val;
	int ret;

	switch (cft->private) {
	case RES_LIMIT:
		/* This function does all necessary parse...reuse it */
		ret = res_counter_memparse_write_strategy(buffer, &val);
		if (!ret)
			ret = mem_cgroup_resize_limit(memcg, val);
		break;
	default:
		ret = -EINVAL; /* should be BUG() ? */
		break;
	}
	return ret;
B
Balbir Singh 已提交
1159 1160
}

1161
static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1162 1163 1164 1165
{
	struct mem_cgroup *mem;

	mem = mem_cgroup_from_cont(cont);
1166 1167 1168 1169 1170 1171 1172 1173
	switch (event) {
	case RES_MAX_USAGE:
		res_counter_reset_max(&mem->res);
		break;
	case RES_FAILCNT:
		res_counter_reset_failcnt(&mem->res);
		break;
	}
1174
	return 0;
1175 1176
}

1177 1178 1179 1180 1181 1182
static const struct mem_cgroup_stat_desc {
	const char *msg;
	u64 unit;
} mem_cgroup_stat_desc[] = {
	[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
	[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
1183 1184
	[MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
	[MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
1185 1186
};

1187 1188
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
				 struct cgroup_map_cb *cb)
1189 1190 1191 1192 1193 1194 1195 1196 1197 1198
{
	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
	struct mem_cgroup_stat *stat = &mem_cont->stat;
	int i;

	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
		s64 val;

		val = mem_cgroup_read_stat(stat, i);
		val *= mem_cgroup_stat_desc[i].unit;
1199
		cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
1200
	}
1201 1202
	/* showing # of active pages */
	{
1203 1204
		unsigned long active_anon, inactive_anon;
		unsigned long active_file, inactive_file;
L
Lee Schermerhorn 已提交
1205
		unsigned long unevictable;
1206 1207 1208 1209 1210 1211 1212 1213 1214

		inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
						LRU_INACTIVE_ANON);
		active_anon = mem_cgroup_get_all_zonestat(mem_cont,
						LRU_ACTIVE_ANON);
		inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
						LRU_INACTIVE_FILE);
		active_file = mem_cgroup_get_all_zonestat(mem_cont,
						LRU_ACTIVE_FILE);
L
Lee Schermerhorn 已提交
1215 1216 1217
		unevictable = mem_cgroup_get_all_zonestat(mem_cont,
							LRU_UNEVICTABLE);

1218 1219 1220 1221
		cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
		cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
		cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
		cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
L
Lee Schermerhorn 已提交
1222 1223
		cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);

1224
	}
1225 1226 1227
	return 0;
}

B
Balbir Singh 已提交
1228 1229
static struct cftype mem_cgroup_files[] = {
	{
1230
		.name = "usage_in_bytes",
B
Balbir Singh 已提交
1231
		.private = RES_USAGE,
1232
		.read_u64 = mem_cgroup_read,
B
Balbir Singh 已提交
1233
	},
1234 1235 1236
	{
		.name = "max_usage_in_bytes",
		.private = RES_MAX_USAGE,
1237
		.trigger = mem_cgroup_reset,
1238 1239
		.read_u64 = mem_cgroup_read,
	},
B
Balbir Singh 已提交
1240
	{
1241
		.name = "limit_in_bytes",
B
Balbir Singh 已提交
1242
		.private = RES_LIMIT,
1243
		.write_string = mem_cgroup_write,
1244
		.read_u64 = mem_cgroup_read,
B
Balbir Singh 已提交
1245 1246 1247 1248
	},
	{
		.name = "failcnt",
		.private = RES_FAILCNT,
1249
		.trigger = mem_cgroup_reset,
1250
		.read_u64 = mem_cgroup_read,
B
Balbir Singh 已提交
1251
	},
1252 1253
	{
		.name = "stat",
1254
		.read_map = mem_control_stat_show,
1255
	},
B
Balbir Singh 已提交
1256 1257
};

1258 1259 1260
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
	struct mem_cgroup_per_node *pn;
1261
	struct mem_cgroup_per_zone *mz;
1262
	enum lru_list l;
1263
	int zone, tmp = node;
1264 1265 1266 1267 1268 1269 1270 1271
	/*
	 * This routine is called against possible nodes.
	 * But it's BUG to call kmalloc() against offline node.
	 *
	 * TODO: this routine can waste much memory for nodes which will
	 *       never be onlined. It's better to use memory hotplug callback
	 *       function.
	 */
1272 1273 1274
	if (!node_state(node, N_NORMAL_MEMORY))
		tmp = -1;
	pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
1275 1276
	if (!pn)
		return 1;
1277

1278 1279
	mem->info.nodeinfo[node] = pn;
	memset(pn, 0, sizeof(*pn));
1280 1281 1282

	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
		mz = &pn->zoneinfo[zone];
1283
		spin_lock_init(&mz->lru_lock);
1284 1285
		for_each_lru(l)
			INIT_LIST_HEAD(&mz->lists[l]);
1286
	}
1287 1288 1289
	return 0;
}

1290 1291 1292 1293 1294
static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
	kfree(mem->info.nodeinfo[node]);
}

1295 1296 1297 1298 1299 1300
static int mem_cgroup_size(void)
{
	int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
	return sizeof(struct mem_cgroup) + cpustat_size;
}

1301 1302 1303
static struct mem_cgroup *mem_cgroup_alloc(void)
{
	struct mem_cgroup *mem;
1304
	int size = mem_cgroup_size();
1305

1306 1307
	if (size < PAGE_SIZE)
		mem = kmalloc(size, GFP_KERNEL);
1308
	else
1309
		mem = vmalloc(size);
1310 1311

	if (mem)
1312
		memset(mem, 0, size);
1313 1314 1315 1316 1317
	return mem;
}

static void mem_cgroup_free(struct mem_cgroup *mem)
{
1318
	if (mem_cgroup_size() < PAGE_SIZE)
1319 1320 1321 1322 1323 1324
		kfree(mem);
	else
		vfree(mem);
}


B
Balbir Singh 已提交
1325 1326 1327 1328
static struct cgroup_subsys_state *
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
	struct mem_cgroup *mem;
1329
	int node;
B
Balbir Singh 已提交
1330

1331 1332 1333
	mem = mem_cgroup_alloc();
	if (!mem)
		return ERR_PTR(-ENOMEM);
1334

B
Balbir Singh 已提交
1335
	res_counter_init(&mem->res);
1336

1337 1338 1339 1340
	for_each_node_state(node, N_POSSIBLE)
		if (alloc_mem_cgroup_per_zone_info(mem, node))
			goto free_out;

B
Balbir Singh 已提交
1341
	return &mem->css;
1342 1343
free_out:
	for_each_node_state(node, N_POSSIBLE)
1344
		free_mem_cgroup_per_zone_info(mem, node);
1345
	mem_cgroup_free(mem);
1346
	return ERR_PTR(-ENOMEM);
B
Balbir Singh 已提交
1347 1348
}

1349 1350 1351 1352 1353 1354 1355
static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
					struct cgroup *cont)
{
	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
	mem_cgroup_force_empty(mem);
}

B
Balbir Singh 已提交
1356 1357 1358
static void mem_cgroup_destroy(struct cgroup_subsys *ss,
				struct cgroup *cont)
{
1359 1360 1361 1362
	int node;
	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

	for_each_node_state(node, N_POSSIBLE)
1363
		free_mem_cgroup_per_zone_info(mem, node);
1364

1365
	mem_cgroup_free(mem_cgroup_from_cont(cont));
B
Balbir Singh 已提交
1366 1367 1368 1369 1370 1371 1372 1373 1374
}

static int mem_cgroup_populate(struct cgroup_subsys *ss,
				struct cgroup *cont)
{
	return cgroup_add_files(cont, ss, mem_cgroup_files,
					ARRAY_SIZE(mem_cgroup_files));
}

B
Balbir Singh 已提交
1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
				struct cgroup *cont,
				struct cgroup *old_cont,
				struct task_struct *p)
{
	struct mm_struct *mm;
	struct mem_cgroup *mem, *old_mem;

	mm = get_task_mm(p);
	if (mm == NULL)
		return;

	mem = mem_cgroup_from_cont(cont);
	old_mem = mem_cgroup_from_cont(old_cont);

	/*
	 * Only thread group leaders are allowed to migrate, the mm_struct is
	 * in effect owned by the leader
	 */
1394
	if (!thread_group_leader(p))
B
Balbir Singh 已提交
1395 1396 1397 1398 1399 1400
		goto out;

out:
	mmput(mm);
}

B
Balbir Singh 已提交
1401 1402 1403 1404
struct cgroup_subsys mem_cgroup_subsys = {
	.name = "memory",
	.subsys_id = mem_cgroup_subsys_id,
	.create = mem_cgroup_create,
1405
	.pre_destroy = mem_cgroup_pre_destroy,
B
Balbir Singh 已提交
1406 1407
	.destroy = mem_cgroup_destroy,
	.populate = mem_cgroup_populate,
B
Balbir Singh 已提交
1408
	.attach = mem_cgroup_move_task,
1409
	.early_init = 0,
B
Balbir Singh 已提交
1410
};