memcontrol.c 35.4 KB
Newer Older
B
Balbir Singh 已提交
1 2 3 4 5
/* memcontrol.c - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
6 7 8
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 *
B
Balbir Singh 已提交
9 10 11 12 13 14 15 16 17 18 19 20 21 22
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
23
#include <linux/mm.h>
K
KAMEZAWA Hiroyuki 已提交
24
#include <linux/pagemap.h>
25
#include <linux/smp.h>
26
#include <linux/page-flags.h>
27
#include <linux/backing-dev.h>
28 29
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
30
#include <linux/slab.h>
31 32 33
#include <linux/swap.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
34
#include <linux/seq_file.h>
35
#include <linux/vmalloc.h>
36
#include <linux/mm_inline.h>
37
#include <linux/page_cgroup.h>
B
Balbir Singh 已提交
38

39 40
#include <asm/uaccess.h>

41 42
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES	5
B
Balbir Singh 已提交
43

44 45 46 47 48 49 50 51 52
/*
 * Statistics for memory cgroup.
 */
enum mem_cgroup_stat_index {
	/*
	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
	 */
	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as rss */
53 54
	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
55 56 57 58 59 60 61 62 63

	MEM_CGROUP_STAT_NSTATS,
};

struct mem_cgroup_stat_cpu {
	s64 count[MEM_CGROUP_STAT_NSTATS];
} ____cacheline_aligned_in_smp;

struct mem_cgroup_stat {
64
	struct mem_cgroup_stat_cpu cpustat[0];
65 66 67 68 69
};

/*
 * For accounting under irq disable, no need for increment preempt count.
 */
70
static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
71 72
		enum mem_cgroup_stat_index idx, int val)
{
73
	stat->count[idx] += val;
74 75 76 77 78 79 80 81 82 83 84 85
}

static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
		enum mem_cgroup_stat_index idx)
{
	int cpu;
	s64 ret = 0;
	for_each_possible_cpu(cpu)
		ret += stat->cpustat[cpu].count[idx];
	return ret;
}

86 87 88 89
/*
 * per-zone information in memory controller.
 */
struct mem_cgroup_per_zone {
90 91 92 93
	/*
	 * spin_lock to protect the per cgroup LRU
	 */
	spinlock_t		lru_lock;
94 95
	struct list_head	lists[NR_LRU_LISTS];
	unsigned long		count[NR_LRU_LISTS];
96 97 98 99 100 101 102 103 104 105 106 107
};
/* Macro for accessing counter */
#define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])

struct mem_cgroup_per_node {
	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};

struct mem_cgroup_lru_info {
	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
};

B
Balbir Singh 已提交
108 109 110 111 112 113 114
/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 *
 * TODO: Add a water mark for the memory controller. Reclaim will begin when
115 116 117
 * we hit the water mark. May be even add a low water mark, such that
 * no reclaim occurs from a cgroup at it's low water mark, this is
 * a feature that will be implemented much later in the future.
B
Balbir Singh 已提交
118 119 120 121 122 123 124
 */
struct mem_cgroup {
	struct cgroup_subsys_state css;
	/*
	 * the counter to account for memory usage
	 */
	struct res_counter res;
125 126 127 128
	/*
	 * Per cgroup active and inactive list, similar to the
	 * per zone LRU lists.
	 */
129
	struct mem_cgroup_lru_info info;
130

131
	int	prev_priority;	/* for recording reclaim priority */
132
	/*
133
	 * statistics. This must be placed at the end of memcg.
134 135
	 */
	struct mem_cgroup_stat stat;
B
Balbir Singh 已提交
136 137
};

138 139 140
enum charge_type {
	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
	MEM_CGROUP_CHARGE_TYPE_MAPPED,
141
	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
142
	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
K
KAMEZAWA Hiroyuki 已提交
143
	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
144 145 146
	NR_CHARGE_TYPE,
};

147 148 149 150 151 152
/* only for here (for easy reading.) */
#define PCGF_CACHE	(1UL << PCG_CACHE)
#define PCGF_USED	(1UL << PCG_USED)
#define PCGF_ACTIVE	(1UL << PCG_ACTIVE)
#define PCGF_LOCK	(1UL << PCG_LOCK)
#define PCGF_FILE	(1UL << PCG_FILE)
153 154
static const unsigned long
pcg_default_flags[NR_CHARGE_TYPE] = {
155 156 157 158
	PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
	PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
	PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
	0, /* FORCE */
159 160
};

161 162 163
/*
 * Always modified under lru lock. Then, not necessary to preempt_disable()
 */
164 165 166
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
					 struct page_cgroup *pc,
					 bool charge)
167 168 169
{
	int val = (charge)? 1 : -1;
	struct mem_cgroup_stat *stat = &mem->stat;
170
	struct mem_cgroup_stat_cpu *cpustat;
171

172
	VM_BUG_ON(!irqs_disabled());
173 174

	cpustat = &stat->cpustat[smp_processor_id()];
175
	if (PageCgroupCache(pc))
176
		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
177
	else
178
		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
179 180

	if (charge)
181
		__mem_cgroup_stat_add_safe(cpustat,
182 183
				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
	else
184
		__mem_cgroup_stat_add_safe(cpustat,
185
				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
186 187
}

188
static struct mem_cgroup_per_zone *
189 190 191 192 193
mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
{
	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
}

194
static struct mem_cgroup_per_zone *
195 196 197 198 199
page_cgroup_zoneinfo(struct page_cgroup *pc)
{
	struct mem_cgroup *mem = pc->mem_cgroup;
	int nid = page_cgroup_nid(pc);
	int zid = page_cgroup_zid(pc);
200

201 202 203 204
	return mem_cgroup_zoneinfo(mem, nid, zid);
}

static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
205
					enum lru_list idx)
206 207 208 209 210 211 212 213 214 215 216
{
	int nid, zid;
	struct mem_cgroup_per_zone *mz;
	u64 total = 0;

	for_each_online_node(nid)
		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
			mz = mem_cgroup_zoneinfo(mem, nid, zid);
			total += MEM_CGROUP_ZSTAT(mz, idx);
		}
	return total;
217 218
}

219
static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
B
Balbir Singh 已提交
220 221 222 223 224 225
{
	return container_of(cgroup_subsys_state(cont,
				mem_cgroup_subsys_id), struct mem_cgroup,
				css);
}

226
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
227
{
228 229 230 231 232 233 234 235
	/*
	 * mm_update_next_owner() may clear mm->owner to NULL
	 * if it races with swapoff, page migration, etc.
	 * So this can be called with p == NULL.
	 */
	if (unlikely(!p))
		return NULL;

236 237 238 239
	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
				struct mem_cgroup, css);
}

240 241
static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
			struct page_cgroup *pc)
242
{
243 244
	int lru = LRU_BASE;

245
	if (PageCgroupUnevictable(pc))
L
Lee Schermerhorn 已提交
246 247
		lru = LRU_UNEVICTABLE;
	else {
248
		if (PageCgroupActive(pc))
L
Lee Schermerhorn 已提交
249
			lru += LRU_ACTIVE;
250
		if (PageCgroupFile(pc))
L
Lee Schermerhorn 已提交
251 252
			lru += LRU_FILE;
	}
253

254
	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
255

256
	mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
257
	list_del(&pc->lru);
258 259
}

260
static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
261
				struct page_cgroup *pc, bool hot)
262
{
263
	int lru = LRU_BASE;
264

265
	if (PageCgroupUnevictable(pc))
L
Lee Schermerhorn 已提交
266 267
		lru = LRU_UNEVICTABLE;
	else {
268
		if (PageCgroupActive(pc))
L
Lee Schermerhorn 已提交
269
			lru += LRU_ACTIVE;
270
		if (PageCgroupFile(pc))
L
Lee Schermerhorn 已提交
271 272
			lru += LRU_FILE;
	}
273 274

	MEM_CGROUP_ZSTAT(mz, lru) += 1;
275 276 277 278
	if (hot)
		list_add(&pc->lru, &mz->lists[lru]);
	else
		list_add_tail(&pc->lru, &mz->lists[lru]);
279

280
	mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
281 282
}

L
Lee Schermerhorn 已提交
283
static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
284
{
285
	struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
286 287 288
	int active    = PageCgroupActive(pc);
	int file      = PageCgroupFile(pc);
	int unevictable = PageCgroupUnevictable(pc);
L
Lee Schermerhorn 已提交
289 290
	enum lru_list from = unevictable ? LRU_UNEVICTABLE :
				(LRU_FILE * !!file + !!active);
291

L
Lee Schermerhorn 已提交
292 293
	if (lru == from)
		return;
294

L
Lee Schermerhorn 已提交
295
	MEM_CGROUP_ZSTAT(mz, from) -= 1;
296 297 298 299 300
	/*
	 * However this is done under mz->lru_lock, another flags, which
	 * are not related to LRU, will be modified from out-of-lock.
	 * We have to use atomic set/clear flags.
	 */
L
Lee Schermerhorn 已提交
301
	if (is_unevictable_lru(lru)) {
302 303
		ClearPageCgroupActive(pc);
		SetPageCgroupUnevictable(pc);
L
Lee Schermerhorn 已提交
304 305
	} else {
		if (is_active_lru(lru))
306
			SetPageCgroupActive(pc);
L
Lee Schermerhorn 已提交
307
		else
308 309
			ClearPageCgroupActive(pc);
		ClearPageCgroupUnevictable(pc);
L
Lee Schermerhorn 已提交
310
	}
311 312 313

	MEM_CGROUP_ZSTAT(mz, lru) += 1;
	list_move(&pc->lru, &mz->lists[lru]);
314 315
}

316 317 318 319 320
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
{
	int ret;

	task_lock(task);
321
	ret = task->mm && mm_match_cgroup(task->mm, mem);
322 323 324 325
	task_unlock(task);
	return ret;
}

326 327 328
/*
 * This routine assumes that the appropriate zone's lru lock is already held
 */
L
Lee Schermerhorn 已提交
329
void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
330
{
331
	struct page_cgroup *pc;
332 333 334
	struct mem_cgroup_per_zone *mz;
	unsigned long flags;

335 336 337
	if (mem_cgroup_subsys.disabled)
		return;

338 339 340 341 342 343 344
	/*
	 * We cannot lock_page_cgroup while holding zone's lru_lock,
	 * because other holders of lock_page_cgroup can be interrupted
	 * with an attempt to rotate_reclaimable_page.  But we cannot
	 * safely get to page_cgroup without it, so just try_lock it:
	 * mem_cgroup_isolate_pages allows for page left on wrong list.
	 */
345 346
	pc = lookup_page_cgroup(page);
	if (!trylock_page_cgroup(pc))
347
		return;
348
	if (pc && PageCgroupUsed(pc)) {
349 350
		mz = page_cgroup_zoneinfo(pc);
		spin_lock_irqsave(&mz->lru_lock, flags);
L
Lee Schermerhorn 已提交
351
		__mem_cgroup_move_lists(pc, lru);
352
		spin_unlock_irqrestore(&mz->lru_lock, flags);
353
	}
354
	unlock_page_cgroup(pc);
355 356
}

357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372
/*
 * Calculate mapped_ratio under memory controller. This will be used in
 * vmscan.c for deteremining we have to reclaim mapped pages.
 */
int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
{
	long total, rss;

	/*
	 * usage is recorded in bytes. But, here, we assume the number of
	 * physical pages can be represented by "long" on any arch.
	 */
	total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
	rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
	return (int)((rss * 100L) / total);
}
373

374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392
/*
 * prev_priority control...this will be used in memory reclaim path.
 */
int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
{
	return mem->prev_priority;
}

void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
{
	if (priority < mem->prev_priority)
		mem->prev_priority = priority;
}

void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
{
	mem->prev_priority = priority;
}

393 394 395 396 397 398 399 400
/*
 * Calculate # of pages to be scanned in this priority/zone.
 * See also vmscan.c
 *
 * priority starts from "DEF_PRIORITY" and decremented in each loop.
 * (see include/linux/mmzone.h)
 */

401 402
long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
					int priority, enum lru_list lru)
403
{
404
	long nr_pages;
405 406 407 408
	int nid = zone->zone_pgdat->node_id;
	int zid = zone_idx(zone);
	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);

409
	nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
410

411
	return (nr_pages >> priority);
412 413
}

414 415 416 417 418
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
					struct list_head *dst,
					unsigned long *scanned, int order,
					int mode, struct zone *z,
					struct mem_cgroup *mem_cont,
419
					int active, int file)
420 421 422 423 424 425
{
	unsigned long nr_taken = 0;
	struct page *page;
	unsigned long scan;
	LIST_HEAD(pc_list);
	struct list_head *src;
426
	struct page_cgroup *pc, *tmp;
427 428 429
	int nid = z->zone_pgdat->node_id;
	int zid = zone_idx(z);
	struct mem_cgroup_per_zone *mz;
430
	int lru = LRU_FILE * !!file + !!active;
431

432
	BUG_ON(!mem_cont);
433
	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
434
	src = &mz->lists[lru];
435

436
	spin_lock(&mz->lru_lock);
437 438
	scan = 0;
	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
H
Hugh Dickins 已提交
439
		if (scan >= nr_to_scan)
440
			break;
441 442
		if (unlikely(!PageCgroupUsed(pc)))
			continue;
443 444
		page = pc->page;

H
Hugh Dickins 已提交
445
		if (unlikely(!PageLRU(page)))
446 447
			continue;

448 449 450
		/*
		 * TODO: play better with lumpy reclaim, grabbing anything.
		 */
L
Lee Schermerhorn 已提交
451 452 453 454
		if (PageUnevictable(page) ||
		    (PageActive(page) && !active) ||
		    (!PageActive(page) && active)) {
			__mem_cgroup_move_lists(pc, page_lru(page));
455 456 457
			continue;
		}

H
Hugh Dickins 已提交
458 459
		scan++;
		list_move(&pc->lru, &pc_list);
460

461
		if (__isolate_lru_page(page, mode, file) == 0) {
462 463 464 465 466 467
			list_move(&page->lru, dst);
			nr_taken++;
		}
	}

	list_splice(&pc_list, src);
468
	spin_unlock(&mz->lru_lock);
469 470 471 472 473

	*scanned = scan;
	return nr_taken;
}

474 475 476
/*
 * Unlike exported interface, "oom" parameter is added. if oom==true,
 * oom-killer can be invoked.
477
 */
478 479
static int __mem_cgroup_try_charge(struct mm_struct *mm,
			gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
480 481
{
	struct mem_cgroup *mem;
482
	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
483
	/*
484 485
	 * We always charge the cgroup the mm_struct belongs to.
	 * The mm_struct's mem_cgroup changes on task migration if the
486 487 488
	 * thread group leader migrates. It's possible that mm is not
	 * set, if so charge the init_mm (happens for pagecache usage).
	 */
489
	if (likely(!*memcg)) {
490 491
		rcu_read_lock();
		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
492 493 494 495
		if (unlikely(!mem)) {
			rcu_read_unlock();
			return 0;
		}
496 497 498 499
		/*
		 * For every charge from the cgroup, increment reference count
		 */
		css_get(&mem->css);
500
		*memcg = mem;
501 502
		rcu_read_unlock();
	} else {
503 504
		mem = *memcg;
		css_get(&mem->css);
505
	}
506

507

508
	while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
509
		if (!(gfp_mask & __GFP_WAIT))
510
			goto nomem;
511 512

		if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
513 514 515
			continue;

		/*
516 517 518 519 520 521
		 * try_to_free_mem_cgroup_pages() might not give us a full
		 * picture of reclaim. Some pages are reclaimed and might be
		 * moved to swap cache or just unmapped from the cgroup.
		 * Check the limit again to see if the reclaim reduced the
		 * current usage of the cgroup before giving up
		 */
522 523
		if (res_counter_check_under_limit(&mem->res))
			continue;
524 525

		if (!nr_retries--) {
526 527
			if (oom)
				mem_cgroup_out_of_memory(mem, gfp_mask);
528
			goto nomem;
529
		}
530
	}
531 532 533 534 535
	return 0;
nomem:
	css_put(&mem->css);
	return -ENOMEM;
}
536

537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555
/**
 * mem_cgroup_try_charge - get charge of PAGE_SIZE.
 * @mm: an mm_struct which is charged against. (when *memcg is NULL)
 * @gfp_mask: gfp_mask for reclaim.
 * @memcg: a pointer to memory cgroup which is charged against.
 *
 * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
 * memory cgroup from @mm is got and stored in *memcg.
 *
 * Returns 0 if success. -ENOMEM at failure.
 * This call can invoke OOM-Killer.
 */

int mem_cgroup_try_charge(struct mm_struct *mm,
			  gfp_t mask, struct mem_cgroup **memcg)
{
	return __mem_cgroup_try_charge(mm, mask, memcg, true);
}

556 557 558 559 560 561 562 563 564 565 566 567 568 569 570
/*
 * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
 * USED state. If already USED, uncharge and return.
 */

static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
				     struct page_cgroup *pc,
				     enum charge_type ctype)
{
	struct mem_cgroup_per_zone *mz;
	unsigned long flags;

	/* try_charge() can return NULL to *memcg, taking care of it. */
	if (!mem)
		return;
571 572 573 574 575 576

	lock_page_cgroup(pc);
	if (unlikely(PageCgroupUsed(pc))) {
		unlock_page_cgroup(pc);
		res_counter_uncharge(&mem->res, PAGE_SIZE);
		css_put(&mem->css);
577
		return;
578
	}
579
	pc->mem_cgroup = mem;
580 581 582 583
	/*
	 * If a page is accounted as a page cache, insert to inactive list.
	 * If anon, insert to active list.
	 */
584
	pc->flags = pcg_default_flags[ctype];
585

586
	mz = page_cgroup_zoneinfo(pc);
587

588
	spin_lock_irqsave(&mz->lru_lock, flags);
589
	__mem_cgroup_add_list(mz, pc, true);
590
	spin_unlock_irqrestore(&mz->lru_lock, flags);
591
	unlock_page_cgroup(pc);
592
}
593

594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691
/**
 * mem_cgroup_move_account - move account of the page
 * @pc:	page_cgroup of the page.
 * @from: mem_cgroup which the page is moved from.
 * @to:	mem_cgroup which the page is moved to. @from != @to.
 *
 * The caller must confirm following.
 * 1. disable irq.
 * 2. lru_lock of old mem_cgroup(@from) should be held.
 *
 * returns 0 at success,
 * returns -EBUSY when lock is busy or "pc" is unstable.
 *
 * This function does "uncharge" from old cgroup but doesn't do "charge" to
 * new cgroup. It should be done by a caller.
 */

static int mem_cgroup_move_account(struct page_cgroup *pc,
	struct mem_cgroup *from, struct mem_cgroup *to)
{
	struct mem_cgroup_per_zone *from_mz, *to_mz;
	int nid, zid;
	int ret = -EBUSY;

	VM_BUG_ON(!irqs_disabled());
	VM_BUG_ON(from == to);

	nid = page_cgroup_nid(pc);
	zid = page_cgroup_zid(pc);
	from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
	to_mz =  mem_cgroup_zoneinfo(to, nid, zid);


	if (!trylock_page_cgroup(pc))
		return ret;

	if (!PageCgroupUsed(pc))
		goto out;

	if (pc->mem_cgroup != from)
		goto out;

	if (spin_trylock(&to_mz->lru_lock)) {
		__mem_cgroup_remove_list(from_mz, pc);
		css_put(&from->css);
		res_counter_uncharge(&from->res, PAGE_SIZE);
		pc->mem_cgroup = to;
		css_get(&to->css);
		__mem_cgroup_add_list(to_mz, pc, false);
		ret = 0;
		spin_unlock(&to_mz->lru_lock);
	}
out:
	unlock_page_cgroup(pc);
	return ret;
}

/*
 * move charges to its parent.
 */

static int mem_cgroup_move_parent(struct page_cgroup *pc,
				  struct mem_cgroup *child,
				  gfp_t gfp_mask)
{
	struct cgroup *cg = child->css.cgroup;
	struct cgroup *pcg = cg->parent;
	struct mem_cgroup *parent;
	struct mem_cgroup_per_zone *mz;
	unsigned long flags;
	int ret;

	/* Is ROOT ? */
	if (!pcg)
		return -EINVAL;

	parent = mem_cgroup_from_cont(pcg);

	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
	if (ret)
		return ret;

	mz = mem_cgroup_zoneinfo(child,
			page_cgroup_nid(pc), page_cgroup_zid(pc));

	spin_lock_irqsave(&mz->lru_lock, flags);
	ret = mem_cgroup_move_account(pc, child, parent);
	spin_unlock_irqrestore(&mz->lru_lock, flags);

	/* drop extra refcnt */
	css_put(&parent->css);
	/* uncharge if move fails */
	if (ret)
		res_counter_uncharge(&parent->res, PAGE_SIZE);

	return ret;
}

692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712
/*
 * Charge the memory controller for page usage.
 * Return
 * 0 if the charge was successful
 * < 0 if the cgroup is over its limit
 */
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
				gfp_t gfp_mask, enum charge_type ctype,
				struct mem_cgroup *memcg)
{
	struct mem_cgroup *mem;
	struct page_cgroup *pc;
	int ret;

	pc = lookup_page_cgroup(page);
	/* can happen at boot */
	if (unlikely(!pc))
		return 0;
	prefetchw(pc);

	mem = memcg;
713
	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
714 715 716 717
	if (ret)
		return ret;

	__mem_cgroup_commit_charge(mem, pc, ctype);
718 719 720
	return 0;
}

721 722
int mem_cgroup_newpage_charge(struct page *page,
			      struct mm_struct *mm, gfp_t gfp_mask)
723
{
724 725
	if (mem_cgroup_subsys.disabled)
		return 0;
726 727
	if (PageCompound(page))
		return 0;
728 729 730 731 732 733 734 735 736 737 738
	/*
	 * If already mapped, we don't have to account.
	 * If page cache, page->mapping has address_space.
	 * But page->mapping may have out-of-use anon_vma pointer,
	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
	 * is NULL.
  	 */
	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
		return 0;
	if (unlikely(!mm))
		mm = &init_mm;
739
	return mem_cgroup_charge_common(page, mm, gfp_mask,
740
				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
741 742
}

743 744
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
				gfp_t gfp_mask)
745
{
746 747
	if (mem_cgroup_subsys.disabled)
		return 0;
748 749
	if (PageCompound(page))
		return 0;
750 751 752 753 754 755 756 757 758 759 760 761
	/*
	 * Corner case handling. This is called from add_to_page_cache()
	 * in usual. But some FS (shmem) precharges this page before calling it
	 * and call add_to_page_cache() with GFP_NOWAIT.
	 *
	 * For GFP_NOWAIT case, the page may be pre-charged before calling
	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
	 * charge twice. (It works but has to pay a bit larger cost.)
	 */
	if (!(gfp_mask & __GFP_WAIT)) {
		struct page_cgroup *pc;

762 763 764 765 766 767 768

		pc = lookup_page_cgroup(page);
		if (!pc)
			return 0;
		lock_page_cgroup(pc);
		if (PageCgroupUsed(pc)) {
			unlock_page_cgroup(pc);
769 770
			return 0;
		}
771
		unlock_page_cgroup(pc);
772 773
	}

774
	if (unlikely(!mm))
775
		mm = &init_mm;
776

777 778
	if (page_is_file_cache(page))
		return mem_cgroup_charge_common(page, mm, gfp_mask,
779
				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
780 781 782
	else
		return mem_cgroup_charge_common(page, mm, gfp_mask,
				MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
783 784
}

K
KAMEZAWA Hiroyuki 已提交
785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811
#ifdef CONFIG_SWAP
int mem_cgroup_cache_charge_swapin(struct page *page,
			struct mm_struct *mm, gfp_t mask, bool locked)
{
	int ret = 0;

	if (mem_cgroup_subsys.disabled)
		return 0;
	if (unlikely(!mm))
		mm = &init_mm;
	if (!locked)
		lock_page(page);
	/*
	 * If not locked, the page can be dropped from SwapCache until
	 * we reach here.
	 */
	if (PageSwapCache(page)) {
		ret = mem_cgroup_charge_common(page, mm, mask,
				MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
	}
	if (!locked)
		unlock_page(page);

	return ret;
}
#endif

812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834
void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
{
	struct page_cgroup *pc;

	if (mem_cgroup_subsys.disabled)
		return;
	if (!ptr)
		return;
	pc = lookup_page_cgroup(page);
	__mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
}

void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
{
	if (mem_cgroup_subsys.disabled)
		return;
	if (!mem)
		return;
	res_counter_uncharge(&mem->res, PAGE_SIZE);
	css_put(&mem->css);
}


835
/*
836
 * uncharge if !page_mapped(page)
837
 */
838 839
static void
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
840
{
H
Hugh Dickins 已提交
841
	struct page_cgroup *pc;
842
	struct mem_cgroup *mem;
843
	struct mem_cgroup_per_zone *mz;
844
	unsigned long flags;
845

846 847 848
	if (mem_cgroup_subsys.disabled)
		return;

K
KAMEZAWA Hiroyuki 已提交
849 850 851
	if (PageSwapCache(page))
		return;

852
	/*
853
	 * Check if our page_cgroup is valid
854
	 */
855 856 857
	pc = lookup_page_cgroup(page);
	if (unlikely(!pc || !PageCgroupUsed(pc)))
		return;
858

859
	lock_page_cgroup(pc);
K
KAMEZAWA Hiroyuki 已提交
860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877

	if (!PageCgroupUsed(pc))
		goto unlock_out;

	switch (ctype) {
	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
		if (page_mapped(page))
			goto unlock_out;
		break;
	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
		if (!PageAnon(page)) {	/* Shared memory */
			if (page->mapping && !page_is_file_cache(page))
				goto unlock_out;
		} else if (page_mapped(page)) /* Anon */
				goto unlock_out;
		break;
	default:
		break;
878
	}
K
KAMEZAWA Hiroyuki 已提交
879

880 881
	ClearPageCgroupUsed(pc);
	mem = pc->mem_cgroup;
882

883 884 885 886
	mz = page_cgroup_zoneinfo(pc);
	spin_lock_irqsave(&mz->lru_lock, flags);
	__mem_cgroup_remove_list(mz, pc);
	spin_unlock_irqrestore(&mz->lru_lock, flags);
887
	unlock_page_cgroup(pc);
H
Hugh Dickins 已提交
888

889 890
	res_counter_uncharge(&mem->res, PAGE_SIZE);
	css_put(&mem->css);
891

892
	return;
K
KAMEZAWA Hiroyuki 已提交
893 894 895 896

unlock_out:
	unlock_page_cgroup(pc);
	return;
897 898
}

899 900
void mem_cgroup_uncharge_page(struct page *page)
{
901 902 903 904 905
	/* early check. */
	if (page_mapped(page))
		return;
	if (page->mapping && !PageAnon(page))
		return;
906 907 908 909 910 911
	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
}

void mem_cgroup_uncharge_cache_page(struct page *page)
{
	VM_BUG_ON(page_mapped(page));
912
	VM_BUG_ON(page->mapping);
913 914 915
	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
}

K
KAMEZAWA Hiroyuki 已提交
916 917 918 919 920
void mem_cgroup_uncharge_swapcache(struct page *page)
{
	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
}

921
/*
922 923
 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
 * page belongs to.
924
 */
925
int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
926 927
{
	struct page_cgroup *pc;
928 929
	struct mem_cgroup *mem = NULL;
	int ret = 0;
930

931 932 933
	if (mem_cgroup_subsys.disabled)
		return 0;

934 935 936
	pc = lookup_page_cgroup(page);
	lock_page_cgroup(pc);
	if (PageCgroupUsed(pc)) {
937 938 939
		mem = pc->mem_cgroup;
		css_get(&mem->css);
	}
940
	unlock_page_cgroup(pc);
941

942
	if (mem) {
943
		ret = mem_cgroup_try_charge(NULL, GFP_HIGHUSER_MOVABLE, &mem);
944 945
		css_put(&mem->css);
	}
946
	*ptr = mem;
947
	return ret;
948
}
949

950
/* remove redundant charge if migration failed*/
951 952
void mem_cgroup_end_migration(struct mem_cgroup *mem,
		struct page *oldpage, struct page *newpage)
953
{
954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977
	struct page *target, *unused;
	struct page_cgroup *pc;
	enum charge_type ctype;

	if (!mem)
		return;

	/* at migration success, oldpage->mapping is NULL. */
	if (oldpage->mapping) {
		target = oldpage;
		unused = NULL;
	} else {
		target = newpage;
		unused = oldpage;
	}

	if (PageAnon(target))
		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
	else if (page_is_file_cache(target))
		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
	else
		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;

	/* unused page is not on radix-tree now. */
K
KAMEZAWA Hiroyuki 已提交
978
	if (unused)
979 980 981
		__mem_cgroup_uncharge_common(unused, ctype);

	pc = lookup_page_cgroup(target);
982
	/*
983 984 985 986 987 988 989 990 991 992 993 994 995 996
	 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
	 * So, double-counting is effectively avoided.
	 */
	__mem_cgroup_commit_charge(mem, pc, ctype);

	/*
	 * Both of oldpage and newpage are still under lock_page().
	 * Then, we don't have to care about race in radix-tree.
	 * But we have to be careful that this page is unmapped or not.
	 *
	 * There is a case for !page_mapped(). At the start of
	 * migration, oldpage was mapped. But now, it's zapped.
	 * But we know *target* page is not freed/reused under us.
	 * mem_cgroup_uncharge_page() does all necessary checks.
997
	 */
998 999
	if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
		mem_cgroup_uncharge_page(target);
1000
}
1001

1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
/*
 * A call to try to shrink memory usage under specified resource controller.
 * This is typically used for page reclaiming for shmem for reducing side
 * effect of page allocation from shmem, which is used by some mem_cgroup.
 */
int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
{
	struct mem_cgroup *mem;
	int progress = 0;
	int retry = MEM_CGROUP_RECLAIM_RETRIES;

1013 1014
	if (mem_cgroup_subsys.disabled)
		return 0;
1015 1016
	if (!mm)
		return 0;
1017

1018 1019
	rcu_read_lock();
	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
1020 1021 1022 1023
	if (unlikely(!mem)) {
		rcu_read_unlock();
		return 0;
	}
1024 1025 1026 1027 1028
	css_get(&mem->css);
	rcu_read_unlock();

	do {
		progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
1029
		progress += res_counter_check_under_limit(&mem->res);
1030 1031 1032 1033 1034 1035 1036 1037
	} while (!progress && --retry);

	css_put(&mem->css);
	if (!retry)
		return -ENOMEM;
	return 0;
}

1038 1039
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
				   unsigned long long val)
1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054
{

	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
	int progress;
	int ret = 0;

	while (res_counter_set_limit(&memcg->res, val)) {
		if (signal_pending(current)) {
			ret = -EINTR;
			break;
		}
		if (!retry_count) {
			ret = -EBUSY;
			break;
		}
1055 1056
		progress = try_to_free_mem_cgroup_pages(memcg,
				GFP_HIGHUSER_MOVABLE);
1057 1058 1059 1060 1061 1062 1063
		if (!progress)
			retry_count--;
	}
	return ret;
}


1064 1065 1066 1067
/*
 * This routine traverse page_cgroup in given list and drop them all.
 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
 */
1068
static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
1069
			    struct mem_cgroup_per_zone *mz,
1070
			    enum lru_list lru)
1071
{
1072
	struct page_cgroup *pc, *busy;
1073
	unsigned long flags;
1074
	unsigned long loop;
1075
	struct list_head *list;
1076
	int ret = 0;
1077

1078
	list = &mz->lists[lru];
1079

1080 1081 1082 1083 1084 1085 1086 1087 1088
	loop = MEM_CGROUP_ZSTAT(mz, lru);
	/* give some margin against EBUSY etc...*/
	loop += 256;
	busy = NULL;
	while (loop--) {
		ret = 0;
		spin_lock_irqsave(&mz->lru_lock, flags);
		if (list_empty(list)) {
			spin_unlock_irqrestore(&mz->lru_lock, flags);
1089
			break;
1090 1091 1092 1093 1094 1095 1096 1097
		}
		pc = list_entry(list->prev, struct page_cgroup, lru);
		if (busy == pc) {
			list_move(&pc->lru, list);
			busy = 0;
			spin_unlock_irqrestore(&mz->lru_lock, flags);
			continue;
		}
1098
		spin_unlock_irqrestore(&mz->lru_lock, flags);
1099 1100 1101

		ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);
		if (ret == -ENOMEM)
1102
			break;
1103 1104 1105 1106 1107 1108 1109

		if (ret == -EBUSY || ret == -EINVAL) {
			/* found lock contention or "pc" is obsolete. */
			busy = pc;
			cond_resched();
		} else
			busy = NULL;
1110
	}
1111 1112 1113
	if (!ret && !list_empty(list))
		return -EBUSY;
	return ret;
1114 1115 1116 1117 1118 1119
}

/*
 * make mem_cgroup's charge to be 0 if there is no task.
 * This enables deleting this mem_cgroup.
 */
1120
static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
1121
{
1122 1123 1124
	int ret;
	int node, zid, shrink;
	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1125
	struct cgroup *cgrp = mem->css.cgroup;
1126

1127
	css_get(&mem->css);
1128 1129

	shrink = 0;
1130 1131 1132
	/* should free all ? */
	if (free_all)
		goto try_to_free;
1133
move_account:
1134
	while (mem->res.usage > 0) {
1135
		ret = -EBUSY;
1136 1137 1138 1139
		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
			goto out;
		ret = -EINTR;
		if (signal_pending(current))
1140
			goto out;
1141 1142
		/* This is for making all *used* pages to be on LRU. */
		lru_add_drain_all();
1143 1144 1145
		ret = 0;
		for_each_node_state(node, N_POSSIBLE) {
			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
1146
				struct mem_cgroup_per_zone *mz;
1147
				enum lru_list l;
1148
				mz = mem_cgroup_zoneinfo(mem, node, zid);
1149 1150 1151 1152 1153 1154
				for_each_lru(l) {
					ret = mem_cgroup_force_empty_list(mem,
								  mz, l);
					if (ret)
						break;
				}
1155
			}
1156 1157 1158 1159 1160 1161
			if (ret)
				break;
		}
		/* it seems parent cgroup doesn't have enough mem */
		if (ret == -ENOMEM)
			goto try_to_free;
1162
		cond_resched();
1163 1164 1165 1166 1167
	}
	ret = 0;
out:
	css_put(&mem->css);
	return ret;
1168 1169

try_to_free:
1170 1171
	/* returns EBUSY if there is a task or if we come here twice. */
	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
1172 1173 1174
		ret = -EBUSY;
		goto out;
	}
1175 1176
	/* we call try-to-free pages for make this cgroup empty */
	lru_add_drain_all();
1177 1178 1179 1180
	/* try to free all pages in this cgroup */
	shrink = 1;
	while (nr_retries && mem->res.usage > 0) {
		int progress;
1181 1182 1183 1184 1185

		if (signal_pending(current)) {
			ret = -EINTR;
			goto out;
		}
1186 1187
		progress = try_to_free_mem_cgroup_pages(mem,
						  GFP_HIGHUSER_MOVABLE);
1188
		if (!progress) {
1189
			nr_retries--;
1190 1191 1192
			/* maybe some writeback is necessary */
			congestion_wait(WRITE, HZ/10);
		}
1193 1194 1195 1196 1197 1198 1199

	}
	/* try move_account...there may be some *locked* pages. */
	if (mem->res.usage)
		goto move_account;
	ret = 0;
	goto out;
1200 1201
}

1202 1203 1204 1205 1206 1207
int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
{
	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
}


1208
static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
B
Balbir Singh 已提交
1209
{
1210 1211
	return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
				    cft->private);
B
Balbir Singh 已提交
1212
}
1213 1214 1215 1216
/*
 * The user of this function is...
 * RES_LIMIT.
 */
1217 1218
static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
			    const char *buffer)
B
Balbir Singh 已提交
1219
{
1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235
	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
	unsigned long long val;
	int ret;

	switch (cft->private) {
	case RES_LIMIT:
		/* This function does all necessary parse...reuse it */
		ret = res_counter_memparse_write_strategy(buffer, &val);
		if (!ret)
			ret = mem_cgroup_resize_limit(memcg, val);
		break;
	default:
		ret = -EINVAL; /* should be BUG() ? */
		break;
	}
	return ret;
B
Balbir Singh 已提交
1236 1237
}

1238
static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1239 1240 1241 1242
{
	struct mem_cgroup *mem;

	mem = mem_cgroup_from_cont(cont);
1243 1244 1245 1246 1247 1248 1249 1250
	switch (event) {
	case RES_MAX_USAGE:
		res_counter_reset_max(&mem->res);
		break;
	case RES_FAILCNT:
		res_counter_reset_failcnt(&mem->res);
		break;
	}
1251
	return 0;
1252 1253
}

1254 1255 1256 1257 1258 1259
static const struct mem_cgroup_stat_desc {
	const char *msg;
	u64 unit;
} mem_cgroup_stat_desc[] = {
	[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
	[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
1260 1261
	[MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
	[MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
1262 1263
};

1264 1265
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
				 struct cgroup_map_cb *cb)
1266 1267 1268 1269 1270 1271 1272 1273 1274 1275
{
	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
	struct mem_cgroup_stat *stat = &mem_cont->stat;
	int i;

	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
		s64 val;

		val = mem_cgroup_read_stat(stat, i);
		val *= mem_cgroup_stat_desc[i].unit;
1276
		cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
1277
	}
1278 1279
	/* showing # of active pages */
	{
1280 1281
		unsigned long active_anon, inactive_anon;
		unsigned long active_file, inactive_file;
L
Lee Schermerhorn 已提交
1282
		unsigned long unevictable;
1283 1284 1285 1286 1287 1288 1289 1290 1291

		inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
						LRU_INACTIVE_ANON);
		active_anon = mem_cgroup_get_all_zonestat(mem_cont,
						LRU_ACTIVE_ANON);
		inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
						LRU_INACTIVE_FILE);
		active_file = mem_cgroup_get_all_zonestat(mem_cont,
						LRU_ACTIVE_FILE);
L
Lee Schermerhorn 已提交
1292 1293 1294
		unevictable = mem_cgroup_get_all_zonestat(mem_cont,
							LRU_UNEVICTABLE);

1295 1296 1297 1298
		cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
		cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
		cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
		cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
L
Lee Schermerhorn 已提交
1299 1300
		cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);

1301
	}
1302 1303 1304
	return 0;
}

1305

B
Balbir Singh 已提交
1306 1307
static struct cftype mem_cgroup_files[] = {
	{
1308
		.name = "usage_in_bytes",
B
Balbir Singh 已提交
1309
		.private = RES_USAGE,
1310
		.read_u64 = mem_cgroup_read,
B
Balbir Singh 已提交
1311
	},
1312 1313 1314
	{
		.name = "max_usage_in_bytes",
		.private = RES_MAX_USAGE,
1315
		.trigger = mem_cgroup_reset,
1316 1317
		.read_u64 = mem_cgroup_read,
	},
B
Balbir Singh 已提交
1318
	{
1319
		.name = "limit_in_bytes",
B
Balbir Singh 已提交
1320
		.private = RES_LIMIT,
1321
		.write_string = mem_cgroup_write,
1322
		.read_u64 = mem_cgroup_read,
B
Balbir Singh 已提交
1323 1324 1325 1326
	},
	{
		.name = "failcnt",
		.private = RES_FAILCNT,
1327
		.trigger = mem_cgroup_reset,
1328
		.read_u64 = mem_cgroup_read,
B
Balbir Singh 已提交
1329
	},
1330 1331
	{
		.name = "stat",
1332
		.read_map = mem_control_stat_show,
1333
	},
1334 1335 1336 1337
	{
		.name = "force_empty",
		.trigger = mem_cgroup_force_empty_write,
	},
B
Balbir Singh 已提交
1338 1339
};

1340 1341 1342
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
	struct mem_cgroup_per_node *pn;
1343
	struct mem_cgroup_per_zone *mz;
1344
	enum lru_list l;
1345
	int zone, tmp = node;
1346 1347 1348 1349 1350 1351 1352 1353
	/*
	 * This routine is called against possible nodes.
	 * But it's BUG to call kmalloc() against offline node.
	 *
	 * TODO: this routine can waste much memory for nodes which will
	 *       never be onlined. It's better to use memory hotplug callback
	 *       function.
	 */
1354 1355 1356
	if (!node_state(node, N_NORMAL_MEMORY))
		tmp = -1;
	pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
1357 1358
	if (!pn)
		return 1;
1359

1360 1361
	mem->info.nodeinfo[node] = pn;
	memset(pn, 0, sizeof(*pn));
1362 1363 1364

	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
		mz = &pn->zoneinfo[zone];
1365
		spin_lock_init(&mz->lru_lock);
1366 1367
		for_each_lru(l)
			INIT_LIST_HEAD(&mz->lists[l]);
1368
	}
1369 1370 1371
	return 0;
}

1372 1373 1374 1375 1376
static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
	kfree(mem->info.nodeinfo[node]);
}

1377 1378 1379 1380 1381 1382
static int mem_cgroup_size(void)
{
	int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
	return sizeof(struct mem_cgroup) + cpustat_size;
}

1383 1384 1385
static struct mem_cgroup *mem_cgroup_alloc(void)
{
	struct mem_cgroup *mem;
1386
	int size = mem_cgroup_size();
1387

1388 1389
	if (size < PAGE_SIZE)
		mem = kmalloc(size, GFP_KERNEL);
1390
	else
1391
		mem = vmalloc(size);
1392 1393

	if (mem)
1394
		memset(mem, 0, size);
1395 1396 1397 1398 1399
	return mem;
}

static void mem_cgroup_free(struct mem_cgroup *mem)
{
1400
	if (mem_cgroup_size() < PAGE_SIZE)
1401 1402 1403 1404 1405 1406
		kfree(mem);
	else
		vfree(mem);
}


B
Balbir Singh 已提交
1407 1408 1409 1410
static struct cgroup_subsys_state *
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
	struct mem_cgroup *mem;
1411
	int node;
B
Balbir Singh 已提交
1412

1413 1414 1415
	mem = mem_cgroup_alloc();
	if (!mem)
		return ERR_PTR(-ENOMEM);
1416

B
Balbir Singh 已提交
1417
	res_counter_init(&mem->res);
1418

1419 1420 1421 1422
	for_each_node_state(node, N_POSSIBLE)
		if (alloc_mem_cgroup_per_zone_info(mem, node))
			goto free_out;

B
Balbir Singh 已提交
1423
	return &mem->css;
1424 1425
free_out:
	for_each_node_state(node, N_POSSIBLE)
1426
		free_mem_cgroup_per_zone_info(mem, node);
1427
	mem_cgroup_free(mem);
1428
	return ERR_PTR(-ENOMEM);
B
Balbir Singh 已提交
1429 1430
}

1431 1432 1433 1434
static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
					struct cgroup *cont)
{
	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1435
	mem_cgroup_force_empty(mem, false);
1436 1437
}

B
Balbir Singh 已提交
1438 1439 1440
static void mem_cgroup_destroy(struct cgroup_subsys *ss,
				struct cgroup *cont)
{
1441 1442 1443 1444
	int node;
	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

	for_each_node_state(node, N_POSSIBLE)
1445
		free_mem_cgroup_per_zone_info(mem, node);
1446

1447
	mem_cgroup_free(mem_cgroup_from_cont(cont));
B
Balbir Singh 已提交
1448 1449 1450 1451 1452 1453 1454 1455 1456
}

static int mem_cgroup_populate(struct cgroup_subsys *ss,
				struct cgroup *cont)
{
	return cgroup_add_files(cont, ss, mem_cgroup_files,
					ARRAY_SIZE(mem_cgroup_files));
}

B
Balbir Singh 已提交
1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
				struct cgroup *cont,
				struct cgroup *old_cont,
				struct task_struct *p)
{
	struct mm_struct *mm;
	struct mem_cgroup *mem, *old_mem;

	mm = get_task_mm(p);
	if (mm == NULL)
		return;

	mem = mem_cgroup_from_cont(cont);
	old_mem = mem_cgroup_from_cont(old_cont);

	/*
	 * Only thread group leaders are allowed to migrate, the mm_struct is
	 * in effect owned by the leader
	 */
1476
	if (!thread_group_leader(p))
B
Balbir Singh 已提交
1477 1478 1479 1480 1481 1482
		goto out;

out:
	mmput(mm);
}

B
Balbir Singh 已提交
1483 1484 1485 1486
struct cgroup_subsys mem_cgroup_subsys = {
	.name = "memory",
	.subsys_id = mem_cgroup_subsys_id,
	.create = mem_cgroup_create,
1487
	.pre_destroy = mem_cgroup_pre_destroy,
B
Balbir Singh 已提交
1488 1489
	.destroy = mem_cgroup_destroy,
	.populate = mem_cgroup_populate,
B
Balbir Singh 已提交
1490
	.attach = mem_cgroup_move_task,
1491
	.early_init = 0,
B
Balbir Singh 已提交
1492
};