memcontrol.c 42.7 KB
Newer Older
B
Balbir Singh 已提交
1 2 3 4 5
/* memcontrol.c - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
6 7 8
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 *
B
Balbir Singh 已提交
9 10 11 12 13 14 15 16 17 18 19 20 21 22
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
23
#include <linux/mm.h>
K
KAMEZAWA Hiroyuki 已提交
24
#include <linux/pagemap.h>
25
#include <linux/smp.h>
26
#include <linux/page-flags.h>
27
#include <linux/backing-dev.h>
28 29
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
30
#include <linux/mutex.h>
31
#include <linux/slab.h>
32 33 34
#include <linux/swap.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
35
#include <linux/seq_file.h>
36
#include <linux/vmalloc.h>
37
#include <linux/mm_inline.h>
38
#include <linux/page_cgroup.h>
K
KAMEZAWA Hiroyuki 已提交
39
#include "internal.h"
B
Balbir Singh 已提交
40

41 42
#include <asm/uaccess.h>

43 44
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES	5
B
Balbir Singh 已提交
45

46 47 48 49 50 51 52 53 54
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
int do_swap_account __read_mostly;
static int really_do_swap_account __initdata = 1; /* for remember boot option*/
#else
#define do_swap_account		(0)
#endif


55 56 57 58 59 60 61 62 63
/*
 * Statistics for memory cgroup.
 */
enum mem_cgroup_stat_index {
	/*
	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
	 */
	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as rss */
64 65
	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
66 67 68 69 70 71 72 73 74

	MEM_CGROUP_STAT_NSTATS,
};

struct mem_cgroup_stat_cpu {
	s64 count[MEM_CGROUP_STAT_NSTATS];
} ____cacheline_aligned_in_smp;

struct mem_cgroup_stat {
75
	struct mem_cgroup_stat_cpu cpustat[0];
76 77 78 79 80
};

/*
 * For accounting under irq disable, no need for increment preempt count.
 */
81
static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
82 83
		enum mem_cgroup_stat_index idx, int val)
{
84
	stat->count[idx] += val;
85 86 87 88 89 90 91 92 93 94 95 96
}

static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
		enum mem_cgroup_stat_index idx)
{
	int cpu;
	s64 ret = 0;
	for_each_possible_cpu(cpu)
		ret += stat->cpustat[cpu].count[idx];
	return ret;
}

97 98 99 100
/*
 * per-zone information in memory controller.
 */
struct mem_cgroup_per_zone {
101 102 103
	/*
	 * spin_lock to protect the per cgroup LRU
	 */
104 105
	struct list_head	lists[NR_LRU_LISTS];
	unsigned long		count[NR_LRU_LISTS];
106 107 108 109 110 111 112 113 114 115 116 117
};
/* Macro for accessing counter */
#define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])

struct mem_cgroup_per_node {
	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};

struct mem_cgroup_lru_info {
	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
};

B
Balbir Singh 已提交
118 119 120 121 122 123 124
/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 *
 * TODO: Add a water mark for the memory controller. Reclaim will begin when
125 126 127
 * we hit the water mark. May be even add a low water mark, such that
 * no reclaim occurs from a cgroup at it's low water mark, this is
 * a feature that will be implemented much later in the future.
B
Balbir Singh 已提交
128 129 130 131 132 133 134
 */
struct mem_cgroup {
	struct cgroup_subsys_state css;
	/*
	 * the counter to account for memory usage
	 */
	struct res_counter res;
135 136 137 138
	/*
	 * the counter to account for mem+swap usage.
	 */
	struct res_counter memsw;
139 140 141 142
	/*
	 * Per cgroup active and inactive list, similar to the
	 * per zone LRU lists.
	 */
143
	struct mem_cgroup_lru_info info;
144

145
	int	prev_priority;	/* for recording reclaim priority */
146 147
	int		obsolete;
	atomic_t	refcnt;
148
	/*
149
	 * statistics. This must be placed at the end of memcg.
150 151
	 */
	struct mem_cgroup_stat stat;
B
Balbir Singh 已提交
152 153
};

154 155 156
enum charge_type {
	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
	MEM_CGROUP_CHARGE_TYPE_MAPPED,
157
	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
158
	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
K
KAMEZAWA Hiroyuki 已提交
159
	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
160 161 162
	NR_CHARGE_TYPE,
};

163 164 165 166
/* only for here (for easy reading.) */
#define PCGF_CACHE	(1UL << PCG_CACHE)
#define PCGF_USED	(1UL << PCG_USED)
#define PCGF_LOCK	(1UL << PCG_LOCK)
167 168
static const unsigned long
pcg_default_flags[NR_CHARGE_TYPE] = {
K
KAMEZAWA Hiroyuki 已提交
169 170 171
	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
	PCGF_USED | PCGF_LOCK, /* Anon */
	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
172
	0, /* FORCE */
173 174
};

175 176 177 178 179 180 181 182 183 184 185

/* for encoding cft->private value on file */
#define _MEM			(0)
#define _MEMSWAP		(1)
#define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
#define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val)	((val) & 0xffff)

static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem);

186 187 188
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
					 struct page_cgroup *pc,
					 bool charge)
189 190 191
{
	int val = (charge)? 1 : -1;
	struct mem_cgroup_stat *stat = &mem->stat;
192
	struct mem_cgroup_stat_cpu *cpustat;
K
KAMEZAWA Hiroyuki 已提交
193
	int cpu = get_cpu();
194

K
KAMEZAWA Hiroyuki 已提交
195
	cpustat = &stat->cpustat[cpu];
196
	if (PageCgroupCache(pc))
197
		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
198
	else
199
		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
200 201

	if (charge)
202
		__mem_cgroup_stat_add_safe(cpustat,
203 204
				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
	else
205
		__mem_cgroup_stat_add_safe(cpustat,
206
				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
K
KAMEZAWA Hiroyuki 已提交
207
	put_cpu();
208 209
}

210
static struct mem_cgroup_per_zone *
211 212 213 214 215
mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
{
	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
}

216
static struct mem_cgroup_per_zone *
217 218 219 220 221
page_cgroup_zoneinfo(struct page_cgroup *pc)
{
	struct mem_cgroup *mem = pc->mem_cgroup;
	int nid = page_cgroup_nid(pc);
	int zid = page_cgroup_zid(pc);
222

223 224 225 226
	return mem_cgroup_zoneinfo(mem, nid, zid);
}

static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
227
					enum lru_list idx)
228 229 230 231 232 233 234 235 236 237 238
{
	int nid, zid;
	struct mem_cgroup_per_zone *mz;
	u64 total = 0;

	for_each_online_node(nid)
		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
			mz = mem_cgroup_zoneinfo(mem, nid, zid);
			total += MEM_CGROUP_ZSTAT(mz, idx);
		}
	return total;
239 240
}

241
static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
B
Balbir Singh 已提交
242 243 244 245 246 247
{
	return container_of(cgroup_subsys_state(cont,
				mem_cgroup_subsys_id), struct mem_cgroup,
				css);
}

248
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
249
{
250 251 252 253 254 255 256 257
	/*
	 * mm_update_next_owner() may clear mm->owner to NULL
	 * if it races with swapoff, page migration, etc.
	 * So this can be called with p == NULL.
	 */
	if (unlikely(!p))
		return NULL;

258 259 260 261
	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
				struct mem_cgroup, css);
}

K
KAMEZAWA Hiroyuki 已提交
262 263 264 265 266 267 268 269 270 271 272 273 274
/*
 * Following LRU functions are allowed to be used without PCG_LOCK.
 * Operations are called by routine of global LRU independently from memcg.
 * What we have to take care of here is validness of pc->mem_cgroup.
 *
 * Changes to pc->mem_cgroup happens when
 * 1. charge
 * 2. moving account
 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
 * It is added to LRU before charge.
 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
 * When moving account, the page is not on LRU. It's isolated.
 */
275

K
KAMEZAWA Hiroyuki 已提交
276 277 278 279 280
void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
{
	struct page_cgroup *pc;
	struct mem_cgroup *mem;
	struct mem_cgroup_per_zone *mz;
281

K
KAMEZAWA Hiroyuki 已提交
282 283 284 285 286 287 288 289
	if (mem_cgroup_subsys.disabled)
		return;
	pc = lookup_page_cgroup(page);
	/* can happen while we handle swapcache. */
	if (list_empty(&pc->lru))
		return;
	mz = page_cgroup_zoneinfo(pc);
	mem = pc->mem_cgroup;
290
	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
K
KAMEZAWA Hiroyuki 已提交
291 292
	list_del_init(&pc->lru);
	return;
293 294
}

K
KAMEZAWA Hiroyuki 已提交
295
void mem_cgroup_del_lru(struct page *page)
296
{
K
KAMEZAWA Hiroyuki 已提交
297 298
	mem_cgroup_del_lru_list(page, page_lru(page));
}
299

K
KAMEZAWA Hiroyuki 已提交
300 301 302 303
void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
{
	struct mem_cgroup_per_zone *mz;
	struct page_cgroup *pc;
304

K
KAMEZAWA Hiroyuki 已提交
305 306
	if (mem_cgroup_subsys.disabled)
		return;
307

K
KAMEZAWA Hiroyuki 已提交
308 309 310 311 312 313 314
	pc = lookup_page_cgroup(page);
	smp_rmb();
	/* unused page is not rotated. */
	if (!PageCgroupUsed(pc))
		return;
	mz = page_cgroup_zoneinfo(pc);
	list_move(&pc->lru, &mz->lists[lru]);
315 316
}

K
KAMEZAWA Hiroyuki 已提交
317
void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
318
{
K
KAMEZAWA Hiroyuki 已提交
319 320
	struct page_cgroup *pc;
	struct mem_cgroup_per_zone *mz;
321

K
KAMEZAWA Hiroyuki 已提交
322 323 324 325 326 327
	if (mem_cgroup_subsys.disabled)
		return;
	pc = lookup_page_cgroup(page);
	/* barrier to sync with "charge" */
	smp_rmb();
	if (!PageCgroupUsed(pc))
L
Lee Schermerhorn 已提交
328
		return;
329

K
KAMEZAWA Hiroyuki 已提交
330
	mz = page_cgroup_zoneinfo(pc);
331
	MEM_CGROUP_ZSTAT(mz, lru) += 1;
K
KAMEZAWA Hiroyuki 已提交
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350
	list_add(&pc->lru, &mz->lists[lru]);
}
/*
 * To add swapcache into LRU. Be careful to all this function.
 * zone->lru_lock shouldn't be held and irq must not be disabled.
 */
static void mem_cgroup_lru_fixup(struct page *page)
{
	if (!isolate_lru_page(page))
		putback_lru_page(page);
}

void mem_cgroup_move_lists(struct page *page,
			   enum lru_list from, enum lru_list to)
{
	if (mem_cgroup_subsys.disabled)
		return;
	mem_cgroup_del_lru_list(page, from);
	mem_cgroup_add_lru_list(page, to);
351 352
}

353 354 355 356 357
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
{
	int ret;

	task_lock(task);
358
	ret = task->mm && mm_match_cgroup(task->mm, mem);
359 360 361 362
	task_unlock(task);
	return ret;
}

363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
/*
 * Calculate mapped_ratio under memory controller. This will be used in
 * vmscan.c for deteremining we have to reclaim mapped pages.
 */
int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
{
	long total, rss;

	/*
	 * usage is recorded in bytes. But, here, we assume the number of
	 * physical pages can be represented by "long" on any arch.
	 */
	total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
	rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
	return (int)((rss * 100L) / total);
}
379

380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
/*
 * prev_priority control...this will be used in memory reclaim path.
 */
int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
{
	return mem->prev_priority;
}

void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
{
	if (priority < mem->prev_priority)
		mem->prev_priority = priority;
}

void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
{
	mem->prev_priority = priority;
}

399 400 401 402 403 404 405 406
/*
 * Calculate # of pages to be scanned in this priority/zone.
 * See also vmscan.c
 *
 * priority starts from "DEF_PRIORITY" and decremented in each loop.
 * (see include/linux/mmzone.h)
 */

407 408
long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
					int priority, enum lru_list lru)
409
{
410
	long nr_pages;
411 412 413 414
	int nid = zone->zone_pgdat->node_id;
	int zid = zone_idx(zone);
	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);

415
	nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
416

417
	return (nr_pages >> priority);
418 419
}

420 421 422 423 424
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
					struct list_head *dst,
					unsigned long *scanned, int order,
					int mode, struct zone *z,
					struct mem_cgroup *mem_cont,
425
					int active, int file)
426 427 428 429 430 431
{
	unsigned long nr_taken = 0;
	struct page *page;
	unsigned long scan;
	LIST_HEAD(pc_list);
	struct list_head *src;
432
	struct page_cgroup *pc, *tmp;
433 434 435
	int nid = z->zone_pgdat->node_id;
	int zid = zone_idx(z);
	struct mem_cgroup_per_zone *mz;
436
	int lru = LRU_FILE * !!file + !!active;
437

438
	BUG_ON(!mem_cont);
439
	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
440
	src = &mz->lists[lru];
441

442 443
	scan = 0;
	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
H
Hugh Dickins 已提交
444
		if (scan >= nr_to_scan)
445
			break;
K
KAMEZAWA Hiroyuki 已提交
446 447

		page = pc->page;
448 449
		if (unlikely(!PageCgroupUsed(pc)))
			continue;
H
Hugh Dickins 已提交
450
		if (unlikely(!PageLRU(page)))
451 452
			continue;

H
Hugh Dickins 已提交
453
		scan++;
454
		if (__isolate_lru_page(page, mode, file) == 0) {
455 456 457 458 459 460 461 462 463
			list_move(&page->lru, dst);
			nr_taken++;
		}
	}

	*scanned = scan;
	return nr_taken;
}

464 465 466
/*
 * Unlike exported interface, "oom" parameter is added. if oom==true,
 * oom-killer can be invoked.
467
 */
468
static int __mem_cgroup_try_charge(struct mm_struct *mm,
469 470
			gfp_t gfp_mask, struct mem_cgroup **memcg,
			bool oom)
471 472
{
	struct mem_cgroup *mem;
473
	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
474
	/*
475 476
	 * We always charge the cgroup the mm_struct belongs to.
	 * The mm_struct's mem_cgroup changes on task migration if the
477 478 479
	 * thread group leader migrates. It's possible that mm is not
	 * set, if so charge the init_mm (happens for pagecache usage).
	 */
480
	if (likely(!*memcg)) {
481 482
		rcu_read_lock();
		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
483 484 485 486
		if (unlikely(!mem)) {
			rcu_read_unlock();
			return 0;
		}
487 488 489 490
		/*
		 * For every charge from the cgroup, increment reference count
		 */
		css_get(&mem->css);
491
		*memcg = mem;
492 493
		rcu_read_unlock();
	} else {
494 495
		mem = *memcg;
		css_get(&mem->css);
496
	}
497

498 499 500
	while (1) {
		int ret;
		bool noswap = false;
501

502 503 504 505 506 507 508 509 510 511 512
		ret = res_counter_charge(&mem->res, PAGE_SIZE);
		if (likely(!ret)) {
			if (!do_swap_account)
				break;
			ret = res_counter_charge(&mem->memsw, PAGE_SIZE);
			if (likely(!ret))
				break;
			/* mem+swap counter fails */
			res_counter_uncharge(&mem->res, PAGE_SIZE);
			noswap = true;
		}
513
		if (!(gfp_mask & __GFP_WAIT))
514
			goto nomem;
515

516
		if (try_to_free_mem_cgroup_pages(mem, gfp_mask, noswap))
517 518 519
			continue;

		/*
520 521 522 523 524
		 * try_to_free_mem_cgroup_pages() might not give us a full
		 * picture of reclaim. Some pages are reclaimed and might be
		 * moved to swap cache or just unmapped from the cgroup.
		 * Check the limit again to see if the reclaim reduced the
		 * current usage of the cgroup before giving up
525
		 *
526
		 */
527 528 529 530 531
		if (!do_swap_account &&
			res_counter_check_under_limit(&mem->res))
			continue;
		if (do_swap_account &&
			res_counter_check_under_limit(&mem->memsw))
532
			continue;
533 534

		if (!nr_retries--) {
535 536
			if (oom)
				mem_cgroup_out_of_memory(mem, gfp_mask);
537
			goto nomem;
538
		}
539
	}
540 541 542 543 544
	return 0;
nomem:
	css_put(&mem->css);
	return -ENOMEM;
}
545

546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564
/**
 * mem_cgroup_try_charge - get charge of PAGE_SIZE.
 * @mm: an mm_struct which is charged against. (when *memcg is NULL)
 * @gfp_mask: gfp_mask for reclaim.
 * @memcg: a pointer to memory cgroup which is charged against.
 *
 * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
 * memory cgroup from @mm is got and stored in *memcg.
 *
 * Returns 0 if success. -ENOMEM at failure.
 * This call can invoke OOM-Killer.
 */

int mem_cgroup_try_charge(struct mm_struct *mm,
			  gfp_t mask, struct mem_cgroup **memcg)
{
	return __mem_cgroup_try_charge(mm, mask, memcg, true);
}

565 566 567 568 569 570 571 572 573 574 575 576
/*
 * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
 * USED state. If already USED, uncharge and return.
 */

static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
				     struct page_cgroup *pc,
				     enum charge_type ctype)
{
	/* try_charge() can return NULL to *memcg, taking care of it. */
	if (!mem)
		return;
577 578 579 580 581

	lock_page_cgroup(pc);
	if (unlikely(PageCgroupUsed(pc))) {
		unlock_page_cgroup(pc);
		res_counter_uncharge(&mem->res, PAGE_SIZE);
582 583
		if (do_swap_account)
			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
584
		css_put(&mem->css);
585
		return;
586
	}
587
	pc->mem_cgroup = mem;
K
KAMEZAWA Hiroyuki 已提交
588
	smp_wmb();
589
	pc->flags = pcg_default_flags[ctype];
590

K
KAMEZAWA Hiroyuki 已提交
591
	mem_cgroup_charge_statistics(mem, pc, true);
592 593

	unlock_page_cgroup(pc);
594
}
595

596 597 598 599 600 601 602
/**
 * mem_cgroup_move_account - move account of the page
 * @pc:	page_cgroup of the page.
 * @from: mem_cgroup which the page is moved from.
 * @to:	mem_cgroup which the page is moved to. @from != @to.
 *
 * The caller must confirm following.
K
KAMEZAWA Hiroyuki 已提交
603
 * - page is not on LRU (isolate_page() is useful.)
604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619
 *
 * returns 0 at success,
 * returns -EBUSY when lock is busy or "pc" is unstable.
 *
 * This function does "uncharge" from old cgroup but doesn't do "charge" to
 * new cgroup. It should be done by a caller.
 */

static int mem_cgroup_move_account(struct page_cgroup *pc,
	struct mem_cgroup *from, struct mem_cgroup *to)
{
	struct mem_cgroup_per_zone *from_mz, *to_mz;
	int nid, zid;
	int ret = -EBUSY;

	VM_BUG_ON(from == to);
K
KAMEZAWA Hiroyuki 已提交
620
	VM_BUG_ON(PageLRU(pc->page));
621 622 623 624 625 626 627 628 629 630 631 632 633 634 635

	nid = page_cgroup_nid(pc);
	zid = page_cgroup_zid(pc);
	from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
	to_mz =  mem_cgroup_zoneinfo(to, nid, zid);

	if (!trylock_page_cgroup(pc))
		return ret;

	if (!PageCgroupUsed(pc))
		goto out;

	if (pc->mem_cgroup != from)
		goto out;

K
KAMEZAWA Hiroyuki 已提交
636 637 638 639 640 641 642 643 644
	css_put(&from->css);
	res_counter_uncharge(&from->res, PAGE_SIZE);
	mem_cgroup_charge_statistics(from, pc, false);
	if (do_swap_account)
		res_counter_uncharge(&from->memsw, PAGE_SIZE);
	pc->mem_cgroup = to;
	mem_cgroup_charge_statistics(to, pc, true);
	css_get(&to->css);
	ret = 0;
645 646 647 648 649 650 651 652 653 654 655 656 657
out:
	unlock_page_cgroup(pc);
	return ret;
}

/*
 * move charges to its parent.
 */

static int mem_cgroup_move_parent(struct page_cgroup *pc,
				  struct mem_cgroup *child,
				  gfp_t gfp_mask)
{
K
KAMEZAWA Hiroyuki 已提交
658
	struct page *page = pc->page;
659 660 661 662 663 664 665 666 667
	struct cgroup *cg = child->css.cgroup;
	struct cgroup *pcg = cg->parent;
	struct mem_cgroup *parent;
	int ret;

	/* Is ROOT ? */
	if (!pcg)
		return -EINVAL;

K
KAMEZAWA Hiroyuki 已提交
668

669 670
	parent = mem_cgroup_from_cont(pcg);

K
KAMEZAWA Hiroyuki 已提交
671

672 673 674 675
	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
	if (ret)
		return ret;

K
KAMEZAWA Hiroyuki 已提交
676 677 678 679 680 681 682
	if (!get_page_unless_zero(page))
		return -EBUSY;

	ret = isolate_lru_page(page);

	if (ret)
		goto cancel;
683 684 685

	ret = mem_cgroup_move_account(pc, child, parent);

K
KAMEZAWA Hiroyuki 已提交
686
	/* drop extra refcnt by try_charge() (move_account increment one) */
687
	css_put(&parent->css);
K
KAMEZAWA Hiroyuki 已提交
688 689 690 691
	putback_lru_page(page);
	if (!ret) {
		put_page(page);
		return 0;
692
	}
K
KAMEZAWA Hiroyuki 已提交
693 694 695 696 697 698
	/* uncharge if move fails */
cancel:
	res_counter_uncharge(&parent->res, PAGE_SIZE);
	if (do_swap_account)
		res_counter_uncharge(&parent->memsw, PAGE_SIZE);
	put_page(page);
699 700 701
	return ret;
}

702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722
/*
 * Charge the memory controller for page usage.
 * Return
 * 0 if the charge was successful
 * < 0 if the cgroup is over its limit
 */
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
				gfp_t gfp_mask, enum charge_type ctype,
				struct mem_cgroup *memcg)
{
	struct mem_cgroup *mem;
	struct page_cgroup *pc;
	int ret;

	pc = lookup_page_cgroup(page);
	/* can happen at boot */
	if (unlikely(!pc))
		return 0;
	prefetchw(pc);

	mem = memcg;
723
	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
724 725 726 727
	if (ret)
		return ret;

	__mem_cgroup_commit_charge(mem, pc, ctype);
728 729 730
	return 0;
}

731 732
int mem_cgroup_newpage_charge(struct page *page,
			      struct mm_struct *mm, gfp_t gfp_mask)
733
{
734 735
	if (mem_cgroup_subsys.disabled)
		return 0;
736 737
	if (PageCompound(page))
		return 0;
738 739 740 741 742 743 744 745 746 747 748
	/*
	 * If already mapped, we don't have to account.
	 * If page cache, page->mapping has address_space.
	 * But page->mapping may have out-of-use anon_vma pointer,
	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
	 * is NULL.
  	 */
	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
		return 0;
	if (unlikely(!mm))
		mm = &init_mm;
749
	return mem_cgroup_charge_common(page, mm, gfp_mask,
750
				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
751 752
}

753 754
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
				gfp_t gfp_mask)
755
{
756 757
	if (mem_cgroup_subsys.disabled)
		return 0;
758 759
	if (PageCompound(page))
		return 0;
760 761 762 763 764 765 766 767 768 769 770 771
	/*
	 * Corner case handling. This is called from add_to_page_cache()
	 * in usual. But some FS (shmem) precharges this page before calling it
	 * and call add_to_page_cache() with GFP_NOWAIT.
	 *
	 * For GFP_NOWAIT case, the page may be pre-charged before calling
	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
	 * charge twice. (It works but has to pay a bit larger cost.)
	 */
	if (!(gfp_mask & __GFP_WAIT)) {
		struct page_cgroup *pc;

772 773 774 775 776 777 778

		pc = lookup_page_cgroup(page);
		if (!pc)
			return 0;
		lock_page_cgroup(pc);
		if (PageCgroupUsed(pc)) {
			unlock_page_cgroup(pc);
779 780
			return 0;
		}
781
		unlock_page_cgroup(pc);
782 783
	}

784
	if (unlikely(!mm))
785
		mm = &init_mm;
786

787 788
	if (page_is_file_cache(page))
		return mem_cgroup_charge_common(page, mm, gfp_mask,
789
				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
790 791 792
	else
		return mem_cgroup_charge_common(page, mm, gfp_mask,
				MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
793 794
}

795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828
int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
				 struct page *page,
				 gfp_t mask, struct mem_cgroup **ptr)
{
	struct mem_cgroup *mem;
	swp_entry_t     ent;

	if (mem_cgroup_subsys.disabled)
		return 0;

	if (!do_swap_account)
		goto charge_cur_mm;

	/*
	 * A racing thread's fault, or swapoff, may have already updated
	 * the pte, and even removed page from swap cache: return success
	 * to go on to do_swap_page()'s pte_same() test, which should fail.
	 */
	if (!PageSwapCache(page))
		return 0;

	ent.val = page_private(page);

	mem = lookup_swap_cgroup(ent);
	if (!mem || mem->obsolete)
		goto charge_cur_mm;
	*ptr = mem;
	return __mem_cgroup_try_charge(NULL, mask, ptr, true);
charge_cur_mm:
	if (unlikely(!mm))
		mm = &init_mm;
	return __mem_cgroup_try_charge(mm, mask, ptr, true);
}

K
KAMEZAWA Hiroyuki 已提交
829
#ifdef CONFIG_SWAP
830

K
KAMEZAWA Hiroyuki 已提交
831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846
int mem_cgroup_cache_charge_swapin(struct page *page,
			struct mm_struct *mm, gfp_t mask, bool locked)
{
	int ret = 0;

	if (mem_cgroup_subsys.disabled)
		return 0;
	if (unlikely(!mm))
		mm = &init_mm;
	if (!locked)
		lock_page(page);
	/*
	 * If not locked, the page can be dropped from SwapCache until
	 * we reach here.
	 */
	if (PageSwapCache(page)) {
847 848 849 850 851 852 853 854 855 856 857
		struct mem_cgroup *mem = NULL;
		swp_entry_t ent;

		ent.val = page_private(page);
		if (do_swap_account) {
			mem = lookup_swap_cgroup(ent);
			if (mem && mem->obsolete)
				mem = NULL;
			if (mem)
				mm = NULL;
		}
K
KAMEZAWA Hiroyuki 已提交
858
		ret = mem_cgroup_charge_common(page, mm, mask,
859 860 861 862 863 864 865 866 867 868
				MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);

		if (!ret && do_swap_account) {
			/* avoid double counting */
			mem = swap_cgroup_record(ent, NULL);
			if (mem) {
				res_counter_uncharge(&mem->memsw, PAGE_SIZE);
				mem_cgroup_put(mem);
			}
		}
K
KAMEZAWA Hiroyuki 已提交
869 870 871
	}
	if (!locked)
		unlock_page(page);
K
KAMEZAWA Hiroyuki 已提交
872 873
	/* add this page(page_cgroup) to the LRU we want. */
	mem_cgroup_lru_fixup(page);
K
KAMEZAWA Hiroyuki 已提交
874 875 876 877 878

	return ret;
}
#endif

879 880 881 882 883 884 885 886 887 888
void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
{
	struct page_cgroup *pc;

	if (mem_cgroup_subsys.disabled)
		return;
	if (!ptr)
		return;
	pc = lookup_page_cgroup(page);
	__mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905
	/*
	 * Now swap is on-memory. This means this page may be
	 * counted both as mem and swap....double count.
	 * Fix it by uncharging from memsw. This SwapCache is stable
	 * because we're still under lock_page().
	 */
	if (do_swap_account) {
		swp_entry_t ent = {.val = page_private(page)};
		struct mem_cgroup *memcg;
		memcg = swap_cgroup_record(ent, NULL);
		if (memcg) {
			/* If memcg is obsolete, memcg can be != ptr */
			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
			mem_cgroup_put(memcg);
		}

	}
K
KAMEZAWA Hiroyuki 已提交
906 907
	/* add this page(page_cgroup) to the LRU we want. */
	mem_cgroup_lru_fixup(page);
908 909 910 911 912 913 914 915 916
}

void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
{
	if (mem_cgroup_subsys.disabled)
		return;
	if (!mem)
		return;
	res_counter_uncharge(&mem->res, PAGE_SIZE);
917 918
	if (do_swap_account)
		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
919 920 921 922
	css_put(&mem->css);
}


923
/*
924
 * uncharge if !page_mapped(page)
925
 */
926
static struct mem_cgroup *
927
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
928
{
H
Hugh Dickins 已提交
929
	struct page_cgroup *pc;
930
	struct mem_cgroup *mem = NULL;
931
	struct mem_cgroup_per_zone *mz;
932

933
	if (mem_cgroup_subsys.disabled)
934
		return NULL;
935

K
KAMEZAWA Hiroyuki 已提交
936
	if (PageSwapCache(page))
937
		return NULL;
K
KAMEZAWA Hiroyuki 已提交
938

939
	/*
940
	 * Check if our page_cgroup is valid
941
	 */
942 943
	pc = lookup_page_cgroup(page);
	if (unlikely(!pc || !PageCgroupUsed(pc)))
944
		return NULL;
945

946
	lock_page_cgroup(pc);
K
KAMEZAWA Hiroyuki 已提交
947

948 949
	mem = pc->mem_cgroup;

K
KAMEZAWA Hiroyuki 已提交
950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966
	if (!PageCgroupUsed(pc))
		goto unlock_out;

	switch (ctype) {
	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
		if (page_mapped(page))
			goto unlock_out;
		break;
	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
		if (!PageAnon(page)) {	/* Shared memory */
			if (page->mapping && !page_is_file_cache(page))
				goto unlock_out;
		} else if (page_mapped(page)) /* Anon */
				goto unlock_out;
		break;
	default:
		break;
967
	}
K
KAMEZAWA Hiroyuki 已提交
968

969 970 971 972
	res_counter_uncharge(&mem->res, PAGE_SIZE);
	if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
		res_counter_uncharge(&mem->memsw, PAGE_SIZE);

K
KAMEZAWA Hiroyuki 已提交
973
	mem_cgroup_charge_statistics(mem, pc, false);
974
	ClearPageCgroupUsed(pc);
975

976
	mz = page_cgroup_zoneinfo(pc);
977
	unlock_page_cgroup(pc);
H
Hugh Dickins 已提交
978

979
	css_put(&mem->css);
980

981
	return mem;
K
KAMEZAWA Hiroyuki 已提交
982 983 984

unlock_out:
	unlock_page_cgroup(pc);
985
	return NULL;
986 987
}

988 989
void mem_cgroup_uncharge_page(struct page *page)
{
990 991 992 993 994
	/* early check. */
	if (page_mapped(page))
		return;
	if (page->mapping && !PageAnon(page))
		return;
995 996 997 998 999 1000
	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
}

void mem_cgroup_uncharge_cache_page(struct page *page)
{
	VM_BUG_ON(page_mapped(page));
1001
	VM_BUG_ON(page->mapping);
1002 1003 1004
	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
}

1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027
/*
 * called from __delete_from_swap_cache() and drop "page" account.
 * memcg information is recorded to swap_cgroup of "ent"
 */
void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
{
	struct mem_cgroup *memcg;

	memcg = __mem_cgroup_uncharge_common(page,
					MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
	/* record memcg information */
	if (do_swap_account && memcg) {
		swap_cgroup_record(ent, memcg);
		mem_cgroup_get(memcg);
	}
}

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/*
 * called from swap_entry_free(). remove record in swap_cgroup and
 * uncharge "memsw" account.
 */
void mem_cgroup_uncharge_swap(swp_entry_t ent)
K
KAMEZAWA Hiroyuki 已提交
1028
{
1029 1030 1031 1032 1033 1034 1035 1036 1037 1038
	struct mem_cgroup *memcg;

	if (!do_swap_account)
		return;

	memcg = swap_cgroup_record(ent, NULL);
	if (memcg) {
		res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
		mem_cgroup_put(memcg);
	}
K
KAMEZAWA Hiroyuki 已提交
1039
}
1040
#endif
K
KAMEZAWA Hiroyuki 已提交
1041

1042
/*
1043 1044
 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
 * page belongs to.
1045
 */
1046
int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
1047 1048
{
	struct page_cgroup *pc;
1049 1050
	struct mem_cgroup *mem = NULL;
	int ret = 0;
1051

1052 1053 1054
	if (mem_cgroup_subsys.disabled)
		return 0;

1055 1056 1057
	pc = lookup_page_cgroup(page);
	lock_page_cgroup(pc);
	if (PageCgroupUsed(pc)) {
1058 1059 1060
		mem = pc->mem_cgroup;
		css_get(&mem->css);
	}
1061
	unlock_page_cgroup(pc);
1062

1063
	if (mem) {
1064
		ret = mem_cgroup_try_charge(NULL, GFP_HIGHUSER_MOVABLE, &mem);
1065 1066
		css_put(&mem->css);
	}
1067
	*ptr = mem;
1068
	return ret;
1069
}
1070

1071
/* remove redundant charge if migration failed*/
1072 1073
void mem_cgroup_end_migration(struct mem_cgroup *mem,
		struct page *oldpage, struct page *newpage)
1074
{
1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098
	struct page *target, *unused;
	struct page_cgroup *pc;
	enum charge_type ctype;

	if (!mem)
		return;

	/* at migration success, oldpage->mapping is NULL. */
	if (oldpage->mapping) {
		target = oldpage;
		unused = NULL;
	} else {
		target = newpage;
		unused = oldpage;
	}

	if (PageAnon(target))
		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
	else if (page_is_file_cache(target))
		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
	else
		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;

	/* unused page is not on radix-tree now. */
K
KAMEZAWA Hiroyuki 已提交
1099
	if (unused)
1100 1101 1102
		__mem_cgroup_uncharge_common(unused, ctype);

	pc = lookup_page_cgroup(target);
1103
	/*
1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117
	 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
	 * So, double-counting is effectively avoided.
	 */
	__mem_cgroup_commit_charge(mem, pc, ctype);

	/*
	 * Both of oldpage and newpage are still under lock_page().
	 * Then, we don't have to care about race in radix-tree.
	 * But we have to be careful that this page is unmapped or not.
	 *
	 * There is a case for !page_mapped(). At the start of
	 * migration, oldpage was mapped. But now, it's zapped.
	 * But we know *target* page is not freed/reused under us.
	 * mem_cgroup_uncharge_page() does all necessary checks.
1118
	 */
1119 1120
	if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
		mem_cgroup_uncharge_page(target);
1121
}
1122

1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133
/*
 * A call to try to shrink memory usage under specified resource controller.
 * This is typically used for page reclaiming for shmem for reducing side
 * effect of page allocation from shmem, which is used by some mem_cgroup.
 */
int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
{
	struct mem_cgroup *mem;
	int progress = 0;
	int retry = MEM_CGROUP_RECLAIM_RETRIES;

1134 1135
	if (mem_cgroup_subsys.disabled)
		return 0;
1136 1137
	if (!mm)
		return 0;
1138

1139 1140
	rcu_read_lock();
	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
1141 1142 1143 1144
	if (unlikely(!mem)) {
		rcu_read_unlock();
		return 0;
	}
1145 1146 1147 1148
	css_get(&mem->css);
	rcu_read_unlock();

	do {
1149
		progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true);
1150
		progress += res_counter_check_under_limit(&mem->res);
1151 1152 1153 1154 1155 1156 1157 1158
	} while (!progress && --retry);

	css_put(&mem->css);
	if (!retry)
		return -ENOMEM;
	return 0;
}

1159 1160
static DEFINE_MUTEX(set_limit_mutex);

1161
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1162
				unsigned long long val)
1163 1164 1165 1166
{

	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
	int progress;
1167
	u64 memswlimit;
1168 1169
	int ret = 0;

1170
	while (retry_count) {
1171 1172 1173 1174
		if (signal_pending(current)) {
			ret = -EINTR;
			break;
		}
1175 1176 1177 1178 1179 1180 1181 1182 1183 1184
		/*
		 * Rather than hide all in some function, I do this in
		 * open coded manner. You see what this really does.
		 * We have to guarantee mem->res.limit < mem->memsw.limit.
		 */
		mutex_lock(&set_limit_mutex);
		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
		if (memswlimit < val) {
			ret = -EINVAL;
			mutex_unlock(&set_limit_mutex);
1185 1186
			break;
		}
1187 1188 1189 1190 1191 1192
		ret = res_counter_set_limit(&memcg->res, val);
		mutex_unlock(&set_limit_mutex);

		if (!ret)
			break;

1193
		progress = try_to_free_mem_cgroup_pages(memcg,
1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236
				GFP_HIGHUSER_MOVABLE, false);
  		if (!progress)			retry_count--;
	}
	return ret;
}

int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
				unsigned long long val)
{
	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
	u64 memlimit, oldusage, curusage;
	int ret;

	if (!do_swap_account)
		return -EINVAL;

	while (retry_count) {
		if (signal_pending(current)) {
			ret = -EINTR;
			break;
		}
		/*
		 * Rather than hide all in some function, I do this in
		 * open coded manner. You see what this really does.
		 * We have to guarantee mem->res.limit < mem->memsw.limit.
		 */
		mutex_lock(&set_limit_mutex);
		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
		if (memlimit > val) {
			ret = -EINVAL;
			mutex_unlock(&set_limit_mutex);
			break;
		}
		ret = res_counter_set_limit(&memcg->memsw, val);
		mutex_unlock(&set_limit_mutex);

		if (!ret)
			break;

		oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
		try_to_free_mem_cgroup_pages(memcg, GFP_HIGHUSER_MOVABLE, true);
		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
		if (curusage >= oldusage)
1237 1238 1239 1240 1241
			retry_count--;
	}
	return ret;
}

1242 1243 1244 1245
/*
 * This routine traverse page_cgroup in given list and drop them all.
 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
 */
1246
static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
K
KAMEZAWA Hiroyuki 已提交
1247
				int node, int zid, enum lru_list lru)
1248
{
K
KAMEZAWA Hiroyuki 已提交
1249 1250
	struct zone *zone;
	struct mem_cgroup_per_zone *mz;
1251
	struct page_cgroup *pc, *busy;
K
KAMEZAWA Hiroyuki 已提交
1252
	unsigned long flags, loop;
1253
	struct list_head *list;
1254
	int ret = 0;
1255

K
KAMEZAWA Hiroyuki 已提交
1256 1257
	zone = &NODE_DATA(node)->node_zones[zid];
	mz = mem_cgroup_zoneinfo(mem, node, zid);
1258
	list = &mz->lists[lru];
1259

1260 1261 1262 1263 1264 1265
	loop = MEM_CGROUP_ZSTAT(mz, lru);
	/* give some margin against EBUSY etc...*/
	loop += 256;
	busy = NULL;
	while (loop--) {
		ret = 0;
K
KAMEZAWA Hiroyuki 已提交
1266
		spin_lock_irqsave(&zone->lru_lock, flags);
1267
		if (list_empty(list)) {
K
KAMEZAWA Hiroyuki 已提交
1268
			spin_unlock_irqrestore(&zone->lru_lock, flags);
1269
			break;
1270 1271 1272 1273 1274
		}
		pc = list_entry(list->prev, struct page_cgroup, lru);
		if (busy == pc) {
			list_move(&pc->lru, list);
			busy = 0;
K
KAMEZAWA Hiroyuki 已提交
1275
			spin_unlock_irqrestore(&zone->lru_lock, flags);
1276 1277
			continue;
		}
K
KAMEZAWA Hiroyuki 已提交
1278
		spin_unlock_irqrestore(&zone->lru_lock, flags);
1279 1280 1281

		ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);
		if (ret == -ENOMEM)
1282
			break;
1283 1284 1285 1286 1287 1288 1289

		if (ret == -EBUSY || ret == -EINVAL) {
			/* found lock contention or "pc" is obsolete. */
			busy = pc;
			cond_resched();
		} else
			busy = NULL;
1290
	}
K
KAMEZAWA Hiroyuki 已提交
1291

1292 1293 1294
	if (!ret && !list_empty(list))
		return -EBUSY;
	return ret;
1295 1296 1297 1298 1299 1300
}

/*
 * make mem_cgroup's charge to be 0 if there is no task.
 * This enables deleting this mem_cgroup.
 */
1301
static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
1302
{
1303 1304 1305
	int ret;
	int node, zid, shrink;
	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1306
	struct cgroup *cgrp = mem->css.cgroup;
1307

1308
	css_get(&mem->css);
1309 1310

	shrink = 0;
1311 1312 1313
	/* should free all ? */
	if (free_all)
		goto try_to_free;
1314
move_account:
1315
	while (mem->res.usage > 0) {
1316
		ret = -EBUSY;
1317 1318 1319 1320
		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
			goto out;
		ret = -EINTR;
		if (signal_pending(current))
1321
			goto out;
1322 1323
		/* This is for making all *used* pages to be on LRU. */
		lru_add_drain_all();
1324 1325 1326
		ret = 0;
		for_each_node_state(node, N_POSSIBLE) {
			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
1327
				enum lru_list l;
1328 1329
				for_each_lru(l) {
					ret = mem_cgroup_force_empty_list(mem,
K
KAMEZAWA Hiroyuki 已提交
1330
							node, zid, l);
1331 1332 1333
					if (ret)
						break;
				}
1334
			}
1335 1336 1337 1338 1339 1340
			if (ret)
				break;
		}
		/* it seems parent cgroup doesn't have enough mem */
		if (ret == -ENOMEM)
			goto try_to_free;
1341
		cond_resched();
1342 1343 1344 1345 1346
	}
	ret = 0;
out:
	css_put(&mem->css);
	return ret;
1347 1348

try_to_free:
1349 1350
	/* returns EBUSY if there is a task or if we come here twice. */
	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
1351 1352 1353
		ret = -EBUSY;
		goto out;
	}
1354 1355
	/* we call try-to-free pages for make this cgroup empty */
	lru_add_drain_all();
1356 1357 1358 1359
	/* try to free all pages in this cgroup */
	shrink = 1;
	while (nr_retries && mem->res.usage > 0) {
		int progress;
1360 1361 1362 1363 1364

		if (signal_pending(current)) {
			ret = -EINTR;
			goto out;
		}
1365
		progress = try_to_free_mem_cgroup_pages(mem,
1366
						  GFP_HIGHUSER_MOVABLE, false);
1367
		if (!progress) {
1368
			nr_retries--;
1369 1370 1371
			/* maybe some writeback is necessary */
			congestion_wait(WRITE, HZ/10);
		}
1372 1373

	}
K
KAMEZAWA Hiroyuki 已提交
1374
	lru_add_drain();
1375 1376 1377 1378 1379
	/* try move_account...there may be some *locked* pages. */
	if (mem->res.usage)
		goto move_account;
	ret = 0;
	goto out;
1380 1381
}

1382 1383 1384 1385 1386 1387
int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
{
	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
}


1388
static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
B
Balbir Singh 已提交
1389
{
1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408
	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
	u64 val = 0;
	int type, name;

	type = MEMFILE_TYPE(cft->private);
	name = MEMFILE_ATTR(cft->private);
	switch (type) {
	case _MEM:
		val = res_counter_read_u64(&mem->res, name);
		break;
	case _MEMSWAP:
		if (do_swap_account)
			val = res_counter_read_u64(&mem->memsw, name);
		break;
	default:
		BUG();
		break;
	}
	return val;
B
Balbir Singh 已提交
1409
}
1410 1411 1412 1413
/*
 * The user of this function is...
 * RES_LIMIT.
 */
1414 1415
static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
			    const char *buffer)
B
Balbir Singh 已提交
1416
{
1417
	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
1418
	int type, name;
1419 1420 1421
	unsigned long long val;
	int ret;

1422 1423 1424
	type = MEMFILE_TYPE(cft->private);
	name = MEMFILE_ATTR(cft->private);
	switch (name) {
1425 1426 1427
	case RES_LIMIT:
		/* This function does all necessary parse...reuse it */
		ret = res_counter_memparse_write_strategy(buffer, &val);
1428 1429 1430
		if (ret)
			break;
		if (type == _MEM)
1431
			ret = mem_cgroup_resize_limit(memcg, val);
1432 1433
		else
			ret = mem_cgroup_resize_memsw_limit(memcg, val);
1434 1435 1436 1437 1438 1439
		break;
	default:
		ret = -EINVAL; /* should be BUG() ? */
		break;
	}
	return ret;
B
Balbir Singh 已提交
1440 1441
}

1442
static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1443 1444
{
	struct mem_cgroup *mem;
1445
	int type, name;
1446 1447

	mem = mem_cgroup_from_cont(cont);
1448 1449 1450
	type = MEMFILE_TYPE(event);
	name = MEMFILE_ATTR(event);
	switch (name) {
1451
	case RES_MAX_USAGE:
1452 1453 1454 1455
		if (type == _MEM)
			res_counter_reset_max(&mem->res);
		else
			res_counter_reset_max(&mem->memsw);
1456 1457
		break;
	case RES_FAILCNT:
1458 1459 1460 1461
		if (type == _MEM)
			res_counter_reset_failcnt(&mem->res);
		else
			res_counter_reset_failcnt(&mem->memsw);
1462 1463
		break;
	}
1464
	return 0;
1465 1466
}

1467 1468 1469 1470 1471 1472
static const struct mem_cgroup_stat_desc {
	const char *msg;
	u64 unit;
} mem_cgroup_stat_desc[] = {
	[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
	[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
1473 1474
	[MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
	[MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
1475 1476
};

1477 1478
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
				 struct cgroup_map_cb *cb)
1479 1480 1481 1482 1483 1484 1485 1486 1487 1488
{
	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
	struct mem_cgroup_stat *stat = &mem_cont->stat;
	int i;

	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
		s64 val;

		val = mem_cgroup_read_stat(stat, i);
		val *= mem_cgroup_stat_desc[i].unit;
1489
		cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
1490
	}
1491 1492
	/* showing # of active pages */
	{
1493 1494
		unsigned long active_anon, inactive_anon;
		unsigned long active_file, inactive_file;
L
Lee Schermerhorn 已提交
1495
		unsigned long unevictable;
1496 1497 1498 1499 1500 1501 1502 1503 1504

		inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
						LRU_INACTIVE_ANON);
		active_anon = mem_cgroup_get_all_zonestat(mem_cont,
						LRU_ACTIVE_ANON);
		inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
						LRU_INACTIVE_FILE);
		active_file = mem_cgroup_get_all_zonestat(mem_cont,
						LRU_ACTIVE_FILE);
L
Lee Schermerhorn 已提交
1505 1506 1507
		unevictable = mem_cgroup_get_all_zonestat(mem_cont,
							LRU_UNEVICTABLE);

1508 1509 1510 1511
		cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
		cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
		cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
		cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
L
Lee Schermerhorn 已提交
1512 1513
		cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);

1514
	}
1515 1516 1517
	return 0;
}

1518

B
Balbir Singh 已提交
1519 1520
static struct cftype mem_cgroup_files[] = {
	{
1521
		.name = "usage_in_bytes",
1522
		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
1523
		.read_u64 = mem_cgroup_read,
B
Balbir Singh 已提交
1524
	},
1525 1526
	{
		.name = "max_usage_in_bytes",
1527
		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
1528
		.trigger = mem_cgroup_reset,
1529 1530
		.read_u64 = mem_cgroup_read,
	},
B
Balbir Singh 已提交
1531
	{
1532
		.name = "limit_in_bytes",
1533
		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
1534
		.write_string = mem_cgroup_write,
1535
		.read_u64 = mem_cgroup_read,
B
Balbir Singh 已提交
1536 1537 1538
	},
	{
		.name = "failcnt",
1539
		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
1540
		.trigger = mem_cgroup_reset,
1541
		.read_u64 = mem_cgroup_read,
B
Balbir Singh 已提交
1542
	},
1543 1544
	{
		.name = "stat",
1545
		.read_map = mem_control_stat_show,
1546
	},
1547 1548 1549 1550
	{
		.name = "force_empty",
		.trigger = mem_cgroup_force_empty_write,
	},
B
Balbir Singh 已提交
1551 1552
};

1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
static struct cftype memsw_cgroup_files[] = {
	{
		.name = "memsw.usage_in_bytes",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
		.read_u64 = mem_cgroup_read,
	},
	{
		.name = "memsw.max_usage_in_bytes",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
		.trigger = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read,
	},
	{
		.name = "memsw.limit_in_bytes",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
		.write_string = mem_cgroup_write,
		.read_u64 = mem_cgroup_read,
	},
	{
		.name = "memsw.failcnt",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
		.trigger = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read,
	},
};

static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
{
	if (!do_swap_account)
		return 0;
	return cgroup_add_files(cont, ss, memsw_cgroup_files,
				ARRAY_SIZE(memsw_cgroup_files));
};
#else
static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
{
	return 0;
}
#endif

1594 1595 1596
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
	struct mem_cgroup_per_node *pn;
1597
	struct mem_cgroup_per_zone *mz;
1598
	enum lru_list l;
1599
	int zone, tmp = node;
1600 1601 1602 1603 1604 1605 1606 1607
	/*
	 * This routine is called against possible nodes.
	 * But it's BUG to call kmalloc() against offline node.
	 *
	 * TODO: this routine can waste much memory for nodes which will
	 *       never be onlined. It's better to use memory hotplug callback
	 *       function.
	 */
1608 1609 1610
	if (!node_state(node, N_NORMAL_MEMORY))
		tmp = -1;
	pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
1611 1612
	if (!pn)
		return 1;
1613

1614 1615
	mem->info.nodeinfo[node] = pn;
	memset(pn, 0, sizeof(*pn));
1616 1617 1618

	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
		mz = &pn->zoneinfo[zone];
1619 1620
		for_each_lru(l)
			INIT_LIST_HEAD(&mz->lists[l]);
1621
	}
1622 1623 1624
	return 0;
}

1625 1626 1627 1628 1629
static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
	kfree(mem->info.nodeinfo[node]);
}

1630 1631 1632 1633 1634 1635
static int mem_cgroup_size(void)
{
	int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
	return sizeof(struct mem_cgroup) + cpustat_size;
}

1636 1637 1638
static struct mem_cgroup *mem_cgroup_alloc(void)
{
	struct mem_cgroup *mem;
1639
	int size = mem_cgroup_size();
1640

1641 1642
	if (size < PAGE_SIZE)
		mem = kmalloc(size, GFP_KERNEL);
1643
	else
1644
		mem = vmalloc(size);
1645 1646

	if (mem)
1647
		memset(mem, 0, size);
1648 1649 1650
	return mem;
}

1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664
/*
 * At destroying mem_cgroup, references from swap_cgroup can remain.
 * (scanning all at force_empty is too costly...)
 *
 * Instead of clearing all references at force_empty, we remember
 * the number of reference from swap_cgroup and free mem_cgroup when
 * it goes down to 0.
 *
 * When mem_cgroup is destroyed, mem->obsolete will be set to 0 and
 * entry which points to this memcg will be ignore at swapin.
 *
 * Removal of cgroup itself succeeds regardless of refs from swap.
 */

1665 1666
static void mem_cgroup_free(struct mem_cgroup *mem)
{
K
KAMEZAWA Hiroyuki 已提交
1667 1668
	int node;

1669 1670
	if (atomic_read(&mem->refcnt) > 0)
		return;
K
KAMEZAWA Hiroyuki 已提交
1671 1672 1673 1674 1675


	for_each_node_state(node, N_POSSIBLE)
		free_mem_cgroup_per_zone_info(mem, node);

1676
	if (mem_cgroup_size() < PAGE_SIZE)
1677 1678 1679 1680 1681
		kfree(mem);
	else
		vfree(mem);
}

1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695
static void mem_cgroup_get(struct mem_cgroup *mem)
{
	atomic_inc(&mem->refcnt);
}

static void mem_cgroup_put(struct mem_cgroup *mem)
{
	if (atomic_dec_and_test(&mem->refcnt)) {
		if (!mem->obsolete)
			return;
		mem_cgroup_free(mem);
	}
}

1696

1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
static void __init enable_swap_cgroup(void)
{
	if (!mem_cgroup_subsys.disabled && really_do_swap_account)
		do_swap_account = 1;
}
#else
static void __init enable_swap_cgroup(void)
{
}
#endif

B
Balbir Singh 已提交
1709 1710 1711 1712
static struct cgroup_subsys_state *
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
	struct mem_cgroup *mem;
1713
	int node;
B
Balbir Singh 已提交
1714

1715 1716 1717
	mem = mem_cgroup_alloc();
	if (!mem)
		return ERR_PTR(-ENOMEM);
1718

B
Balbir Singh 已提交
1719
	res_counter_init(&mem->res);
1720
	res_counter_init(&mem->memsw);
1721

1722 1723 1724
	for_each_node_state(node, N_POSSIBLE)
		if (alloc_mem_cgroup_per_zone_info(mem, node))
			goto free_out;
1725 1726 1727
	/* root ? */
	if (cont->parent == NULL)
		enable_swap_cgroup();
1728

B
Balbir Singh 已提交
1729
	return &mem->css;
1730 1731
free_out:
	for_each_node_state(node, N_POSSIBLE)
1732
		free_mem_cgroup_per_zone_info(mem, node);
1733
	mem_cgroup_free(mem);
1734
	return ERR_PTR(-ENOMEM);
B
Balbir Singh 已提交
1735 1736
}

1737 1738 1739 1740
static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
					struct cgroup *cont)
{
	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1741
	mem->obsolete = 1;
1742
	mem_cgroup_force_empty(mem, false);
1743 1744
}

B
Balbir Singh 已提交
1745 1746 1747
static void mem_cgroup_destroy(struct cgroup_subsys *ss,
				struct cgroup *cont)
{
1748
	mem_cgroup_free(mem_cgroup_from_cont(cont));
B
Balbir Singh 已提交
1749 1750 1751 1752 1753
}

static int mem_cgroup_populate(struct cgroup_subsys *ss,
				struct cgroup *cont)
{
1754 1755 1756 1757 1758 1759 1760 1761
	int ret;

	ret = cgroup_add_files(cont, ss, mem_cgroup_files,
				ARRAY_SIZE(mem_cgroup_files));

	if (!ret)
		ret = register_memsw_files(cont, ss);
	return ret;
B
Balbir Singh 已提交
1762 1763
}

B
Balbir Singh 已提交
1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
				struct cgroup *cont,
				struct cgroup *old_cont,
				struct task_struct *p)
{
	struct mm_struct *mm;
	struct mem_cgroup *mem, *old_mem;

	mm = get_task_mm(p);
	if (mm == NULL)
		return;

	mem = mem_cgroup_from_cont(cont);
	old_mem = mem_cgroup_from_cont(old_cont);

	/*
	 * Only thread group leaders are allowed to migrate, the mm_struct is
	 * in effect owned by the leader
	 */
1783
	if (!thread_group_leader(p))
B
Balbir Singh 已提交
1784 1785 1786 1787 1788 1789
		goto out;

out:
	mmput(mm);
}

B
Balbir Singh 已提交
1790 1791 1792 1793
struct cgroup_subsys mem_cgroup_subsys = {
	.name = "memory",
	.subsys_id = mem_cgroup_subsys_id,
	.create = mem_cgroup_create,
1794
	.pre_destroy = mem_cgroup_pre_destroy,
B
Balbir Singh 已提交
1795 1796
	.destroy = mem_cgroup_destroy,
	.populate = mem_cgroup_populate,
B
Balbir Singh 已提交
1797
	.attach = mem_cgroup_move_task,
1798
	.early_init = 0,
B
Balbir Singh 已提交
1799
};
1800 1801 1802 1803 1804 1805 1806 1807 1808 1809

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

static int __init disable_swap_account(char *s)
{
	really_do_swap_account = 0;
	return 1;
}
__setup("noswapaccount", disable_swap_account);
#endif