memcontrol.c 42.8 KB
Newer Older
B
Balbir Singh 已提交
1 2 3 4 5
/* memcontrol.c - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
6 7 8
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 *
B
Balbir Singh 已提交
9 10 11 12 13 14 15 16 17 18 19 20 21 22
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
23
#include <linux/mm.h>
K
KAMEZAWA Hiroyuki 已提交
24
#include <linux/pagemap.h>
25
#include <linux/smp.h>
26
#include <linux/page-flags.h>
27
#include <linux/backing-dev.h>
28 29
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
30
#include <linux/mutex.h>
31
#include <linux/slab.h>
32 33 34
#include <linux/swap.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
35
#include <linux/seq_file.h>
36
#include <linux/vmalloc.h>
37
#include <linux/mm_inline.h>
38
#include <linux/page_cgroup.h>
K
KAMEZAWA Hiroyuki 已提交
39
#include "internal.h"
B
Balbir Singh 已提交
40

41 42
#include <asm/uaccess.h>

43 44
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES	5
B
Balbir Singh 已提交
45

46 47 48 49 50 51 52 53 54
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
int do_swap_account __read_mostly;
static int really_do_swap_account __initdata = 1; /* for remember boot option*/
#else
#define do_swap_account		(0)
#endif


55 56 57 58 59 60 61 62 63
/*
 * Statistics for memory cgroup.
 */
enum mem_cgroup_stat_index {
	/*
	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
	 */
	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as rss */
64 65
	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
66 67 68 69 70 71 72 73 74

	MEM_CGROUP_STAT_NSTATS,
};

struct mem_cgroup_stat_cpu {
	s64 count[MEM_CGROUP_STAT_NSTATS];
} ____cacheline_aligned_in_smp;

struct mem_cgroup_stat {
75
	struct mem_cgroup_stat_cpu cpustat[0];
76 77 78 79 80
};

/*
 * For accounting under irq disable, no need for increment preempt count.
 */
81
static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
82 83
		enum mem_cgroup_stat_index idx, int val)
{
84
	stat->count[idx] += val;
85 86 87 88 89 90 91 92 93 94 95 96
}

static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
		enum mem_cgroup_stat_index idx)
{
	int cpu;
	s64 ret = 0;
	for_each_possible_cpu(cpu)
		ret += stat->cpustat[cpu].count[idx];
	return ret;
}

97 98 99 100
/*
 * per-zone information in memory controller.
 */
struct mem_cgroup_per_zone {
101 102 103
	/*
	 * spin_lock to protect the per cgroup LRU
	 */
104 105
	struct list_head	lists[NR_LRU_LISTS];
	unsigned long		count[NR_LRU_LISTS];
106 107 108 109 110 111 112 113 114 115 116 117
};
/* Macro for accessing counter */
#define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])

struct mem_cgroup_per_node {
	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};

struct mem_cgroup_lru_info {
	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
};

B
Balbir Singh 已提交
118 119 120 121 122 123 124
/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 *
 * TODO: Add a water mark for the memory controller. Reclaim will begin when
125 126 127
 * we hit the water mark. May be even add a low water mark, such that
 * no reclaim occurs from a cgroup at it's low water mark, this is
 * a feature that will be implemented much later in the future.
B
Balbir Singh 已提交
128 129 130 131 132 133 134
 */
struct mem_cgroup {
	struct cgroup_subsys_state css;
	/*
	 * the counter to account for memory usage
	 */
	struct res_counter res;
135 136 137 138
	/*
	 * the counter to account for mem+swap usage.
	 */
	struct res_counter memsw;
139 140 141 142
	/*
	 * Per cgroup active and inactive list, similar to the
	 * per zone LRU lists.
	 */
143
	struct mem_cgroup_lru_info info;
144

145
	int	prev_priority;	/* for recording reclaim priority */
146 147
	int		obsolete;
	atomic_t	refcnt;
148
	/*
149
	 * statistics. This must be placed at the end of memcg.
150 151
	 */
	struct mem_cgroup_stat stat;
B
Balbir Singh 已提交
152 153
};

154 155 156
enum charge_type {
	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
	MEM_CGROUP_CHARGE_TYPE_MAPPED,
157
	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
158
	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
K
KAMEZAWA Hiroyuki 已提交
159
	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
160 161 162
	NR_CHARGE_TYPE,
};

163 164 165 166
/* only for here (for easy reading.) */
#define PCGF_CACHE	(1UL << PCG_CACHE)
#define PCGF_USED	(1UL << PCG_USED)
#define PCGF_LOCK	(1UL << PCG_LOCK)
167 168
static const unsigned long
pcg_default_flags[NR_CHARGE_TYPE] = {
K
KAMEZAWA Hiroyuki 已提交
169 170 171
	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
	PCGF_USED | PCGF_LOCK, /* Anon */
	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
172
	0, /* FORCE */
173 174
};

175 176 177 178 179 180 181 182 183 184 185

/* for encoding cft->private value on file */
#define _MEM			(0)
#define _MEMSWAP		(1)
#define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
#define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val)	((val) & 0xffff)

static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem);

186 187 188
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
					 struct page_cgroup *pc,
					 bool charge)
189 190 191
{
	int val = (charge)? 1 : -1;
	struct mem_cgroup_stat *stat = &mem->stat;
192
	struct mem_cgroup_stat_cpu *cpustat;
K
KAMEZAWA Hiroyuki 已提交
193
	int cpu = get_cpu();
194

K
KAMEZAWA Hiroyuki 已提交
195
	cpustat = &stat->cpustat[cpu];
196
	if (PageCgroupCache(pc))
197
		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
198
	else
199
		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
200 201

	if (charge)
202
		__mem_cgroup_stat_add_safe(cpustat,
203 204
				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
	else
205
		__mem_cgroup_stat_add_safe(cpustat,
206
				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
K
KAMEZAWA Hiroyuki 已提交
207
	put_cpu();
208 209
}

210
static struct mem_cgroup_per_zone *
211 212 213 214 215
mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
{
	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
}

216
static struct mem_cgroup_per_zone *
217 218 219 220 221
page_cgroup_zoneinfo(struct page_cgroup *pc)
{
	struct mem_cgroup *mem = pc->mem_cgroup;
	int nid = page_cgroup_nid(pc);
	int zid = page_cgroup_zid(pc);
222

223 224 225 226
	return mem_cgroup_zoneinfo(mem, nid, zid);
}

static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
227
					enum lru_list idx)
228 229 230 231 232 233 234 235 236 237 238
{
	int nid, zid;
	struct mem_cgroup_per_zone *mz;
	u64 total = 0;

	for_each_online_node(nid)
		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
			mz = mem_cgroup_zoneinfo(mem, nid, zid);
			total += MEM_CGROUP_ZSTAT(mz, idx);
		}
	return total;
239 240
}

241
static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
B
Balbir Singh 已提交
242 243 244 245 246 247
{
	return container_of(cgroup_subsys_state(cont,
				mem_cgroup_subsys_id), struct mem_cgroup,
				css);
}

248
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
249
{
250 251 252 253 254 255 256 257
	/*
	 * mm_update_next_owner() may clear mm->owner to NULL
	 * if it races with swapoff, page migration, etc.
	 * So this can be called with p == NULL.
	 */
	if (unlikely(!p))
		return NULL;

258 259 260 261
	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
				struct mem_cgroup, css);
}

K
KAMEZAWA Hiroyuki 已提交
262 263 264 265 266 267 268 269 270 271 272 273 274
/*
 * Following LRU functions are allowed to be used without PCG_LOCK.
 * Operations are called by routine of global LRU independently from memcg.
 * What we have to take care of here is validness of pc->mem_cgroup.
 *
 * Changes to pc->mem_cgroup happens when
 * 1. charge
 * 2. moving account
 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
 * It is added to LRU before charge.
 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
 * When moving account, the page is not on LRU. It's isolated.
 */
275

K
KAMEZAWA Hiroyuki 已提交
276 277 278 279 280
void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
{
	struct page_cgroup *pc;
	struct mem_cgroup *mem;
	struct mem_cgroup_per_zone *mz;
281

282
	if (mem_cgroup_disabled())
K
KAMEZAWA Hiroyuki 已提交
283 284 285 286 287 288 289
		return;
	pc = lookup_page_cgroup(page);
	/* can happen while we handle swapcache. */
	if (list_empty(&pc->lru))
		return;
	mz = page_cgroup_zoneinfo(pc);
	mem = pc->mem_cgroup;
290
	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
K
KAMEZAWA Hiroyuki 已提交
291 292
	list_del_init(&pc->lru);
	return;
293 294
}

K
KAMEZAWA Hiroyuki 已提交
295
void mem_cgroup_del_lru(struct page *page)
296
{
K
KAMEZAWA Hiroyuki 已提交
297 298
	mem_cgroup_del_lru_list(page, page_lru(page));
}
299

K
KAMEZAWA Hiroyuki 已提交
300 301 302 303
void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
{
	struct mem_cgroup_per_zone *mz;
	struct page_cgroup *pc;
304

305
	if (mem_cgroup_disabled())
K
KAMEZAWA Hiroyuki 已提交
306
		return;
307

K
KAMEZAWA Hiroyuki 已提交
308 309 310 311 312 313 314
	pc = lookup_page_cgroup(page);
	smp_rmb();
	/* unused page is not rotated. */
	if (!PageCgroupUsed(pc))
		return;
	mz = page_cgroup_zoneinfo(pc);
	list_move(&pc->lru, &mz->lists[lru]);
315 316
}

K
KAMEZAWA Hiroyuki 已提交
317
void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
318
{
K
KAMEZAWA Hiroyuki 已提交
319 320
	struct page_cgroup *pc;
	struct mem_cgroup_per_zone *mz;
321

322
	if (mem_cgroup_disabled())
K
KAMEZAWA Hiroyuki 已提交
323 324 325 326 327
		return;
	pc = lookup_page_cgroup(page);
	/* barrier to sync with "charge" */
	smp_rmb();
	if (!PageCgroupUsed(pc))
L
Lee Schermerhorn 已提交
328
		return;
329

K
KAMEZAWA Hiroyuki 已提交
330
	mz = page_cgroup_zoneinfo(pc);
331
	MEM_CGROUP_ZSTAT(mz, lru) += 1;
K
KAMEZAWA Hiroyuki 已提交
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
	list_add(&pc->lru, &mz->lists[lru]);
}
/*
 * To add swapcache into LRU. Be careful to all this function.
 * zone->lru_lock shouldn't be held and irq must not be disabled.
 */
static void mem_cgroup_lru_fixup(struct page *page)
{
	if (!isolate_lru_page(page))
		putback_lru_page(page);
}

void mem_cgroup_move_lists(struct page *page,
			   enum lru_list from, enum lru_list to)
{
347
	if (mem_cgroup_disabled())
K
KAMEZAWA Hiroyuki 已提交
348 349 350
		return;
	mem_cgroup_del_lru_list(page, from);
	mem_cgroup_add_lru_list(page, to);
351 352
}

353 354 355 356 357
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
{
	int ret;

	task_lock(task);
358
	ret = task->mm && mm_match_cgroup(task->mm, mem);
359 360 361 362
	task_unlock(task);
	return ret;
}

363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
/*
 * Calculate mapped_ratio under memory controller. This will be used in
 * vmscan.c for deteremining we have to reclaim mapped pages.
 */
int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
{
	long total, rss;

	/*
	 * usage is recorded in bytes. But, here, we assume the number of
	 * physical pages can be represented by "long" on any arch.
	 */
	total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
	rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
	return (int)((rss * 100L) / total);
}
379

380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
/*
 * prev_priority control...this will be used in memory reclaim path.
 */
int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
{
	return mem->prev_priority;
}

void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
{
	if (priority < mem->prev_priority)
		mem->prev_priority = priority;
}

void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
{
	mem->prev_priority = priority;
}

399 400 401 402 403 404 405 406
/*
 * Calculate # of pages to be scanned in this priority/zone.
 * See also vmscan.c
 *
 * priority starts from "DEF_PRIORITY" and decremented in each loop.
 * (see include/linux/mmzone.h)
 */

407 408
long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
					int priority, enum lru_list lru)
409
{
410
	long nr_pages;
411 412 413 414
	int nid = zone->zone_pgdat->node_id;
	int zid = zone_idx(zone);
	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);

415
	nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
416

417
	return (nr_pages >> priority);
418 419
}

420 421 422 423 424
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
					struct list_head *dst,
					unsigned long *scanned, int order,
					int mode, struct zone *z,
					struct mem_cgroup *mem_cont,
425
					int active, int file)
426 427 428 429 430 431
{
	unsigned long nr_taken = 0;
	struct page *page;
	unsigned long scan;
	LIST_HEAD(pc_list);
	struct list_head *src;
432
	struct page_cgroup *pc, *tmp;
433 434 435
	int nid = z->zone_pgdat->node_id;
	int zid = zone_idx(z);
	struct mem_cgroup_per_zone *mz;
436
	int lru = LRU_FILE * !!file + !!active;
437

438
	BUG_ON(!mem_cont);
439
	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
440
	src = &mz->lists[lru];
441

442 443
	scan = 0;
	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
H
Hugh Dickins 已提交
444
		if (scan >= nr_to_scan)
445
			break;
K
KAMEZAWA Hiroyuki 已提交
446 447

		page = pc->page;
448 449
		if (unlikely(!PageCgroupUsed(pc)))
			continue;
H
Hugh Dickins 已提交
450
		if (unlikely(!PageLRU(page)))
451 452
			continue;

H
Hugh Dickins 已提交
453
		scan++;
454
		if (__isolate_lru_page(page, mode, file) == 0) {
455 456 457 458 459 460 461 462 463
			list_move(&page->lru, dst);
			nr_taken++;
		}
	}

	*scanned = scan;
	return nr_taken;
}

464 465 466
/*
 * Unlike exported interface, "oom" parameter is added. if oom==true,
 * oom-killer can be invoked.
467
 */
468
static int __mem_cgroup_try_charge(struct mm_struct *mm,
469 470
			gfp_t gfp_mask, struct mem_cgroup **memcg,
			bool oom)
471 472
{
	struct mem_cgroup *mem;
473
	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
474
	struct res_counter *fail_res;
475
	/*
476 477
	 * We always charge the cgroup the mm_struct belongs to.
	 * The mm_struct's mem_cgroup changes on task migration if the
478 479 480
	 * thread group leader migrates. It's possible that mm is not
	 * set, if so charge the init_mm (happens for pagecache usage).
	 */
481
	if (likely(!*memcg)) {
482 483
		rcu_read_lock();
		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
484 485 486 487
		if (unlikely(!mem)) {
			rcu_read_unlock();
			return 0;
		}
488 489 490 491
		/*
		 * For every charge from the cgroup, increment reference count
		 */
		css_get(&mem->css);
492
		*memcg = mem;
493 494
		rcu_read_unlock();
	} else {
495 496
		mem = *memcg;
		css_get(&mem->css);
497
	}
498

499 500 501
	while (1) {
		int ret;
		bool noswap = false;
502

503
		ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
504 505 506
		if (likely(!ret)) {
			if (!do_swap_account)
				break;
507 508
			ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
							&fail_res);
509 510 511 512 513 514
			if (likely(!ret))
				break;
			/* mem+swap counter fails */
			res_counter_uncharge(&mem->res, PAGE_SIZE);
			noswap = true;
		}
515
		if (!(gfp_mask & __GFP_WAIT))
516
			goto nomem;
517

518
		if (try_to_free_mem_cgroup_pages(mem, gfp_mask, noswap))
519 520 521
			continue;

		/*
522 523 524 525 526
		 * try_to_free_mem_cgroup_pages() might not give us a full
		 * picture of reclaim. Some pages are reclaimed and might be
		 * moved to swap cache or just unmapped from the cgroup.
		 * Check the limit again to see if the reclaim reduced the
		 * current usage of the cgroup before giving up
527
		 *
528
		 */
529 530 531 532 533
		if (!do_swap_account &&
			res_counter_check_under_limit(&mem->res))
			continue;
		if (do_swap_account &&
			res_counter_check_under_limit(&mem->memsw))
534
			continue;
535 536

		if (!nr_retries--) {
537 538
			if (oom)
				mem_cgroup_out_of_memory(mem, gfp_mask);
539
			goto nomem;
540
		}
541
	}
542 543 544 545 546
	return 0;
nomem:
	css_put(&mem->css);
	return -ENOMEM;
}
547

548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566
/**
 * mem_cgroup_try_charge - get charge of PAGE_SIZE.
 * @mm: an mm_struct which is charged against. (when *memcg is NULL)
 * @gfp_mask: gfp_mask for reclaim.
 * @memcg: a pointer to memory cgroup which is charged against.
 *
 * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
 * memory cgroup from @mm is got and stored in *memcg.
 *
 * Returns 0 if success. -ENOMEM at failure.
 * This call can invoke OOM-Killer.
 */

int mem_cgroup_try_charge(struct mm_struct *mm,
			  gfp_t mask, struct mem_cgroup **memcg)
{
	return __mem_cgroup_try_charge(mm, mask, memcg, true);
}

567 568 569 570 571 572 573 574 575 576 577 578
/*
 * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
 * USED state. If already USED, uncharge and return.
 */

static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
				     struct page_cgroup *pc,
				     enum charge_type ctype)
{
	/* try_charge() can return NULL to *memcg, taking care of it. */
	if (!mem)
		return;
579 580 581 582 583

	lock_page_cgroup(pc);
	if (unlikely(PageCgroupUsed(pc))) {
		unlock_page_cgroup(pc);
		res_counter_uncharge(&mem->res, PAGE_SIZE);
584 585
		if (do_swap_account)
			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
586
		css_put(&mem->css);
587
		return;
588
	}
589
	pc->mem_cgroup = mem;
K
KAMEZAWA Hiroyuki 已提交
590
	smp_wmb();
591
	pc->flags = pcg_default_flags[ctype];
592

K
KAMEZAWA Hiroyuki 已提交
593
	mem_cgroup_charge_statistics(mem, pc, true);
594 595

	unlock_page_cgroup(pc);
596
}
597

598 599 600 601 602 603 604
/**
 * mem_cgroup_move_account - move account of the page
 * @pc:	page_cgroup of the page.
 * @from: mem_cgroup which the page is moved from.
 * @to:	mem_cgroup which the page is moved to. @from != @to.
 *
 * The caller must confirm following.
K
KAMEZAWA Hiroyuki 已提交
605
 * - page is not on LRU (isolate_page() is useful.)
606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621
 *
 * returns 0 at success,
 * returns -EBUSY when lock is busy or "pc" is unstable.
 *
 * This function does "uncharge" from old cgroup but doesn't do "charge" to
 * new cgroup. It should be done by a caller.
 */

static int mem_cgroup_move_account(struct page_cgroup *pc,
	struct mem_cgroup *from, struct mem_cgroup *to)
{
	struct mem_cgroup_per_zone *from_mz, *to_mz;
	int nid, zid;
	int ret = -EBUSY;

	VM_BUG_ON(from == to);
K
KAMEZAWA Hiroyuki 已提交
622
	VM_BUG_ON(PageLRU(pc->page));
623 624 625 626 627 628 629 630 631 632 633 634 635 636 637

	nid = page_cgroup_nid(pc);
	zid = page_cgroup_zid(pc);
	from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
	to_mz =  mem_cgroup_zoneinfo(to, nid, zid);

	if (!trylock_page_cgroup(pc))
		return ret;

	if (!PageCgroupUsed(pc))
		goto out;

	if (pc->mem_cgroup != from)
		goto out;

K
KAMEZAWA Hiroyuki 已提交
638 639 640 641 642 643 644 645 646
	css_put(&from->css);
	res_counter_uncharge(&from->res, PAGE_SIZE);
	mem_cgroup_charge_statistics(from, pc, false);
	if (do_swap_account)
		res_counter_uncharge(&from->memsw, PAGE_SIZE);
	pc->mem_cgroup = to;
	mem_cgroup_charge_statistics(to, pc, true);
	css_get(&to->css);
	ret = 0;
647 648 649 650 651 652 653 654 655 656 657 658 659
out:
	unlock_page_cgroup(pc);
	return ret;
}

/*
 * move charges to its parent.
 */

static int mem_cgroup_move_parent(struct page_cgroup *pc,
				  struct mem_cgroup *child,
				  gfp_t gfp_mask)
{
K
KAMEZAWA Hiroyuki 已提交
660
	struct page *page = pc->page;
661 662 663 664 665 666 667 668 669
	struct cgroup *cg = child->css.cgroup;
	struct cgroup *pcg = cg->parent;
	struct mem_cgroup *parent;
	int ret;

	/* Is ROOT ? */
	if (!pcg)
		return -EINVAL;

K
KAMEZAWA Hiroyuki 已提交
670

671 672
	parent = mem_cgroup_from_cont(pcg);

K
KAMEZAWA Hiroyuki 已提交
673

674 675 676 677
	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
	if (ret)
		return ret;

K
KAMEZAWA Hiroyuki 已提交
678 679 680 681 682 683 684
	if (!get_page_unless_zero(page))
		return -EBUSY;

	ret = isolate_lru_page(page);

	if (ret)
		goto cancel;
685 686 687

	ret = mem_cgroup_move_account(pc, child, parent);

K
KAMEZAWA Hiroyuki 已提交
688
	/* drop extra refcnt by try_charge() (move_account increment one) */
689
	css_put(&parent->css);
K
KAMEZAWA Hiroyuki 已提交
690 691 692 693
	putback_lru_page(page);
	if (!ret) {
		put_page(page);
		return 0;
694
	}
K
KAMEZAWA Hiroyuki 已提交
695 696 697 698 699 700
	/* uncharge if move fails */
cancel:
	res_counter_uncharge(&parent->res, PAGE_SIZE);
	if (do_swap_account)
		res_counter_uncharge(&parent->memsw, PAGE_SIZE);
	put_page(page);
701 702 703
	return ret;
}

704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724
/*
 * Charge the memory controller for page usage.
 * Return
 * 0 if the charge was successful
 * < 0 if the cgroup is over its limit
 */
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
				gfp_t gfp_mask, enum charge_type ctype,
				struct mem_cgroup *memcg)
{
	struct mem_cgroup *mem;
	struct page_cgroup *pc;
	int ret;

	pc = lookup_page_cgroup(page);
	/* can happen at boot */
	if (unlikely(!pc))
		return 0;
	prefetchw(pc);

	mem = memcg;
725
	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
726 727 728 729
	if (ret)
		return ret;

	__mem_cgroup_commit_charge(mem, pc, ctype);
730 731 732
	return 0;
}

733 734
int mem_cgroup_newpage_charge(struct page *page,
			      struct mm_struct *mm, gfp_t gfp_mask)
735
{
736
	if (mem_cgroup_disabled())
737
		return 0;
738 739
	if (PageCompound(page))
		return 0;
740 741 742 743 744 745 746 747 748 749 750
	/*
	 * If already mapped, we don't have to account.
	 * If page cache, page->mapping has address_space.
	 * But page->mapping may have out-of-use anon_vma pointer,
	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
	 * is NULL.
  	 */
	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
		return 0;
	if (unlikely(!mm))
		mm = &init_mm;
751
	return mem_cgroup_charge_common(page, mm, gfp_mask,
752
				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
753 754
}

755 756
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
				gfp_t gfp_mask)
757
{
758
	if (mem_cgroup_disabled())
759
		return 0;
760 761
	if (PageCompound(page))
		return 0;
762 763 764 765 766 767 768 769 770 771 772 773
	/*
	 * Corner case handling. This is called from add_to_page_cache()
	 * in usual. But some FS (shmem) precharges this page before calling it
	 * and call add_to_page_cache() with GFP_NOWAIT.
	 *
	 * For GFP_NOWAIT case, the page may be pre-charged before calling
	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
	 * charge twice. (It works but has to pay a bit larger cost.)
	 */
	if (!(gfp_mask & __GFP_WAIT)) {
		struct page_cgroup *pc;

774 775 776 777 778 779 780

		pc = lookup_page_cgroup(page);
		if (!pc)
			return 0;
		lock_page_cgroup(pc);
		if (PageCgroupUsed(pc)) {
			unlock_page_cgroup(pc);
781 782
			return 0;
		}
783
		unlock_page_cgroup(pc);
784 785
	}

786
	if (unlikely(!mm))
787
		mm = &init_mm;
788

789 790
	if (page_is_file_cache(page))
		return mem_cgroup_charge_common(page, mm, gfp_mask,
791
				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
792 793 794
	else
		return mem_cgroup_charge_common(page, mm, gfp_mask,
				MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
795 796
}

797 798 799 800 801 802 803
int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
				 struct page *page,
				 gfp_t mask, struct mem_cgroup **ptr)
{
	struct mem_cgroup *mem;
	swp_entry_t     ent;

804
	if (mem_cgroup_disabled())
805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830
		return 0;

	if (!do_swap_account)
		goto charge_cur_mm;

	/*
	 * A racing thread's fault, or swapoff, may have already updated
	 * the pte, and even removed page from swap cache: return success
	 * to go on to do_swap_page()'s pte_same() test, which should fail.
	 */
	if (!PageSwapCache(page))
		return 0;

	ent.val = page_private(page);

	mem = lookup_swap_cgroup(ent);
	if (!mem || mem->obsolete)
		goto charge_cur_mm;
	*ptr = mem;
	return __mem_cgroup_try_charge(NULL, mask, ptr, true);
charge_cur_mm:
	if (unlikely(!mm))
		mm = &init_mm;
	return __mem_cgroup_try_charge(mm, mask, ptr, true);
}

K
KAMEZAWA Hiroyuki 已提交
831
#ifdef CONFIG_SWAP
832

K
KAMEZAWA Hiroyuki 已提交
833 834 835 836 837
int mem_cgroup_cache_charge_swapin(struct page *page,
			struct mm_struct *mm, gfp_t mask, bool locked)
{
	int ret = 0;

838
	if (mem_cgroup_disabled())
K
KAMEZAWA Hiroyuki 已提交
839 840 841 842 843 844 845 846 847 848
		return 0;
	if (unlikely(!mm))
		mm = &init_mm;
	if (!locked)
		lock_page(page);
	/*
	 * If not locked, the page can be dropped from SwapCache until
	 * we reach here.
	 */
	if (PageSwapCache(page)) {
849 850 851 852 853 854 855 856 857 858 859
		struct mem_cgroup *mem = NULL;
		swp_entry_t ent;

		ent.val = page_private(page);
		if (do_swap_account) {
			mem = lookup_swap_cgroup(ent);
			if (mem && mem->obsolete)
				mem = NULL;
			if (mem)
				mm = NULL;
		}
K
KAMEZAWA Hiroyuki 已提交
860
		ret = mem_cgroup_charge_common(page, mm, mask,
861 862 863 864 865 866 867 868 869 870
				MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);

		if (!ret && do_swap_account) {
			/* avoid double counting */
			mem = swap_cgroup_record(ent, NULL);
			if (mem) {
				res_counter_uncharge(&mem->memsw, PAGE_SIZE);
				mem_cgroup_put(mem);
			}
		}
K
KAMEZAWA Hiroyuki 已提交
871 872 873
	}
	if (!locked)
		unlock_page(page);
K
KAMEZAWA Hiroyuki 已提交
874 875
	/* add this page(page_cgroup) to the LRU we want. */
	mem_cgroup_lru_fixup(page);
K
KAMEZAWA Hiroyuki 已提交
876 877 878 879 880

	return ret;
}
#endif

881 882 883 884
void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
{
	struct page_cgroup *pc;

885
	if (mem_cgroup_disabled())
886 887 888 889 890
		return;
	if (!ptr)
		return;
	pc = lookup_page_cgroup(page);
	__mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907
	/*
	 * Now swap is on-memory. This means this page may be
	 * counted both as mem and swap....double count.
	 * Fix it by uncharging from memsw. This SwapCache is stable
	 * because we're still under lock_page().
	 */
	if (do_swap_account) {
		swp_entry_t ent = {.val = page_private(page)};
		struct mem_cgroup *memcg;
		memcg = swap_cgroup_record(ent, NULL);
		if (memcg) {
			/* If memcg is obsolete, memcg can be != ptr */
			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
			mem_cgroup_put(memcg);
		}

	}
K
KAMEZAWA Hiroyuki 已提交
908 909
	/* add this page(page_cgroup) to the LRU we want. */
	mem_cgroup_lru_fixup(page);
910 911 912 913
}

void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
{
914
	if (mem_cgroup_disabled())
915 916 917 918
		return;
	if (!mem)
		return;
	res_counter_uncharge(&mem->res, PAGE_SIZE);
919 920
	if (do_swap_account)
		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
921 922 923 924
	css_put(&mem->css);
}


925
/*
926
 * uncharge if !page_mapped(page)
927
 */
928
static struct mem_cgroup *
929
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
930
{
H
Hugh Dickins 已提交
931
	struct page_cgroup *pc;
932
	struct mem_cgroup *mem = NULL;
933
	struct mem_cgroup_per_zone *mz;
934

935
	if (mem_cgroup_disabled())
936
		return NULL;
937

K
KAMEZAWA Hiroyuki 已提交
938
	if (PageSwapCache(page))
939
		return NULL;
K
KAMEZAWA Hiroyuki 已提交
940

941
	/*
942
	 * Check if our page_cgroup is valid
943
	 */
944 945
	pc = lookup_page_cgroup(page);
	if (unlikely(!pc || !PageCgroupUsed(pc)))
946
		return NULL;
947

948
	lock_page_cgroup(pc);
K
KAMEZAWA Hiroyuki 已提交
949

950 951
	mem = pc->mem_cgroup;

K
KAMEZAWA Hiroyuki 已提交
952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968
	if (!PageCgroupUsed(pc))
		goto unlock_out;

	switch (ctype) {
	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
		if (page_mapped(page))
			goto unlock_out;
		break;
	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
		if (!PageAnon(page)) {	/* Shared memory */
			if (page->mapping && !page_is_file_cache(page))
				goto unlock_out;
		} else if (page_mapped(page)) /* Anon */
				goto unlock_out;
		break;
	default:
		break;
969
	}
K
KAMEZAWA Hiroyuki 已提交
970

971 972 973 974
	res_counter_uncharge(&mem->res, PAGE_SIZE);
	if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
		res_counter_uncharge(&mem->memsw, PAGE_SIZE);

K
KAMEZAWA Hiroyuki 已提交
975
	mem_cgroup_charge_statistics(mem, pc, false);
976
	ClearPageCgroupUsed(pc);
977

978
	mz = page_cgroup_zoneinfo(pc);
979
	unlock_page_cgroup(pc);
H
Hugh Dickins 已提交
980

981
	css_put(&mem->css);
982

983
	return mem;
K
KAMEZAWA Hiroyuki 已提交
984 985 986

unlock_out:
	unlock_page_cgroup(pc);
987
	return NULL;
988 989
}

990 991
void mem_cgroup_uncharge_page(struct page *page)
{
992 993 994 995 996
	/* early check. */
	if (page_mapped(page))
		return;
	if (page->mapping && !PageAnon(page))
		return;
997 998 999 1000 1001 1002
	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
}

void mem_cgroup_uncharge_cache_page(struct page *page)
{
	VM_BUG_ON(page_mapped(page));
1003
	VM_BUG_ON(page->mapping);
1004 1005 1006
	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
}

1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029
/*
 * called from __delete_from_swap_cache() and drop "page" account.
 * memcg information is recorded to swap_cgroup of "ent"
 */
void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
{
	struct mem_cgroup *memcg;

	memcg = __mem_cgroup_uncharge_common(page,
					MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
	/* record memcg information */
	if (do_swap_account && memcg) {
		swap_cgroup_record(ent, memcg);
		mem_cgroup_get(memcg);
	}
}

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/*
 * called from swap_entry_free(). remove record in swap_cgroup and
 * uncharge "memsw" account.
 */
void mem_cgroup_uncharge_swap(swp_entry_t ent)
K
KAMEZAWA Hiroyuki 已提交
1030
{
1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
	struct mem_cgroup *memcg;

	if (!do_swap_account)
		return;

	memcg = swap_cgroup_record(ent, NULL);
	if (memcg) {
		res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
		mem_cgroup_put(memcg);
	}
K
KAMEZAWA Hiroyuki 已提交
1041
}
1042
#endif
K
KAMEZAWA Hiroyuki 已提交
1043

1044
/*
1045 1046
 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
 * page belongs to.
1047
 */
1048
int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
1049 1050
{
	struct page_cgroup *pc;
1051 1052
	struct mem_cgroup *mem = NULL;
	int ret = 0;
1053

1054
	if (mem_cgroup_disabled())
1055 1056
		return 0;

1057 1058 1059
	pc = lookup_page_cgroup(page);
	lock_page_cgroup(pc);
	if (PageCgroupUsed(pc)) {
1060 1061 1062
		mem = pc->mem_cgroup;
		css_get(&mem->css);
	}
1063
	unlock_page_cgroup(pc);
1064

1065
	if (mem) {
1066
		ret = mem_cgroup_try_charge(NULL, GFP_HIGHUSER_MOVABLE, &mem);
1067 1068
		css_put(&mem->css);
	}
1069
	*ptr = mem;
1070
	return ret;
1071
}
1072

1073
/* remove redundant charge if migration failed*/
1074 1075
void mem_cgroup_end_migration(struct mem_cgroup *mem,
		struct page *oldpage, struct page *newpage)
1076
{
1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100
	struct page *target, *unused;
	struct page_cgroup *pc;
	enum charge_type ctype;

	if (!mem)
		return;

	/* at migration success, oldpage->mapping is NULL. */
	if (oldpage->mapping) {
		target = oldpage;
		unused = NULL;
	} else {
		target = newpage;
		unused = oldpage;
	}

	if (PageAnon(target))
		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
	else if (page_is_file_cache(target))
		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
	else
		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;

	/* unused page is not on radix-tree now. */
K
KAMEZAWA Hiroyuki 已提交
1101
	if (unused)
1102 1103 1104
		__mem_cgroup_uncharge_common(unused, ctype);

	pc = lookup_page_cgroup(target);
1105
	/*
1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119
	 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
	 * So, double-counting is effectively avoided.
	 */
	__mem_cgroup_commit_charge(mem, pc, ctype);

	/*
	 * Both of oldpage and newpage are still under lock_page().
	 * Then, we don't have to care about race in radix-tree.
	 * But we have to be careful that this page is unmapped or not.
	 *
	 * There is a case for !page_mapped(). At the start of
	 * migration, oldpage was mapped. But now, it's zapped.
	 * But we know *target* page is not freed/reused under us.
	 * mem_cgroup_uncharge_page() does all necessary checks.
1120
	 */
1121 1122
	if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
		mem_cgroup_uncharge_page(target);
1123
}
1124

1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135
/*
 * A call to try to shrink memory usage under specified resource controller.
 * This is typically used for page reclaiming for shmem for reducing side
 * effect of page allocation from shmem, which is used by some mem_cgroup.
 */
int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
{
	struct mem_cgroup *mem;
	int progress = 0;
	int retry = MEM_CGROUP_RECLAIM_RETRIES;

1136
	if (mem_cgroup_disabled())
1137
		return 0;
1138 1139
	if (!mm)
		return 0;
1140

1141 1142
	rcu_read_lock();
	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
1143 1144 1145 1146
	if (unlikely(!mem)) {
		rcu_read_unlock();
		return 0;
	}
1147 1148 1149 1150
	css_get(&mem->css);
	rcu_read_unlock();

	do {
1151
		progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true);
1152
		progress += res_counter_check_under_limit(&mem->res);
1153 1154 1155 1156 1157 1158 1159 1160
	} while (!progress && --retry);

	css_put(&mem->css);
	if (!retry)
		return -ENOMEM;
	return 0;
}

1161 1162
static DEFINE_MUTEX(set_limit_mutex);

1163
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1164
				unsigned long long val)
1165 1166 1167 1168
{

	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
	int progress;
1169
	u64 memswlimit;
1170 1171
	int ret = 0;

1172
	while (retry_count) {
1173 1174 1175 1176
		if (signal_pending(current)) {
			ret = -EINTR;
			break;
		}
1177 1178 1179 1180 1181 1182 1183 1184 1185 1186
		/*
		 * Rather than hide all in some function, I do this in
		 * open coded manner. You see what this really does.
		 * We have to guarantee mem->res.limit < mem->memsw.limit.
		 */
		mutex_lock(&set_limit_mutex);
		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
		if (memswlimit < val) {
			ret = -EINVAL;
			mutex_unlock(&set_limit_mutex);
1187 1188
			break;
		}
1189 1190 1191 1192 1193 1194
		ret = res_counter_set_limit(&memcg->res, val);
		mutex_unlock(&set_limit_mutex);

		if (!ret)
			break;

1195
		progress = try_to_free_mem_cgroup_pages(memcg,
1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238
				GFP_HIGHUSER_MOVABLE, false);
  		if (!progress)			retry_count--;
	}
	return ret;
}

int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
				unsigned long long val)
{
	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
	u64 memlimit, oldusage, curusage;
	int ret;

	if (!do_swap_account)
		return -EINVAL;

	while (retry_count) {
		if (signal_pending(current)) {
			ret = -EINTR;
			break;
		}
		/*
		 * Rather than hide all in some function, I do this in
		 * open coded manner. You see what this really does.
		 * We have to guarantee mem->res.limit < mem->memsw.limit.
		 */
		mutex_lock(&set_limit_mutex);
		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
		if (memlimit > val) {
			ret = -EINVAL;
			mutex_unlock(&set_limit_mutex);
			break;
		}
		ret = res_counter_set_limit(&memcg->memsw, val);
		mutex_unlock(&set_limit_mutex);

		if (!ret)
			break;

		oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
		try_to_free_mem_cgroup_pages(memcg, GFP_HIGHUSER_MOVABLE, true);
		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
		if (curusage >= oldusage)
1239 1240 1241 1242 1243
			retry_count--;
	}
	return ret;
}

1244 1245 1246 1247
/*
 * This routine traverse page_cgroup in given list and drop them all.
 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
 */
1248
static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
K
KAMEZAWA Hiroyuki 已提交
1249
				int node, int zid, enum lru_list lru)
1250
{
K
KAMEZAWA Hiroyuki 已提交
1251 1252
	struct zone *zone;
	struct mem_cgroup_per_zone *mz;
1253
	struct page_cgroup *pc, *busy;
K
KAMEZAWA Hiroyuki 已提交
1254
	unsigned long flags, loop;
1255
	struct list_head *list;
1256
	int ret = 0;
1257

K
KAMEZAWA Hiroyuki 已提交
1258 1259
	zone = &NODE_DATA(node)->node_zones[zid];
	mz = mem_cgroup_zoneinfo(mem, node, zid);
1260
	list = &mz->lists[lru];
1261

1262 1263 1264 1265 1266 1267
	loop = MEM_CGROUP_ZSTAT(mz, lru);
	/* give some margin against EBUSY etc...*/
	loop += 256;
	busy = NULL;
	while (loop--) {
		ret = 0;
K
KAMEZAWA Hiroyuki 已提交
1268
		spin_lock_irqsave(&zone->lru_lock, flags);
1269
		if (list_empty(list)) {
K
KAMEZAWA Hiroyuki 已提交
1270
			spin_unlock_irqrestore(&zone->lru_lock, flags);
1271
			break;
1272 1273 1274 1275 1276
		}
		pc = list_entry(list->prev, struct page_cgroup, lru);
		if (busy == pc) {
			list_move(&pc->lru, list);
			busy = 0;
K
KAMEZAWA Hiroyuki 已提交
1277
			spin_unlock_irqrestore(&zone->lru_lock, flags);
1278 1279
			continue;
		}
K
KAMEZAWA Hiroyuki 已提交
1280
		spin_unlock_irqrestore(&zone->lru_lock, flags);
1281 1282 1283

		ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);
		if (ret == -ENOMEM)
1284
			break;
1285 1286 1287 1288 1289 1290 1291

		if (ret == -EBUSY || ret == -EINVAL) {
			/* found lock contention or "pc" is obsolete. */
			busy = pc;
			cond_resched();
		} else
			busy = NULL;
1292
	}
K
KAMEZAWA Hiroyuki 已提交
1293

1294 1295 1296
	if (!ret && !list_empty(list))
		return -EBUSY;
	return ret;
1297 1298 1299 1300 1301 1302
}

/*
 * make mem_cgroup's charge to be 0 if there is no task.
 * This enables deleting this mem_cgroup.
 */
1303
static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
1304
{
1305 1306 1307
	int ret;
	int node, zid, shrink;
	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1308
	struct cgroup *cgrp = mem->css.cgroup;
1309

1310
	css_get(&mem->css);
1311 1312

	shrink = 0;
1313 1314 1315
	/* should free all ? */
	if (free_all)
		goto try_to_free;
1316
move_account:
1317
	while (mem->res.usage > 0) {
1318
		ret = -EBUSY;
1319 1320 1321 1322
		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
			goto out;
		ret = -EINTR;
		if (signal_pending(current))
1323
			goto out;
1324 1325
		/* This is for making all *used* pages to be on LRU. */
		lru_add_drain_all();
1326 1327 1328
		ret = 0;
		for_each_node_state(node, N_POSSIBLE) {
			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
1329
				enum lru_list l;
1330 1331
				for_each_lru(l) {
					ret = mem_cgroup_force_empty_list(mem,
K
KAMEZAWA Hiroyuki 已提交
1332
							node, zid, l);
1333 1334 1335
					if (ret)
						break;
				}
1336
			}
1337 1338 1339 1340 1341 1342
			if (ret)
				break;
		}
		/* it seems parent cgroup doesn't have enough mem */
		if (ret == -ENOMEM)
			goto try_to_free;
1343
		cond_resched();
1344 1345 1346 1347 1348
	}
	ret = 0;
out:
	css_put(&mem->css);
	return ret;
1349 1350

try_to_free:
1351 1352
	/* returns EBUSY if there is a task or if we come here twice. */
	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
1353 1354 1355
		ret = -EBUSY;
		goto out;
	}
1356 1357
	/* we call try-to-free pages for make this cgroup empty */
	lru_add_drain_all();
1358 1359 1360 1361
	/* try to free all pages in this cgroup */
	shrink = 1;
	while (nr_retries && mem->res.usage > 0) {
		int progress;
1362 1363 1364 1365 1366

		if (signal_pending(current)) {
			ret = -EINTR;
			goto out;
		}
1367
		progress = try_to_free_mem_cgroup_pages(mem,
1368
						  GFP_HIGHUSER_MOVABLE, false);
1369
		if (!progress) {
1370
			nr_retries--;
1371 1372 1373
			/* maybe some writeback is necessary */
			congestion_wait(WRITE, HZ/10);
		}
1374 1375

	}
K
KAMEZAWA Hiroyuki 已提交
1376
	lru_add_drain();
1377 1378 1379 1380 1381
	/* try move_account...there may be some *locked* pages. */
	if (mem->res.usage)
		goto move_account;
	ret = 0;
	goto out;
1382 1383
}

1384 1385 1386 1387 1388 1389
int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
{
	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
}


1390
static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
B
Balbir Singh 已提交
1391
{
1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410
	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
	u64 val = 0;
	int type, name;

	type = MEMFILE_TYPE(cft->private);
	name = MEMFILE_ATTR(cft->private);
	switch (type) {
	case _MEM:
		val = res_counter_read_u64(&mem->res, name);
		break;
	case _MEMSWAP:
		if (do_swap_account)
			val = res_counter_read_u64(&mem->memsw, name);
		break;
	default:
		BUG();
		break;
	}
	return val;
B
Balbir Singh 已提交
1411
}
1412 1413 1414 1415
/*
 * The user of this function is...
 * RES_LIMIT.
 */
1416 1417
static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
			    const char *buffer)
B
Balbir Singh 已提交
1418
{
1419
	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
1420
	int type, name;
1421 1422 1423
	unsigned long long val;
	int ret;

1424 1425 1426
	type = MEMFILE_TYPE(cft->private);
	name = MEMFILE_ATTR(cft->private);
	switch (name) {
1427 1428 1429
	case RES_LIMIT:
		/* This function does all necessary parse...reuse it */
		ret = res_counter_memparse_write_strategy(buffer, &val);
1430 1431 1432
		if (ret)
			break;
		if (type == _MEM)
1433
			ret = mem_cgroup_resize_limit(memcg, val);
1434 1435
		else
			ret = mem_cgroup_resize_memsw_limit(memcg, val);
1436 1437 1438 1439 1440 1441
		break;
	default:
		ret = -EINVAL; /* should be BUG() ? */
		break;
	}
	return ret;
B
Balbir Singh 已提交
1442 1443
}

1444
static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1445 1446
{
	struct mem_cgroup *mem;
1447
	int type, name;
1448 1449

	mem = mem_cgroup_from_cont(cont);
1450 1451 1452
	type = MEMFILE_TYPE(event);
	name = MEMFILE_ATTR(event);
	switch (name) {
1453
	case RES_MAX_USAGE:
1454 1455 1456 1457
		if (type == _MEM)
			res_counter_reset_max(&mem->res);
		else
			res_counter_reset_max(&mem->memsw);
1458 1459
		break;
	case RES_FAILCNT:
1460 1461 1462 1463
		if (type == _MEM)
			res_counter_reset_failcnt(&mem->res);
		else
			res_counter_reset_failcnt(&mem->memsw);
1464 1465
		break;
	}
1466
	return 0;
1467 1468
}

1469 1470 1471 1472 1473 1474
static const struct mem_cgroup_stat_desc {
	const char *msg;
	u64 unit;
} mem_cgroup_stat_desc[] = {
	[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
	[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
1475 1476
	[MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
	[MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
1477 1478
};

1479 1480
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
				 struct cgroup_map_cb *cb)
1481 1482 1483 1484 1485 1486 1487 1488 1489 1490
{
	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
	struct mem_cgroup_stat *stat = &mem_cont->stat;
	int i;

	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
		s64 val;

		val = mem_cgroup_read_stat(stat, i);
		val *= mem_cgroup_stat_desc[i].unit;
1491
		cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
1492
	}
1493 1494
	/* showing # of active pages */
	{
1495 1496
		unsigned long active_anon, inactive_anon;
		unsigned long active_file, inactive_file;
L
Lee Schermerhorn 已提交
1497
		unsigned long unevictable;
1498 1499 1500 1501 1502 1503 1504 1505 1506

		inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
						LRU_INACTIVE_ANON);
		active_anon = mem_cgroup_get_all_zonestat(mem_cont,
						LRU_ACTIVE_ANON);
		inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
						LRU_INACTIVE_FILE);
		active_file = mem_cgroup_get_all_zonestat(mem_cont,
						LRU_ACTIVE_FILE);
L
Lee Schermerhorn 已提交
1507 1508 1509
		unevictable = mem_cgroup_get_all_zonestat(mem_cont,
							LRU_UNEVICTABLE);

1510 1511 1512 1513
		cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
		cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
		cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
		cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
L
Lee Schermerhorn 已提交
1514 1515
		cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);

1516
	}
1517 1518 1519
	return 0;
}

1520

B
Balbir Singh 已提交
1521 1522
static struct cftype mem_cgroup_files[] = {
	{
1523
		.name = "usage_in_bytes",
1524
		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
1525
		.read_u64 = mem_cgroup_read,
B
Balbir Singh 已提交
1526
	},
1527 1528
	{
		.name = "max_usage_in_bytes",
1529
		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
1530
		.trigger = mem_cgroup_reset,
1531 1532
		.read_u64 = mem_cgroup_read,
	},
B
Balbir Singh 已提交
1533
	{
1534
		.name = "limit_in_bytes",
1535
		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
1536
		.write_string = mem_cgroup_write,
1537
		.read_u64 = mem_cgroup_read,
B
Balbir Singh 已提交
1538 1539 1540
	},
	{
		.name = "failcnt",
1541
		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
1542
		.trigger = mem_cgroup_reset,
1543
		.read_u64 = mem_cgroup_read,
B
Balbir Singh 已提交
1544
	},
1545 1546
	{
		.name = "stat",
1547
		.read_map = mem_control_stat_show,
1548
	},
1549 1550 1551 1552
	{
		.name = "force_empty",
		.trigger = mem_cgroup_force_empty_write,
	},
B
Balbir Singh 已提交
1553 1554
};

1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
static struct cftype memsw_cgroup_files[] = {
	{
		.name = "memsw.usage_in_bytes",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
		.read_u64 = mem_cgroup_read,
	},
	{
		.name = "memsw.max_usage_in_bytes",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
		.trigger = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read,
	},
	{
		.name = "memsw.limit_in_bytes",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
		.write_string = mem_cgroup_write,
		.read_u64 = mem_cgroup_read,
	},
	{
		.name = "memsw.failcnt",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
		.trigger = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read,
	},
};

static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
{
	if (!do_swap_account)
		return 0;
	return cgroup_add_files(cont, ss, memsw_cgroup_files,
				ARRAY_SIZE(memsw_cgroup_files));
};
#else
static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
{
	return 0;
}
#endif

1596 1597 1598
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
	struct mem_cgroup_per_node *pn;
1599
	struct mem_cgroup_per_zone *mz;
1600
	enum lru_list l;
1601
	int zone, tmp = node;
1602 1603 1604 1605 1606 1607 1608 1609
	/*
	 * This routine is called against possible nodes.
	 * But it's BUG to call kmalloc() against offline node.
	 *
	 * TODO: this routine can waste much memory for nodes which will
	 *       never be onlined. It's better to use memory hotplug callback
	 *       function.
	 */
1610 1611 1612
	if (!node_state(node, N_NORMAL_MEMORY))
		tmp = -1;
	pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
1613 1614
	if (!pn)
		return 1;
1615

1616 1617
	mem->info.nodeinfo[node] = pn;
	memset(pn, 0, sizeof(*pn));
1618 1619 1620

	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
		mz = &pn->zoneinfo[zone];
1621 1622
		for_each_lru(l)
			INIT_LIST_HEAD(&mz->lists[l]);
1623
	}
1624 1625 1626
	return 0;
}

1627 1628 1629 1630 1631
static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
	kfree(mem->info.nodeinfo[node]);
}

1632 1633 1634 1635 1636 1637
static int mem_cgroup_size(void)
{
	int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
	return sizeof(struct mem_cgroup) + cpustat_size;
}

1638 1639 1640
static struct mem_cgroup *mem_cgroup_alloc(void)
{
	struct mem_cgroup *mem;
1641
	int size = mem_cgroup_size();
1642

1643 1644
	if (size < PAGE_SIZE)
		mem = kmalloc(size, GFP_KERNEL);
1645
	else
1646
		mem = vmalloc(size);
1647 1648

	if (mem)
1649
		memset(mem, 0, size);
1650 1651 1652
	return mem;
}

1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666
/*
 * At destroying mem_cgroup, references from swap_cgroup can remain.
 * (scanning all at force_empty is too costly...)
 *
 * Instead of clearing all references at force_empty, we remember
 * the number of reference from swap_cgroup and free mem_cgroup when
 * it goes down to 0.
 *
 * When mem_cgroup is destroyed, mem->obsolete will be set to 0 and
 * entry which points to this memcg will be ignore at swapin.
 *
 * Removal of cgroup itself succeeds regardless of refs from swap.
 */

1667 1668
static void mem_cgroup_free(struct mem_cgroup *mem)
{
K
KAMEZAWA Hiroyuki 已提交
1669 1670
	int node;

1671 1672
	if (atomic_read(&mem->refcnt) > 0)
		return;
K
KAMEZAWA Hiroyuki 已提交
1673 1674 1675 1676 1677


	for_each_node_state(node, N_POSSIBLE)
		free_mem_cgroup_per_zone_info(mem, node);

1678
	if (mem_cgroup_size() < PAGE_SIZE)
1679 1680 1681 1682 1683
		kfree(mem);
	else
		vfree(mem);
}

1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697
static void mem_cgroup_get(struct mem_cgroup *mem)
{
	atomic_inc(&mem->refcnt);
}

static void mem_cgroup_put(struct mem_cgroup *mem)
{
	if (atomic_dec_and_test(&mem->refcnt)) {
		if (!mem->obsolete)
			return;
		mem_cgroup_free(mem);
	}
}

1698

1699 1700 1701
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
static void __init enable_swap_cgroup(void)
{
1702
	if (!mem_cgroup_disabled() && really_do_swap_account)
1703 1704 1705 1706 1707 1708 1709 1710
		do_swap_account = 1;
}
#else
static void __init enable_swap_cgroup(void)
{
}
#endif

B
Balbir Singh 已提交
1711 1712 1713
static struct cgroup_subsys_state *
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
1714
	struct mem_cgroup *mem, *parent;
1715
	int node;
B
Balbir Singh 已提交
1716

1717 1718 1719
	mem = mem_cgroup_alloc();
	if (!mem)
		return ERR_PTR(-ENOMEM);
1720

1721 1722 1723
	for_each_node_state(node, N_POSSIBLE)
		if (alloc_mem_cgroup_per_zone_info(mem, node))
			goto free_out;
1724
	/* root ? */
1725
	if (cont->parent == NULL) {
1726
		enable_swap_cgroup();
1727 1728 1729 1730 1731 1732 1733
		parent = NULL;
	} else
		parent = mem_cgroup_from_cont(cont->parent);

	res_counter_init(&mem->res, parent ? &parent->res : NULL);
	res_counter_init(&mem->memsw, parent ? &parent->memsw : NULL);

1734

B
Balbir Singh 已提交
1735
	return &mem->css;
1736 1737
free_out:
	for_each_node_state(node, N_POSSIBLE)
1738
		free_mem_cgroup_per_zone_info(mem, node);
1739
	mem_cgroup_free(mem);
1740
	return ERR_PTR(-ENOMEM);
B
Balbir Singh 已提交
1741 1742
}

1743 1744 1745 1746
static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
					struct cgroup *cont)
{
	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1747
	mem->obsolete = 1;
1748
	mem_cgroup_force_empty(mem, false);
1749 1750
}

B
Balbir Singh 已提交
1751 1752 1753
static void mem_cgroup_destroy(struct cgroup_subsys *ss,
				struct cgroup *cont)
{
1754
	mem_cgroup_free(mem_cgroup_from_cont(cont));
B
Balbir Singh 已提交
1755 1756 1757 1758 1759
}

static int mem_cgroup_populate(struct cgroup_subsys *ss,
				struct cgroup *cont)
{
1760 1761 1762 1763 1764 1765 1766 1767
	int ret;

	ret = cgroup_add_files(cont, ss, mem_cgroup_files,
				ARRAY_SIZE(mem_cgroup_files));

	if (!ret)
		ret = register_memsw_files(cont, ss);
	return ret;
B
Balbir Singh 已提交
1768 1769
}

B
Balbir Singh 已提交
1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
				struct cgroup *cont,
				struct cgroup *old_cont,
				struct task_struct *p)
{
	struct mm_struct *mm;
	struct mem_cgroup *mem, *old_mem;

	mm = get_task_mm(p);
	if (mm == NULL)
		return;

	mem = mem_cgroup_from_cont(cont);
	old_mem = mem_cgroup_from_cont(old_cont);

	/*
	 * Only thread group leaders are allowed to migrate, the mm_struct is
	 * in effect owned by the leader
	 */
1789
	if (!thread_group_leader(p))
B
Balbir Singh 已提交
1790 1791 1792 1793 1794 1795
		goto out;

out:
	mmput(mm);
}

B
Balbir Singh 已提交
1796 1797 1798 1799
struct cgroup_subsys mem_cgroup_subsys = {
	.name = "memory",
	.subsys_id = mem_cgroup_subsys_id,
	.create = mem_cgroup_create,
1800
	.pre_destroy = mem_cgroup_pre_destroy,
B
Balbir Singh 已提交
1801 1802
	.destroy = mem_cgroup_destroy,
	.populate = mem_cgroup_populate,
B
Balbir Singh 已提交
1803
	.attach = mem_cgroup_move_task,
1804
	.early_init = 0,
B
Balbir Singh 已提交
1805
};
1806 1807 1808 1809 1810 1811 1812 1813 1814 1815

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

static int __init disable_swap_account(char *s)
{
	really_do_swap_account = 0;
	return 1;
}
__setup("noswapaccount", disable_swap_account);
#endif