task_mmu.c 39.9 KB
Newer Older
L
Linus Torvalds 已提交
1
#include <linux/mm.h>
D
Davidlohr Bueso 已提交
2
#include <linux/vmacache.h>
L
Linus Torvalds 已提交
3
#include <linux/hugetlb.h>
4
#include <linux/huge_mm.h>
L
Linus Torvalds 已提交
5 6
#include <linux/mount.h>
#include <linux/seq_file.h>
M
Mauricio Lin 已提交
7
#include <linux/highmem.h>
K
Kees Cook 已提交
8
#include <linux/ptrace.h>
9
#include <linux/slab.h>
10 11
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
12
#include <linux/rmap.h>
13 14
#include <linux/swap.h>
#include <linux/swapops.h>
15
#include <linux/mmu_notifier.h>
16
#include <linux/page_idle.h>
M
Mauricio Lin 已提交
17

L
Linus Torvalds 已提交
18 19
#include <asm/elf.h>
#include <asm/uaccess.h>
M
Mauricio Lin 已提交
20
#include <asm/tlbflush.h>
L
Linus Torvalds 已提交
21 22
#include "internal.h"

23
void task_mem(struct seq_file *m, struct mm_struct *mm)
L
Linus Torvalds 已提交
24
{
25
	unsigned long data, text, lib, swap, ptes, pmds;
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;

	/*
	 * Note: to minimize their overhead, mm maintains hiwater_vm and
	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
	 * collector of these hiwater stats must therefore get total_vm
	 * and rss too, which will usually be the higher.  Barriers? not
	 * worth the effort, such snapshots can always be inconsistent.
	 */
	hiwater_vm = total_vm = mm->total_vm;
	if (hiwater_vm < mm->hiwater_vm)
		hiwater_vm = mm->hiwater_vm;
	hiwater_rss = total_rss = get_mm_rss(mm);
	if (hiwater_rss < mm->hiwater_rss)
		hiwater_rss = mm->hiwater_rss;
L
Linus Torvalds 已提交
41 42 43 44

	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
K
KAMEZAWA Hiroyuki 已提交
45
	swap = get_mm_counter(mm, MM_SWAPENTS);
46 47
	ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
48
	seq_printf(m,
49
		"VmPeak:\t%8lu kB\n"
L
Linus Torvalds 已提交
50 51
		"VmSize:\t%8lu kB\n"
		"VmLck:\t%8lu kB\n"
52
		"VmPin:\t%8lu kB\n"
53
		"VmHWM:\t%8lu kB\n"
L
Linus Torvalds 已提交
54 55 56 57 58
		"VmRSS:\t%8lu kB\n"
		"VmData:\t%8lu kB\n"
		"VmStk:\t%8lu kB\n"
		"VmExe:\t%8lu kB\n"
		"VmLib:\t%8lu kB\n"
K
KAMEZAWA Hiroyuki 已提交
59
		"VmPTE:\t%8lu kB\n"
60
		"VmPMD:\t%8lu kB\n"
K
KAMEZAWA Hiroyuki 已提交
61
		"VmSwap:\t%8lu kB\n",
62
		hiwater_vm << (PAGE_SHIFT-10),
63
		total_vm << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
64
		mm->locked_vm << (PAGE_SHIFT-10),
65
		mm->pinned_vm << (PAGE_SHIFT-10),
66 67
		hiwater_rss << (PAGE_SHIFT-10),
		total_rss << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
68 69
		data << (PAGE_SHIFT-10),
		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
70 71
		ptes >> 10,
		pmds >> 10,
K
KAMEZAWA Hiroyuki 已提交
72
		swap << (PAGE_SHIFT-10));
73
	hugetlb_report_usage(m, mm);
L
Linus Torvalds 已提交
74 75 76 77 78 79 80
}

unsigned long task_vsize(struct mm_struct *mm)
{
	return PAGE_SIZE * mm->total_vm;
}

81 82 83
unsigned long task_statm(struct mm_struct *mm,
			 unsigned long *shared, unsigned long *text,
			 unsigned long *data, unsigned long *resident)
L
Linus Torvalds 已提交
84
{
K
KAMEZAWA Hiroyuki 已提交
85
	*shared = get_mm_counter(mm, MM_FILEPAGES);
L
Linus Torvalds 已提交
86 87 88
	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
								>> PAGE_SHIFT;
	*data = mm->total_vm - mm->shared_vm;
K
KAMEZAWA Hiroyuki 已提交
89
	*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
L
Linus Torvalds 已提交
90 91 92
	return mm->total_vm;
}

93 94
#ifdef CONFIG_NUMA
/*
95
 * Save get_task_policy() for show_numa_map().
96 97 98 99 100 101
 */
static void hold_task_mempolicy(struct proc_maps_private *priv)
{
	struct task_struct *task = priv->task;

	task_lock(task);
102
	priv->task_mempolicy = get_task_policy(task);
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
	mpol_get(priv->task_mempolicy);
	task_unlock(task);
}
static void release_task_mempolicy(struct proc_maps_private *priv)
{
	mpol_put(priv->task_mempolicy);
}
#else
static void hold_task_mempolicy(struct proc_maps_private *priv)
{
}
static void release_task_mempolicy(struct proc_maps_private *priv)
{
}
#endif

119
static void vma_stop(struct proc_maps_private *priv)
120
{
121 122 123 124 125
	struct mm_struct *mm = priv->mm;

	release_task_mempolicy(priv);
	up_read(&mm->mmap_sem);
	mmput(mm);
126
}
127

128 129 130 131 132 133 134 135
static struct vm_area_struct *
m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
{
	if (vma == priv->tail_vma)
		return NULL;
	return vma->vm_next ?: priv->tail_vma;
}

136 137 138 139 140 141
static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
{
	if (m->count < m->size)	/* vma is copied successfully */
		m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
}

142
static void *m_start(struct seq_file *m, loff_t *ppos)
M
Mauricio Lin 已提交
143
{
144
	struct proc_maps_private *priv = m->private;
145
	unsigned long last_addr = m->version;
146
	struct mm_struct *mm;
147 148
	struct vm_area_struct *vma;
	unsigned int pos = *ppos;
149

150 151 152 153
	/* See m_cache_vma(). Zero at the start or after lseek. */
	if (last_addr == -1UL)
		return NULL;

154
	priv->task = get_proc_task(priv->inode);
155
	if (!priv->task)
A
Al Viro 已提交
156
		return ERR_PTR(-ESRCH);
157

158 159 160
	mm = priv->mm;
	if (!mm || !atomic_inc_not_zero(&mm->mm_users))
		return NULL;
161

162
	down_read(&mm->mmap_sem);
163
	hold_task_mempolicy(priv);
164
	priv->tail_vma = get_gate_vma(mm);
165

166 167 168 169 170 171 172
	if (last_addr) {
		vma = find_vma(mm, last_addr);
		if (vma && (vma = m_next_vma(priv, vma)))
			return vma;
	}

	m->version = 0;
173
	if (pos < mm->map_count) {
174 175
		for (vma = mm->mmap; pos; pos--) {
			m->version = vma->vm_start;
176
			vma = vma->vm_next;
177
		}
178
		return vma;
179
	}
180

181
	/* we do not bother to update m->version in this case */
182 183
	if (pos == mm->map_count && priv->tail_vma)
		return priv->tail_vma;
184 185 186

	vma_stop(priv);
	return NULL;
187 188 189 190 191
}

static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
	struct proc_maps_private *priv = m->private;
192
	struct vm_area_struct *next;
193 194

	(*pos)++;
195
	next = m_next_vma(priv, v);
196 197 198
	if (!next)
		vma_stop(priv);
	return next;
199 200 201 202 203 204
}

static void m_stop(struct seq_file *m, void *v)
{
	struct proc_maps_private *priv = m->private;

205 206
	if (!IS_ERR_OR_NULL(v))
		vma_stop(priv);
207
	if (priv->task) {
208
		put_task_struct(priv->task);
209 210
		priv->task = NULL;
	}
211 212
}

213 214 215 216 217 218 219 220
static int proc_maps_open(struct inode *inode, struct file *file,
			const struct seq_operations *ops, int psize)
{
	struct proc_maps_private *priv = __seq_open_private(file, ops, psize);

	if (!priv)
		return -ENOMEM;

221
	priv->inode = inode;
222 223 224 225 226 227 228 229
	priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
	if (IS_ERR(priv->mm)) {
		int err = PTR_ERR(priv->mm);

		seq_release_private(inode, file);
		return err;
	}

230 231 232
	return 0;
}

233 234 235 236 237 238 239 240 241 242 243
static int proc_map_release(struct inode *inode, struct file *file)
{
	struct seq_file *seq = file->private_data;
	struct proc_maps_private *priv = seq->private;

	if (priv->mm)
		mmdrop(priv->mm);

	return seq_release_private(inode, file);
}

244
static int do_maps_open(struct inode *inode, struct file *file,
245
			const struct seq_operations *ops)
246
{
247 248
	return proc_maps_open(inode, file, ops,
				sizeof(struct proc_maps_private));
249
}
M
Mauricio Lin 已提交
250

251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
static pid_t pid_of_stack(struct proc_maps_private *priv,
				struct vm_area_struct *vma, bool is_pid)
{
	struct inode *inode = priv->inode;
	struct task_struct *task;
	pid_t ret = 0;

	rcu_read_lock();
	task = pid_task(proc_pid(inode), PIDTYPE_PID);
	if (task) {
		task = task_of_stack(task, vma, is_pid);
		if (task)
			ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
	}
	rcu_read_unlock();

	return ret;
}

270 271
static void
show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
L
Linus Torvalds 已提交
272
{
M
Mauricio Lin 已提交
273 274
	struct mm_struct *mm = vma->vm_mm;
	struct file *file = vma->vm_file;
275
	struct proc_maps_private *priv = m->private;
276
	vm_flags_t flags = vma->vm_flags;
L
Linus Torvalds 已提交
277
	unsigned long ino = 0;
278
	unsigned long long pgoff = 0;
279
	unsigned long start, end;
L
Linus Torvalds 已提交
280
	dev_t dev = 0;
281
	const char *name = NULL;
L
Linus Torvalds 已提交
282 283

	if (file) {
A
Al Viro 已提交
284
		struct inode *inode = file_inode(vma->vm_file);
L
Linus Torvalds 已提交
285 286
		dev = inode->i_sb->s_dev;
		ino = inode->i_ino;
287
		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
L
Linus Torvalds 已提交
288 289
	}

290 291
	/* We don't show the stack guard page in /proc/maps */
	start = vma->vm_start;
292 293 294 295 296
	if (stack_guard_page_start(vma, start))
		start += PAGE_SIZE;
	end = vma->vm_end;
	if (stack_guard_page_end(vma, end))
		end -= PAGE_SIZE;
297

298 299
	seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
300
			start,
301
			end,
L
Linus Torvalds 已提交
302 303 304 305
			flags & VM_READ ? 'r' : '-',
			flags & VM_WRITE ? 'w' : '-',
			flags & VM_EXEC ? 'x' : '-',
			flags & VM_MAYSHARE ? 's' : 'p',
306
			pgoff,
307
			MAJOR(dev), MINOR(dev), ino);
L
Linus Torvalds 已提交
308 309 310 311 312

	/*
	 * Print the dentry name for named mappings, and a
	 * special [heap] marker for the heap:
	 */
M
Mauricio Lin 已提交
313
	if (file) {
314
		seq_pad(m, ' ');
M
Miklos Szeredi 已提交
315
		seq_file_path(m, file, "\n");
316 317 318
		goto done;
	}

319 320 321 322 323 324
	if (vma->vm_ops && vma->vm_ops->name) {
		name = vma->vm_ops->name(vma);
		if (name)
			goto done;
	}

325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
	name = arch_vma_name(vma);
	if (!name) {
		pid_t tid;

		if (!mm) {
			name = "[vdso]";
			goto done;
		}

		if (vma->vm_start <= mm->brk &&
		    vma->vm_end >= mm->start_brk) {
			name = "[heap]";
			goto done;
		}

340
		tid = pid_of_stack(priv, vma, is_pid);
341 342 343 344 345 346 347 348
		if (tid != 0) {
			/*
			 * Thread stack in /proc/PID/task/TID/maps or
			 * the main process stack.
			 */
			if (!is_pid || (vma->vm_start <= mm->start_stack &&
			    vma->vm_end >= mm->start_stack)) {
				name = "[stack]";
349
			} else {
350
				/* Thread stack in /proc/PID/maps */
351
				seq_pad(m, ' ');
352
				seq_printf(m, "[stack:%d]", tid);
L
Linus Torvalds 已提交
353
			}
354
		}
355 356 357 358
	}

done:
	if (name) {
359
		seq_pad(m, ' ');
360
		seq_puts(m, name);
L
Linus Torvalds 已提交
361 362
	}
	seq_putc(m, '\n');
363 364
}

365
static int show_map(struct seq_file *m, void *v, int is_pid)
366
{
367
	show_map_vma(m, v, is_pid);
368
	m_cache_vma(m, v);
L
Linus Torvalds 已提交
369 370 371
	return 0;
}

372 373 374 375 376 377 378 379 380 381
static int show_pid_map(struct seq_file *m, void *v)
{
	return show_map(m, v, 1);
}

static int show_tid_map(struct seq_file *m, void *v)
{
	return show_map(m, v, 0);
}

382
static const struct seq_operations proc_pid_maps_op = {
383 384 385
	.start	= m_start,
	.next	= m_next,
	.stop	= m_stop,
386 387 388 389 390 391 392 393
	.show	= show_pid_map
};

static const struct seq_operations proc_tid_maps_op = {
	.start	= m_start,
	.next	= m_next,
	.stop	= m_stop,
	.show	= show_tid_map
394 395
};

396
static int pid_maps_open(struct inode *inode, struct file *file)
397 398 399 400
{
	return do_maps_open(inode, file, &proc_pid_maps_op);
}

401 402 403 404 405 406 407 408 409
static int tid_maps_open(struct inode *inode, struct file *file)
{
	return do_maps_open(inode, file, &proc_tid_maps_op);
}

const struct file_operations proc_pid_maps_operations = {
	.open		= pid_maps_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
410
	.release	= proc_map_release,
411 412 413 414
};

const struct file_operations proc_tid_maps_operations = {
	.open		= tid_maps_open,
415 416
	.read		= seq_read,
	.llseek		= seq_lseek,
417
	.release	= proc_map_release,
418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
};

/*
 * Proportional Set Size(PSS): my share of RSS.
 *
 * PSS of a process is the count of pages it has in memory, where each
 * page is divided by the number of processes sharing it.  So if a
 * process has 1000 pages all to itself, and 1000 shared with one other
 * process, its PSS will be 1500.
 *
 * To keep (accumulated) division errors low, we adopt a 64bit
 * fixed-point pss counter to minimize division errors. So (pss >>
 * PSS_SHIFT) would be the real byte count.
 *
 * A shift of 12 before division means (assuming 4K page size):
 * 	- 1M 3-user-pages add up to 8KB errors;
 * 	- supports mapcount up to 2^24, or 16M;
 * 	- supports PSS up to 2^52 bytes, or 4PB.
 */
#define PSS_SHIFT 12

439
#ifdef CONFIG_PROC_PAGE_MONITOR
P
Peter Zijlstra 已提交
440
struct mem_size_stats {
441 442 443 444 445 446
	unsigned long resident;
	unsigned long shared_clean;
	unsigned long shared_dirty;
	unsigned long private_clean;
	unsigned long private_dirty;
	unsigned long referenced;
447
	unsigned long anonymous;
448
	unsigned long anonymous_thp;
P
Peter Zijlstra 已提交
449
	unsigned long swap;
450 451
	unsigned long shared_hugetlb;
	unsigned long private_hugetlb;
452
	u64 pss;
453
	u64 swap_pss;
454
	bool check_shmem_swap;
455 456
};

457 458 459 460 461 462 463 464 465 466
static void smaps_account(struct mem_size_stats *mss, struct page *page,
		unsigned long size, bool young, bool dirty)
{
	int mapcount;

	if (PageAnon(page))
		mss->anonymous += size;

	mss->resident += size;
	/* Accumulate the size in pages that have been accessed. */
467
	if (young || page_is_young(page) || PageReferenced(page))
468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487
		mss->referenced += size;
	mapcount = page_mapcount(page);
	if (mapcount >= 2) {
		u64 pss_delta;

		if (dirty || PageDirty(page))
			mss->shared_dirty += size;
		else
			mss->shared_clean += size;
		pss_delta = (u64)size << PSS_SHIFT;
		do_div(pss_delta, mapcount);
		mss->pss += pss_delta;
	} else {
		if (dirty || PageDirty(page))
			mss->private_dirty += size;
		else
			mss->private_clean += size;
		mss->pss += (u64)size << PSS_SHIFT;
	}
}
488

489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527
#ifdef CONFIG_SHMEM
static unsigned long smaps_shmem_swap(struct vm_area_struct *vma,
		unsigned long addr)
{
	struct page *page;

	page = find_get_entry(vma->vm_file->f_mapping,
					linear_page_index(vma, addr));
	if (!page)
		return 0;

	if (radix_tree_exceptional_entry(page))
		return PAGE_SIZE;

	page_cache_release(page);
	return 0;

}

static int smaps_pte_hole(unsigned long addr, unsigned long end,
		struct mm_walk *walk)
{
	struct mem_size_stats *mss = walk->private;

	while (addr < end) {
		mss->swap += smaps_shmem_swap(walk->vma, addr);
		addr += PAGE_SIZE;
	}

	return 0;
}
#else
static unsigned long smaps_shmem_swap(struct vm_area_struct *vma,
		unsigned long addr)
{
	return 0;
}
#endif

528 529
static void smaps_pte_entry(pte_t *pte, unsigned long addr,
		struct mm_walk *walk)
530 531
{
	struct mem_size_stats *mss = walk->private;
532
	struct vm_area_struct *vma = walk->vma;
533
	struct page *page = NULL;
534

535 536 537 538
	if (pte_present(*pte)) {
		page = vm_normal_page(vma, addr, *pte);
	} else if (is_swap_pte(*pte)) {
		swp_entry_t swpent = pte_to_swp_entry(*pte);
539

540 541 542
		if (!non_swap_entry(swpent)) {
			int mapcount;

543
			mss->swap += PAGE_SIZE;
544 545 546 547 548 549 550 551 552 553
			mapcount = swp_swapcount(swpent);
			if (mapcount >= 2) {
				u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;

				do_div(pss_delta, mapcount);
				mss->swap_pss += pss_delta;
			} else {
				mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
			}
		} else if (is_migration_entry(swpent))
554
			page = migration_entry_to_page(swpent);
555 556 557
	} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
							&& pte_none(*pte))) {
		mss->swap += smaps_shmem_swap(vma, addr);
558
	}
559 560 561

	if (!page)
		return;
562
	smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
563 564
}

565 566 567 568 569
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
		struct mm_walk *walk)
{
	struct mem_size_stats *mss = walk->private;
570
	struct vm_area_struct *vma = walk->vma;
571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
	struct page *page;

	/* FOLL_DUMP will return -EFAULT on huge zero page */
	page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
	if (IS_ERR_OR_NULL(page))
		return;
	mss->anonymous_thp += HPAGE_PMD_SIZE;
	smaps_account(mss, page, HPAGE_PMD_SIZE,
			pmd_young(*pmd), pmd_dirty(*pmd));
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
		struct mm_walk *walk)
{
}
#endif

588
static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
D
Dave Hansen 已提交
589
			   struct mm_walk *walk)
M
Mauricio Lin 已提交
590
{
591
	struct vm_area_struct *vma = walk->vma;
592
	pte_t *pte;
593
	spinlock_t *ptl;
M
Mauricio Lin 已提交
594

595
	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
596
		smaps_pmd_entry(pmd, addr, walk);
597
		spin_unlock(ptl);
598
		return 0;
599
	}
600 601 602

	if (pmd_trans_unstable(pmd))
		return 0;
603 604 605 606 607
	/*
	 * The mmap_sem held all the way back in m_start() is what
	 * keeps khugepaged out of here and from collapsing things
	 * in here.
	 */
608
	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
609
	for (; addr != end; pte++, addr += PAGE_SIZE)
610
		smaps_pte_entry(pte, addr, walk);
611 612
	pte_unmap_unlock(pte - 1, ptl);
	cond_resched();
613
	return 0;
M
Mauricio Lin 已提交
614 615
}

616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637
static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
{
	/*
	 * Don't forget to update Documentation/ on changes.
	 */
	static const char mnemonics[BITS_PER_LONG][2] = {
		/*
		 * In case if we meet a flag we don't know about.
		 */
		[0 ... (BITS_PER_LONG-1)] = "??",

		[ilog2(VM_READ)]	= "rd",
		[ilog2(VM_WRITE)]	= "wr",
		[ilog2(VM_EXEC)]	= "ex",
		[ilog2(VM_SHARED)]	= "sh",
		[ilog2(VM_MAYREAD)]	= "mr",
		[ilog2(VM_MAYWRITE)]	= "mw",
		[ilog2(VM_MAYEXEC)]	= "me",
		[ilog2(VM_MAYSHARE)]	= "ms",
		[ilog2(VM_GROWSDOWN)]	= "gd",
		[ilog2(VM_PFNMAP)]	= "pf",
		[ilog2(VM_DENYWRITE)]	= "dw",
638 639 640
#ifdef CONFIG_X86_INTEL_MPX
		[ilog2(VM_MPX)]		= "mp",
#endif
641 642 643 644 645 646 647 648 649 650 651
		[ilog2(VM_LOCKED)]	= "lo",
		[ilog2(VM_IO)]		= "io",
		[ilog2(VM_SEQ_READ)]	= "sr",
		[ilog2(VM_RAND_READ)]	= "rr",
		[ilog2(VM_DONTCOPY)]	= "dc",
		[ilog2(VM_DONTEXPAND)]	= "de",
		[ilog2(VM_ACCOUNT)]	= "ac",
		[ilog2(VM_NORESERVE)]	= "nr",
		[ilog2(VM_HUGETLB)]	= "ht",
		[ilog2(VM_ARCH_1)]	= "ar",
		[ilog2(VM_DONTDUMP)]	= "dd",
652 653 654
#ifdef CONFIG_MEM_SOFT_DIRTY
		[ilog2(VM_SOFTDIRTY)]	= "sd",
#endif
655 656 657 658
		[ilog2(VM_MIXEDMAP)]	= "mm",
		[ilog2(VM_HUGEPAGE)]	= "hg",
		[ilog2(VM_NOHUGEPAGE)]	= "nh",
		[ilog2(VM_MERGEABLE)]	= "mg",
659 660
		[ilog2(VM_UFFD_MISSING)]= "um",
		[ilog2(VM_UFFD_WP)]	= "uw",
661 662 663 664 665 666 667 668 669 670 671 672 673
	};
	size_t i;

	seq_puts(m, "VmFlags: ");
	for (i = 0; i < BITS_PER_LONG; i++) {
		if (vma->vm_flags & (1UL << i)) {
			seq_printf(m, "%c%c ",
				   mnemonics[i][0], mnemonics[i][1]);
		}
	}
	seq_putc(m, '\n');
}

674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702
#ifdef CONFIG_HUGETLB_PAGE
static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
				 unsigned long addr, unsigned long end,
				 struct mm_walk *walk)
{
	struct mem_size_stats *mss = walk->private;
	struct vm_area_struct *vma = walk->vma;
	struct page *page = NULL;

	if (pte_present(*pte)) {
		page = vm_normal_page(vma, addr, *pte);
	} else if (is_swap_pte(*pte)) {
		swp_entry_t swpent = pte_to_swp_entry(*pte);

		if (is_migration_entry(swpent))
			page = migration_entry_to_page(swpent);
	}
	if (page) {
		int mapcount = page_mapcount(page);

		if (mapcount >= 2)
			mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
		else
			mss->private_hugetlb += huge_page_size(hstate_vma(vma));
	}
	return 0;
}
#endif /* HUGETLB_PAGE */

703
static int show_smap(struct seq_file *m, void *v, int is_pid)
M
Mauricio Lin 已提交
704 705 706
{
	struct vm_area_struct *vma = v;
	struct mem_size_stats mss;
D
Dave Hansen 已提交
707 708
	struct mm_walk smaps_walk = {
		.pmd_entry = smaps_pte_range,
709 710 711
#ifdef CONFIG_HUGETLB_PAGE
		.hugetlb_entry = smaps_hugetlb_range,
#endif
D
Dave Hansen 已提交
712 713 714
		.mm = vma->vm_mm,
		.private = &mss,
	};
M
Mauricio Lin 已提交
715 716

	memset(&mss, 0, sizeof mss);
717 718 719 720 721 722 723 724

#ifdef CONFIG_SHMEM
	if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
		mss.check_shmem_swap = true;
		smaps_walk.pte_hole = smaps_pte_hole;
	}
#endif

725
	/* mmap_sem is held in m_start */
726
	walk_page_vma(vma, &smaps_walk);
727

728
	show_map_vma(m, vma, is_pid);
729 730 731 732 733 734 735 736 737

	seq_printf(m,
		   "Size:           %8lu kB\n"
		   "Rss:            %8lu kB\n"
		   "Pss:            %8lu kB\n"
		   "Shared_Clean:   %8lu kB\n"
		   "Shared_Dirty:   %8lu kB\n"
		   "Private_Clean:  %8lu kB\n"
		   "Private_Dirty:  %8lu kB\n"
P
Peter Zijlstra 已提交
738
		   "Referenced:     %8lu kB\n"
739
		   "Anonymous:      %8lu kB\n"
740
		   "AnonHugePages:  %8lu kB\n"
741 742
		   "Shared_Hugetlb: %8lu kB\n"
		   "Private_Hugetlb: %7lu kB\n"
743
		   "Swap:           %8lu kB\n"
744
		   "SwapPss:        %8lu kB\n"
745
		   "KernelPageSize: %8lu kB\n"
746 747
		   "MMUPageSize:    %8lu kB\n"
		   "Locked:         %8lu kB\n",
748 749 750 751 752 753 754
		   (vma->vm_end - vma->vm_start) >> 10,
		   mss.resident >> 10,
		   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
		   mss.shared_clean  >> 10,
		   mss.shared_dirty  >> 10,
		   mss.private_clean >> 10,
		   mss.private_dirty >> 10,
P
Peter Zijlstra 已提交
755
		   mss.referenced >> 10,
756
		   mss.anonymous >> 10,
757
		   mss.anonymous_thp >> 10,
758 759
		   mss.shared_hugetlb >> 10,
		   mss.private_hugetlb >> 10,
760
		   mss.swap >> 10,
761
		   (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
762
		   vma_kernel_pagesize(vma) >> 10,
763 764 765
		   vma_mmu_pagesize(vma) >> 10,
		   (vma->vm_flags & VM_LOCKED) ?
			(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
766

767
	show_smap_vma_flags(m, vma);
768
	m_cache_vma(m, vma);
769
	return 0;
M
Mauricio Lin 已提交
770 771
}

772 773 774 775 776 777 778 779 780 781
static int show_pid_smap(struct seq_file *m, void *v)
{
	return show_smap(m, v, 1);
}

static int show_tid_smap(struct seq_file *m, void *v)
{
	return show_smap(m, v, 0);
}

782
static const struct seq_operations proc_pid_smaps_op = {
783 784 785
	.start	= m_start,
	.next	= m_next,
	.stop	= m_stop,
786 787 788 789 790 791 792 793
	.show	= show_pid_smap
};

static const struct seq_operations proc_tid_smaps_op = {
	.start	= m_start,
	.next	= m_next,
	.stop	= m_stop,
	.show	= show_tid_smap
794 795
};

796
static int pid_smaps_open(struct inode *inode, struct file *file)
797 798 799 800
{
	return do_maps_open(inode, file, &proc_pid_smaps_op);
}

801 802 803 804 805 806 807 808 809
static int tid_smaps_open(struct inode *inode, struct file *file)
{
	return do_maps_open(inode, file, &proc_tid_smaps_op);
}

const struct file_operations proc_pid_smaps_operations = {
	.open		= pid_smaps_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
810
	.release	= proc_map_release,
811 812 813 814
};

const struct file_operations proc_tid_smaps_operations = {
	.open		= tid_smaps_open,
815 816
	.read		= seq_read,
	.llseek		= seq_lseek,
817
	.release	= proc_map_release,
818 819
};

820 821 822 823
enum clear_refs_types {
	CLEAR_REFS_ALL = 1,
	CLEAR_REFS_ANON,
	CLEAR_REFS_MAPPED,
824
	CLEAR_REFS_SOFT_DIRTY,
825
	CLEAR_REFS_MM_HIWATER_RSS,
826 827 828
	CLEAR_REFS_LAST,
};

829
struct clear_refs_private {
830
	enum clear_refs_types type;
831 832
};

833
#ifdef CONFIG_MEM_SOFT_DIRTY
834 835 836 837 838 839 840 841 842 843
static inline void clear_soft_dirty(struct vm_area_struct *vma,
		unsigned long addr, pte_t *pte)
{
	/*
	 * The soft-dirty tracker uses #PF-s to catch writes
	 * to pages, so write-protect the pte as well. See the
	 * Documentation/vm/soft-dirty.txt for full description
	 * of how soft-dirty works.
	 */
	pte_t ptent = *pte;
844 845

	if (pte_present(ptent)) {
846
		ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
847
		ptent = pte_wrprotect(ptent);
848
		ptent = pte_clear_soft_dirty(ptent);
849
		ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
850 851
	} else if (is_swap_pte(ptent)) {
		ptent = pte_swp_clear_soft_dirty(ptent);
852
		set_pte_at(vma->vm_mm, addr, pte, ptent);
853
	}
854
}
855 856 857 858 859 860
#else
static inline void clear_soft_dirty(struct vm_area_struct *vma,
		unsigned long addr, pte_t *pte)
{
}
#endif
861

862
#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
863 864 865
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
		unsigned long addr, pmd_t *pmdp)
{
866
	pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
867 868

	pmd = pmd_wrprotect(pmd);
869
	pmd = pmd_clear_soft_dirty(pmd);
870 871 872 873 874 875 876 877 878 879 880 881 882

	if (vma->vm_flags & VM_SOFTDIRTY)
		vma->vm_flags &= ~VM_SOFTDIRTY;

	set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
#else
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
		unsigned long addr, pmd_t *pmdp)
{
}
#endif

883
static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
D
Dave Hansen 已提交
884
				unsigned long end, struct mm_walk *walk)
885
{
886
	struct clear_refs_private *cp = walk->private;
887
	struct vm_area_struct *vma = walk->vma;
888 889 890 891
	pte_t *pte, ptent;
	spinlock_t *ptl;
	struct page *page;

892 893 894 895 896 897 898 899 900 901
	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
			clear_soft_dirty_pmd(vma, addr, pmd);
			goto out;
		}

		page = pmd_page(*pmd);

		/* Clear accessed and referenced bits. */
		pmdp_test_and_clear_young(vma, addr, pmd);
902
		test_and_clear_page_young(page);
903 904 905 906 907 908
		ClearPageReferenced(page);
out:
		spin_unlock(ptl);
		return 0;
	}

909 910
	if (pmd_trans_unstable(pmd))
		return 0;
911

912 913 914 915
	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
	for (; addr != end; pte++, addr += PAGE_SIZE) {
		ptent = *pte;

916 917 918 919 920
		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
			clear_soft_dirty(vma, addr, pte);
			continue;
		}

921 922 923
		if (!pte_present(ptent))
			continue;

924 925 926 927 928 929
		page = vm_normal_page(vma, addr, ptent);
		if (!page)
			continue;

		/* Clear accessed and referenced bits. */
		ptep_test_and_clear_young(vma, addr, pte);
930
		test_and_clear_page_young(page);
931 932 933 934 935 936 937
		ClearPageReferenced(page);
	}
	pte_unmap_unlock(pte - 1, ptl);
	cond_resched();
	return 0;
}

938 939 940 941 942 943
static int clear_refs_test_walk(unsigned long start, unsigned long end,
				struct mm_walk *walk)
{
	struct clear_refs_private *cp = walk->private;
	struct vm_area_struct *vma = walk->vma;

944 945 946
	if (vma->vm_flags & VM_PFNMAP)
		return 1;

947 948 949 950 951 952 953 954 955 956 957 958 959
	/*
	 * Writing 1 to /proc/pid/clear_refs affects all pages.
	 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
	 * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
	 * Writing 4 to /proc/pid/clear_refs affects all pages.
	 */
	if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
		return 1;
	if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
		return 1;
	return 0;
}

960 961
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
				size_t count, loff_t *ppos)
962
{
963
	struct task_struct *task;
964
	char buffer[PROC_NUMBUF];
965
	struct mm_struct *mm;
966
	struct vm_area_struct *vma;
967 968
	enum clear_refs_types type;
	int itype;
A
Alexey Dobriyan 已提交
969
	int rv;
970

971 972 973 974 975
	memset(buffer, 0, sizeof(buffer));
	if (count > sizeof(buffer) - 1)
		count = sizeof(buffer) - 1;
	if (copy_from_user(buffer, buf, count))
		return -EFAULT;
976
	rv = kstrtoint(strstrip(buffer), 10, &itype);
A
Alexey Dobriyan 已提交
977 978
	if (rv < 0)
		return rv;
979 980
	type = (enum clear_refs_types)itype;
	if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
981
		return -EINVAL;
982

A
Al Viro 已提交
983
	task = get_proc_task(file_inode(file));
984 985 986 987
	if (!task)
		return -ESRCH;
	mm = get_task_mm(task);
	if (mm) {
988
		struct clear_refs_private cp = {
989
			.type = type,
990
		};
991 992
		struct mm_walk clear_refs_walk = {
			.pmd_entry = clear_refs_pte_range,
993
			.test_walk = clear_refs_test_walk,
994
			.mm = mm,
995
			.private = &cp,
996
		};
997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008

		if (type == CLEAR_REFS_MM_HIWATER_RSS) {
			/*
			 * Writing 5 to /proc/pid/clear_refs resets the peak
			 * resident set size to this mm's current rss value.
			 */
			down_write(&mm->mmap_sem);
			reset_mm_hiwater_rss(mm);
			up_write(&mm->mmap_sem);
			goto out_mm;
		}

1009
		down_read(&mm->mmap_sem);
1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022
		if (type == CLEAR_REFS_SOFT_DIRTY) {
			for (vma = mm->mmap; vma; vma = vma->vm_next) {
				if (!(vma->vm_flags & VM_SOFTDIRTY))
					continue;
				up_read(&mm->mmap_sem);
				down_write(&mm->mmap_sem);
				for (vma = mm->mmap; vma; vma = vma->vm_next) {
					vma->vm_flags &= ~VM_SOFTDIRTY;
					vma_set_page_prot(vma);
				}
				downgrade_write(&mm->mmap_sem);
				break;
			}
1023
			mmu_notifier_invalidate_range_start(mm, 0, -1);
1024
		}
1025
		walk_page_range(0, ~0UL, &clear_refs_walk);
1026 1027
		if (type == CLEAR_REFS_SOFT_DIRTY)
			mmu_notifier_invalidate_range_end(mm, 0, -1);
1028 1029
		flush_tlb_mm(mm);
		up_read(&mm->mmap_sem);
1030
out_mm:
1031 1032 1033
		mmput(mm);
	}
	put_task_struct(task);
1034 1035

	return count;
1036 1037
}

1038 1039
const struct file_operations proc_clear_refs_operations = {
	.write		= clear_refs_write,
1040
	.llseek		= noop_llseek,
1041 1042
};

1043 1044 1045 1046
typedef struct {
	u64 pme;
} pagemap_entry_t;

1047
struct pagemapread {
1048
	int pos, len;		/* units: PM_ENTRY_BYTES, not bytes */
1049
	pagemap_entry_t *buffer;
1050
	bool show_pfn;
1051 1052
};

1053 1054 1055
#define PAGEMAP_WALK_SIZE	(PMD_SIZE)
#define PAGEMAP_WALK_MASK	(PMD_MASK)

1056 1057 1058 1059
#define PM_ENTRY_BYTES		sizeof(pagemap_entry_t)
#define PM_PFRAME_BITS		55
#define PM_PFRAME_MASK		GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
#define PM_SOFT_DIRTY		BIT_ULL(55)
1060
#define PM_MMAP_EXCLUSIVE	BIT_ULL(56)
1061 1062 1063 1064
#define PM_FILE			BIT_ULL(61)
#define PM_SWAP			BIT_ULL(62)
#define PM_PRESENT		BIT_ULL(63)

1065 1066
#define PM_END_OF_BUFFER    1

1067
static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
1068
{
1069
	return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
1070 1071 1072
}

static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
1073 1074
			  struct pagemapread *pm)
{
1075
	pm->buffer[pm->pos++] = *pme;
1076
	if (pm->pos >= pm->len)
1077
		return PM_END_OF_BUFFER;
1078 1079 1080 1081
	return 0;
}

static int pagemap_pte_hole(unsigned long start, unsigned long end,
D
Dave Hansen 已提交
1082
				struct mm_walk *walk)
1083
{
D
Dave Hansen 已提交
1084
	struct pagemapread *pm = walk->private;
1085
	unsigned long addr = start;
1086
	int err = 0;
1087

1088 1089
	while (addr < end) {
		struct vm_area_struct *vma = find_vma(walk->mm, addr);
1090
		pagemap_entry_t pme = make_pme(0, 0);
1091 1092
		/* End of address space hole, which we mark as non-present. */
		unsigned long hole_end;
1093

1094 1095 1096 1097 1098 1099 1100 1101 1102
		if (vma)
			hole_end = min(end, vma->vm_start);
		else
			hole_end = end;

		for (; addr < hole_end; addr += PAGE_SIZE) {
			err = add_to_pagemap(addr, &pme, pm);
			if (err)
				goto out;
1103 1104
		}

1105 1106 1107 1108 1109
		if (!vma)
			break;

		/* Addresses in the VMA. */
		if (vma->vm_flags & VM_SOFTDIRTY)
1110
			pme = make_pme(0, PM_SOFT_DIRTY);
1111
		for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
1112 1113 1114 1115
			err = add_to_pagemap(addr, &pme, pm);
			if (err)
				goto out;
		}
1116
	}
1117
out:
1118 1119 1120
	return err;
}

1121
static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
1122
		struct vm_area_struct *vma, unsigned long addr, pte_t pte)
1123
{
1124
	u64 frame = 0, flags = 0;
1125
	struct page *page = NULL;
1126

1127
	if (pte_present(pte)) {
1128 1129
		if (pm->show_pfn)
			frame = pte_pfn(pte);
1130
		flags |= PM_PRESENT;
1131
		page = vm_normal_page(vma, addr, pte);
1132
		if (pte_soft_dirty(pte))
1133
			flags |= PM_SOFT_DIRTY;
1134
	} else if (is_swap_pte(pte)) {
1135 1136
		swp_entry_t entry;
		if (pte_swp_soft_dirty(pte))
1137
			flags |= PM_SOFT_DIRTY;
1138
		entry = pte_to_swp_entry(pte);
1139 1140
		frame = swp_type(entry) |
			(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
1141
		flags |= PM_SWAP;
1142 1143 1144 1145 1146 1147
		if (is_migration_entry(entry))
			page = migration_entry_to_page(entry);
	}

	if (page && !PageAnon(page))
		flags |= PM_FILE;
1148 1149
	if (page && page_mapcount(page) == 1)
		flags |= PM_MMAP_EXCLUSIVE;
1150 1151
	if (vma->vm_flags & VM_SOFTDIRTY)
		flags |= PM_SOFT_DIRTY;
1152

1153
	return make_pme(frame, flags);
1154 1155
}

1156
static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
D
Dave Hansen 已提交
1157
			     struct mm_walk *walk)
1158
{
1159
	struct vm_area_struct *vma = walk->vma;
D
Dave Hansen 已提交
1160
	struct pagemapread *pm = walk->private;
1161
	spinlock_t *ptl;
1162
	pte_t *pte, *orig_pte;
1163 1164
	int err = 0;

1165 1166 1167 1168
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
		u64 flags = 0, frame = 0;
		pmd_t pmd = *pmdp;
1169

1170
		if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
1171
			flags |= PM_SOFT_DIRTY;
1172

1173 1174 1175 1176 1177 1178 1179
		/*
		 * Currently pmd for thp is always present because thp
		 * can not be swapped-out, migrated, or HWPOISONed
		 * (split in such cases instead.)
		 * This if-check is just to prepare for future implementation.
		 */
		if (pmd_present(pmd)) {
1180 1181 1182 1183 1184
			struct page *page = pmd_page(pmd);

			if (page_mapcount(page) == 1)
				flags |= PM_MMAP_EXCLUSIVE;

1185
			flags |= PM_PRESENT;
1186 1187 1188
			if (pm->show_pfn)
				frame = pmd_pfn(pmd) +
					((addr & ~PMD_MASK) >> PAGE_SHIFT);
1189 1190
		}

1191
		for (; addr != end; addr += PAGE_SIZE) {
1192
			pagemap_entry_t pme = make_pme(frame, flags);
1193

1194
			err = add_to_pagemap(addr, &pme, pm);
1195 1196
			if (err)
				break;
1197
			if (pm->show_pfn && (flags & PM_PRESENT))
1198
				frame++;
1199
		}
1200
		spin_unlock(ptl);
1201
		return err;
1202 1203
	}

1204
	if (pmd_trans_unstable(pmdp))
1205
		return 0;
1206
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1207

1208 1209 1210 1211
	/*
	 * We can assume that @vma always points to a valid one and @end never
	 * goes beyond vma->vm_end.
	 */
1212
	orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
1213 1214
	for (; addr < end; pte++, addr += PAGE_SIZE) {
		pagemap_entry_t pme;
1215

1216
		pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
1217
		err = add_to_pagemap(addr, &pme, pm);
1218
		if (err)
1219
			break;
1220
	}
1221
	pte_unmap_unlock(orig_pte, ptl);
1222 1223 1224 1225 1226 1227

	cond_resched();

	return err;
}

1228
#ifdef CONFIG_HUGETLB_PAGE
1229
/* This function walks within one hugetlb entry in the single call */
1230
static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
1231 1232
				 unsigned long addr, unsigned long end,
				 struct mm_walk *walk)
1233 1234
{
	struct pagemapread *pm = walk->private;
1235
	struct vm_area_struct *vma = walk->vma;
1236
	u64 flags = 0, frame = 0;
1237
	int err = 0;
1238
	pte_t pte;
1239

1240
	if (vma->vm_flags & VM_SOFTDIRTY)
1241
		flags |= PM_SOFT_DIRTY;
1242

1243 1244 1245 1246 1247 1248 1249
	pte = huge_ptep_get(ptep);
	if (pte_present(pte)) {
		struct page *page = pte_page(pte);

		if (!PageAnon(page))
			flags |= PM_FILE;

1250 1251 1252
		if (page_mapcount(page) == 1)
			flags |= PM_MMAP_EXCLUSIVE;

1253
		flags |= PM_PRESENT;
1254 1255 1256
		if (pm->show_pfn)
			frame = pte_pfn(pte) +
				((addr & ~hmask) >> PAGE_SHIFT);
1257 1258
	}

1259
	for (; addr != end; addr += PAGE_SIZE) {
1260 1261
		pagemap_entry_t pme = make_pme(frame, flags);

1262
		err = add_to_pagemap(addr, &pme, pm);
1263 1264
		if (err)
			return err;
1265
		if (pm->show_pfn && (flags & PM_PRESENT))
1266
			frame++;
1267 1268 1269 1270 1271 1272
	}

	cond_resched();

	return err;
}
1273
#endif /* HUGETLB_PAGE */
1274

1275 1276 1277
/*
 * /proc/pid/pagemap - an array mapping virtual pages to pfns
 *
1278 1279 1280
 * For each page in the address space, this file contains one 64-bit entry
 * consisting of the following:
 *
1281
 * Bits 0-54  page frame number (PFN) if present
1282
 * Bits 0-4   swap type if swapped
1283
 * Bits 5-54  swap offset if swapped
1284
 * Bit  55    pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
1285 1286
 * Bit  56    page exclusively mapped
 * Bits 57-60 zero
1287
 * Bit  61    page is file-page or shared-anon
1288 1289 1290 1291 1292 1293
 * Bit  62    page swapped
 * Bit  63    page present
 *
 * If the page is not present but in swap, then the PFN contains an
 * encoding of the swap file number and the page's offset into the
 * swap. Unmapped pages return a null PFN. This allows determining
1294 1295 1296 1297 1298 1299 1300 1301 1302 1303
 * precisely which pages are mapped (or in swap) and comparing mapped
 * pages between processes.
 *
 * Efficient users of this interface will use /proc/pid/maps to
 * determine which areas of memory are actually mapped and llseek to
 * skip over unmapped regions.
 */
static ssize_t pagemap_read(struct file *file, char __user *buf,
			    size_t count, loff_t *ppos)
{
1304
	struct mm_struct *mm = file->private_data;
1305
	struct pagemapread pm;
1306
	struct mm_walk pagemap_walk = {};
1307 1308 1309 1310
	unsigned long src;
	unsigned long svpfn;
	unsigned long start_vaddr;
	unsigned long end_vaddr;
1311
	int ret = 0, copied = 0;
1312

1313
	if (!mm || !atomic_inc_not_zero(&mm->mm_users))
1314 1315 1316 1317
		goto out;

	ret = -EINVAL;
	/* file position must be aligned */
1318
	if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
1319
		goto out_mm;
1320 1321

	ret = 0;
1322
	if (!count)
1323
		goto out_mm;
1324

1325 1326 1327
	/* do not disclose physical addresses: attack vector */
	pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);

1328 1329
	pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
	pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
1330
	ret = -ENOMEM;
1331
	if (!pm.buffer)
1332
		goto out_mm;
1333

1334
	pagemap_walk.pmd_entry = pagemap_pmd_range;
1335
	pagemap_walk.pte_hole = pagemap_pte_hole;
1336
#ifdef CONFIG_HUGETLB_PAGE
1337
	pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
1338
#endif
1339 1340 1341 1342 1343 1344
	pagemap_walk.mm = mm;
	pagemap_walk.private = &pm;

	src = *ppos;
	svpfn = src / PM_ENTRY_BYTES;
	start_vaddr = svpfn << PAGE_SHIFT;
1345
	end_vaddr = mm->task_size;
1346 1347

	/* watch out for wraparound */
1348
	if (svpfn > mm->task_size >> PAGE_SHIFT)
1349 1350 1351 1352 1353 1354 1355 1356
		start_vaddr = end_vaddr;

	/*
	 * The odds are that this will stop walking way
	 * before end_vaddr, because the length of the
	 * user buffer is tracked in "pm", and the walk
	 * will stop when we hit the end of the buffer.
	 */
1357 1358 1359 1360 1361 1362
	ret = 0;
	while (count && (start_vaddr < end_vaddr)) {
		int len;
		unsigned long end;

		pm.pos = 0;
1363
		end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
1364 1365 1366 1367 1368 1369 1370 1371 1372
		/* overflow ? */
		if (end < start_vaddr || end > end_vaddr)
			end = end_vaddr;
		down_read(&mm->mmap_sem);
		ret = walk_page_range(start_vaddr, end, &pagemap_walk);
		up_read(&mm->mmap_sem);
		start_vaddr = end;

		len = min(count, PM_ENTRY_BYTES * pm.pos);
1373
		if (copy_to_user(buf, pm.buffer, len)) {
1374
			ret = -EFAULT;
1375
			goto out_free;
1376 1377 1378 1379
		}
		copied += len;
		buf += len;
		count -= len;
1380
	}
1381 1382 1383 1384
	*ppos += copied;
	if (!ret || ret == PM_END_OF_BUFFER)
		ret = copied;

1385 1386
out_free:
	kfree(pm.buffer);
1387 1388
out_mm:
	mmput(mm);
1389 1390 1391 1392
out:
	return ret;
}

1393 1394
static int pagemap_open(struct inode *inode, struct file *file)
{
1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409
	struct mm_struct *mm;

	mm = proc_mem_open(inode, PTRACE_MODE_READ);
	if (IS_ERR(mm))
		return PTR_ERR(mm);
	file->private_data = mm;
	return 0;
}

static int pagemap_release(struct inode *inode, struct file *file)
{
	struct mm_struct *mm = file->private_data;

	if (mm)
		mmdrop(mm);
1410 1411 1412
	return 0;
}

1413 1414 1415
const struct file_operations proc_pagemap_operations = {
	.llseek		= mem_lseek, /* borrow this */
	.read		= pagemap_read,
1416
	.open		= pagemap_open,
1417
	.release	= pagemap_release,
1418
};
1419
#endif /* CONFIG_PROC_PAGE_MONITOR */
1420

1421 1422
#ifdef CONFIG_NUMA

1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433
struct numa_maps {
	unsigned long pages;
	unsigned long anon;
	unsigned long active;
	unsigned long writeback;
	unsigned long mapcount_max;
	unsigned long dirty;
	unsigned long swapcache;
	unsigned long node[MAX_NUMNODES];
};

1434 1435 1436 1437 1438
struct numa_maps_private {
	struct proc_maps_private proc_maps;
	struct numa_maps md;
};

1439 1440
static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
			unsigned long nr_pages)
1441 1442 1443
{
	int count = page_mapcount(page);

1444
	md->pages += nr_pages;
1445
	if (pte_dirty || PageDirty(page))
1446
		md->dirty += nr_pages;
1447 1448

	if (PageSwapCache(page))
1449
		md->swapcache += nr_pages;
1450 1451

	if (PageActive(page) || PageUnevictable(page))
1452
		md->active += nr_pages;
1453 1454

	if (PageWriteback(page))
1455
		md->writeback += nr_pages;
1456 1457

	if (PageAnon(page))
1458
		md->anon += nr_pages;
1459 1460 1461 1462

	if (count > md->mapcount_max)
		md->mapcount_max = count;

1463
	md->node[page_to_nid(page)] += nr_pages;
1464 1465
}

1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482
static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
		unsigned long addr)
{
	struct page *page;
	int nid;

	if (!pte_present(pte))
		return NULL;

	page = vm_normal_page(vma, addr, pte);
	if (!page)
		return NULL;

	if (PageReserved(page))
		return NULL;

	nid = page_to_nid(page);
1483
	if (!node_isset(nid, node_states[N_MEMORY]))
1484 1485 1486 1487 1488
		return NULL;

	return page;
}

1489 1490 1491
static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
		unsigned long end, struct mm_walk *walk)
{
1492 1493
	struct numa_maps *md = walk->private;
	struct vm_area_struct *vma = walk->vma;
1494 1495 1496 1497
	spinlock_t *ptl;
	pte_t *orig_pte;
	pte_t *pte;

1498
	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1499 1500 1501
		pte_t huge_pte = *(pte_t *)pmd;
		struct page *page;

1502
		page = can_gather_numa_stats(huge_pte, vma, addr);
1503 1504 1505
		if (page)
			gather_stats(page, md, pte_dirty(huge_pte),
				     HPAGE_PMD_SIZE/PAGE_SIZE);
1506
		spin_unlock(ptl);
1507
		return 0;
1508 1509
	}

1510 1511
	if (pmd_trans_unstable(pmd))
		return 0;
1512 1513
	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
	do {
1514
		struct page *page = can_gather_numa_stats(*pte, vma, addr);
1515 1516
		if (!page)
			continue;
1517
		gather_stats(page, md, pte_dirty(*pte), 1);
1518 1519 1520 1521 1522 1523

	} while (pte++, addr += PAGE_SIZE, addr != end);
	pte_unmap_unlock(orig_pte, ptl);
	return 0;
}
#ifdef CONFIG_HUGETLB_PAGE
1524
static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
1525 1526 1527 1528 1529
		unsigned long addr, unsigned long end, struct mm_walk *walk)
{
	struct numa_maps *md;
	struct page *page;

1530
	if (!pte_present(*pte))
1531 1532 1533 1534 1535 1536 1537
		return 0;

	page = pte_page(*pte);
	if (!page)
		return 0;

	md = walk->private;
1538
	gather_stats(page, md, pte_dirty(*pte), 1);
1539 1540 1541 1542
	return 0;
}

#else
1543
static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
1544 1545 1546 1547 1548 1549 1550 1551 1552
		unsigned long addr, unsigned long end, struct mm_walk *walk)
{
	return 0;
}
#endif

/*
 * Display pages allocated per node and memory policy via /proc.
 */
1553
static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1554
{
1555 1556
	struct numa_maps_private *numa_priv = m->private;
	struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
1557
	struct vm_area_struct *vma = v;
1558
	struct numa_maps *md = &numa_priv->md;
1559 1560
	struct file *file = vma->vm_file;
	struct mm_struct *mm = vma->vm_mm;
1561 1562 1563 1564 1565 1566
	struct mm_walk walk = {
		.hugetlb_entry = gather_hugetlb_stats,
		.pmd_entry = gather_pte_stats,
		.private = md,
		.mm = mm,
	};
1567
	struct mempolicy *pol;
1568 1569
	char buffer[64];
	int nid;
1570 1571 1572 1573

	if (!mm)
		return 0;

1574 1575
	/* Ensure we start with an empty set of numa_maps statistics. */
	memset(md, 0, sizeof(*md));
1576

1577 1578 1579 1580 1581 1582 1583
	pol = __get_vma_policy(vma, vma->vm_start);
	if (pol) {
		mpol_to_str(buffer, sizeof(buffer), pol);
		mpol_cond_put(pol);
	} else {
		mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
	}
1584 1585 1586 1587

	seq_printf(m, "%08lx %s", vma->vm_start, buffer);

	if (file) {
1588
		seq_puts(m, " file=");
M
Miklos Szeredi 已提交
1589
		seq_file_path(m, file, "\n\t= ");
1590
	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1591
		seq_puts(m, " heap");
1592
	} else {
1593
		pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
1594 1595 1596 1597 1598 1599 1600
		if (tid != 0) {
			/*
			 * Thread stack in /proc/PID/task/TID/maps or
			 * the main process stack.
			 */
			if (!is_pid || (vma->vm_start <= mm->start_stack &&
			    vma->vm_end >= mm->start_stack))
1601
				seq_puts(m, " stack");
1602 1603 1604
			else
				seq_printf(m, " stack:%d", tid);
		}
1605 1606
	}

1607
	if (is_vm_hugetlb_page(vma))
1608
		seq_puts(m, " huge");
1609

1610 1611
	/* mmap_sem is held by m_start */
	walk_page_vma(vma, &walk);
1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636

	if (!md->pages)
		goto out;

	if (md->anon)
		seq_printf(m, " anon=%lu", md->anon);

	if (md->dirty)
		seq_printf(m, " dirty=%lu", md->dirty);

	if (md->pages != md->anon && md->pages != md->dirty)
		seq_printf(m, " mapped=%lu", md->pages);

	if (md->mapcount_max > 1)
		seq_printf(m, " mapmax=%lu", md->mapcount_max);

	if (md->swapcache)
		seq_printf(m, " swapcache=%lu", md->swapcache);

	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
		seq_printf(m, " active=%lu", md->active);

	if (md->writeback)
		seq_printf(m, " writeback=%lu", md->writeback);

1637 1638 1639
	for_each_node_state(nid, N_MEMORY)
		if (md->node[nid])
			seq_printf(m, " N%d=%lu", nid, md->node[nid]);
1640 1641

	seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
1642 1643
out:
	seq_putc(m, '\n');
1644
	m_cache_vma(m, vma);
1645 1646
	return 0;
}
1647

1648 1649 1650 1651 1652 1653 1654 1655 1656 1657
static int show_pid_numa_map(struct seq_file *m, void *v)
{
	return show_numa_map(m, v, 1);
}

static int show_tid_numa_map(struct seq_file *m, void *v)
{
	return show_numa_map(m, v, 0);
}

1658
static const struct seq_operations proc_pid_numa_maps_op = {
1659 1660 1661 1662
	.start  = m_start,
	.next   = m_next,
	.stop   = m_stop,
	.show   = show_pid_numa_map,
1663
};
1664

1665 1666 1667 1668 1669 1670 1671 1672 1673
static const struct seq_operations proc_tid_numa_maps_op = {
	.start  = m_start,
	.next   = m_next,
	.stop   = m_stop,
	.show   = show_tid_numa_map,
};

static int numa_maps_open(struct inode *inode, struct file *file,
			  const struct seq_operations *ops)
1674
{
1675 1676
	return proc_maps_open(inode, file, ops,
				sizeof(struct numa_maps_private));
1677 1678
}

1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692
static int pid_numa_maps_open(struct inode *inode, struct file *file)
{
	return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
}

static int tid_numa_maps_open(struct inode *inode, struct file *file)
{
	return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
}

const struct file_operations proc_pid_numa_maps_operations = {
	.open		= pid_numa_maps_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
1693
	.release	= proc_map_release,
1694 1695 1696 1697
};

const struct file_operations proc_tid_numa_maps_operations = {
	.open		= tid_numa_maps_open,
1698 1699
	.read		= seq_read,
	.llseek		= seq_lseek,
1700
	.release	= proc_map_release,
1701
};
1702
#endif /* CONFIG_NUMA */