mincore.c 7.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3
/*
 *	linux/mm/mincore.c
 *
4
 * Copyright (C) 1994-2006  Linus Torvalds
L
Linus Torvalds 已提交
5 6 7 8 9 10
 */

/*
 * The mincore() system call.
 */
#include <linux/pagemap.h>
11
#include <linux/gfp.h>
L
Linus Torvalds 已提交
12 13 14
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/syscalls.h>
N
Nick Piggin 已提交
15 16
#include <linux/swap.h>
#include <linux/swapops.h>
17
#include <linux/hugetlb.h>
L
Linus Torvalds 已提交
18 19 20 21

#include <asm/uaccess.h>
#include <asm/pgtable.h>

22
static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
23
				unsigned long addr, unsigned long end,
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
				unsigned char *vec)
{
#ifdef CONFIG_HUGETLB_PAGE
	struct hstate *h;

	h = hstate_vma(vma);
	while (1) {
		unsigned char present;
		pte_t *ptep;
		/*
		 * Huge pages are always in RAM for now, but
		 * theoretically it needs to be checked.
		 */
		ptep = huge_pte_offset(current->mm,
				       addr & huge_page_mask(h));
		present = ptep && !huge_pte_none(huge_ptep_get(ptep));
		while (1) {
41 42
			*vec = present;
			vec++;
43
			addr += PAGE_SIZE;
44
			if (addr == end)
45 46 47 48 49 50 51 52 53 54 55
				return;
			/* check hugepage border */
			if (!(addr & ~huge_page_mask(h)))
				break;
		}
	}
#else
	BUG();
#endif
}

L
Linus Torvalds 已提交
56 57 58 59 60 61
/*
 * Later we can get more picky about what "in core" means precisely.
 * For now, simply check to see if the page is in the page cache,
 * and is up to date; i.e. that no page-in operation would be required
 * at this time if an application were to map and access this page.
 */
N
Nick Piggin 已提交
62
static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
L
Linus Torvalds 已提交
63 64
{
	unsigned char present = 0;
N
Nick Piggin 已提交
65
	struct page *page;
L
Linus Torvalds 已提交
66

N
Nick Piggin 已提交
67 68 69 70
	/*
	 * When tmpfs swaps out a page from a file, any process mapping that
	 * file will not get a swp_entry_t in its pte, but rather it is like
	 * any other file mapping (ie. marked !present and faulted in with
N
Nick Piggin 已提交
71
	 * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
N
Nick Piggin 已提交
72 73 74 75 76 77
	 *
	 * However when tmpfs moves the page from pagecache and into swapcache,
	 * it is still in core, but the find_get_page below won't find it.
	 * No big deal, but make a note of it.
	 */
	page = find_get_page(mapping, pgoff);
L
Linus Torvalds 已提交
78 79 80 81 82 83 84 85
	if (page) {
		present = PageUptodate(page);
		page_cache_release(page);
	}

	return present;
}

86
static void mincore_unmapped_range(struct vm_area_struct *vma,
87
				unsigned long addr, unsigned long end,
88 89
				unsigned char *vec)
{
90
	unsigned long nr = (end - addr) >> PAGE_SHIFT;
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
	int i;

	if (vma->vm_file) {
		pgoff_t pgoff;

		pgoff = linear_page_index(vma, addr);
		for (i = 0; i < nr; i++, pgoff++)
			vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
	} else {
		for (i = 0; i < nr; i++)
			vec[i] = 0;
	}
}

static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
106
			unsigned long addr, unsigned long end,
107 108
			unsigned char *vec)
{
109
	unsigned long next;
110 111 112 113
	spinlock_t *ptl;
	pte_t *ptep;

	ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
114
	do {
115 116 117
		pte_t pte = *ptep;
		pgoff_t pgoff;

118
		next = addr + PAGE_SIZE;
119
		if (pte_none(pte))
120
			mincore_unmapped_range(vma, addr, next, vec);
121
		else if (pte_present(pte))
122
			*vec = 1;
123 124
		else if (pte_file(pte)) {
			pgoff = pte_to_pgoff(pte);
125
			*vec = mincore_page(vma->vm_file->f_mapping, pgoff);
126 127 128 129 130
		} else { /* pte is a swap entry */
			swp_entry_t entry = pte_to_swp_entry(pte);

			if (is_migration_entry(entry)) {
				/* migration entries are always uptodate */
131
				*vec = 1;
132 133 134
			} else {
#ifdef CONFIG_SWAP
				pgoff = entry.val;
135
				*vec = mincore_page(&swapper_space, pgoff);
136 137
#else
				WARN_ON(1);
138
				*vec = 1;
139 140 141
#endif
			}
		}
142 143
		vec++;
	} while (ptep++, addr = next, addr != end);
144 145 146
	pte_unmap_unlock(ptep - 1, ptl);
}

147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
			unsigned long addr, unsigned long end,
			unsigned char *vec)
{
	unsigned long next;
	pmd_t *pmd;

	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
		if (pmd_none_or_clear_bad(pmd))
			mincore_unmapped_range(vma, addr, next, vec);
		else
			mincore_pte_range(vma, pmd, addr, next, vec);
		vec += (next - addr) >> PAGE_SHIFT;
	} while (pmd++, addr = next, addr != end);
}

static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
			unsigned long addr, unsigned long end,
			unsigned char *vec)
{
	unsigned long next;
	pud_t *pud;

	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			mincore_unmapped_range(vma, addr, next, vec);
		else
			mincore_pmd_range(vma, pud, addr, next, vec);
		vec += (next - addr) >> PAGE_SHIFT;
	} while (pud++, addr = next, addr != end);
}

static void mincore_page_range(struct vm_area_struct *vma,
			unsigned long addr, unsigned long end,
			unsigned char *vec)
{
	unsigned long next;
	pgd_t *pgd;

	pgd = pgd_offset(vma->vm_mm, addr);
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			mincore_unmapped_range(vma, addr, next, vec);
		else
			mincore_pud_range(vma, pgd, addr, next, vec);
		vec += (next - addr) >> PAGE_SHIFT;
	} while (pgd++, addr = next, addr != end);
}

201 202 203 204 205
/*
 * Do a chunk of "sys_mincore()". We've already checked
 * all the arguments, we hold the mmap semaphore: we should
 * just return the amount of info we're asked for.
 */
J
Johannes Weiner 已提交
206
static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
L
Linus Torvalds 已提交
207
{
J
Johannes Weiner 已提交
208
	struct vm_area_struct *vma;
209
	unsigned long end;
L
Linus Torvalds 已提交
210

J
Johannes Weiner 已提交
211
	vma = find_vma(current->mm, addr);
212 213
	if (!vma || addr < vma->vm_start)
		return -ENOMEM;
L
Linus Torvalds 已提交
214

215
	end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
J
Johannes Weiner 已提交
216

217
	if (is_vm_hugetlb_page(vma)) {
218 219
		mincore_hugetlb_page_range(vma, addr, end, vec);
		return (end - addr) >> PAGE_SHIFT;
220 221
	}

222
	end = pmd_addr_end(addr, end);
L
Linus Torvalds 已提交
223

224 225 226 227
	if (is_vm_hugetlb_page(vma))
		mincore_hugetlb_page_range(vma, addr, end, vec);
	else
		mincore_page_range(vma, addr, end, vec);
N
Nick Piggin 已提交
228

229
	return (end - addr) >> PAGE_SHIFT;
L
Linus Torvalds 已提交
230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
}

/*
 * The mincore(2) system call.
 *
 * mincore() returns the memory residency status of the pages in the
 * current process's address space specified by [addr, addr + len).
 * The status is returned in a vector of bytes.  The least significant
 * bit of each byte is 1 if the referenced page is in memory, otherwise
 * it is zero.
 *
 * Because the status of a page can change after mincore() checks it
 * but before it returns to the application, the returned vector may
 * contain stale information.  Only locked pages are guaranteed to
 * remain in memory.
 *
 * return values:
 *  zero    - success
 *  -EFAULT - vec points to an illegal address
 *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE
 *  -ENOMEM - Addresses in the range [addr, addr + len] are
 *		invalid for the address space of this process, or
 *		specify one or more pages which are not currently
 *		mapped
 *  -EAGAIN - A kernel resource was temporarily unavailable.
 */
256 257
SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
		unsigned char __user *, vec)
L
Linus Torvalds 已提交
258
{
259 260 261
	long retval;
	unsigned long pages;
	unsigned char *tmp;
L
Linus Torvalds 已提交
262

263 264 265
	/* Check the start address: needs to be page-aligned.. */
 	if (start & ~PAGE_CACHE_MASK)
		return -EINVAL;
L
Linus Torvalds 已提交
266

267 268 269
	/* ..and we need to be passed a valid user-space range */
	if (!access_ok(VERIFY_READ, (void __user *) start, len))
		return -ENOMEM;
L
Linus Torvalds 已提交
270

271 272 273
	/* This also avoids any overflows on PAGE_CACHE_ALIGN */
	pages = len >> PAGE_SHIFT;
	pages += (len & ~PAGE_MASK) != 0;
L
Linus Torvalds 已提交
274

275 276
	if (!access_ok(VERIFY_WRITE, vec, pages))
		return -EFAULT;
L
Linus Torvalds 已提交
277

278 279
	tmp = (void *) __get_free_page(GFP_USER);
	if (!tmp)
280
		return -EAGAIN;
281 282 283 284 285 286 287 288

	retval = 0;
	while (pages) {
		/*
		 * Do at most PAGE_SIZE entries per iteration, due to
		 * the temporary buffer size.
		 */
		down_read(&current->mm->mmap_sem);
J
Johannes Weiner 已提交
289
		retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
290 291 292 293 294 295 296
		up_read(&current->mm->mmap_sem);

		if (retval <= 0)
			break;
		if (copy_to_user(vec, tmp, retval)) {
			retval = -EFAULT;
			break;
L
Linus Torvalds 已提交
297
		}
298 299 300 301
		pages -= retval;
		vec += retval;
		start += retval << PAGE_SHIFT;
		retval = 0;
L
Linus Torvalds 已提交
302
	}
303 304
	free_page((unsigned long) tmp);
	return retval;
L
Linus Torvalds 已提交
305
}