gfp.h 25.8 KB
Newer Older
1
/* SPDX-License-Identifier: GPL-2.0 */
L
Linus Torvalds 已提交
2 3 4
#ifndef __LINUX_GFP_H
#define __LINUX_GFP_H

5
#include <linux/mmdebug.h>
L
Linus Torvalds 已提交
6 7 8
#include <linux/mmzone.h>
#include <linux/stddef.h>
#include <linux/linkage.h>
9
#include <linux/topology.h>
L
Linus Torvalds 已提交
10

11 12 13 14 15 16 17 18 19 20 21 22 23 24
/* The typedef is in types.h but we want the documentation here */
#if 0
/**
 * typedef gfp_t - Memory allocation flags.
 *
 * GFP flags are commonly used throughout Linux to indicate how memory
 * should be allocated.  The GFP acronym stands for get_free_pages(),
 * the underlying memory allocation function.  Not every GFP flag is
 * supported by every function which may allocate memory.  Most users
 * will want to use a plain ``GFP_KERNEL``.
 */
typedef unsigned int __bitwise gfp_t;
#endif

L
Linus Torvalds 已提交
25 26
struct vm_area_struct;

27 28
/*
 * In case of changes, please don't forget to update
29
 * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c
30 31
 */

32 33 34 35 36
/* Plain integer GFP bitmasks. Do not use this directly. */
#define ___GFP_DMA		0x01u
#define ___GFP_HIGHMEM		0x02u
#define ___GFP_DMA32		0x04u
#define ___GFP_MOVABLE		0x08u
37
#define ___GFP_RECLAIMABLE	0x10u
38 39 40
#define ___GFP_HIGH		0x20u
#define ___GFP_IO		0x40u
#define ___GFP_FS		0x80u
A
Alexey Dobriyan 已提交
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
#define ___GFP_ZERO		0x100u
#define ___GFP_ATOMIC		0x200u
#define ___GFP_DIRECT_RECLAIM	0x400u
#define ___GFP_KSWAPD_RECLAIM	0x800u
#define ___GFP_WRITE		0x1000u
#define ___GFP_NOWARN		0x2000u
#define ___GFP_RETRY_MAYFAIL	0x4000u
#define ___GFP_NOFAIL		0x8000u
#define ___GFP_NORETRY		0x10000u
#define ___GFP_MEMALLOC		0x20000u
#define ___GFP_COMP		0x40000u
#define ___GFP_NOMEMALLOC	0x80000u
#define ___GFP_HARDWALL		0x100000u
#define ___GFP_THISNODE		0x200000u
#define ___GFP_ACCOUNT		0x400000u
56
#define ___GFP_ZEROTAGS		0x800000u
57
#define ___GFP_SKIP_KASAN_POISON	0x1000000u
58
#ifdef CONFIG_LOCKDEP
59
#define ___GFP_NOLOCKDEP	0x2000000u
60 61 62
#else
#define ___GFP_NOLOCKDEP	0
#endif
63
/* If the above are modified, __GFP_BITS_SHIFT may need updating */
64

L
Linus Torvalds 已提交
65
/*
66
 * Physical address zone modifiers (see linux/mmzone.h - low four bits)
67 68
 *
 * Do not put any conditional on these. If necessary modify the definitions
69
 * without the underscores and use them consistently. The definitions here may
70
 * be used in bit comparisons.
L
Linus Torvalds 已提交
71
 */
72 73 74
#define __GFP_DMA	((__force gfp_t)___GFP_DMA)
#define __GFP_HIGHMEM	((__force gfp_t)___GFP_HIGHMEM)
#define __GFP_DMA32	((__force gfp_t)___GFP_DMA32)
75
#define __GFP_MOVABLE	((__force gfp_t)___GFP_MOVABLE)  /* ZONE_MOVABLE allowed */
76
#define GFP_ZONEMASK	(__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
77

78 79 80
/**
 * DOC: Page mobility and placement hints
 *
81
 * Page mobility and placement hints
82
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
L
Linus Torvalds 已提交
83
 *
84 85 86
 * These flags provide hints about how mobile the page is. Pages with similar
 * mobility are placed within the same pageblocks to minimise problems due
 * to external fragmentation.
L
Linus Torvalds 已提交
87
 *
88 89
 * %__GFP_MOVABLE (also a zone modifier) indicates that the page can be
 * moved by page migration during memory compaction or can be reclaimed.
L
Linus Torvalds 已提交
90
 *
91 92
 * %__GFP_RECLAIMABLE is used for slab allocations that specify
 * SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers.
93
 *
94 95 96
 * %__GFP_WRITE indicates the caller intends to dirty the page. Where possible,
 * these pages will be spread between local zones to avoid all the dirty
 * pages being in one zone (fair zone allocation policy).
97
 *
98
 * %__GFP_HARDWALL enforces the cpuset memory allocation policy.
99
 *
K
Kyle Spiers 已提交
100
 * %__GFP_THISNODE forces the allocation to be satisfied from the requested
101
 * node with no fallbacks or placement policy enforcements.
102
 *
103
 * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
L
Linus Torvalds 已提交
104
 */
105 106 107 108
#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
#define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)
#define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL)
#define __GFP_THISNODE	((__force gfp_t)___GFP_THISNODE)
109
#define __GFP_ACCOUNT	((__force gfp_t)___GFP_ACCOUNT)
A
Andrea Arcangeli 已提交
110

111 112 113
/**
 * DOC: Watermark modifiers
 *
114
 * Watermark modifiers -- controls access to emergency reserves
115
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
116
 *
117 118 119
 * %__GFP_HIGH indicates that the caller is high-priority and that granting
 * the request is necessary before the system can make forward progress.
 * For example, creating an IO context to clean pages.
120
 *
121 122 123
 * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
 * high priority. Users are typically interrupt handlers. This may be
 * used in conjunction with %__GFP_HIGH
124
 *
125 126 127 128
 * %__GFP_MEMALLOC allows access to all memory. This should only be used when
 * the caller guarantees the allocation will allow more memory to be freed
 * very shortly e.g. process exiting or swapping. Users either should
 * be the MM or co-ordinating closely with the VM (e.g. swap over NFS).
M
Michal Hocko 已提交
129 130 131 132 133
 * Users of this flag have to be extremely careful to not deplete the reserve
 * completely and implement a throttling mechanism which controls the
 * consumption of the reserve based on the amount of freed memory.
 * Usage of a pre-allocated pool (e.g. mempool) should be always considered
 * before using this flag.
134
 *
135 136
 * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
 * This takes precedence over the %__GFP_MEMALLOC flag if both are set.
137
 */
138 139 140 141 142
#define __GFP_ATOMIC	((__force gfp_t)___GFP_ATOMIC)
#define __GFP_HIGH	((__force gfp_t)___GFP_HIGH)
#define __GFP_MEMALLOC	((__force gfp_t)___GFP_MEMALLOC)
#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)

143 144 145
/**
 * DOC: Reclaim modifiers
 *
146
 * Reclaim modifiers
147
 * ~~~~~~~~~~~~~~~~~
148 149
 * Please note that all the following flags are only applicable to sleepable
 * allocations (e.g. %GFP_NOWAIT and %GFP_ATOMIC will ignore them).
150
 *
151
 * %__GFP_IO can start physical IO.
152
 *
153 154 155
 * %__GFP_FS can call down to the low-level FS. Clearing the flag avoids the
 * allocator recursing into the filesystem which might already be holding
 * locks.
156
 *
157 158 159
 * %__GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim.
 * This flag can be cleared to avoid unnecessary delays when a fallback
 * option is available.
160
 *
161 162 163 164 165 166
 * %__GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when
 * the low watermark is reached and have it reclaim pages until the high
 * watermark is reached. A caller may wish to clear this flag when fallback
 * options are available and the reclaim is likely to disrupt the system. The
 * canonical example is THP allocation where a fallback is cheap but
 * reclaim/compaction may cause indirect stalls.
167
 *
168
 * %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim.
169
 *
170
 * The default allocator behavior depends on the request size. We have a concept
171
 * of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER).
172 173 174 175 176 177 178
 * !costly allocations are too essential to fail so they are implicitly
 * non-failing by default (with some exceptions like OOM victims might fail so
 * the caller still has to check for failures) while costly requests try to be
 * not disruptive and back off even without invoking the OOM killer.
 * The following three modifiers might be used to override some of these
 * implicit rules
 *
179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
 * %__GFP_NORETRY: The VM implementation will try only very lightweight
 * memory direct reclaim to get some memory under memory pressure (thus
 * it can sleep). It will avoid disruptive actions like OOM killer. The
 * caller must handle the failure which is quite likely to happen under
 * heavy memory pressure. The flag is suitable when failure can easily be
 * handled at small cost, such as reduced throughput
 *
 * %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim
 * procedures that have previously failed if there is some indication
 * that progress has been made else where.  It can wait for other
 * tasks to attempt high level approaches to freeing memory such as
 * compaction (which removes fragmentation) and page-out.
 * There is still a definite limit to the number of retries, but it is
 * a larger limit than with %__GFP_NORETRY.
 * Allocations with this flag may fail, but only when there is
 * genuinely little unused memory. While these allocations do not
 * directly trigger the OOM killer, their failure indicates that
 * the system is likely to need to use the OOM killer soon.  The
 * caller must handle failure, but can reasonably do so by failing
 * a higher-level request, or completing it only in a much less
 * efficient manner.
 * If the allocation does fail, and the caller is in a position to
 * free some non-essential memory, doing so could benefit the system
 * as a whole.
 *
 * %__GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
 * cannot handle allocation failures. The allocation could block
 * indefinitely but will never return with failure. Testing for
 * failure is pointless.
 * New users should be evaluated carefully (and the flag should be
 * used only when there is no reasonable failure policy) but it is
 * definitely preferable to use the flag rather than opencode endless
 * loop around allocator.
 * Using this flag for costly allocations is _highly_ discouraged.
213 214 215
 */
#define __GFP_IO	((__force gfp_t)___GFP_IO)
#define __GFP_FS	((__force gfp_t)___GFP_FS)
216 217
#define __GFP_DIRECT_RECLAIM	((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */
#define __GFP_KSWAPD_RECLAIM	((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */
218
#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM))
219
#define __GFP_RETRY_MAYFAIL	((__force gfp_t)___GFP_RETRY_MAYFAIL)
220 221
#define __GFP_NOFAIL	((__force gfp_t)___GFP_NOFAIL)
#define __GFP_NORETRY	((__force gfp_t)___GFP_NORETRY)
222

223 224 225
/**
 * DOC: Action modifiers
 *
226
 * Action modifiers
227
 * ~~~~~~~~~~~~~~~~
228
 *
229
 * %__GFP_NOWARN suppresses allocation failure reports.
230
 *
231
 * %__GFP_COMP address compound page metadata.
232
 *
233
 * %__GFP_ZERO returns a zeroed page on success.
234 235 236
 *
 * %__GFP_ZEROTAGS returns a page with zeroed memory tags on success, if
 * __GFP_ZERO is set.
237 238 239 240
 *
 * %__GFP_SKIP_KASAN_POISON returns a page which does not need to be poisoned
 * on deallocation. Typically used for userspace pages. Currently only has an
 * effect in HW tags mode.
V
Vegard Nossum 已提交
241
 */
242 243 244
#define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
#define __GFP_COMP	((__force gfp_t)___GFP_COMP)
#define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
245
#define __GFP_ZEROTAGS	((__force gfp_t)___GFP_ZEROTAGS)
246
#define __GFP_SKIP_KASAN_POISON	((__force gfp_t)___GFP_SKIP_KASAN_POISON)
V
Vegard Nossum 已提交
247

248 249 250
/* Disable lockdep for GFP context tracking */
#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)

251
/* Room for N __GFP_FOO bits */
252
#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
A
Al Viro 已提交
253
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
L
Linus Torvalds 已提交
254

255 256 257 258 259 260
/**
 * DOC: Useful GFP flag combinations
 *
 * Useful GFP flag combinations
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
261 262
 * Useful GFP flag combinations that are commonly used. It is recommended
 * that subsystems start with one of these combinations and then set/clear
263 264 265
 * %__GFP_FOO flags as necessary.
 *
 * %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower
266 267 268
 * watermark is applied to allow access to "atomic reserves".
 * The current implementation doesn't support NMI and few other strict
 * non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT.
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
 *
 * %GFP_KERNEL is typical for kernel-internal allocations. The caller requires
 * %ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
 *
 * %GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is
 * accounted to kmemcg.
 *
 * %GFP_NOWAIT is for kernel allocations that should not stall for direct
 * reclaim, start physical IO or use any filesystem callback.
 *
 * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages
 * that do not require the starting of any physical IO.
 * Please try to avoid using this flag directly and instead use
 * memalloc_noio_{save,restore} to mark the whole scope which cannot
 * perform any IO with a short explanation why. All allocation requests
 * will inherit GFP_NOIO implicitly.
 *
 * %GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
 * Please try to avoid using this flag directly and instead use
 * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't
 * recurse into the FS layer with a short explanation why. All allocation
 * requests will inherit GFP_NOFS implicitly.
 *
 * %GFP_USER is for userspace allocations that also need to be directly
 * accessibly by the kernel or hardware. It is typically used by hardware
 * for buffers that are mapped to userspace (e.g. graphics) that hardware
 * still must DMA to. cpuset limits are enforced for these allocations.
 *
 * %GFP_DMA exists for historical reasons and should be avoided where possible.
 * The flags indicates that the caller requires that the lowest zone be
 * used (%ZONE_DMA or 16M on x86-64). Ideally, this would be removed but
 * it would require careful auditing as some users really require it and
 * others use the flag to avoid lowmem reserves in %ZONE_DMA and treat the
 * lowest zone as a type of emergency reserve.
 *
 * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit
 * address.
 *
 * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace,
 * do not need to be directly accessible by the kernel but that cannot
 * move once in use. An example may be a hardware allocation that maps
 * data directly into userspace but has no addressing limitations.
 *
 * %GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not
 * need direct access to but can use kmap() when access is required. They
 * are expected to be movable via page reclaim or page migration. Typically,
 * pages on the LRU would also be allocated with %GFP_HIGHUSER_MOVABLE.
 *
 * %GFP_TRANSHUGE and %GFP_TRANSHUGE_LIGHT are used for THP allocations. They
 * are compound allocations that will generally fail quickly if memory is not
 * available and will not wake kswapd/kcompactd on failure. The _LIGHT
 * version does not attempt reclaim/compaction at all and is by default used
 * in page fault path, while the non-light is used by khugepaged.
322 323
 */
#define GFP_ATOMIC	(__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
324
#define GFP_KERNEL	(__GFP_RECLAIM | __GFP_IO | __GFP_FS)
325
#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
326
#define GFP_NOWAIT	(__GFP_KSWAPD_RECLAIM)
327 328 329
#define GFP_NOIO	(__GFP_RECLAIM)
#define GFP_NOFS	(__GFP_RECLAIM | __GFP_IO)
#define GFP_USER	(__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
330 331
#define GFP_DMA		__GFP_DMA
#define GFP_DMA32	__GFP_DMA32
332
#define GFP_HIGHUSER	(GFP_USER | __GFP_HIGHMEM)
333 334
#define GFP_HIGHUSER_MOVABLE	(GFP_HIGHUSER | __GFP_MOVABLE | \
			 __GFP_SKIP_KASAN_POISON)
335 336 337
#define GFP_TRANSHUGE_LIGHT	((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
			 __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
#define GFP_TRANSHUGE	(GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
L
Linus Torvalds 已提交
338

339
/* Convert GFP flags to their corresponding migrate type */
340
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
341
#define GFP_MOVABLE_SHIFT 3
C
Christoph Lameter 已提交
342

343
static inline int gfp_migratetype(const gfp_t gfp_flags)
344
{
345 346 347
	VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
	BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
	BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
348 349 350 351 352

	if (unlikely(page_group_by_mobility_disabled))
		return MIGRATE_UNMOVABLE;

	/* Group based on mobility */
353
	return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
354
}
355 356
#undef GFP_MOVABLE_MASK
#undef GFP_MOVABLE_SHIFT
A
Andi Kleen 已提交
357

358 359
static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
{
360
	return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
361 362
}

363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
/**
 * gfpflags_normal_context - is gfp_flags a normal sleepable context?
 * @gfp_flags: gfp_flags to test
 *
 * Test whether @gfp_flags indicates that the allocation is from the
 * %current context and allowed to sleep.
 *
 * An allocation being allowed to block doesn't mean it owns the %current
 * context.  When direct reclaim path tries to allocate memory, the
 * allocation context is nested inside whatever %current was doing at the
 * time of the original allocation.  The nested allocation may be allowed
 * to block but modifying anything %current owns can corrupt the outer
 * context's expectations.
 *
 * %true result from this function indicates that the allocation context
 * can sleep and use anything that's associated with %current.
 */
static inline bool gfpflags_normal_context(const gfp_t gfp_flags)
{
	return (gfp_flags & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC)) ==
		__GFP_DIRECT_RECLAIM;
}

386 387 388 389 390 391
#ifdef CONFIG_HIGHMEM
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
#else
#define OPT_ZONE_HIGHMEM ZONE_NORMAL
#endif

392
#ifdef CONFIG_ZONE_DMA
393 394 395
#define OPT_ZONE_DMA ZONE_DMA
#else
#define OPT_ZONE_DMA ZONE_NORMAL
396
#endif
397

398
#ifdef CONFIG_ZONE_DMA32
399 400 401
#define OPT_ZONE_DMA32 ZONE_DMA32
#else
#define OPT_ZONE_DMA32 ZONE_NORMAL
402
#endif
403 404 405

/*
 * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
H
Hao Lee 已提交
406 407
 * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT
 * bits long and there are 16 of them to cover all possible combinations of
408
 * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
409 410 411 412
 *
 * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
 * But GFP_MOVABLE is not only a zone specifier but also an allocation
 * policy. Therefore __GFP_MOVABLE plus another zone selector is valid.
413
 * Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1".
414 415 416 417 418 419 420
 *
 *       bit       result
 *       =================
 *       0x0    => NORMAL
 *       0x1    => DMA or NORMAL
 *       0x2    => HIGHMEM or NORMAL
 *       0x3    => BAD (DMA+HIGHMEM)
421
 *       0x4    => DMA32 or NORMAL
422 423 424 425 426 427 428
 *       0x5    => BAD (DMA+DMA32)
 *       0x6    => BAD (HIGHMEM+DMA32)
 *       0x7    => BAD (HIGHMEM+DMA32+DMA)
 *       0x8    => NORMAL (MOVABLE+0)
 *       0x9    => DMA or NORMAL (MOVABLE+DMA)
 *       0xa    => MOVABLE (Movable is valid only if HIGHMEM is set too)
 *       0xb    => BAD (MOVABLE+HIGHMEM+DMA)
429
 *       0xc    => DMA32 or NORMAL (MOVABLE+DMA32)
430 431 432 433
 *       0xd    => BAD (MOVABLE+DMA32+DMA)
 *       0xe    => BAD (MOVABLE+DMA32+HIGHMEM)
 *       0xf    => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
 *
434
 * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms.
435 436
 */

437 438 439 440 441 442 443 444 445
#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
/* ZONE_DEVICE is not a valid GFP zone specifier */
#define GFP_ZONES_SHIFT 2
#else
#define GFP_ZONES_SHIFT ZONES_SHIFT
#endif

#if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG
#error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
446 447 448
#endif

#define GFP_ZONE_TABLE ( \
449 450 451 452 453 454 455 456
	(ZONE_NORMAL << 0 * GFP_ZONES_SHIFT)				       \
	| (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT)		       \
	| (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT)	       \
	| (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT)		       \
	| (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT)		       \
	| (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT)    \
	| (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\
	| (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\
457 458 459
)

/*
460
 * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32
461 462 463 464 465
 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per
 * entry starting with bit 0. Bit is set if the combination is not
 * allowed.
 */
#define GFP_ZONE_BAD ( \
466 467 468 469 470 471 472 473
	1 << (___GFP_DMA | ___GFP_HIGHMEM)				      \
	| 1 << (___GFP_DMA | ___GFP_DMA32)				      \
	| 1 << (___GFP_DMA32 | ___GFP_HIGHMEM)				      \
	| 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM)		      \
	| 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA)		      \
	| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA)		      \
	| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM)		      \
	| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM)  \
474 475 476 477 478
)

static inline enum zone_type gfp_zone(gfp_t flags)
{
	enum zone_type z;
479
	int bit = (__force int) (flags & GFP_ZONEMASK);
480

481 482
	z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
					 ((1 << GFP_ZONES_SHIFT) - 1);
483
	VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
484
	return z;
485 486
}

L
Linus Torvalds 已提交
487 488 489 490 491 492 493
/*
 * There is only one page-allocator function, and two main namespaces to
 * it. The alloc_page*() variants return 'struct page *' and as such
 * can allocate highmem pages, the *get*page*() variants return
 * virtual kernel addresses to the allocated page(s).
 */

494 495
static inline int gfp_zonelist(gfp_t flags)
{
496 497 498 499 500
#ifdef CONFIG_NUMA
	if (unlikely(flags & __GFP_THISNODE))
		return ZONELIST_NOFALLBACK;
#endif
	return ZONELIST_FALLBACK;
501 502
}

L
Linus Torvalds 已提交
503 504
/*
 * We get the zone list from the current node and the gfp_mask.
505
 * This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones.
506 507
 * There are two zonelists per node, one for all zones with memory and
 * one containing just zones from the node the zonelist belongs to.
L
Linus Torvalds 已提交
508 509 510 511
 *
 * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets
 * optimized to &contig_page_data at compile-time.
 */
512 513
static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
514
	return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
515
}
L
Linus Torvalds 已提交
516 517 518 519

#ifndef HAVE_ARCH_FREE_PAGE
static inline void arch_free_page(struct page *page, int order) { }
#endif
N
Nick Piggin 已提交
520 521 522
#ifndef HAVE_ARCH_ALLOC_PAGE
static inline void arch_alloc_page(struct page *page, int order) { }
#endif
523 524 525 526 527 528
#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
static inline int arch_make_page_accessible(struct page *page)
{
	return 0;
}
#endif
L
Linus Torvalds 已提交
529

530 531
struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
		nodemask_t *nodemask);
532

533 534
unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
				nodemask_t *nodemask, int nr_pages,
535 536
				struct list_head *page_list,
				struct page **page_array);
537 538 539

/* Bulk allocate order-0 pages */
static inline unsigned long
540
alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
541
{
542 543 544 545 546 547 548
	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL);
}

static inline unsigned long
alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array)
{
	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array);
549 550
}

551 552 553 554 555 556
/*
 * Allocate pages, preferring the node given as nid. The node must be valid and
 * online. For more general interface, see alloc_pages_node().
 */
static inline struct page *
__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
L
Linus Torvalds 已提交
557
{
558
	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
559
	VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid));
560

561
	return __alloc_pages(gfp_mask, order, nid, NULL);
L
Linus Torvalds 已提交
562 563
}

564 565
/*
 * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
566 567
 * prefer the current CPU's closest node. Otherwise node must be valid and
 * online.
568 569
 */
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
570 571
						unsigned int order)
{
572
	if (nid == NUMA_NO_NODE)
573
		nid = numa_mem_id();
574

575
	return __alloc_pages_node(nid, gfp_mask, order);
576 577
}

L
Linus Torvalds 已提交
578
#ifdef CONFIG_NUMA
579
struct page *alloc_pages(gfp_t gfp, unsigned int order);
580
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
581
			struct vm_area_struct *vma, unsigned long addr,
582 583 584
			int node, bool hugepage);
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
	alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
L
Linus Torvalds 已提交
585
#else
586 587 588 589
static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
{
	return alloc_pages_node(numa_node_id(), gfp_mask, order);
}
590 591 592
#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
	alloc_pages(gfp_mask, order)
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
593
	alloc_pages(gfp_mask, order)
L
Linus Torvalds 已提交
594 595
#endif
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
596
#define alloc_page_vma(gfp_mask, vma, addr)			\
597
	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
L
Linus Torvalds 已提交
598

599 600
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
extern unsigned long get_zeroed_page(gfp_t gfp_mask);
L
Linus Torvalds 已提交
601

602 603
void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
void free_pages_exact(void *virt, size_t size);
604
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
605

L
Linus Torvalds 已提交
606
#define __get_free_page(gfp_mask) \
607
		__get_free_pages((gfp_mask), 0)
L
Linus Torvalds 已提交
608 609

#define __get_dma_pages(gfp_mask, order) \
610
		__get_free_pages((gfp_mask) | GFP_DMA, (order))
L
Linus Torvalds 已提交
611

612 613
extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
L
Linus Torvalds 已提交
614

615
struct page_frag_cache;
616
extern void __page_frag_cache_drain(struct page *page, unsigned int count);
617 618 619 620 621 622 623 624 625 626
extern void *page_frag_alloc_align(struct page_frag_cache *nc,
				   unsigned int fragsz, gfp_t gfp_mask,
				   unsigned int align_mask);

static inline void *page_frag_alloc(struct page_frag_cache *nc,
			     unsigned int fragsz, gfp_t gfp_mask)
{
	return page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
}

627
extern void page_frag_free(void *addr);
628

L
Linus Torvalds 已提交
629
#define __free_page(page) __free_pages((page), 0)
630
#define free_page(addr) free_pages((addr), 0)
L
Linus Torvalds 已提交
631 632

void page_alloc_init(void);
633
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
634 635
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);
L
Linus Torvalds 已提交
636

637 638
void page_alloc_init_late(void);

639 640 641 642 643 644 645
/*
 * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
 * GFP flags are used before interrupts are enabled. Once interrupts are
 * enabled, it is set to __GFP_BITS_MASK while the system is running. During
 * hibernation, it is used by PM to avoid I/O during memory allocation while
 * devices are suspended.
 */
646 647
extern gfp_t gfp_allowed_mask;

648 649 650
/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);

651 652
extern void pm_restrict_gfp_mask(void);
extern void pm_restore_gfp_mask(void);
653

654 655
extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);

656 657 658 659 660 661 662 663 664
#ifdef CONFIG_PM_SLEEP
extern bool pm_suspended_storage(void);
#else
static inline bool pm_suspended_storage(void)
{
	return false;
}
#endif /* CONFIG_PM_SLEEP */

665
#ifdef CONFIG_CONTIG_ALLOC
666
/* The below functions must be run on a range from a single zone. */
667
extern int alloc_contig_range(unsigned long start, unsigned long end,
668
			      unsigned migratetype, gfp_t gfp_mask);
669 670
extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
				       int nid, nodemask_t *nodemask);
671
#endif
672
void free_contig_range(unsigned long pfn, unsigned long nr_pages);
673

674
#ifdef CONFIG_CMA
675 676
/* CMA stuff */
extern void init_cma_reserved_pageblock(struct page *page);
677 678
#endif

L
Linus Torvalds 已提交
679
#endif /* __LINUX_GFP_H */