slub.c 130.1 KB
Newer Older
C
Christoph Lameter 已提交
1 2 3 4
/*
 * SLUB: A slab allocator that limits cache line use instead of queuing
 * objects in per cpu and per node lists.
 *
5 6
 * The allocator synchronizes using per slab locks or atomic operatios
 * and only uses a centralized lock to manage a pool of partial slabs.
C
Christoph Lameter 已提交
7
 *
C
Christoph Lameter 已提交
8
 * (C) 2007 SGI, Christoph Lameter
9
 * (C) 2011 Linux Foundation, Christoph Lameter
C
Christoph Lameter 已提交
10 11 12
 */

#include <linux/mm.h>
N
Nick Piggin 已提交
13
#include <linux/swap.h> /* struct reclaim_state */
C
Christoph Lameter 已提交
14 15 16 17 18
#include <linux/module.h>
#include <linux/bit_spinlock.h>
#include <linux/interrupt.h>
#include <linux/bitops.h>
#include <linux/slab.h>
19
#include "slab.h"
20
#include <linux/proc_fs.h>
21
#include <linux/notifier.h>
C
Christoph Lameter 已提交
22
#include <linux/seq_file.h>
23
#include <linux/kasan.h>
V
Vegard Nossum 已提交
24
#include <linux/kmemcheck.h>
C
Christoph Lameter 已提交
25 26 27 28
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/mempolicy.h>
#include <linux/ctype.h>
29
#include <linux/debugobjects.h>
C
Christoph Lameter 已提交
30
#include <linux/kallsyms.h>
31
#include <linux/memory.h>
R
Roman Zippel 已提交
32
#include <linux/math64.h>
A
Akinobu Mita 已提交
33
#include <linux/fault-inject.h>
34
#include <linux/stacktrace.h>
35
#include <linux/prefetch.h>
36
#include <linux/memcontrol.h>
C
Christoph Lameter 已提交
37

38 39
#include <trace/events/kmem.h>

40 41
#include "internal.h"

C
Christoph Lameter 已提交
42 43
/*
 * Lock order:
44
 *   1. slab_mutex (Global Mutex)
45 46
 *   2. node->list_lock
 *   3. slab_lock(page) (Only on some arches and for debugging)
C
Christoph Lameter 已提交
47
 *
48
 *   slab_mutex
49
 *
50
 *   The role of the slab_mutex is to protect the list of all the slabs
51 52 53 54 55 56 57 58 59 60 61 62 63 64
 *   and to synchronize major metadata changes to slab cache structures.
 *
 *   The slab_lock is only used for debugging and on arches that do not
 *   have the ability to do a cmpxchg_double. It only protects the second
 *   double word in the page struct. Meaning
 *	A. page->freelist	-> List of object free in a page
 *	B. page->counters	-> Counters of objects
 *	C. page->frozen		-> frozen state
 *
 *   If a slab is frozen then it is exempt from list management. It is not
 *   on any list. The processor that froze the slab is the one who can
 *   perform list operations on the page. Other processors may put objects
 *   onto the freelist but the processor that froze the slab is the only
 *   one that can retrieve the objects from the page's freelist.
C
Christoph Lameter 已提交
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
 *
 *   The list_lock protects the partial and full list on each node and
 *   the partial slab counter. If taken then no new slabs may be added or
 *   removed from the lists nor make the number of partial slabs be modified.
 *   (Note that the total number of slabs is an atomic value that may be
 *   modified without taking the list lock).
 *
 *   The list_lock is a centralized lock and thus we avoid taking it as
 *   much as possible. As long as SLUB does not have to handle partial
 *   slabs, operations can continue without any centralized lock. F.e.
 *   allocating a long series of objects that fill up slabs does not require
 *   the list lock.
 *   Interrupts are disabled during allocation and deallocation in order to
 *   make the slab allocator safe to use in the context of an irq. In addition
 *   interrupts are disabled to ensure that the processor does not change
 *   while handling per_cpu slabs, due to kernel preemption.
 *
 * SLUB assigns one slab for allocation to each processor.
 * Allocations only occur from these slabs called cpu slabs.
 *
C
Christoph Lameter 已提交
85 86
 * Slabs with free elements are kept on a partial list and during regular
 * operations no list for full slabs is used. If an object in a full slab is
C
Christoph Lameter 已提交
87
 * freed then the slab will show up again on the partial lists.
C
Christoph Lameter 已提交
88 89
 * We track full slabs for debugging purposes though because otherwise we
 * cannot scan all objects.
C
Christoph Lameter 已提交
90 91 92 93 94 95 96
 *
 * Slabs are freed when they become empty. Teardown and setup is
 * minimal so we rely on the page allocators per cpu caches for
 * fast frees and allocs.
 *
 * Overloading of page flags that are otherwise used for LRU management.
 *
97 98 99 100 101 102 103 104 105 106 107 108
 * PageActive 		The slab is frozen and exempt from list processing.
 * 			This means that the slab is dedicated to a purpose
 * 			such as satisfying allocations for a specific
 * 			processor. Objects may be freed in the slab while
 * 			it is frozen but slab_free will then skip the usual
 * 			list operations. It is up to the processor holding
 * 			the slab to integrate the slab into the slab lists
 * 			when the slab is no longer needed.
 *
 * 			One use of this flag is to mark slabs that are
 * 			used for allocations. Then such a slab becomes a cpu
 * 			slab. The cpu slab may be equipped with an additional
109
 * 			freelist that allows lockless access to
110 111
 * 			free objects in addition to the regular freelist
 * 			that requires the slab lock.
C
Christoph Lameter 已提交
112 113 114
 *
 * PageError		Slab requires special handling due to debug
 * 			options set. This moves	slab handling out of
115
 * 			the fast path and disables lockless freelists.
C
Christoph Lameter 已提交
116 117
 */

118 119
static inline int kmem_cache_debug(struct kmem_cache *s)
{
120
#ifdef CONFIG_SLUB_DEBUG
121
	return unlikely(s->flags & SLAB_DEBUG_FLAGS);
122
#else
123
	return 0;
124
#endif
125
}
126

127 128 129 130 131 132 133 134 135
static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
	return !kmem_cache_debug(s);
#else
	return false;
#endif
}

C
Christoph Lameter 已提交
136 137 138 139 140 141 142 143 144 145 146
/*
 * Issues still to be resolved:
 *
 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
 *
 * - Variable sizing of the per node arrays
 */

/* Enable to test recovery from slab corruption on boot */
#undef SLUB_RESILIENCY_TEST

147 148 149
/* Enable to log cmpxchg failures */
#undef SLUB_DEBUG_CMPXCHG

150 151 152 153
/*
 * Mininum number of partial slabs. These will be left on the partial
 * lists even if they are empty. kmem_cache_shrink may reclaim them.
 */
C
Christoph Lameter 已提交
154
#define MIN_PARTIAL 5
C
Christoph Lameter 已提交
155

156 157 158
/*
 * Maximum number of desirable partial slabs.
 * The existence of more partial slabs makes kmem_cache_shrink
Z
Zhi Yong Wu 已提交
159
 * sort the partial list by the number of objects in use.
160 161 162
 */
#define MAX_PARTIAL 10

C
Christoph Lameter 已提交
163 164
#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
				SLAB_POISON | SLAB_STORE_USER)
C
Christoph Lameter 已提交
165

166
/*
167 168 169
 * Debugging flags that require metadata to be stored in the slab.  These get
 * disabled when slub_debug=O is used and a cache's min order increases with
 * metadata.
170
 */
171
#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
172

173 174
#define OO_SHIFT	16
#define OO_MASK		((1 << OO_SHIFT) - 1)
175
#define MAX_OBJS_PER_PAGE	32767 /* since page.objects is u15 */
176

C
Christoph Lameter 已提交
177
/* Internal SLUB flags */
C
Christoph Lameter 已提交
178
#define __OBJECT_POISON		0x80000000UL /* Poison object */
179
#define __CMPXCHG_DOUBLE	0x40000000UL /* Use cmpxchg_double */
C
Christoph Lameter 已提交
180 181 182 183 184

#ifdef CONFIG_SMP
static struct notifier_block slab_notifier;
#endif

185 186 187
/*
 * Tracking user of a slab.
 */
188
#define TRACK_ADDRS_COUNT 16
189
struct track {
190
	unsigned long addr;	/* Called from address */
191 192 193
#ifdef CONFIG_STACKTRACE
	unsigned long addrs[TRACK_ADDRS_COUNT];	/* Called from address */
#endif
194 195 196 197 198 199 200
	int cpu;		/* Was running on cpu */
	int pid;		/* Pid context */
	unsigned long when;	/* When did the operation occur */
};

enum track_item { TRACK_ALLOC, TRACK_FREE };

201
#ifdef CONFIG_SYSFS
C
Christoph Lameter 已提交
202 203
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
204
static void memcg_propagate_slab_attrs(struct kmem_cache *s);
C
Christoph Lameter 已提交
205
#else
206 207 208
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
							{ return 0; }
209
static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
C
Christoph Lameter 已提交
210 211
#endif

212
static inline void stat(const struct kmem_cache *s, enum stat_item si)
213 214
{
#ifdef CONFIG_SLUB_STATS
215 216 217 218 219
	/*
	 * The rmw is racy on a preemptible kernel but this is acceptable, so
	 * avoid this_cpu_add()'s irq-disable overhead.
	 */
	raw_cpu_inc(s->cpu_slab->stat[si]);
220 221 222
#endif
}

C
Christoph Lameter 已提交
223 224 225 226
/********************************************************************
 * 			Core slab cache functions
 *******************************************************************/

C
Christoph Lameter 已提交
227
/* Verify that a pointer has an address that is valid within a slab page */
228 229 230 231 232
static inline int check_valid_pointer(struct kmem_cache *s,
				struct page *page, const void *object)
{
	void *base;

233
	if (!object)
234 235
		return 1;

236
	base = page_address(page);
237
	if (object < base || object >= base + page->objects * s->size ||
238 239 240 241 242 243 244
		(object - base) % s->size) {
		return 0;
	}

	return 1;
}

245 246 247 248 249
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
	return *(void **)(object + s->offset);
}

250 251 252 253 254
static void prefetch_freepointer(const struct kmem_cache *s, void *object)
{
	prefetch(object + s->offset);
}

255 256 257 258 259 260 261 262 263 264 265 266
static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
	void *p;

#ifdef CONFIG_DEBUG_PAGEALLOC
	probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
#else
	p = get_freepointer(s, object);
#endif
	return p;
}

267 268 269 270 271 272
static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
{
	*(void **)(object + s->offset) = fp;
}

/* Loop over all objects in a slab */
273 274
#define for_each_object(__p, __s, __addr, __objects) \
	for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
275 276
			__p += (__s)->size)

277 278 279 280
#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
	for (__p = (__addr), __idx = 1; __idx <= __objects;\
			__p += (__s)->size, __idx++)

281 282 283 284 285 286
/* Determine object index from a given position */
static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
{
	return (p - addr) / s->size;
}

287 288 289 290 291 292 293 294
static inline size_t slab_ksize(const struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_DEBUG
	/*
	 * Debugging requires use of the padding between object
	 * and whatever may come after it.
	 */
	if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
295
		return s->object_size;
296 297 298 299 300 301 302 303 304 305 306 307 308 309 310

#endif
	/*
	 * If we have the need to store the freelist pointer
	 * back there or track user information then we can
	 * only use the space before that information.
	 */
	if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
		return s->inuse;
	/*
	 * Else we can use all the padding etc for the allocation
	 */
	return s->size;
}

311 312 313 314 315
static inline int order_objects(int order, unsigned long size, int reserved)
{
	return ((PAGE_SIZE << order) - reserved) / size;
}

316
static inline struct kmem_cache_order_objects oo_make(int order,
317
		unsigned long size, int reserved)
318 319
{
	struct kmem_cache_order_objects x = {
320
		(order << OO_SHIFT) + order_objects(order, size, reserved)
321 322 323 324 325 326 327
	};

	return x;
}

static inline int oo_order(struct kmem_cache_order_objects x)
{
328
	return x.x >> OO_SHIFT;
329 330 331 332
}

static inline int oo_objects(struct kmem_cache_order_objects x)
{
333
	return x.x & OO_MASK;
334 335
}

336 337 338 339 340 341 342 343 344 345 346 347 348
/*
 * Per slab locking using the pagelock
 */
static __always_inline void slab_lock(struct page *page)
{
	bit_spin_lock(PG_locked, &page->flags);
}

static __always_inline void slab_unlock(struct page *page)
{
	__bit_spin_unlock(PG_locked, &page->flags);
}

349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
static inline void set_page_slub_counters(struct page *page, unsigned long counters_new)
{
	struct page tmp;
	tmp.counters = counters_new;
	/*
	 * page->counters can cover frozen/inuse/objects as well
	 * as page->_count.  If we assign to ->counters directly
	 * we run the risk of losing updates to page->_count, so
	 * be careful and only assign to the fields we need.
	 */
	page->frozen  = tmp.frozen;
	page->inuse   = tmp.inuse;
	page->objects = tmp.objects;
}

364 365 366 367 368 369 370
/* Interrupts must be disabled (for the fallback code to work right) */
static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
		void *freelist_old, unsigned long counters_old,
		void *freelist_new, unsigned long counters_new,
		const char *n)
{
	VM_BUG_ON(!irqs_disabled());
371 372
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
373
	if (s->flags & __CMPXCHG_DOUBLE) {
374
		if (cmpxchg_double(&page->freelist, &page->counters,
375 376
				   freelist_old, counters_old,
				   freelist_new, counters_new))
377
			return true;
378 379 380 381
	} else
#endif
	{
		slab_lock(page);
382 383
		if (page->freelist == freelist_old &&
					page->counters == counters_old) {
384
			page->freelist = freelist_new;
385
			set_page_slub_counters(page, counters_new);
386
			slab_unlock(page);
387
			return true;
388 389 390 391 392 393 394 395
		}
		slab_unlock(page);
	}

	cpu_relax();
	stat(s, CMPXCHG_DOUBLE_FAIL);

#ifdef SLUB_DEBUG_CMPXCHG
396
	pr_info("%s %s: cmpxchg double redo ", n, s->name);
397 398
#endif

399
	return false;
400 401
}

402 403 404 405 406
static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
		void *freelist_old, unsigned long counters_old,
		void *freelist_new, unsigned long counters_new,
		const char *n)
{
407 408
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
409
	if (s->flags & __CMPXCHG_DOUBLE) {
410
		if (cmpxchg_double(&page->freelist, &page->counters,
411 412
				   freelist_old, counters_old,
				   freelist_new, counters_new))
413
			return true;
414 415 416
	} else
#endif
	{
417 418 419
		unsigned long flags;

		local_irq_save(flags);
420
		slab_lock(page);
421 422
		if (page->freelist == freelist_old &&
					page->counters == counters_old) {
423
			page->freelist = freelist_new;
424
			set_page_slub_counters(page, counters_new);
425
			slab_unlock(page);
426
			local_irq_restore(flags);
427
			return true;
428
		}
429
		slab_unlock(page);
430
		local_irq_restore(flags);
431 432 433 434 435 436
	}

	cpu_relax();
	stat(s, CMPXCHG_DOUBLE_FAIL);

#ifdef SLUB_DEBUG_CMPXCHG
437
	pr_info("%s %s: cmpxchg double redo ", n, s->name);
438 439
#endif

440
	return false;
441 442
}

C
Christoph Lameter 已提交
443
#ifdef CONFIG_SLUB_DEBUG
444 445 446
/*
 * Determine a map of object in use on a page.
 *
447
 * Node listlock must be held to guarantee that the page does
448 449 450 451 452 453 454 455 456 457 458
 * not vanish from under us.
 */
static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
{
	void *p;
	void *addr = page_address(page);

	for (p = page->freelist; p; p = get_freepointer(s, p))
		set_bit(slab_index(p, s, addr), map);
}

C
Christoph Lameter 已提交
459 460 461
/*
 * Debug settings:
 */
462
#if defined(CONFIG_SLUB_DEBUG_ON)
463
static int slub_debug = DEBUG_DEFAULT_FLAGS;
464 465
#elif defined(CONFIG_KASAN)
static int slub_debug = SLAB_STORE_USER;
466
#else
C
Christoph Lameter 已提交
467
static int slub_debug;
468
#endif
C
Christoph Lameter 已提交
469 470

static char *slub_debug_slabs;
471
static int disable_higher_order_debug;
C
Christoph Lameter 已提交
472

473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488
/*
 * slub is about to manipulate internal object metadata.  This memory lies
 * outside the range of the allocated object, so accessing it would normally
 * be reported by kasan as a bounds error.  metadata_access_enable() is used
 * to tell kasan that these accesses are OK.
 */
static inline void metadata_access_enable(void)
{
	kasan_disable_current();
}

static inline void metadata_access_disable(void)
{
	kasan_enable_current();
}

C
Christoph Lameter 已提交
489 490 491 492 493
/*
 * Object debugging
 */
static void print_section(char *text, u8 *addr, unsigned int length)
{
494
	metadata_access_enable();
495 496
	print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
			length, 1);
497
	metadata_access_disable();
C
Christoph Lameter 已提交
498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513
}

static struct track *get_track(struct kmem_cache *s, void *object,
	enum track_item alloc)
{
	struct track *p;

	if (s->offset)
		p = object + s->offset + sizeof(void *);
	else
		p = object + s->inuse;

	return p + alloc;
}

static void set_track(struct kmem_cache *s, void *object,
514
			enum track_item alloc, unsigned long addr)
C
Christoph Lameter 已提交
515
{
A
Akinobu Mita 已提交
516
	struct track *p = get_track(s, object, alloc);
C
Christoph Lameter 已提交
517 518

	if (addr) {
519 520 521 522 523 524 525 526
#ifdef CONFIG_STACKTRACE
		struct stack_trace trace;
		int i;

		trace.nr_entries = 0;
		trace.max_entries = TRACK_ADDRS_COUNT;
		trace.entries = p->addrs;
		trace.skip = 3;
527
		metadata_access_enable();
528
		save_stack_trace(&trace);
529
		metadata_access_disable();
530 531 532 533 534 535 536 537 538

		/* See rant in lockdep.c */
		if (trace.nr_entries != 0 &&
		    trace.entries[trace.nr_entries - 1] == ULONG_MAX)
			trace.nr_entries--;

		for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
			p->addrs[i] = 0;
#endif
C
Christoph Lameter 已提交
539 540
		p->addr = addr;
		p->cpu = smp_processor_id();
A
Alexey Dobriyan 已提交
541
		p->pid = current->pid;
C
Christoph Lameter 已提交
542 543 544 545 546 547 548
		p->when = jiffies;
	} else
		memset(p, 0, sizeof(struct track));
}

static void init_tracking(struct kmem_cache *s, void *object)
{
549 550 551
	if (!(s->flags & SLAB_STORE_USER))
		return;

552 553
	set_track(s, object, TRACK_FREE, 0UL);
	set_track(s, object, TRACK_ALLOC, 0UL);
C
Christoph Lameter 已提交
554 555 556 557 558 559 560
}

static void print_track(const char *s, struct track *t)
{
	if (!t->addr)
		return;

561 562
	pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
	       s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
563 564 565 566 567
#ifdef CONFIG_STACKTRACE
	{
		int i;
		for (i = 0; i < TRACK_ADDRS_COUNT; i++)
			if (t->addrs[i])
568
				pr_err("\t%pS\n", (void *)t->addrs[i]);
569 570 571 572
			else
				break;
	}
#endif
573 574 575 576 577 578 579 580 581 582 583 584 585
}

static void print_tracking(struct kmem_cache *s, void *object)
{
	if (!(s->flags & SLAB_STORE_USER))
		return;

	print_track("Allocated", get_track(s, object, TRACK_ALLOC));
	print_track("Freed", get_track(s, object, TRACK_FREE));
}

static void print_page_info(struct page *page)
{
586
	pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
587
	       page, page->objects, page->inuse, page->freelist, page->flags);
588 589 590 591 592

}

static void slab_bug(struct kmem_cache *s, char *fmt, ...)
{
593
	struct va_format vaf;
594 595 596
	va_list args;

	va_start(args, fmt);
597 598
	vaf.fmt = fmt;
	vaf.va = &args;
599
	pr_err("=============================================================================\n");
600
	pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
601
	pr_err("-----------------------------------------------------------------------------\n\n");
602

603
	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
604
	va_end(args);
C
Christoph Lameter 已提交
605 606
}

607 608
static void slab_fix(struct kmem_cache *s, char *fmt, ...)
{
609
	struct va_format vaf;
610 611 612
	va_list args;

	va_start(args, fmt);
613 614 615
	vaf.fmt = fmt;
	vaf.va = &args;
	pr_err("FIX %s: %pV\n", s->name, &vaf);
616 617 618 619
	va_end(args);
}

static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
C
Christoph Lameter 已提交
620 621
{
	unsigned int off;	/* Offset of last byte */
622
	u8 *addr = page_address(page);
623 624 625 626 627

	print_tracking(s, p);

	print_page_info(page);

628 629
	pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
	       p, p - addr, get_freepointer(s, p));
630 631

	if (p > addr + 16)
632
		print_section("Bytes b4 ", p - 16, 16);
C
Christoph Lameter 已提交
633

634
	print_section("Object ", p, min_t(unsigned long, s->object_size,
635
				PAGE_SIZE));
C
Christoph Lameter 已提交
636
	if (s->flags & SLAB_RED_ZONE)
637 638
		print_section("Redzone ", p + s->object_size,
			s->inuse - s->object_size);
C
Christoph Lameter 已提交
639 640 641 642 643 644

	if (s->offset)
		off = s->offset + sizeof(void *);
	else
		off = s->inuse;

645
	if (s->flags & SLAB_STORE_USER)
C
Christoph Lameter 已提交
646 647 648 649
		off += 2 * sizeof(struct track);

	if (off != s->size)
		/* Beginning of the filler is the free pointer */
650
		print_section("Padding ", p + off, s->size - off);
651 652

	dump_stack();
C
Christoph Lameter 已提交
653 654
}

655
void object_err(struct kmem_cache *s, struct page *page,
C
Christoph Lameter 已提交
656 657
			u8 *object, char *reason)
{
658
	slab_bug(s, "%s", reason);
659
	print_trailer(s, page, object);
C
Christoph Lameter 已提交
660 661
}

662 663
static void slab_err(struct kmem_cache *s, struct page *page,
			const char *fmt, ...)
C
Christoph Lameter 已提交
664 665 666 667
{
	va_list args;
	char buf[100];

668 669
	va_start(args, fmt);
	vsnprintf(buf, sizeof(buf), fmt, args);
C
Christoph Lameter 已提交
670
	va_end(args);
671
	slab_bug(s, "%s", buf);
672
	print_page_info(page);
C
Christoph Lameter 已提交
673 674 675
	dump_stack();
}

676
static void init_object(struct kmem_cache *s, void *object, u8 val)
C
Christoph Lameter 已提交
677 678 679 680
{
	u8 *p = object;

	if (s->flags & __OBJECT_POISON) {
681 682
		memset(p, POISON_FREE, s->object_size - 1);
		p[s->object_size - 1] = POISON_END;
C
Christoph Lameter 已提交
683 684 685
	}

	if (s->flags & SLAB_RED_ZONE)
686
		memset(p + s->object_size, val, s->inuse - s->object_size);
C
Christoph Lameter 已提交
687 688
}

689 690 691 692 693 694 695 696 697
static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
						void *from, void *to)
{
	slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
	memset(from, data, to - from);
}

static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
			u8 *object, char *what,
P
Pekka Enberg 已提交
698
			u8 *start, unsigned int value, unsigned int bytes)
699 700 701 702
{
	u8 *fault;
	u8 *end;

703
	metadata_access_enable();
704
	fault = memchr_inv(start, value, bytes);
705
	metadata_access_disable();
706 707 708 709 710 711 712 713
	if (!fault)
		return 1;

	end = start + bytes;
	while (end > fault && end[-1] == value)
		end--;

	slab_bug(s, "%s overwritten", what);
714
	pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
715 716 717 718 719
					fault, end - 1, fault[0], value);
	print_trailer(s, page, object);

	restore_bytes(s, what, value, fault, end);
	return 0;
C
Christoph Lameter 已提交
720 721 722 723 724 725 726 727 728
}

/*
 * Object layout:
 *
 * object address
 * 	Bytes of the object to be managed.
 * 	If the freepointer may overlay the object then the free
 * 	pointer is the first word of the object.
C
Christoph Lameter 已提交
729
 *
C
Christoph Lameter 已提交
730 731 732
 * 	Poisoning uses 0x6b (POISON_FREE) and the last byte is
 * 	0xa5 (POISON_END)
 *
733
 * object + s->object_size
C
Christoph Lameter 已提交
734
 * 	Padding to reach word boundary. This is also used for Redzoning.
C
Christoph Lameter 已提交
735
 * 	Padding is extended by another word if Redzoning is enabled and
736
 * 	object_size == inuse.
C
Christoph Lameter 已提交
737
 *
C
Christoph Lameter 已提交
738 739 740 741
 * 	We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 * 	0xcc (RED_ACTIVE) for objects in use.
 *
 * object + s->inuse
C
Christoph Lameter 已提交
742 743
 * 	Meta data starts here.
 *
C
Christoph Lameter 已提交
744 745
 * 	A. Free pointer (if we cannot overwrite object on free)
 * 	B. Tracking data for SLAB_STORE_USER
C
Christoph Lameter 已提交
746
 * 	C. Padding to reach required alignment boundary or at mininum
C
Christoph Lameter 已提交
747
 * 		one word if debugging is on to be able to detect writes
C
Christoph Lameter 已提交
748 749 750
 * 		before the word boundary.
 *
 *	Padding is done using 0x5a (POISON_INUSE)
C
Christoph Lameter 已提交
751 752
 *
 * object + s->size
C
Christoph Lameter 已提交
753
 * 	Nothing is used beyond s->size.
C
Christoph Lameter 已提交
754
 *
755
 * If slabcaches are merged then the object_size and inuse boundaries are mostly
C
Christoph Lameter 已提交
756
 * ignored. And therefore no slab options that rely on these boundaries
C
Christoph Lameter 已提交
757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774
 * may be used with merged slabcaches.
 */

static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
{
	unsigned long off = s->inuse;	/* The end of info */

	if (s->offset)
		/* Freepointer is placed after the object. */
		off += sizeof(void *);

	if (s->flags & SLAB_STORE_USER)
		/* We also have user information there */
		off += 2 * sizeof(struct track);

	if (s->size == off)
		return 1;

775 776
	return check_bytes_and_report(s, page, p, "Object padding",
				p + off, POISON_INUSE, s->size - off);
C
Christoph Lameter 已提交
777 778
}

779
/* Check the pad bytes at the end of a slab page */
C
Christoph Lameter 已提交
780 781
static int slab_pad_check(struct kmem_cache *s, struct page *page)
{
782 783 784 785 786
	u8 *start;
	u8 *fault;
	u8 *end;
	int length;
	int remainder;
C
Christoph Lameter 已提交
787 788 789 790

	if (!(s->flags & SLAB_POISON))
		return 1;

791
	start = page_address(page);
792
	length = (PAGE_SIZE << compound_order(page)) - s->reserved;
793 794
	end = start + length;
	remainder = length % s->size;
C
Christoph Lameter 已提交
795 796 797
	if (!remainder)
		return 1;

798
	metadata_access_enable();
799
	fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
800
	metadata_access_disable();
801 802 803 804 805 806
	if (!fault)
		return 1;
	while (end > fault && end[-1] == POISON_INUSE)
		end--;

	slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
807
	print_section("Padding ", end - remainder, remainder);
808

E
Eric Dumazet 已提交
809
	restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
810
	return 0;
C
Christoph Lameter 已提交
811 812 813
}

static int check_object(struct kmem_cache *s, struct page *page,
814
					void *object, u8 val)
C
Christoph Lameter 已提交
815 816
{
	u8 *p = object;
817
	u8 *endobject = object + s->object_size;
C
Christoph Lameter 已提交
818 819

	if (s->flags & SLAB_RED_ZONE) {
820
		if (!check_bytes_and_report(s, page, object, "Redzone",
821
			endobject, val, s->inuse - s->object_size))
C
Christoph Lameter 已提交
822 823
			return 0;
	} else {
824
		if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
I
Ingo Molnar 已提交
825
			check_bytes_and_report(s, page, p, "Alignment padding",
826 827
				endobject, POISON_INUSE,
				s->inuse - s->object_size);
I
Ingo Molnar 已提交
828
		}
C
Christoph Lameter 已提交
829 830 831
	}

	if (s->flags & SLAB_POISON) {
832
		if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
833
			(!check_bytes_and_report(s, page, p, "Poison", p,
834
					POISON_FREE, s->object_size - 1) ||
835
			 !check_bytes_and_report(s, page, p, "Poison",
836
				p + s->object_size - 1, POISON_END, 1)))
C
Christoph Lameter 已提交
837 838 839 840 841 842 843
			return 0;
		/*
		 * check_pad_bytes cleans up on its own.
		 */
		check_pad_bytes(s, page, p);
	}

844
	if (!s->offset && val == SLUB_RED_ACTIVE)
C
Christoph Lameter 已提交
845 846 847 848 849 850 851 852 853 854
		/*
		 * Object and freepointer overlap. Cannot check
		 * freepointer while object is allocated.
		 */
		return 1;

	/* Check free pointer validity */
	if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
		object_err(s, page, p, "Freepointer corrupt");
		/*
N
Nick Andrew 已提交
855
		 * No choice but to zap it and thus lose the remainder
C
Christoph Lameter 已提交
856
		 * of the free objects in this slab. May cause
C
Christoph Lameter 已提交
857
		 * another error because the object count is now wrong.
C
Christoph Lameter 已提交
858
		 */
859
		set_freepointer(s, p, NULL);
C
Christoph Lameter 已提交
860 861 862 863 864 865 866
		return 0;
	}
	return 1;
}

static int check_slab(struct kmem_cache *s, struct page *page)
{
867 868
	int maxobj;

C
Christoph Lameter 已提交
869 870 871
	VM_BUG_ON(!irqs_disabled());

	if (!PageSlab(page)) {
872
		slab_err(s, page, "Not a valid slab page");
C
Christoph Lameter 已提交
873 874
		return 0;
	}
875

876
	maxobj = order_objects(compound_order(page), s->size, s->reserved);
877 878
	if (page->objects > maxobj) {
		slab_err(s, page, "objects %u > max %u",
879
			page->objects, maxobj);
880 881 882
		return 0;
	}
	if (page->inuse > page->objects) {
883
		slab_err(s, page, "inuse %u > max %u",
884
			page->inuse, page->objects);
C
Christoph Lameter 已提交
885 886 887 888 889 890 891 892
		return 0;
	}
	/* Slab_pad_check fixes things up after itself */
	slab_pad_check(s, page);
	return 1;
}

/*
C
Christoph Lameter 已提交
893 894
 * Determine if a certain object on a page is on the freelist. Must hold the
 * slab lock to guarantee that the chains are in a consistent state.
C
Christoph Lameter 已提交
895 896 897 898
 */
static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
{
	int nr = 0;
899
	void *fp;
C
Christoph Lameter 已提交
900
	void *object = NULL;
901
	int max_objects;
C
Christoph Lameter 已提交
902

903
	fp = page->freelist;
904
	while (fp && nr <= page->objects) {
C
Christoph Lameter 已提交
905 906 907 908 909 910
		if (fp == search)
			return 1;
		if (!check_valid_pointer(s, page, fp)) {
			if (object) {
				object_err(s, page, object,
					"Freechain corrupt");
911
				set_freepointer(s, object, NULL);
C
Christoph Lameter 已提交
912
			} else {
913
				slab_err(s, page, "Freepointer corrupt");
914
				page->freelist = NULL;
915
				page->inuse = page->objects;
916
				slab_fix(s, "Freelist cleared");
C
Christoph Lameter 已提交
917 918 919 920 921 922 923 924 925
				return 0;
			}
			break;
		}
		object = fp;
		fp = get_freepointer(s, object);
		nr++;
	}

926
	max_objects = order_objects(compound_order(page), s->size, s->reserved);
927 928
	if (max_objects > MAX_OBJS_PER_PAGE)
		max_objects = MAX_OBJS_PER_PAGE;
929 930 931 932 933 934 935

	if (page->objects != max_objects) {
		slab_err(s, page, "Wrong number of objects. Found %d but "
			"should be %d", page->objects, max_objects);
		page->objects = max_objects;
		slab_fix(s, "Number of objects adjusted.");
	}
936
	if (page->inuse != page->objects - nr) {
937
		slab_err(s, page, "Wrong object count. Counter is %d but "
938 939
			"counted were %d", page->inuse, page->objects - nr);
		page->inuse = page->objects - nr;
940
		slab_fix(s, "Object count adjusted.");
C
Christoph Lameter 已提交
941 942 943 944
	}
	return search == NULL;
}

945 946
static void trace(struct kmem_cache *s, struct page *page, void *object,
								int alloc)
C
Christoph Lameter 已提交
947 948
{
	if (s->flags & SLAB_TRACE) {
949
		pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
C
Christoph Lameter 已提交
950 951 952 953 954 955
			s->name,
			alloc ? "alloc" : "free",
			object, page->inuse,
			page->freelist);

		if (!alloc)
956 957
			print_section("Object ", (void *)object,
					s->object_size);
C
Christoph Lameter 已提交
958 959 960 961 962

		dump_stack();
	}
}

963
/*
C
Christoph Lameter 已提交
964
 * Tracking of fully allocated slabs for debugging purposes.
965
 */
966 967
static void add_full(struct kmem_cache *s,
	struct kmem_cache_node *n, struct page *page)
968
{
969 970 971
	if (!(s->flags & SLAB_STORE_USER))
		return;

972
	lockdep_assert_held(&n->list_lock);
973 974 975
	list_add(&page->lru, &n->full);
}

P
Peter Zijlstra 已提交
976
static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
977 978 979 980
{
	if (!(s->flags & SLAB_STORE_USER))
		return;

981
	lockdep_assert_held(&n->list_lock);
982 983 984
	list_del(&page->lru);
}

985 986 987 988 989 990 991 992
/* Tracking of the number of slabs for debugging purposes */
static inline unsigned long slabs_node(struct kmem_cache *s, int node)
{
	struct kmem_cache_node *n = get_node(s, node);

	return atomic_long_read(&n->nr_slabs);
}

993 994 995 996 997
static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
{
	return atomic_long_read(&n->nr_slabs);
}

998
static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
999 1000 1001 1002 1003 1004 1005 1006 1007
{
	struct kmem_cache_node *n = get_node(s, node);

	/*
	 * May be called early in order to allocate a slab for the
	 * kmem_cache_node structure. Solve the chicken-egg
	 * dilemma by deferring the increment of the count during
	 * bootstrap (see early_kmem_cache_node_alloc).
	 */
1008
	if (likely(n)) {
1009
		atomic_long_inc(&n->nr_slabs);
1010 1011
		atomic_long_add(objects, &n->total_objects);
	}
1012
}
1013
static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
1014 1015 1016 1017
{
	struct kmem_cache_node *n = get_node(s, node);

	atomic_long_dec(&n->nr_slabs);
1018
	atomic_long_sub(objects, &n->total_objects);
1019 1020 1021
}

/* Object debug checks for alloc/free paths */
C
Christoph Lameter 已提交
1022 1023 1024 1025 1026 1027
static void setup_object_debug(struct kmem_cache *s, struct page *page,
								void *object)
{
	if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
		return;

1028
	init_object(s, object, SLUB_RED_INACTIVE);
C
Christoph Lameter 已提交
1029 1030 1031
	init_tracking(s, object);
}

1032 1033
static noinline int alloc_debug_processing(struct kmem_cache *s,
					struct page *page,
1034
					void *object, unsigned long addr)
C
Christoph Lameter 已提交
1035 1036 1037 1038 1039 1040
{
	if (!check_slab(s, page))
		goto bad;

	if (!check_valid_pointer(s, page, object)) {
		object_err(s, page, object, "Freelist Pointer check fails");
1041
		goto bad;
C
Christoph Lameter 已提交
1042 1043
	}

1044
	if (!check_object(s, page, object, SLUB_RED_INACTIVE))
C
Christoph Lameter 已提交
1045 1046
		goto bad;

C
Christoph Lameter 已提交
1047 1048 1049 1050
	/* Success perform special debug activities for allocs */
	if (s->flags & SLAB_STORE_USER)
		set_track(s, object, TRACK_ALLOC, addr);
	trace(s, page, object, 1);
1051
	init_object(s, object, SLUB_RED_ACTIVE);
C
Christoph Lameter 已提交
1052
	return 1;
C
Christoph Lameter 已提交
1053

C
Christoph Lameter 已提交
1054 1055 1056 1057 1058
bad:
	if (PageSlab(page)) {
		/*
		 * If this is a slab page then lets do the best we can
		 * to avoid issues in the future. Marking all objects
C
Christoph Lameter 已提交
1059
		 * as used avoids touching the remaining objects.
C
Christoph Lameter 已提交
1060
		 */
1061
		slab_fix(s, "Marking all objects used");
1062
		page->inuse = page->objects;
1063
		page->freelist = NULL;
C
Christoph Lameter 已提交
1064 1065 1066 1067
	}
	return 0;
}

1068 1069 1070
static noinline struct kmem_cache_node *free_debug_processing(
	struct kmem_cache *s, struct page *page, void *object,
	unsigned long addr, unsigned long *flags)
C
Christoph Lameter 已提交
1071
{
1072
	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1073

1074
	spin_lock_irqsave(&n->list_lock, *flags);
1075 1076
	slab_lock(page);

C
Christoph Lameter 已提交
1077 1078 1079 1080
	if (!check_slab(s, page))
		goto fail;

	if (!check_valid_pointer(s, page, object)) {
1081
		slab_err(s, page, "Invalid object pointer 0x%p", object);
C
Christoph Lameter 已提交
1082 1083 1084 1085
		goto fail;
	}

	if (on_freelist(s, page, object)) {
1086
		object_err(s, page, object, "Object already free");
C
Christoph Lameter 已提交
1087 1088 1089
		goto fail;
	}

1090
	if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1091
		goto out;
C
Christoph Lameter 已提交
1092

1093
	if (unlikely(s != page->slab_cache)) {
I
Ingo Molnar 已提交
1094
		if (!PageSlab(page)) {
1095 1096
			slab_err(s, page, "Attempt to free object(0x%p) "
				"outside of slab", object);
1097
		} else if (!page->slab_cache) {
1098 1099
			pr_err("SLUB <none>: no slab for object 0x%p.\n",
			       object);
1100
			dump_stack();
P
Pekka Enberg 已提交
1101
		} else
1102 1103
			object_err(s, page, object,
					"page slab pointer corrupt.");
C
Christoph Lameter 已提交
1104 1105
		goto fail;
	}
C
Christoph Lameter 已提交
1106 1107 1108 1109

	if (s->flags & SLAB_STORE_USER)
		set_track(s, object, TRACK_FREE, addr);
	trace(s, page, object, 0);
1110
	init_object(s, object, SLUB_RED_INACTIVE);
1111
out:
1112
	slab_unlock(page);
1113 1114 1115 1116 1117
	/*
	 * Keep node_lock to preserve integrity
	 * until the object is actually freed
	 */
	return n;
C
Christoph Lameter 已提交
1118

C
Christoph Lameter 已提交
1119
fail:
1120 1121
	slab_unlock(page);
	spin_unlock_irqrestore(&n->list_lock, *flags);
1122
	slab_fix(s, "Object at 0x%p not freed", object);
1123
	return NULL;
C
Christoph Lameter 已提交
1124 1125
}

C
Christoph Lameter 已提交
1126 1127
static int __init setup_slub_debug(char *str)
{
1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151
	slub_debug = DEBUG_DEFAULT_FLAGS;
	if (*str++ != '=' || !*str)
		/*
		 * No options specified. Switch on full debugging.
		 */
		goto out;

	if (*str == ',')
		/*
		 * No options but restriction on slabs. This means full
		 * debugging for slabs matching a pattern.
		 */
		goto check_slabs;

	slub_debug = 0;
	if (*str == '-')
		/*
		 * Switch off all debugging measures.
		 */
		goto out;

	/*
	 * Determine which debug features should be switched on
	 */
P
Pekka Enberg 已提交
1152
	for (; *str && *str != ','; str++) {
1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168
		switch (tolower(*str)) {
		case 'f':
			slub_debug |= SLAB_DEBUG_FREE;
			break;
		case 'z':
			slub_debug |= SLAB_RED_ZONE;
			break;
		case 'p':
			slub_debug |= SLAB_POISON;
			break;
		case 'u':
			slub_debug |= SLAB_STORE_USER;
			break;
		case 't':
			slub_debug |= SLAB_TRACE;
			break;
1169 1170 1171
		case 'a':
			slub_debug |= SLAB_FAILSLAB;
			break;
1172 1173 1174 1175 1176 1177 1178
		case 'o':
			/*
			 * Avoid enabling debugging on caches if its minimum
			 * order would increase as a result.
			 */
			disable_higher_order_debug = 1;
			break;
1179
		default:
1180 1181
			pr_err("slub_debug option '%c' unknown. skipped\n",
			       *str);
1182
		}
C
Christoph Lameter 已提交
1183 1184
	}

1185
check_slabs:
C
Christoph Lameter 已提交
1186 1187
	if (*str == ',')
		slub_debug_slabs = str + 1;
1188
out:
C
Christoph Lameter 已提交
1189 1190 1191 1192 1193
	return 1;
}

__setup("slub_debug", setup_slub_debug);

1194
unsigned long kmem_cache_flags(unsigned long object_size,
1195
	unsigned long flags, const char *name,
1196
	void (*ctor)(void *))
C
Christoph Lameter 已提交
1197 1198
{
	/*
1199
	 * Enable debugging if selected on the kernel commandline.
C
Christoph Lameter 已提交
1200
	 */
1201 1202
	if (slub_debug && (!slub_debug_slabs || (name &&
		!strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))))
1203
		flags |= slub_debug;
1204 1205

	return flags;
C
Christoph Lameter 已提交
1206 1207
}
#else
C
Christoph Lameter 已提交
1208 1209
static inline void setup_object_debug(struct kmem_cache *s,
			struct page *page, void *object) {}
C
Christoph Lameter 已提交
1210

C
Christoph Lameter 已提交
1211
static inline int alloc_debug_processing(struct kmem_cache *s,
1212
	struct page *page, void *object, unsigned long addr) { return 0; }
C
Christoph Lameter 已提交
1213

1214 1215 1216
static inline struct kmem_cache_node *free_debug_processing(
	struct kmem_cache *s, struct page *page, void *object,
	unsigned long addr, unsigned long *flags) { return NULL; }
C
Christoph Lameter 已提交
1217 1218 1219 1220

static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
			{ return 1; }
static inline int check_object(struct kmem_cache *s, struct page *page,
1221
			void *object, u8 val) { return 1; }
1222 1223
static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
					struct page *page) {}
P
Peter Zijlstra 已提交
1224 1225
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
					struct page *page) {}
1226
unsigned long kmem_cache_flags(unsigned long object_size,
1227
	unsigned long flags, const char *name,
1228
	void (*ctor)(void *))
1229 1230 1231
{
	return flags;
}
C
Christoph Lameter 已提交
1232
#define slub_debug 0
1233

1234 1235
#define disable_higher_order_debug 0

1236 1237
static inline unsigned long slabs_node(struct kmem_cache *s, int node)
							{ return 0; }
1238 1239
static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
							{ return 0; }
1240 1241 1242 1243
static inline void inc_slabs_node(struct kmem_cache *s, int node,
							int objects) {}
static inline void dec_slabs_node(struct kmem_cache *s, int node,
							int objects) {}
1244

1245 1246 1247 1248 1249 1250
#endif /* CONFIG_SLUB_DEBUG */

/*
 * Hooks for other subsystems that check memory allocations. In a typical
 * production configuration these hooks all should produce no code at all.
 */
1251 1252 1253
static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
{
	kmemleak_alloc(ptr, size, 1, flags);
1254
	kasan_kmalloc_large(ptr, size);
1255 1256 1257 1258 1259
}

static inline void kfree_hook(const void *x)
{
	kmemleak_free(x);
1260
	kasan_kfree_large(x);
1261 1262
}

1263 1264
static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
						     gfp_t flags)
1265 1266 1267
{
	flags &= gfp_allowed_mask;
	lockdep_trace_alloc(flags);
1268
	might_sleep_if(gfpflags_allow_blocking(flags));
1269

1270 1271 1272 1273
	if (should_failslab(s->object_size, flags, s->flags))
		return NULL;

	return memcg_kmem_get_cache(s, flags);
1274 1275 1276 1277
}

static inline void slab_post_alloc_hook(struct kmem_cache *s,
					gfp_t flags, void *object)
1278
{
1279 1280 1281
	flags &= gfp_allowed_mask;
	kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
	kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
1282
	memcg_kmem_put_cache(s);
1283
	kasan_slab_alloc(s, object);
1284
}
1285

1286 1287 1288
static inline void slab_free_hook(struct kmem_cache *s, void *x)
{
	kmemleak_free_recursive(x, s->flags);
1289

1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306
	/*
	 * Trouble is that we may no longer disable interrupts in the fast path
	 * So in order to make the debug calls that expect irqs to be
	 * disabled we need to disable interrupts temporarily.
	 */
#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
	{
		unsigned long flags;

		local_irq_save(flags);
		kmemcheck_slab_free(s, x, s->object_size);
		debug_check_no_locks_freed(x, s->object_size);
		local_irq_restore(flags);
	}
#endif
	if (!(s->flags & SLAB_DEBUG_OBJECTS))
		debug_check_no_obj_freed(x, s->object_size);
1307 1308

	kasan_slab_free(s, x);
1309
}
1310

1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321
static void setup_object(struct kmem_cache *s, struct page *page,
				void *object)
{
	setup_object_debug(s, page, object);
	if (unlikely(s->ctor)) {
		kasan_unpoison_object_data(s, object);
		s->ctor(object);
		kasan_poison_object_data(s, object);
	}
}

C
Christoph Lameter 已提交
1322 1323 1324
/*
 * Slab allocation and freeing
 */
1325 1326
static inline struct page *alloc_slab_page(struct kmem_cache *s,
		gfp_t flags, int node, struct kmem_cache_order_objects oo)
1327
{
1328
	struct page *page;
1329 1330
	int order = oo_order(oo);

1331 1332
	flags |= __GFP_NOTRACK;

1333
	if (node == NUMA_NO_NODE)
1334
		page = alloc_pages(flags, order);
1335
	else
1336
		page = __alloc_pages_node(node, flags, order);
1337

1338 1339 1340 1341
	if (page && memcg_charge_slab(page, flags, order, s)) {
		__free_pages(page, order);
		page = NULL;
	}
1342 1343

	return page;
1344 1345
}

C
Christoph Lameter 已提交
1346 1347
static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
P
Pekka Enberg 已提交
1348
	struct page *page;
1349
	struct kmem_cache_order_objects oo = s->oo;
1350
	gfp_t alloc_gfp;
1351 1352
	void *start, *p;
	int idx, order;
C
Christoph Lameter 已提交
1353

1354 1355
	flags &= gfp_allowed_mask;

1356
	if (gfpflags_allow_blocking(flags))
1357 1358
		local_irq_enable();

1359
	flags |= s->allocflags;
1360

1361 1362 1363 1364 1365
	/*
	 * Let the initial higher-order allocation fail under memory pressure
	 * so we fall-back to the minimum order allocation.
	 */
	alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1366 1367
	if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
		alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM;
1368

1369
	page = alloc_slab_page(s, alloc_gfp, node, oo);
1370 1371
	if (unlikely(!page)) {
		oo = s->min;
1372
		alloc_gfp = flags;
1373 1374 1375 1376
		/*
		 * Allocation may have failed due to fragmentation.
		 * Try a lower order alloc if possible
		 */
1377
		page = alloc_slab_page(s, alloc_gfp, node, oo);
1378 1379 1380
		if (unlikely(!page))
			goto out;
		stat(s, ORDER_FALLBACK);
1381
	}
V
Vegard Nossum 已提交
1382

1383 1384
	if (kmemcheck_enabled &&
	    !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1385 1386
		int pages = 1 << oo_order(oo);

1387
		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
1388 1389 1390 1391 1392 1393 1394 1395 1396

		/*
		 * Objects from caches that have a constructor don't get
		 * cleared when they're allocated, so we need to do it here.
		 */
		if (s->ctor)
			kmemcheck_mark_uninitialized_pages(page, pages);
		else
			kmemcheck_mark_unallocated_pages(page, pages);
V
Vegard Nossum 已提交
1397 1398
	}

1399
	page->objects = oo_objects(oo);
C
Christoph Lameter 已提交
1400

G
Glauber Costa 已提交
1401
	order = compound_order(page);
1402
	page->slab_cache = s;
1403
	__SetPageSlab(page);
1404
	if (page_is_pfmemalloc(page))
1405
		SetPageSlabPfmemalloc(page);
C
Christoph Lameter 已提交
1406 1407 1408 1409

	start = page_address(page);

	if (unlikely(s->flags & SLAB_POISON))
G
Glauber Costa 已提交
1410
		memset(start, POISON_INUSE, PAGE_SIZE << order);
C
Christoph Lameter 已提交
1411

1412 1413
	kasan_poison_slab(page);

1414 1415 1416 1417 1418 1419
	for_each_object_idx(p, idx, s, start, page->objects) {
		setup_object(s, page, p);
		if (likely(idx < page->objects))
			set_freepointer(s, p, p + s->size);
		else
			set_freepointer(s, p, NULL);
C
Christoph Lameter 已提交
1420 1421 1422
	}

	page->freelist = start;
1423
	page->inuse = page->objects;
1424
	page->frozen = 1;
1425

C
Christoph Lameter 已提交
1426
out:
1427
	if (gfpflags_allow_blocking(flags))
1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438
		local_irq_disable();
	if (!page)
		return NULL;

	mod_zone_page_state(page_zone(page),
		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
		1 << oo_order(oo));

	inc_slabs_node(s, page_to_nid(page), page->objects);

C
Christoph Lameter 已提交
1439 1440 1441
	return page;
}

1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
	if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
		pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
		BUG();
	}

	return allocate_slab(s,
		flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
}

C
Christoph Lameter 已提交
1453 1454
static void __free_slab(struct kmem_cache *s, struct page *page)
{
1455 1456
	int order = compound_order(page);
	int pages = 1 << order;
C
Christoph Lameter 已提交
1457

1458
	if (kmem_cache_debug(s)) {
C
Christoph Lameter 已提交
1459 1460 1461
		void *p;

		slab_pad_check(s, page);
1462 1463
		for_each_object(p, s, page_address(page),
						page->objects)
1464
			check_object(s, page, p, SLUB_RED_INACTIVE);
C
Christoph Lameter 已提交
1465 1466
	}

1467
	kmemcheck_free_shadow(page, compound_order(page));
V
Vegard Nossum 已提交
1468

C
Christoph Lameter 已提交
1469 1470 1471
	mod_zone_page_state(page_zone(page),
		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
P
Pekka Enberg 已提交
1472
		-pages);
C
Christoph Lameter 已提交
1473

1474
	__ClearPageSlabPfmemalloc(page);
1475
	__ClearPageSlab(page);
G
Glauber Costa 已提交
1476

1477
	page_mapcount_reset(page);
N
Nick Piggin 已提交
1478 1479
	if (current->reclaim_state)
		current->reclaim_state->reclaimed_slab += pages;
1480
	__free_kmem_pages(page, order);
C
Christoph Lameter 已提交
1481 1482
}

1483 1484 1485
#define need_reserve_slab_rcu						\
	(sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))

C
Christoph Lameter 已提交
1486 1487 1488 1489
static void rcu_free_slab(struct rcu_head *h)
{
	struct page *page;

1490 1491 1492 1493 1494
	if (need_reserve_slab_rcu)
		page = virt_to_head_page(h);
	else
		page = container_of((struct list_head *)h, struct page, lru);

1495
	__free_slab(page->slab_cache, page);
C
Christoph Lameter 已提交
1496 1497 1498 1499 1500
}

static void free_slab(struct kmem_cache *s, struct page *page)
{
	if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
1501 1502 1503 1504 1505 1506 1507 1508 1509
		struct rcu_head *head;

		if (need_reserve_slab_rcu) {
			int order = compound_order(page);
			int offset = (PAGE_SIZE << order) - s->reserved;

			VM_BUG_ON(s->reserved != sizeof(*head));
			head = page_address(page) + offset;
		} else {
1510
			head = &page->rcu_head;
1511
		}
C
Christoph Lameter 已提交
1512 1513 1514 1515 1516 1517 1518 1519

		call_rcu(head, rcu_free_slab);
	} else
		__free_slab(s, page);
}

static void discard_slab(struct kmem_cache *s, struct page *page)
{
1520
	dec_slabs_node(s, page_to_nid(page), page->objects);
C
Christoph Lameter 已提交
1521 1522 1523 1524
	free_slab(s, page);
}

/*
1525
 * Management of partially allocated slabs.
C
Christoph Lameter 已提交
1526
 */
1527 1528
static inline void
__add_partial(struct kmem_cache_node *n, struct page *page, int tail)
C
Christoph Lameter 已提交
1529
{
C
Christoph Lameter 已提交
1530
	n->nr_partial++;
1531
	if (tail == DEACTIVATE_TO_TAIL)
1532 1533 1534
		list_add_tail(&page->lru, &n->partial);
	else
		list_add(&page->lru, &n->partial);
C
Christoph Lameter 已提交
1535 1536
}

1537 1538
static inline void add_partial(struct kmem_cache_node *n,
				struct page *page, int tail)
1539
{
P
Peter Zijlstra 已提交
1540
	lockdep_assert_held(&n->list_lock);
1541 1542
	__add_partial(n, page, tail);
}
P
Peter Zijlstra 已提交
1543

1544 1545 1546
static inline void
__remove_partial(struct kmem_cache_node *n, struct page *page)
{
1547 1548 1549 1550
	list_del(&page->lru);
	n->nr_partial--;
}

1551 1552 1553 1554 1555 1556 1557
static inline void remove_partial(struct kmem_cache_node *n,
					struct page *page)
{
	lockdep_assert_held(&n->list_lock);
	__remove_partial(n, page);
}

C
Christoph Lameter 已提交
1558
/*
1559 1560
 * Remove slab from the partial list, freeze it and
 * return the pointer to the freelist.
C
Christoph Lameter 已提交
1561
 *
1562
 * Returns a list of objects or NULL if it fails.
C
Christoph Lameter 已提交
1563
 */
1564
static inline void *acquire_slab(struct kmem_cache *s,
1565
		struct kmem_cache_node *n, struct page *page,
1566
		int mode, int *objects)
C
Christoph Lameter 已提交
1567
{
1568 1569 1570 1571
	void *freelist;
	unsigned long counters;
	struct page new;

P
Peter Zijlstra 已提交
1572 1573
	lockdep_assert_held(&n->list_lock);

1574 1575 1576 1577 1578
	/*
	 * Zap the freelist and set the frozen bit.
	 * The old freelist is the list of objects for the
	 * per cpu allocation list.
	 */
1579 1580 1581
	freelist = page->freelist;
	counters = page->counters;
	new.counters = counters;
1582
	*objects = new.objects - new.inuse;
1583
	if (mode) {
1584
		new.inuse = page->objects;
1585 1586 1587 1588
		new.freelist = NULL;
	} else {
		new.freelist = freelist;
	}
1589

1590
	VM_BUG_ON(new.frozen);
1591
	new.frozen = 1;
1592

1593
	if (!__cmpxchg_double_slab(s, page,
1594
			freelist, counters,
1595
			new.freelist, new.counters,
1596 1597
			"acquire_slab"))
		return NULL;
1598 1599

	remove_partial(n, page);
1600
	WARN_ON(!freelist);
1601
	return freelist;
C
Christoph Lameter 已提交
1602 1603
}

1604
static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
1605
static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
1606

C
Christoph Lameter 已提交
1607
/*
C
Christoph Lameter 已提交
1608
 * Try to allocate a partial slab from a specific node.
C
Christoph Lameter 已提交
1609
 */
1610 1611
static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
				struct kmem_cache_cpu *c, gfp_t flags)
C
Christoph Lameter 已提交
1612
{
1613 1614
	struct page *page, *page2;
	void *object = NULL;
1615 1616
	int available = 0;
	int objects;
C
Christoph Lameter 已提交
1617 1618 1619 1620

	/*
	 * Racy check. If we mistakenly see no partial slabs then we
	 * just allocate an empty slab. If we mistakenly try to get a
C
Christoph Lameter 已提交
1621 1622
	 * partial slab and there is none available then get_partials()
	 * will return NULL.
C
Christoph Lameter 已提交
1623 1624 1625 1626 1627
	 */
	if (!n || !n->nr_partial)
		return NULL;

	spin_lock(&n->list_lock);
1628
	list_for_each_entry_safe(page, page2, &n->partial, lru) {
1629
		void *t;
1630

1631 1632 1633
		if (!pfmemalloc_match(page, flags))
			continue;

1634
		t = acquire_slab(s, n, page, object == NULL, &objects);
1635 1636 1637
		if (!t)
			break;

1638
		available += objects;
1639
		if (!object) {
1640 1641 1642 1643
			c->page = page;
			stat(s, ALLOC_FROM_PARTIAL);
			object = t;
		} else {
1644
			put_cpu_partial(s, page, 0);
1645
			stat(s, CPU_PARTIAL_NODE);
1646
		}
1647 1648
		if (!kmem_cache_has_cpu_partial(s)
			|| available > s->cpu_partial / 2)
1649 1650
			break;

1651
	}
C
Christoph Lameter 已提交
1652
	spin_unlock(&n->list_lock);
1653
	return object;
C
Christoph Lameter 已提交
1654 1655 1656
}

/*
C
Christoph Lameter 已提交
1657
 * Get a page from somewhere. Search in increasing NUMA distances.
C
Christoph Lameter 已提交
1658
 */
1659
static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1660
		struct kmem_cache_cpu *c)
C
Christoph Lameter 已提交
1661 1662 1663
{
#ifdef CONFIG_NUMA
	struct zonelist *zonelist;
1664
	struct zoneref *z;
1665 1666
	struct zone *zone;
	enum zone_type high_zoneidx = gfp_zone(flags);
1667
	void *object;
1668
	unsigned int cpuset_mems_cookie;
C
Christoph Lameter 已提交
1669 1670

	/*
C
Christoph Lameter 已提交
1671 1672 1673 1674
	 * The defrag ratio allows a configuration of the tradeoffs between
	 * inter node defragmentation and node local allocations. A lower
	 * defrag_ratio increases the tendency to do local allocations
	 * instead of attempting to obtain partial slabs from other nodes.
C
Christoph Lameter 已提交
1675
	 *
C
Christoph Lameter 已提交
1676 1677 1678 1679
	 * If the defrag_ratio is set to 0 then kmalloc() always
	 * returns node local objects. If the ratio is higher then kmalloc()
	 * may return off node objects because partial slabs are obtained
	 * from other nodes and filled up.
C
Christoph Lameter 已提交
1680
	 *
C
Christoph Lameter 已提交
1681
	 * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
C
Christoph Lameter 已提交
1682 1683 1684 1685 1686
	 * defrag_ratio = 1000) then every (well almost) allocation will
	 * first attempt to defrag slab caches on other nodes. This means
	 * scanning over all nodes to look for partial slabs which may be
	 * expensive if we do it every time we are trying to find a slab
	 * with available objects.
C
Christoph Lameter 已提交
1687
	 */
1688 1689
	if (!s->remote_node_defrag_ratio ||
			get_cycles() % 1024 > s->remote_node_defrag_ratio)
C
Christoph Lameter 已提交
1690 1691
		return NULL;

1692
	do {
1693
		cpuset_mems_cookie = read_mems_allowed_begin();
1694
		zonelist = node_zonelist(mempolicy_slab_node(), flags);
1695 1696 1697 1698 1699
		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
			struct kmem_cache_node *n;

			n = get_node(s, zone_to_nid(zone));

1700
			if (n && cpuset_zone_allowed(zone, flags) &&
1701
					n->nr_partial > s->min_partial) {
1702
				object = get_partial_node(s, n, c, flags);
1703 1704
				if (object) {
					/*
1705 1706 1707 1708 1709
					 * Don't check read_mems_allowed_retry()
					 * here - if mems_allowed was updated in
					 * parallel, that was a harmless race
					 * between allocation and the cpuset
					 * update
1710 1711 1712
					 */
					return object;
				}
1713
			}
C
Christoph Lameter 已提交
1714
		}
1715
	} while (read_mems_allowed_retry(cpuset_mems_cookie));
C
Christoph Lameter 已提交
1716 1717 1718 1719 1720 1721 1722
#endif
	return NULL;
}

/*
 * Get a partial page, lock it and return it.
 */
1723
static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
1724
		struct kmem_cache_cpu *c)
C
Christoph Lameter 已提交
1725
{
1726
	void *object;
1727 1728 1729 1730 1731 1732
	int searchnode = node;

	if (node == NUMA_NO_NODE)
		searchnode = numa_mem_id();
	else if (!node_present_pages(node))
		searchnode = node_to_mem_node(node);
C
Christoph Lameter 已提交
1733

1734
	object = get_partial_node(s, get_node(s, searchnode), c, flags);
1735 1736
	if (object || node != NUMA_NO_NODE)
		return object;
C
Christoph Lameter 已提交
1737

1738
	return get_any_partial(s, flags, c);
C
Christoph Lameter 已提交
1739 1740
}

1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781
#ifdef CONFIG_PREEMPT
/*
 * Calculate the next globally unique transaction for disambiguiation
 * during cmpxchg. The transactions start with the cpu number and are then
 * incremented by CONFIG_NR_CPUS.
 */
#define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
#else
/*
 * No preemption supported therefore also no need to check for
 * different cpus.
 */
#define TID_STEP 1
#endif

static inline unsigned long next_tid(unsigned long tid)
{
	return tid + TID_STEP;
}

static inline unsigned int tid_to_cpu(unsigned long tid)
{
	return tid % TID_STEP;
}

static inline unsigned long tid_to_event(unsigned long tid)
{
	return tid / TID_STEP;
}

static inline unsigned int init_tid(int cpu)
{
	return cpu;
}

static inline void note_cmpxchg_failure(const char *n,
		const struct kmem_cache *s, unsigned long tid)
{
#ifdef SLUB_DEBUG_CMPXCHG
	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);

1782
	pr_info("%s %s: cmpxchg redo ", n, s->name);
1783 1784 1785

#ifdef CONFIG_PREEMPT
	if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
1786
		pr_warn("due to cpu change %d -> %d\n",
1787 1788 1789 1790
			tid_to_cpu(tid), tid_to_cpu(actual_tid));
	else
#endif
	if (tid_to_event(tid) != tid_to_event(actual_tid))
1791
		pr_warn("due to cpu running other code. Event %ld->%ld\n",
1792 1793
			tid_to_event(tid), tid_to_event(actual_tid));
	else
1794
		pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
1795 1796
			actual_tid, tid, next_tid(tid));
#endif
1797
	stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
1798 1799
}

1800
static void init_kmem_cache_cpus(struct kmem_cache *s)
1801 1802 1803 1804 1805 1806
{
	int cpu;

	for_each_possible_cpu(cpu)
		per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
}
1807

C
Christoph Lameter 已提交
1808 1809 1810
/*
 * Remove the cpu slab
 */
1811 1812
static void deactivate_slab(struct kmem_cache *s, struct page *page,
				void *freelist)
C
Christoph Lameter 已提交
1813
{
1814 1815 1816 1817 1818
	enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
	int lock = 0;
	enum slab_modes l = M_NONE, m = M_NONE;
	void *nextfree;
1819
	int tail = DEACTIVATE_TO_HEAD;
1820 1821 1822 1823
	struct page new;
	struct page old;

	if (page->freelist) {
1824
		stat(s, DEACTIVATE_REMOTE_FREES);
1825
		tail = DEACTIVATE_TO_TAIL;
1826 1827
	}

1828
	/*
1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845
	 * Stage one: Free all available per cpu objects back
	 * to the page freelist while it is still frozen. Leave the
	 * last one.
	 *
	 * There is no need to take the list->lock because the page
	 * is still frozen.
	 */
	while (freelist && (nextfree = get_freepointer(s, freelist))) {
		void *prior;
		unsigned long counters;

		do {
			prior = page->freelist;
			counters = page->counters;
			set_freepointer(s, freelist, prior);
			new.counters = counters;
			new.inuse--;
1846
			VM_BUG_ON(!new.frozen);
1847

1848
		} while (!__cmpxchg_double_slab(s, page,
1849 1850 1851 1852 1853 1854 1855
			prior, counters,
			freelist, new.counters,
			"drain percpu freelist"));

		freelist = nextfree;
	}

1856
	/*
1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868
	 * Stage two: Ensure that the page is unfrozen while the
	 * list presence reflects the actual number of objects
	 * during unfreeze.
	 *
	 * We setup the list membership and then perform a cmpxchg
	 * with the count. If there is a mismatch then the page
	 * is not unfrozen but the page is on the wrong list.
	 *
	 * Then we restart the process which may have to remove
	 * the page from the list that we just put it on again
	 * because the number of objects in the slab may have
	 * changed.
1869
	 */
1870
redo:
1871

1872 1873
	old.freelist = page->freelist;
	old.counters = page->counters;
1874
	VM_BUG_ON(!old.frozen);
1875

1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886
	/* Determine target state of the slab */
	new.counters = old.counters;
	if (freelist) {
		new.inuse--;
		set_freepointer(s, freelist, old.freelist);
		new.freelist = freelist;
	} else
		new.freelist = old.freelist;

	new.frozen = 0;

1887
	if (!new.inuse && n->nr_partial >= s->min_partial)
1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919
		m = M_FREE;
	else if (new.freelist) {
		m = M_PARTIAL;
		if (!lock) {
			lock = 1;
			/*
			 * Taking the spinlock removes the possiblity
			 * that acquire_slab() will see a slab page that
			 * is frozen
			 */
			spin_lock(&n->list_lock);
		}
	} else {
		m = M_FULL;
		if (kmem_cache_debug(s) && !lock) {
			lock = 1;
			/*
			 * This also ensures that the scanning of full
			 * slabs from diagnostic functions will not see
			 * any frozen slabs.
			 */
			spin_lock(&n->list_lock);
		}
	}

	if (l != m) {

		if (l == M_PARTIAL)

			remove_partial(n, page);

		else if (l == M_FULL)
1920

P
Peter Zijlstra 已提交
1921
			remove_full(s, n, page);
1922 1923 1924 1925

		if (m == M_PARTIAL) {

			add_partial(n, page, tail);
1926
			stat(s, tail);
1927 1928

		} else if (m == M_FULL) {
1929

1930 1931 1932 1933 1934 1935 1936
			stat(s, DEACTIVATE_FULL);
			add_full(s, n, page);

		}
	}

	l = m;
1937
	if (!__cmpxchg_double_slab(s, page,
1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949
				old.freelist, old.counters,
				new.freelist, new.counters,
				"unfreezing slab"))
		goto redo;

	if (lock)
		spin_unlock(&n->list_lock);

	if (m == M_FREE) {
		stat(s, DEACTIVATE_EMPTY);
		discard_slab(s, page);
		stat(s, FREE_SLAB);
1950
	}
C
Christoph Lameter 已提交
1951 1952
}

1953 1954 1955
/*
 * Unfreeze all the cpu partial slabs.
 *
1956 1957 1958
 * This function must be called with interrupts disabled
 * for the cpu using c (or some other guarantee must be there
 * to guarantee no concurrent accesses).
1959
 */
1960 1961
static void unfreeze_partials(struct kmem_cache *s,
		struct kmem_cache_cpu *c)
1962
{
1963
#ifdef CONFIG_SLUB_CPU_PARTIAL
1964
	struct kmem_cache_node *n = NULL, *n2 = NULL;
1965
	struct page *page, *discard_page = NULL;
1966 1967 1968 1969 1970 1971

	while ((page = c->partial)) {
		struct page new;
		struct page old;

		c->partial = page->next;
1972 1973 1974 1975 1976 1977 1978 1979 1980

		n2 = get_node(s, page_to_nid(page));
		if (n != n2) {
			if (n)
				spin_unlock(&n->list_lock);

			n = n2;
			spin_lock(&n->list_lock);
		}
1981 1982 1983 1984 1985

		do {

			old.freelist = page->freelist;
			old.counters = page->counters;
1986
			VM_BUG_ON(!old.frozen);
1987 1988 1989 1990 1991 1992

			new.counters = old.counters;
			new.freelist = old.freelist;

			new.frozen = 0;

1993
		} while (!__cmpxchg_double_slab(s, page,
1994 1995 1996 1997
				old.freelist, old.counters,
				new.freelist, new.counters,
				"unfreezing slab"));

1998
		if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
1999 2000
			page->next = discard_page;
			discard_page = page;
2001 2002 2003
		} else {
			add_partial(n, page, DEACTIVATE_TO_TAIL);
			stat(s, FREE_ADD_PARTIAL);
2004 2005 2006 2007 2008
		}
	}

	if (n)
		spin_unlock(&n->list_lock);
2009 2010 2011 2012 2013 2014 2015 2016 2017

	while (discard_page) {
		page = discard_page;
		discard_page = discard_page->next;

		stat(s, DEACTIVATE_EMPTY);
		discard_slab(s, page);
		stat(s, FREE_SLAB);
	}
2018
#endif
2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029
}

/*
 * Put a page that was just frozen (in __slab_free) into a partial page
 * slot if available. This is done without interrupts disabled and without
 * preemption disabled. The cmpxchg is racy and may put the partial page
 * onto a random cpus partial slot.
 *
 * If we did not find a slot then simply move all the partials to the
 * per node partial list.
 */
2030
static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2031
{
2032
#ifdef CONFIG_SLUB_CPU_PARTIAL
2033 2034 2035 2036
	struct page *oldpage;
	int pages;
	int pobjects;

2037
	preempt_disable();
2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052
	do {
		pages = 0;
		pobjects = 0;
		oldpage = this_cpu_read(s->cpu_slab->partial);

		if (oldpage) {
			pobjects = oldpage->pobjects;
			pages = oldpage->pages;
			if (drain && pobjects > s->cpu_partial) {
				unsigned long flags;
				/*
				 * partial array is full. Move the existing
				 * set to the per node partial list.
				 */
				local_irq_save(flags);
2053
				unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
2054
				local_irq_restore(flags);
2055
				oldpage = NULL;
2056 2057
				pobjects = 0;
				pages = 0;
2058
				stat(s, CPU_PARTIAL_DRAIN);
2059 2060 2061 2062 2063 2064 2065 2066 2067 2068
			}
		}

		pages++;
		pobjects += page->objects - page->inuse;

		page->pages = pages;
		page->pobjects = pobjects;
		page->next = oldpage;

2069 2070
	} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
								!= oldpage);
2071 2072 2073 2074 2075 2076 2077 2078
	if (unlikely(!s->cpu_partial)) {
		unsigned long flags;

		local_irq_save(flags);
		unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
		local_irq_restore(flags);
	}
	preempt_enable();
2079
#endif
2080 2081
}

2082
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
C
Christoph Lameter 已提交
2083
{
2084
	stat(s, CPUSLAB_FLUSH);
2085 2086 2087 2088 2089
	deactivate_slab(s, c->page, c->freelist);

	c->tid = next_tid(c->tid);
	c->page = NULL;
	c->freelist = NULL;
C
Christoph Lameter 已提交
2090 2091 2092 2093
}

/*
 * Flush cpu slab.
C
Christoph Lameter 已提交
2094
 *
C
Christoph Lameter 已提交
2095 2096
 * Called from IPI handler with interrupts disabled.
 */
2097
static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
C
Christoph Lameter 已提交
2098
{
2099
	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
C
Christoph Lameter 已提交
2100

2101 2102 2103 2104
	if (likely(c)) {
		if (c->page)
			flush_slab(s, c);

2105
		unfreeze_partials(s, c);
2106
	}
C
Christoph Lameter 已提交
2107 2108 2109 2110 2111 2112
}

static void flush_cpu_slab(void *d)
{
	struct kmem_cache *s = d;

2113
	__flush_cpu_slab(s, smp_processor_id());
C
Christoph Lameter 已提交
2114 2115
}

2116 2117 2118 2119 2120
static bool has_cpu_slab(int cpu, void *info)
{
	struct kmem_cache *s = info;
	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);

2121
	return c->page || c->partial;
2122 2123
}

C
Christoph Lameter 已提交
2124 2125
static void flush_all(struct kmem_cache *s)
{
2126
	on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
C
Christoph Lameter 已提交
2127 2128
}

2129 2130 2131 2132
/*
 * Check if the objects in a per cpu structure fit numa
 * locality expectations.
 */
2133
static inline int node_match(struct page *page, int node)
2134 2135
{
#ifdef CONFIG_NUMA
2136
	if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node))
2137 2138 2139 2140 2141
		return 0;
#endif
	return 1;
}

2142
#ifdef CONFIG_SLUB_DEBUG
P
Pekka Enberg 已提交
2143 2144 2145 2146 2147
static int count_free(struct page *page)
{
	return page->objects - page->inuse;
}

2148 2149 2150 2151 2152 2153 2154
static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
{
	return atomic_long_read(&n->total_objects);
}
#endif /* CONFIG_SLUB_DEBUG */

#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
P
Pekka Enberg 已提交
2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167
static unsigned long count_partial(struct kmem_cache_node *n,
					int (*get_count)(struct page *))
{
	unsigned long flags;
	unsigned long x = 0;
	struct page *page;

	spin_lock_irqsave(&n->list_lock, flags);
	list_for_each_entry(page, &n->partial, lru)
		x += get_count(page);
	spin_unlock_irqrestore(&n->list_lock, flags);
	return x;
}
2168
#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
2169

P
Pekka Enberg 已提交
2170 2171 2172
static noinline void
slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
{
2173 2174 2175
#ifdef CONFIG_SLUB_DEBUG
	static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
				      DEFAULT_RATELIMIT_BURST);
P
Pekka Enberg 已提交
2176
	int node;
C
Christoph Lameter 已提交
2177
	struct kmem_cache_node *n;
P
Pekka Enberg 已提交
2178

2179 2180 2181
	if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
		return;

2182
	pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
P
Pekka Enberg 已提交
2183
		nid, gfpflags);
2184 2185 2186
	pr_warn("  cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n",
		s->name, s->object_size, s->size, oo_order(s->oo),
		oo_order(s->min));
P
Pekka Enberg 已提交
2187

2188
	if (oo_order(s->min) > get_order(s->object_size))
2189 2190
		pr_warn("  %s debugging increased min order, use slub_debug=O to disable.\n",
			s->name);
2191

C
Christoph Lameter 已提交
2192
	for_each_kmem_cache_node(s, node, n) {
P
Pekka Enberg 已提交
2193 2194 2195 2196
		unsigned long nr_slabs;
		unsigned long nr_objs;
		unsigned long nr_free;

2197 2198 2199
		nr_free  = count_partial(n, count_free);
		nr_slabs = node_nr_slabs(n);
		nr_objs  = node_nr_objs(n);
P
Pekka Enberg 已提交
2200

2201
		pr_warn("  node %d: slabs: %ld, objs: %ld, free: %ld\n",
P
Pekka Enberg 已提交
2202 2203
			node, nr_slabs, nr_objs, nr_free);
	}
2204
#endif
P
Pekka Enberg 已提交
2205 2206
}

2207 2208 2209
static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
			int node, struct kmem_cache_cpu **pc)
{
2210
	void *freelist;
2211 2212
	struct kmem_cache_cpu *c = *pc;
	struct page *page;
2213

2214
	freelist = get_partial(s, flags, node, c);
2215

2216 2217 2218 2219
	if (freelist)
		return freelist;

	page = new_slab(s, flags, node);
2220
	if (page) {
2221
		c = raw_cpu_ptr(s->cpu_slab);
2222 2223 2224 2225 2226 2227 2228
		if (c->page)
			flush_slab(s, c);

		/*
		 * No other reference to the page yet so we can
		 * muck around with it freely without cmpxchg
		 */
2229
		freelist = page->freelist;
2230 2231 2232 2233 2234 2235
		page->freelist = NULL;

		stat(s, ALLOC_SLAB);
		c->page = page;
		*pc = c;
	} else
2236
		freelist = NULL;
2237

2238
	return freelist;
2239 2240
}

2241 2242 2243 2244 2245 2246 2247 2248
static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
{
	if (unlikely(PageSlabPfmemalloc(page)))
		return gfp_pfmemalloc_allowed(gfpflags);

	return true;
}

2249
/*
2250 2251
 * Check the page->freelist of a page and either transfer the freelist to the
 * per cpu freelist or deactivate the page.
2252 2253 2254 2255
 *
 * The page is still frozen if the return value is not NULL.
 *
 * If this function returns NULL then the page has been unfrozen.
2256 2257
 *
 * This function must be called with interrupt disabled.
2258 2259 2260 2261 2262 2263 2264 2265 2266 2267
 */
static inline void *get_freelist(struct kmem_cache *s, struct page *page)
{
	struct page new;
	unsigned long counters;
	void *freelist;

	do {
		freelist = page->freelist;
		counters = page->counters;
2268

2269
		new.counters = counters;
2270
		VM_BUG_ON(!new.frozen);
2271 2272 2273 2274

		new.inuse = page->objects;
		new.frozen = freelist != NULL;

2275
	} while (!__cmpxchg_double_slab(s, page,
2276 2277 2278 2279 2280 2281 2282
		freelist, counters,
		NULL, new.counters,
		"get_freelist"));

	return freelist;
}

C
Christoph Lameter 已提交
2283
/*
2284 2285 2286 2287 2288 2289
 * Slow path. The lockless freelist is empty or we need to perform
 * debugging duties.
 *
 * Processing is still very fast if new objects have been freed to the
 * regular freelist. In that case we simply take over the regular freelist
 * as the lockless freelist and zap the regular freelist.
C
Christoph Lameter 已提交
2290
 *
2291 2292 2293
 * If that is not working then we fall back to the partial lists. We take the
 * first element of the freelist as the object to allocate now and move the
 * rest of the freelist to the lockless freelist.
C
Christoph Lameter 已提交
2294
 *
2295
 * And if we were unable to get a new slab from the partial slab lists then
C
Christoph Lameter 已提交
2296 2297
 * we need to allocate a new slab. This is the slowest path since it involves
 * a call to the page allocator and the setup of a new slab.
2298 2299 2300
 *
 * Version of __slab_alloc to use when we know that interrupts are
 * already disabled (which is the case for bulk allocation).
C
Christoph Lameter 已提交
2301
 */
2302
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2303
			  unsigned long addr, struct kmem_cache_cpu *c)
C
Christoph Lameter 已提交
2304
{
2305
	void *freelist;
2306
	struct page *page;
C
Christoph Lameter 已提交
2307

2308 2309
	page = c->page;
	if (!page)
C
Christoph Lameter 已提交
2310
		goto new_slab;
2311
redo:
2312

2313
	if (unlikely(!node_match(page, node))) {
2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325
		int searchnode = node;

		if (node != NUMA_NO_NODE && !node_present_pages(node))
			searchnode = node_to_mem_node(node);

		if (unlikely(!node_match(page, searchnode))) {
			stat(s, ALLOC_NODE_MISMATCH);
			deactivate_slab(s, page, c->freelist);
			c->page = NULL;
			c->freelist = NULL;
			goto new_slab;
		}
2326
	}
C
Christoph Lameter 已提交
2327

2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339
	/*
	 * By rights, we should be searching for a slab page that was
	 * PFMEMALLOC but right now, we are losing the pfmemalloc
	 * information when the page leaves the per-cpu allocator
	 */
	if (unlikely(!pfmemalloc_match(page, gfpflags))) {
		deactivate_slab(s, page, c->freelist);
		c->page = NULL;
		c->freelist = NULL;
		goto new_slab;
	}

2340
	/* must check again c->freelist in case of cpu migration or IRQ */
2341 2342
	freelist = c->freelist;
	if (freelist)
2343
		goto load_freelist;
2344

2345
	freelist = get_freelist(s, page);
C
Christoph Lameter 已提交
2346

2347
	if (!freelist) {
2348 2349
		c->page = NULL;
		stat(s, DEACTIVATE_BYPASS);
2350
		goto new_slab;
2351
	}
C
Christoph Lameter 已提交
2352

2353
	stat(s, ALLOC_REFILL);
C
Christoph Lameter 已提交
2354

2355
load_freelist:
2356 2357 2358 2359 2360
	/*
	 * freelist is pointing to the list of objects to be used.
	 * page is pointing to the page from which the objects are obtained.
	 * That page must be frozen for per cpu allocations to work.
	 */
2361
	VM_BUG_ON(!c->page->frozen);
2362
	c->freelist = get_freepointer(s, freelist);
2363
	c->tid = next_tid(c->tid);
2364
	return freelist;
C
Christoph Lameter 已提交
2365 2366

new_slab:
2367

2368
	if (c->partial) {
2369 2370
		page = c->page = c->partial;
		c->partial = page->next;
2371 2372 2373
		stat(s, CPU_PARTIAL_ALLOC);
		c->freelist = NULL;
		goto redo;
C
Christoph Lameter 已提交
2374 2375
	}

2376
	freelist = new_slab_objects(s, gfpflags, node, &c);
2377

2378
	if (unlikely(!freelist)) {
2379
		slab_out_of_memory(s, gfpflags, node);
2380
		return NULL;
C
Christoph Lameter 已提交
2381
	}
2382

2383
	page = c->page;
2384
	if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
2385
		goto load_freelist;
2386

2387
	/* Only entered in the debug case */
2388 2389
	if (kmem_cache_debug(s) &&
			!alloc_debug_processing(s, page, freelist, addr))
2390
		goto new_slab;	/* Slab failed checks. Next slab needed */
2391

2392
	deactivate_slab(s, page, get_freepointer(s, freelist));
2393 2394
	c->page = NULL;
	c->freelist = NULL;
2395
	return freelist;
2396 2397
}

2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422
/*
 * Another one that disabled interrupt and compensates for possible
 * cpu changes by refetching the per cpu area pointer.
 */
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
			  unsigned long addr, struct kmem_cache_cpu *c)
{
	void *p;
	unsigned long flags;

	local_irq_save(flags);
#ifdef CONFIG_PREEMPT
	/*
	 * We may have been preempted and rescheduled on a different
	 * cpu before disabling interrupts. Need to reload cpu area
	 * pointer.
	 */
	c = this_cpu_ptr(s->cpu_slab);
#endif

	p = ___slab_alloc(s, gfpflags, node, addr, c);
	local_irq_restore(flags);
	return p;
}

2423 2424 2425 2426 2427 2428 2429 2430 2431 2432
/*
 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
 * have the fastpath folded into their functions. So no function call
 * overhead for requests that can be satisfied on the fastpath.
 *
 * The fastpath works by first checking if the lockless freelist can be used.
 * If not then __slab_alloc is called for slow processing.
 *
 * Otherwise we can simply pick the next object from the lockless free list.
 */
2433
static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2434
		gfp_t gfpflags, int node, unsigned long addr)
2435 2436
{
	void **object;
2437
	struct kmem_cache_cpu *c;
2438
	struct page *page;
2439
	unsigned long tid;
2440

2441 2442
	s = slab_pre_alloc_hook(s, gfpflags);
	if (!s)
A
Akinobu Mita 已提交
2443
		return NULL;
2444 2445 2446 2447 2448 2449
redo:
	/*
	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
	 * enabled. We may switch back and forth between cpus while
	 * reading from one cpu area. That does not matter as long
	 * as we end up on the original cpu again when doing the cmpxchg.
2450
	 *
2451 2452 2453
	 * We should guarantee that tid and kmem_cache are retrieved on
	 * the same cpu. It could be different if CONFIG_PREEMPT so we need
	 * to check if it is matched or not.
2454
	 */
2455 2456 2457
	do {
		tid = this_cpu_read(s->cpu_slab->tid);
		c = raw_cpu_ptr(s->cpu_slab);
2458 2459
	} while (IS_ENABLED(CONFIG_PREEMPT) &&
		 unlikely(tid != READ_ONCE(c->tid)));
2460 2461 2462 2463 2464 2465 2466 2467 2468 2469

	/*
	 * Irqless object alloc/free algorithm used here depends on sequence
	 * of fetching cpu_slab's data. tid should be fetched before anything
	 * on c to guarantee that object and page associated with previous tid
	 * won't be used with current tid. If we fetch tid first, object and
	 * page could be one associated with next tid and our alloc/free
	 * request will be failed. In this case, we will retry. So, no problem.
	 */
	barrier();
2470 2471 2472 2473 2474 2475 2476 2477

	/*
	 * The transaction ids are globally unique per cpu and per operation on
	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
	 * occurs on the right processor and that there was no operation on the
	 * linked list in between.
	 */

2478
	object = c->freelist;
2479
	page = c->page;
D
Dave Hansen 已提交
2480
	if (unlikely(!object || !node_match(page, node))) {
2481
		object = __slab_alloc(s, gfpflags, node, addr, c);
D
Dave Hansen 已提交
2482 2483
		stat(s, ALLOC_SLOWPATH);
	} else {
2484 2485
		void *next_object = get_freepointer_safe(s, object);

2486
		/*
L
Lucas De Marchi 已提交
2487
		 * The cmpxchg will only match if there was no additional
2488 2489
		 * operation and if we are on the right processor.
		 *
2490 2491
		 * The cmpxchg does the following atomically (without lock
		 * semantics!)
2492 2493 2494 2495
		 * 1. Relocate first pointer to the current per cpu area.
		 * 2. Verify that tid and freelist have not been changed
		 * 3. If they were not changed replace tid and freelist
		 *
2496 2497 2498
		 * Since this is without lock semantics the protection is only
		 * against code executing on this cpu *not* from access by
		 * other cpus.
2499
		 */
2500
		if (unlikely(!this_cpu_cmpxchg_double(
2501 2502
				s->cpu_slab->freelist, s->cpu_slab->tid,
				object, tid,
2503
				next_object, next_tid(tid)))) {
2504 2505 2506 2507

			note_cmpxchg_failure("slab_alloc", s, tid);
			goto redo;
		}
2508
		prefetch_freepointer(s, next_object);
2509
		stat(s, ALLOC_FASTPATH);
2510
	}
2511

2512
	if (unlikely(gfpflags & __GFP_ZERO) && object)
2513
		memset(object, 0, s->object_size);
2514

2515
	slab_post_alloc_hook(s, gfpflags, object);
V
Vegard Nossum 已提交
2516

2517
	return object;
C
Christoph Lameter 已提交
2518 2519
}

2520 2521 2522 2523 2524 2525
static __always_inline void *slab_alloc(struct kmem_cache *s,
		gfp_t gfpflags, unsigned long addr)
{
	return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
}

C
Christoph Lameter 已提交
2526 2527
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
2528
	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
E
Eduard - Gabriel Munteanu 已提交
2529

2530 2531
	trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
				s->size, gfpflags);
E
Eduard - Gabriel Munteanu 已提交
2532 2533

	return ret;
C
Christoph Lameter 已提交
2534 2535 2536
}
EXPORT_SYMBOL(kmem_cache_alloc);

2537
#ifdef CONFIG_TRACING
2538 2539
void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
2540
	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2541
	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
2542
	kasan_kmalloc(s, ret, size);
2543 2544 2545
	return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_trace);
E
Eduard - Gabriel Munteanu 已提交
2546 2547
#endif

C
Christoph Lameter 已提交
2548 2549 2550
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
{
2551
	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
E
Eduard - Gabriel Munteanu 已提交
2552

2553
	trace_kmem_cache_alloc_node(_RET_IP_, ret,
2554
				    s->object_size, s->size, gfpflags, node);
E
Eduard - Gabriel Munteanu 已提交
2555 2556

	return ret;
C
Christoph Lameter 已提交
2557 2558 2559
}
EXPORT_SYMBOL(kmem_cache_alloc_node);

2560
#ifdef CONFIG_TRACING
2561
void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
E
Eduard - Gabriel Munteanu 已提交
2562
				    gfp_t gfpflags,
2563
				    int node, size_t size)
E
Eduard - Gabriel Munteanu 已提交
2564
{
2565
	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
2566 2567 2568

	trace_kmalloc_node(_RET_IP_, ret,
			   size, s->size, gfpflags, node);
2569 2570

	kasan_kmalloc(s, ret, size);
2571
	return ret;
E
Eduard - Gabriel Munteanu 已提交
2572
}
2573
EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
E
Eduard - Gabriel Munteanu 已提交
2574
#endif
2575
#endif
E
Eduard - Gabriel Munteanu 已提交
2576

C
Christoph Lameter 已提交
2577
/*
K
Kim Phillips 已提交
2578
 * Slow path handling. This may still be called frequently since objects
2579
 * have a longer lifetime than the cpu slabs in most processing loads.
C
Christoph Lameter 已提交
2580
 *
2581 2582 2583
 * So we still attempt to reduce cache line usage. Just take the slab
 * lock and free the item. If there is no additional partial page
 * handling required then we can return immediately.
C
Christoph Lameter 已提交
2584
 */
2585
static void __slab_free(struct kmem_cache *s, struct page *page,
2586
			void *x, unsigned long addr)
C
Christoph Lameter 已提交
2587 2588 2589
{
	void *prior;
	void **object = (void *)x;
2590 2591 2592 2593
	int was_frozen;
	struct page new;
	unsigned long counters;
	struct kmem_cache_node *n = NULL;
2594
	unsigned long uninitialized_var(flags);
C
Christoph Lameter 已提交
2595

2596
	stat(s, FREE_SLOWPATH);
C
Christoph Lameter 已提交
2597

2598 2599
	if (kmem_cache_debug(s) &&
		!(n = free_debug_processing(s, page, x, addr, &flags)))
2600
		return;
C
Christoph Lameter 已提交
2601

2602
	do {
2603 2604 2605 2606
		if (unlikely(n)) {
			spin_unlock_irqrestore(&n->list_lock, flags);
			n = NULL;
		}
2607 2608 2609 2610 2611 2612
		prior = page->freelist;
		counters = page->counters;
		set_freepointer(s, object, prior);
		new.counters = counters;
		was_frozen = new.frozen;
		new.inuse--;
2613
		if ((!new.inuse || !prior) && !was_frozen) {
2614

P
Peter Zijlstra 已提交
2615
			if (kmem_cache_has_cpu_partial(s) && !prior) {
2616 2617

				/*
2618 2619 2620 2621
				 * Slab was on no list before and will be
				 * partially empty
				 * We can defer the list move and instead
				 * freeze it.
2622 2623 2624
				 */
				new.frozen = 1;

P
Peter Zijlstra 已提交
2625
			} else { /* Needs to be taken off a list */
2626

2627
				n = get_node(s, page_to_nid(page));
2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638
				/*
				 * Speculatively acquire the list_lock.
				 * If the cmpxchg does not succeed then we may
				 * drop the list_lock without any processing.
				 *
				 * Otherwise the list_lock will synchronize with
				 * other processors updating the list of slabs.
				 */
				spin_lock_irqsave(&n->list_lock, flags);

			}
2639
		}
C
Christoph Lameter 已提交
2640

2641 2642 2643 2644
	} while (!cmpxchg_double_slab(s, page,
		prior, counters,
		object, new.counters,
		"__slab_free"));
C
Christoph Lameter 已提交
2645

2646
	if (likely(!n)) {
2647 2648 2649 2650 2651

		/*
		 * If we just froze the page then put it onto the
		 * per cpu partial list.
		 */
2652
		if (new.frozen && !was_frozen) {
2653
			put_cpu_partial(s, page, 1);
2654 2655
			stat(s, CPU_PARTIAL_FREE);
		}
2656
		/*
2657 2658 2659
		 * The list lock was not taken therefore no list
		 * activity can be necessary.
		 */
2660 2661 2662 2663
		if (was_frozen)
			stat(s, FREE_FROZEN);
		return;
	}
C
Christoph Lameter 已提交
2664

2665
	if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
2666 2667
		goto slab_empty;

C
Christoph Lameter 已提交
2668
	/*
2669 2670
	 * Objects left in the slab. If it was not on the partial list before
	 * then add it.
C
Christoph Lameter 已提交
2671
	 */
2672 2673
	if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
		if (kmem_cache_debug(s))
P
Peter Zijlstra 已提交
2674
			remove_full(s, n, page);
2675 2676
		add_partial(n, page, DEACTIVATE_TO_TAIL);
		stat(s, FREE_ADD_PARTIAL);
2677
	}
2678
	spin_unlock_irqrestore(&n->list_lock, flags);
C
Christoph Lameter 已提交
2679 2680 2681
	return;

slab_empty:
2682
	if (prior) {
C
Christoph Lameter 已提交
2683
		/*
2684
		 * Slab on the partial list.
C
Christoph Lameter 已提交
2685
		 */
2686
		remove_partial(n, page);
2687
		stat(s, FREE_REMOVE_PARTIAL);
P
Peter Zijlstra 已提交
2688
	} else {
2689
		/* Slab must be on the full list */
P
Peter Zijlstra 已提交
2690 2691
		remove_full(s, n, page);
	}
2692

2693
	spin_unlock_irqrestore(&n->list_lock, flags);
2694
	stat(s, FREE_SLAB);
C
Christoph Lameter 已提交
2695 2696 2697
	discard_slab(s, page);
}

2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708
/*
 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
 * can perform fastpath freeing without additional function calls.
 *
 * The fastpath is only possible if we are freeing to the current cpu slab
 * of this processor. This typically the case if we have just allocated
 * the item before.
 *
 * If fastpath is not possible then fall back to __slab_free where we deal
 * with all sorts of special processing.
 */
P
Pekka Enberg 已提交
2709
static __always_inline void slab_free(struct kmem_cache *s,
2710
			struct page *page, void *x, unsigned long addr)
2711 2712
{
	void **object = (void *)x;
2713
	struct kmem_cache_cpu *c;
2714
	unsigned long tid;
2715

2716 2717
	slab_free_hook(s, x);

2718 2719 2720 2721 2722
redo:
	/*
	 * Determine the currently cpus per cpu slab.
	 * The cpu may change afterward. However that does not matter since
	 * data is retrieved via this pointer. If we are on the same cpu
2723
	 * during the cmpxchg then the free will succeed.
2724
	 */
2725 2726 2727
	do {
		tid = this_cpu_read(s->cpu_slab->tid);
		c = raw_cpu_ptr(s->cpu_slab);
2728 2729
	} while (IS_ENABLED(CONFIG_PREEMPT) &&
		 unlikely(tid != READ_ONCE(c->tid)));
2730

2731 2732
	/* Same with comment on barrier() in slab_alloc_node() */
	barrier();
2733

2734
	if (likely(page == c->page)) {
2735
		set_freepointer(s, object, c->freelist);
2736

2737
		if (unlikely(!this_cpu_cmpxchg_double(
2738 2739 2740 2741 2742 2743 2744
				s->cpu_slab->freelist, s->cpu_slab->tid,
				c->freelist, tid,
				object, next_tid(tid)))) {

			note_cmpxchg_failure("slab_free", s, tid);
			goto redo;
		}
2745
		stat(s, FREE_FASTPATH);
2746
	} else
2747
		__slab_free(s, page, x, addr);
2748 2749 2750

}

C
Christoph Lameter 已提交
2751 2752
void kmem_cache_free(struct kmem_cache *s, void *x)
{
2753 2754
	s = cache_from_obj(s, x);
	if (!s)
2755
		return;
2756
	slab_free(s, virt_to_head_page(x), x, _RET_IP_);
2757
	trace_kmem_cache_free(_RET_IP_, x);
C
Christoph Lameter 已提交
2758 2759 2760
}
EXPORT_SYMBOL(kmem_cache_free);

2761
/* Note that interrupts must be enabled when calling this function. */
2762 2763
void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
{
2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774
	struct kmem_cache_cpu *c;
	struct page *page;
	int i;

	local_irq_disable();
	c = this_cpu_ptr(s->cpu_slab);

	for (i = 0; i < size; i++) {
		void *object = p[i];

		BUG_ON(!object);
2775 2776 2777 2778 2779 2780
		/* kmem cache debug support */
		s = cache_from_obj(s, object);
		if (unlikely(!s))
			goto exit;
		slab_free_hook(s, object);

2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795
		page = virt_to_head_page(object);

		if (c->page == page) {
			/* Fastpath: local CPU free */
			set_freepointer(s, object, c->freelist);
			c->freelist = object;
		} else {
			c->tid = next_tid(c->tid);
			local_irq_enable();
			/* Slowpath: overhead locked cmpxchg_double_slab */
			__slab_free(s, page, object, _RET_IP_);
			local_irq_disable();
			c = this_cpu_ptr(s->cpu_slab);
		}
	}
2796
exit:
2797 2798
	c->tid = next_tid(c->tid);
	local_irq_enable();
2799 2800 2801
}
EXPORT_SYMBOL(kmem_cache_free_bulk);

2802
/* Note that interrupts must be enabled when calling this function. */
2803
bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
2804
			   void **p)
2805
{
2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819
	struct kmem_cache_cpu *c;
	int i;

	/*
	 * Drain objects in the per cpu slab, while disabling local
	 * IRQs, which protects against PREEMPT and interrupts
	 * handlers invoking normal fastpath.
	 */
	local_irq_disable();
	c = this_cpu_ptr(s->cpu_slab);

	for (i = 0; i < size; i++) {
		void *object = c->freelist;

2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835
		if (unlikely(!object)) {
			local_irq_enable();
			/*
			 * Invoking slow path likely have side-effect
			 * of re-populating per CPU c->freelist
			 */
			p[i] = __slab_alloc(s, flags, NUMA_NO_NODE,
					    _RET_IP_, c);
			if (unlikely(!p[i])) {
				__kmem_cache_free_bulk(s, i, p);
				return false;
			}
			local_irq_disable();
			c = this_cpu_ptr(s->cpu_slab);
			continue; /* goto for-loop */
		}
2836

2837 2838 2839 2840 2841 2842 2843 2844 2845
		/* kmem_cache debug support */
		s = slab_pre_alloc_hook(s, flags);
		if (unlikely(!s)) {
			__kmem_cache_free_bulk(s, i, p);
			c->tid = next_tid(c->tid);
			local_irq_enable();
			return false;
		}

2846 2847
		c->freelist = get_freepointer(s, object);
		p[i] = object;
2848 2849 2850

		/* kmem_cache debug support */
		slab_post_alloc_hook(s, flags, object);
2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863
	}
	c->tid = next_tid(c->tid);
	local_irq_enable();

	/* Clear memory outside IRQ disabled fastpath loop */
	if (unlikely(flags & __GFP_ZERO)) {
		int j;

		for (j = 0; j < i; j++)
			memset(p[j], 0, s->object_size);
	}

	return true;
2864 2865 2866 2867
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);


C
Christoph Lameter 已提交
2868
/*
C
Christoph Lameter 已提交
2869 2870 2871 2872
 * Object placement in a slab is made very easy because we always start at
 * offset 0. If we tune the size of the object to the alignment then we can
 * get the required alignment by putting one properly sized object after
 * another.
C
Christoph Lameter 已提交
2873 2874 2875 2876
 *
 * Notice that the allocation order determines the sizes of the per cpu
 * caches. Each processor has always one slab available for allocations.
 * Increasing the allocation order reduces the number of times that slabs
C
Christoph Lameter 已提交
2877
 * must be moved on and off the partial lists and is therefore a factor in
C
Christoph Lameter 已提交
2878 2879 2880 2881 2882 2883 2884 2885 2886 2887
 * locking overhead.
 */

/*
 * Mininum / Maximum order of slab pages. This influences locking overhead
 * and slab fragmentation. A higher order reduces the number of partial slabs
 * and increases the number of allocations possible without having to
 * take the list_lock.
 */
static int slub_min_order;
2888
static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
2889
static int slub_min_objects;
C
Christoph Lameter 已提交
2890 2891 2892 2893

/*
 * Calculate the order of allocation given an slab object size.
 *
C
Christoph Lameter 已提交
2894 2895 2896 2897
 * The order of allocation has significant impact on performance and other
 * system components. Generally order 0 allocations should be preferred since
 * order 0 does not cause fragmentation in the page allocator. Larger objects
 * be problematic to put into order 0 slabs because there may be too much
C
Christoph Lameter 已提交
2898
 * unused space left. We go to a higher order if more than 1/16th of the slab
C
Christoph Lameter 已提交
2899 2900 2901 2902 2903 2904
 * would be wasted.
 *
 * In order to reach satisfactory performance we must ensure that a minimum
 * number of objects is in one slab. Otherwise we may generate too much
 * activity on the partial lists which requires taking the list_lock. This is
 * less a concern for large slabs though which are rarely used.
C
Christoph Lameter 已提交
2905
 *
C
Christoph Lameter 已提交
2906 2907 2908 2909
 * slub_max_order specifies the order where we begin to stop considering the
 * number of objects in a slab as critical. If we reach slub_max_order then
 * we try to keep the page order as low as possible. So we accept more waste
 * of space in favor of a small page order.
C
Christoph Lameter 已提交
2910
 *
C
Christoph Lameter 已提交
2911 2912 2913 2914
 * Higher order allocations also allow the placement of more objects in a
 * slab and thereby reduce object handling overhead. If the user has
 * requested a higher mininum order then we start with that one instead of
 * the smallest order which will fit the object.
C
Christoph Lameter 已提交
2915
 */
2916
static inline int slab_order(int size, int min_objects,
2917
				int max_order, int fract_leftover, int reserved)
C
Christoph Lameter 已提交
2918 2919 2920
{
	int order;
	int rem;
2921
	int min_order = slub_min_order;
C
Christoph Lameter 已提交
2922

2923
	if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
2924
		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
2925

2926
	for (order = max(min_order, get_order(min_objects * size + reserved));
2927
			order <= max_order; order++) {
C
Christoph Lameter 已提交
2928

2929
		unsigned long slab_size = PAGE_SIZE << order;
C
Christoph Lameter 已提交
2930

2931
		rem = (slab_size - reserved) % size;
C
Christoph Lameter 已提交
2932

2933
		if (rem <= slab_size / fract_leftover)
C
Christoph Lameter 已提交
2934 2935
			break;
	}
C
Christoph Lameter 已提交
2936

C
Christoph Lameter 已提交
2937 2938 2939
	return order;
}

2940
static inline int calculate_order(int size, int reserved)
2941 2942 2943 2944
{
	int order;
	int min_objects;
	int fraction;
2945
	int max_objects;
2946 2947 2948 2949 2950 2951

	/*
	 * Attempt to find best configuration for a slab. This
	 * works by first attempting to generate a layout with
	 * the best configuration and backing off gradually.
	 *
2952
	 * First we increase the acceptable waste in a slab. Then
2953 2954 2955
	 * we reduce the minimum objects required in a slab.
	 */
	min_objects = slub_min_objects;
2956 2957
	if (!min_objects)
		min_objects = 4 * (fls(nr_cpu_ids) + 1);
2958
	max_objects = order_objects(slub_max_order, size, reserved);
2959 2960
	min_objects = min(min_objects, max_objects);

2961
	while (min_objects > 1) {
C
Christoph Lameter 已提交
2962
		fraction = 16;
2963 2964
		while (fraction >= 4) {
			order = slab_order(size, min_objects,
2965
					slub_max_order, fraction, reserved);
2966 2967 2968 2969
			if (order <= slub_max_order)
				return order;
			fraction /= 2;
		}
2970
		min_objects--;
2971 2972 2973 2974 2975 2976
	}

	/*
	 * We were unable to place multiple objects in a slab. Now
	 * lets see if we can place a single object there.
	 */
2977
	order = slab_order(size, 1, slub_max_order, 1, reserved);
2978 2979 2980 2981 2982 2983
	if (order <= slub_max_order)
		return order;

	/*
	 * Doh this slab cannot be placed using slub_max_order.
	 */
2984
	order = slab_order(size, 1, MAX_ORDER, 1, reserved);
D
David Rientjes 已提交
2985
	if (order < MAX_ORDER)
2986 2987 2988 2989
		return order;
	return -ENOSYS;
}

2990
static void
2991
init_kmem_cache_node(struct kmem_cache_node *n)
C
Christoph Lameter 已提交
2992 2993 2994 2995
{
	n->nr_partial = 0;
	spin_lock_init(&n->list_lock);
	INIT_LIST_HEAD(&n->partial);
2996
#ifdef CONFIG_SLUB_DEBUG
2997
	atomic_long_set(&n->nr_slabs, 0);
2998
	atomic_long_set(&n->total_objects, 0);
2999
	INIT_LIST_HEAD(&n->full);
3000
#endif
C
Christoph Lameter 已提交
3001 3002
}

3003
static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
3004
{
3005
	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
3006
			KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
3007

3008
	/*
3009 3010
	 * Must align to double word boundary for the double cmpxchg
	 * instructions to work; see __pcpu_double_call_return_bool().
3011
	 */
3012 3013
	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
				     2 * sizeof(void *));
3014 3015 3016 3017 3018

	if (!s->cpu_slab)
		return 0;

	init_kmem_cache_cpus(s);
3019

3020
	return 1;
3021 3022
}

3023 3024
static struct kmem_cache *kmem_cache_node;

C
Christoph Lameter 已提交
3025 3026 3027 3028 3029
/*
 * No kmalloc_node yet so do it by hand. We know that this is the first
 * slab on the node for this slabcache. There are no concurrent accesses
 * possible.
 *
Z
Zhi Yong Wu 已提交
3030 3031
 * Note that this function only works on the kmem_cache_node
 * when allocating for the kmem_cache_node. This is used for bootstrapping
3032
 * memory on a fresh node that has no slab structures yet.
C
Christoph Lameter 已提交
3033
 */
3034
static void early_kmem_cache_node_alloc(int node)
C
Christoph Lameter 已提交
3035 3036 3037 3038
{
	struct page *page;
	struct kmem_cache_node *n;

3039
	BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
C
Christoph Lameter 已提交
3040

3041
	page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
C
Christoph Lameter 已提交
3042 3043

	BUG_ON(!page);
3044
	if (page_to_nid(page) != node) {
3045 3046
		pr_err("SLUB: Unable to allocate memory from node %d\n", node);
		pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
3047 3048
	}

C
Christoph Lameter 已提交
3049 3050
	n = page->freelist;
	BUG_ON(!n);
3051
	page->freelist = get_freepointer(kmem_cache_node, n);
3052
	page->inuse = 1;
3053
	page->frozen = 0;
3054
	kmem_cache_node->node[node] = n;
3055
#ifdef CONFIG_SLUB_DEBUG
3056
	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
3057
	init_tracking(kmem_cache_node, n);
3058
#endif
3059
	kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node));
3060
	init_kmem_cache_node(n);
3061
	inc_slabs_node(kmem_cache_node, node, page->objects);
C
Christoph Lameter 已提交
3062

3063
	/*
3064 3065
	 * No locks need to be taken here as it has just been
	 * initialized and there is no concurrent access.
3066
	 */
3067
	__add_partial(n, page, DEACTIVATE_TO_HEAD);
C
Christoph Lameter 已提交
3068 3069 3070 3071 3072
}

static void free_kmem_cache_nodes(struct kmem_cache *s)
{
	int node;
C
Christoph Lameter 已提交
3073
	struct kmem_cache_node *n;
C
Christoph Lameter 已提交
3074

C
Christoph Lameter 已提交
3075 3076
	for_each_kmem_cache_node(s, node, n) {
		kmem_cache_free(kmem_cache_node, n);
C
Christoph Lameter 已提交
3077 3078 3079 3080
		s->node[node] = NULL;
	}
}

3081
static int init_kmem_cache_nodes(struct kmem_cache *s)
C
Christoph Lameter 已提交
3082 3083 3084
{
	int node;

C
Christoph Lameter 已提交
3085
	for_each_node_state(node, N_NORMAL_MEMORY) {
C
Christoph Lameter 已提交
3086 3087
		struct kmem_cache_node *n;

3088
		if (slab_state == DOWN) {
3089
			early_kmem_cache_node_alloc(node);
3090 3091
			continue;
		}
3092
		n = kmem_cache_alloc_node(kmem_cache_node,
3093
						GFP_KERNEL, node);
C
Christoph Lameter 已提交
3094

3095 3096 3097
		if (!n) {
			free_kmem_cache_nodes(s);
			return 0;
C
Christoph Lameter 已提交
3098
		}
3099

C
Christoph Lameter 已提交
3100
		s->node[node] = n;
3101
		init_kmem_cache_node(n);
C
Christoph Lameter 已提交
3102 3103 3104 3105
	}
	return 1;
}

3106
static void set_min_partial(struct kmem_cache *s, unsigned long min)
3107 3108 3109 3110 3111 3112 3113 3114
{
	if (min < MIN_PARTIAL)
		min = MIN_PARTIAL;
	else if (min > MAX_PARTIAL)
		min = MAX_PARTIAL;
	s->min_partial = min;
}

C
Christoph Lameter 已提交
3115 3116 3117 3118
/*
 * calculate_sizes() determines the order and the distribution of data within
 * a slab object.
 */
3119
static int calculate_sizes(struct kmem_cache *s, int forced_order)
C
Christoph Lameter 已提交
3120 3121
{
	unsigned long flags = s->flags;
3122
	unsigned long size = s->object_size;
3123
	int order;
C
Christoph Lameter 已提交
3124

3125 3126 3127 3128 3129 3130 3131 3132
	/*
	 * Round up object size to the next word boundary. We can only
	 * place the free pointer at word boundaries and this determines
	 * the possible location of the free pointer.
	 */
	size = ALIGN(size, sizeof(void *));

#ifdef CONFIG_SLUB_DEBUG
C
Christoph Lameter 已提交
3133 3134 3135 3136 3137 3138
	/*
	 * Determine if we can poison the object itself. If the user of
	 * the slab may touch the object after free or before allocation
	 * then we should never poison the object itself.
	 */
	if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
3139
			!s->ctor)
C
Christoph Lameter 已提交
3140 3141 3142 3143 3144 3145
		s->flags |= __OBJECT_POISON;
	else
		s->flags &= ~__OBJECT_POISON;


	/*
C
Christoph Lameter 已提交
3146
	 * If we are Redzoning then check if there is some space between the
C
Christoph Lameter 已提交
3147
	 * end of the object and the free pointer. If not then add an
C
Christoph Lameter 已提交
3148
	 * additional word to have some bytes to store Redzone information.
C
Christoph Lameter 已提交
3149
	 */
3150
	if ((flags & SLAB_RED_ZONE) && size == s->object_size)
C
Christoph Lameter 已提交
3151
		size += sizeof(void *);
C
Christoph Lameter 已提交
3152
#endif
C
Christoph Lameter 已提交
3153 3154

	/*
C
Christoph Lameter 已提交
3155 3156
	 * With that we have determined the number of bytes in actual use
	 * by the object. This is the potential offset to the free pointer.
C
Christoph Lameter 已提交
3157 3158 3159 3160
	 */
	s->inuse = size;

	if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
3161
		s->ctor)) {
C
Christoph Lameter 已提交
3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173
		/*
		 * Relocate free pointer after the object if it is not
		 * permitted to overwrite the first word of the object on
		 * kmem_cache_free.
		 *
		 * This is the case if we do RCU, have a constructor or
		 * destructor or are poisoning the objects.
		 */
		s->offset = size;
		size += sizeof(void *);
	}

3174
#ifdef CONFIG_SLUB_DEBUG
C
Christoph Lameter 已提交
3175 3176 3177 3178 3179 3180 3181
	if (flags & SLAB_STORE_USER)
		/*
		 * Need to store information about allocs and frees after
		 * the object.
		 */
		size += 2 * sizeof(struct track);

3182
	if (flags & SLAB_RED_ZONE)
C
Christoph Lameter 已提交
3183 3184 3185 3186
		/*
		 * Add some empty padding so that we can catch
		 * overwrites from earlier objects rather than let
		 * tracking information or the free pointer be
3187
		 * corrupted if a user writes before the start
C
Christoph Lameter 已提交
3188 3189 3190
		 * of the object.
		 */
		size += sizeof(void *);
C
Christoph Lameter 已提交
3191
#endif
C
Christoph Lameter 已提交
3192

C
Christoph Lameter 已提交
3193 3194 3195 3196 3197
	/*
	 * SLUB stores one object immediately after another beginning from
	 * offset 0. In order to align the objects we have to simply size
	 * each object to conform to the alignment.
	 */
3198
	size = ALIGN(size, s->align);
C
Christoph Lameter 已提交
3199
	s->size = size;
3200 3201 3202
	if (forced_order >= 0)
		order = forced_order;
	else
3203
		order = calculate_order(size, s->reserved);
C
Christoph Lameter 已提交
3204

3205
	if (order < 0)
C
Christoph Lameter 已提交
3206 3207
		return 0;

3208
	s->allocflags = 0;
3209
	if (order)
3210 3211 3212
		s->allocflags |= __GFP_COMP;

	if (s->flags & SLAB_CACHE_DMA)
3213
		s->allocflags |= GFP_DMA;
3214 3215 3216 3217

	if (s->flags & SLAB_RECLAIM_ACCOUNT)
		s->allocflags |= __GFP_RECLAIMABLE;

C
Christoph Lameter 已提交
3218 3219 3220
	/*
	 * Determine the number of objects per slab
	 */
3221 3222
	s->oo = oo_make(order, size, s->reserved);
	s->min = oo_make(get_order(size), size, s->reserved);
3223 3224
	if (oo_objects(s->oo) > oo_objects(s->max))
		s->max = s->oo;
C
Christoph Lameter 已提交
3225

3226
	return !!oo_objects(s->oo);
C
Christoph Lameter 已提交
3227 3228
}

3229
static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
C
Christoph Lameter 已提交
3230
{
3231
	s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
3232
	s->reserved = 0;
C
Christoph Lameter 已提交
3233

3234 3235
	if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
		s->reserved = sizeof(struct rcu_head);
C
Christoph Lameter 已提交
3236

3237
	if (!calculate_sizes(s, -1))
C
Christoph Lameter 已提交
3238
		goto error;
3239 3240 3241 3242 3243
	if (disable_higher_order_debug) {
		/*
		 * Disable debugging flags that store metadata if the min slab
		 * order increased.
		 */
3244
		if (get_order(s->size) > get_order(s->object_size)) {
3245 3246 3247 3248 3249 3250
			s->flags &= ~DEBUG_METADATA_FLAGS;
			s->offset = 0;
			if (!calculate_sizes(s, -1))
				goto error;
		}
	}
C
Christoph Lameter 已提交
3251

3252 3253
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
3254 3255 3256 3257 3258
	if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
		/* Enable fast mode */
		s->flags |= __CMPXCHG_DOUBLE;
#endif

3259 3260 3261 3262
	/*
	 * The larger the object size is, the more pages we want on the partial
	 * list to avoid pounding the page allocator excessively.
	 */
3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277
	set_min_partial(s, ilog2(s->size) / 2);

	/*
	 * cpu_partial determined the maximum number of objects kept in the
	 * per cpu partial lists of a processor.
	 *
	 * Per cpu partial lists mainly contain slabs that just have one
	 * object freed. If they are used for allocation then they can be
	 * filled up again with minimal effort. The slab will never hit the
	 * per node partial lists and therefore no locking will be required.
	 *
	 * This setting also determines
	 *
	 * A) The number of objects from per cpu partial slabs dumped to the
	 *    per node list when we reach the limit.
3278
	 * B) The number of objects in cpu partial slabs to extract from the
3279 3280
	 *    per node list when we run out of per cpu objects. We only fetch
	 *    50% to keep some capacity around for frees.
3281
	 */
3282
	if (!kmem_cache_has_cpu_partial(s))
3283 3284
		s->cpu_partial = 0;
	else if (s->size >= PAGE_SIZE)
3285 3286 3287 3288 3289 3290 3291 3292
		s->cpu_partial = 2;
	else if (s->size >= 1024)
		s->cpu_partial = 6;
	else if (s->size >= 256)
		s->cpu_partial = 13;
	else
		s->cpu_partial = 30;

C
Christoph Lameter 已提交
3293
#ifdef CONFIG_NUMA
3294
	s->remote_node_defrag_ratio = 1000;
C
Christoph Lameter 已提交
3295
#endif
3296
	if (!init_kmem_cache_nodes(s))
3297
		goto error;
C
Christoph Lameter 已提交
3298

3299
	if (alloc_kmem_cache_cpus(s))
3300
		return 0;
3301

3302
	free_kmem_cache_nodes(s);
C
Christoph Lameter 已提交
3303 3304 3305 3306
error:
	if (flags & SLAB_PANIC)
		panic("Cannot create slab %s size=%lu realsize=%u "
			"order=%u offset=%u flags=%lx\n",
3307 3308
			s->name, (unsigned long)s->size, s->size,
			oo_order(s->oo), s->offset, flags);
3309
	return -EINVAL;
C
Christoph Lameter 已提交
3310 3311
}

3312 3313 3314 3315 3316 3317
static void list_slab_objects(struct kmem_cache *s, struct page *page,
							const char *text)
{
#ifdef CONFIG_SLUB_DEBUG
	void *addr = page_address(page);
	void *p;
N
Namhyung Kim 已提交
3318 3319
	unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
				     sizeof(long), GFP_ATOMIC);
E
Eric Dumazet 已提交
3320 3321
	if (!map)
		return;
3322
	slab_err(s, page, text, s->name);
3323 3324
	slab_lock(page);

3325
	get_map(s, page, map);
3326 3327 3328
	for_each_object(p, s, addr, page->objects) {

		if (!test_bit(slab_index(p, s, addr), map)) {
3329
			pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
3330 3331 3332 3333
			print_tracking(s, p);
		}
	}
	slab_unlock(page);
E
Eric Dumazet 已提交
3334
	kfree(map);
3335 3336 3337
#endif
}

C
Christoph Lameter 已提交
3338
/*
C
Christoph Lameter 已提交
3339
 * Attempt to free all partial slabs on a node.
3340 3341
 * This is called from kmem_cache_close(). We must be the last thread
 * using the cache and therefore we do not need to lock anymore.
C
Christoph Lameter 已提交
3342
 */
C
Christoph Lameter 已提交
3343
static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
C
Christoph Lameter 已提交
3344 3345 3346
{
	struct page *page, *h;

3347
	list_for_each_entry_safe(page, h, &n->partial, lru) {
C
Christoph Lameter 已提交
3348
		if (!page->inuse) {
3349
			__remove_partial(n, page);
C
Christoph Lameter 已提交
3350
			discard_slab(s, page);
3351 3352
		} else {
			list_slab_objects(s, page,
3353
			"Objects remaining in %s on kmem_cache_close()");
C
Christoph Lameter 已提交
3354
		}
3355
	}
C
Christoph Lameter 已提交
3356 3357 3358
}

/*
C
Christoph Lameter 已提交
3359
 * Release all resources used by a slab cache.
C
Christoph Lameter 已提交
3360
 */
3361
static inline int kmem_cache_close(struct kmem_cache *s)
C
Christoph Lameter 已提交
3362 3363
{
	int node;
C
Christoph Lameter 已提交
3364
	struct kmem_cache_node *n;
C
Christoph Lameter 已提交
3365 3366 3367

	flush_all(s);
	/* Attempt to free all objects */
C
Christoph Lameter 已提交
3368
	for_each_kmem_cache_node(s, node, n) {
C
Christoph Lameter 已提交
3369 3370
		free_partial(s, n);
		if (n->nr_partial || slabs_node(s, node))
C
Christoph Lameter 已提交
3371 3372
			return 1;
	}
3373
	free_percpu(s->cpu_slab);
C
Christoph Lameter 已提交
3374 3375 3376 3377
	free_kmem_cache_nodes(s);
	return 0;
}

3378
int __kmem_cache_shutdown(struct kmem_cache *s)
C
Christoph Lameter 已提交
3379
{
3380
	return kmem_cache_close(s);
C
Christoph Lameter 已提交
3381 3382 3383 3384 3385 3386 3387 3388
}

/********************************************************************
 *		Kmalloc subsystem
 *******************************************************************/

static int __init setup_slub_min_order(char *str)
{
P
Pekka Enberg 已提交
3389
	get_option(&str, &slub_min_order);
C
Christoph Lameter 已提交
3390 3391 3392 3393 3394 3395 3396 3397

	return 1;
}

__setup("slub_min_order=", setup_slub_min_order);

static int __init setup_slub_max_order(char *str)
{
P
Pekka Enberg 已提交
3398
	get_option(&str, &slub_max_order);
D
David Rientjes 已提交
3399
	slub_max_order = min(slub_max_order, MAX_ORDER - 1);
C
Christoph Lameter 已提交
3400 3401 3402 3403 3404 3405 3406 3407

	return 1;
}

__setup("slub_max_order=", setup_slub_max_order);

static int __init setup_slub_min_objects(char *str)
{
P
Pekka Enberg 已提交
3408
	get_option(&str, &slub_min_objects);
C
Christoph Lameter 已提交
3409 3410 3411 3412 3413 3414 3415 3416

	return 1;
}

__setup("slub_min_objects=", setup_slub_min_objects);

void *__kmalloc(size_t size, gfp_t flags)
{
3417
	struct kmem_cache *s;
E
Eduard - Gabriel Munteanu 已提交
3418
	void *ret;
C
Christoph Lameter 已提交
3419

3420
	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
3421
		return kmalloc_large(size, flags);
3422

3423
	s = kmalloc_slab(size, flags);
3424 3425

	if (unlikely(ZERO_OR_NULL_PTR(s)))
3426 3427
		return s;

3428
	ret = slab_alloc(s, flags, _RET_IP_);
E
Eduard - Gabriel Munteanu 已提交
3429

3430
	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
E
Eduard - Gabriel Munteanu 已提交
3431

3432 3433
	kasan_kmalloc(s, ret, size);

E
Eduard - Gabriel Munteanu 已提交
3434
	return ret;
C
Christoph Lameter 已提交
3435 3436 3437
}
EXPORT_SYMBOL(__kmalloc);

3438
#ifdef CONFIG_NUMA
3439 3440
static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
{
3441
	struct page *page;
3442
	void *ptr = NULL;
3443

V
Vladimir Davydov 已提交
3444 3445
	flags |= __GFP_COMP | __GFP_NOTRACK;
	page = alloc_kmem_pages_node(node, flags, get_order(size));
3446
	if (page)
3447 3448
		ptr = page_address(page);

3449
	kmalloc_large_node_hook(ptr, size, flags);
3450
	return ptr;
3451 3452
}

C
Christoph Lameter 已提交
3453 3454
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
3455
	struct kmem_cache *s;
E
Eduard - Gabriel Munteanu 已提交
3456
	void *ret;
C
Christoph Lameter 已提交
3457

3458
	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
E
Eduard - Gabriel Munteanu 已提交
3459 3460
		ret = kmalloc_large_node(size, flags, node);

3461 3462 3463
		trace_kmalloc_node(_RET_IP_, ret,
				   size, PAGE_SIZE << get_order(size),
				   flags, node);
E
Eduard - Gabriel Munteanu 已提交
3464 3465 3466

		return ret;
	}
3467

3468
	s = kmalloc_slab(size, flags);
3469 3470

	if (unlikely(ZERO_OR_NULL_PTR(s)))
3471 3472
		return s;

3473
	ret = slab_alloc_node(s, flags, node, _RET_IP_);
E
Eduard - Gabriel Munteanu 已提交
3474

3475
	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
E
Eduard - Gabriel Munteanu 已提交
3476

3477 3478
	kasan_kmalloc(s, ret, size);

E
Eduard - Gabriel Munteanu 已提交
3479
	return ret;
C
Christoph Lameter 已提交
3480 3481 3482 3483
}
EXPORT_SYMBOL(__kmalloc_node);
#endif

3484
static size_t __ksize(const void *object)
C
Christoph Lameter 已提交
3485
{
3486
	struct page *page;
C
Christoph Lameter 已提交
3487

3488
	if (unlikely(object == ZERO_SIZE_PTR))
3489 3490
		return 0;

3491 3492
	page = virt_to_head_page(object);

P
Pekka Enberg 已提交
3493 3494
	if (unlikely(!PageSlab(page))) {
		WARN_ON(!PageCompound(page));
3495
		return PAGE_SIZE << compound_order(page);
P
Pekka Enberg 已提交
3496
	}
C
Christoph Lameter 已提交
3497

3498
	return slab_ksize(page->slab_cache);
C
Christoph Lameter 已提交
3499
}
3500 3501 3502 3503 3504 3505 3506 3507 3508

size_t ksize(const void *object)
{
	size_t size = __ksize(object);
	/* We assume that ksize callers could use whole allocated area,
	   so we need unpoison this area. */
	kasan_krealloc(object, size);
	return size;
}
K
Kirill A. Shutemov 已提交
3509
EXPORT_SYMBOL(ksize);
C
Christoph Lameter 已提交
3510 3511 3512 3513

void kfree(const void *x)
{
	struct page *page;
3514
	void *object = (void *)x;
C
Christoph Lameter 已提交
3515

3516 3517
	trace_kfree(_RET_IP_, x);

3518
	if (unlikely(ZERO_OR_NULL_PTR(x)))
C
Christoph Lameter 已提交
3519 3520
		return;

3521
	page = virt_to_head_page(x);
3522
	if (unlikely(!PageSlab(page))) {
3523
		BUG_ON(!PageCompound(page));
3524
		kfree_hook(x);
V
Vladimir Davydov 已提交
3525
		__free_kmem_pages(page, compound_order(page));
3526 3527
		return;
	}
3528
	slab_free(page->slab_cache, page, object, _RET_IP_);
C
Christoph Lameter 已提交
3529 3530 3531
}
EXPORT_SYMBOL(kfree);

3532 3533
#define SHRINK_PROMOTE_MAX 32

3534
/*
3535 3536 3537
 * kmem_cache_shrink discards empty slabs and promotes the slabs filled
 * up most to the head of the partial lists. New allocations will then
 * fill those up and thus they can be removed from the partial lists.
C
Christoph Lameter 已提交
3538 3539 3540 3541
 *
 * The slabs with the least items are placed last. This results in them
 * being allocated from last increasing the chance that the last objects
 * are freed in them.
3542
 */
3543
int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
3544 3545 3546 3547 3548 3549
{
	int node;
	int i;
	struct kmem_cache_node *n;
	struct page *page;
	struct page *t;
3550 3551
	struct list_head discard;
	struct list_head promote[SHRINK_PROMOTE_MAX];
3552
	unsigned long flags;
3553
	int ret = 0;
3554

3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569
	if (deactivate) {
		/*
		 * Disable empty slabs caching. Used to avoid pinning offline
		 * memory cgroups by kmem pages that can be freed.
		 */
		s->cpu_partial = 0;
		s->min_partial = 0;

		/*
		 * s->cpu_partial is checked locklessly (see put_cpu_partial),
		 * so we have to make sure the change is visible.
		 */
		kick_all_cpus_sync();
	}

3570
	flush_all(s);
C
Christoph Lameter 已提交
3571
	for_each_kmem_cache_node(s, node, n) {
3572 3573 3574
		INIT_LIST_HEAD(&discard);
		for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
			INIT_LIST_HEAD(promote + i);
3575 3576 3577 3578

		spin_lock_irqsave(&n->list_lock, flags);

		/*
3579
		 * Build lists of slabs to discard or promote.
3580
		 *
C
Christoph Lameter 已提交
3581 3582
		 * Note that concurrent frees may occur while we hold the
		 * list_lock. page->inuse here is the upper limit.
3583 3584
		 */
		list_for_each_entry_safe(page, t, &n->partial, lru) {
3585 3586 3587 3588 3589 3590 3591 3592 3593 3594
			int free = page->objects - page->inuse;

			/* Do not reread page->inuse */
			barrier();

			/* We do not keep full slabs on the list */
			BUG_ON(free <= 0);

			if (free == page->objects) {
				list_move(&page->lru, &discard);
3595
				n->nr_partial--;
3596 3597
			} else if (free <= SHRINK_PROMOTE_MAX)
				list_move(&page->lru, promote + free - 1);
3598 3599 3600
		}

		/*
3601 3602
		 * Promote the slabs filled up most to the head of the
		 * partial list.
3603
		 */
3604 3605
		for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
			list_splice(promote + i, &n->partial);
3606 3607

		spin_unlock_irqrestore(&n->list_lock, flags);
3608 3609

		/* Release empty slabs */
3610
		list_for_each_entry_safe(page, t, &discard, lru)
3611
			discard_slab(s, page);
3612 3613 3614

		if (slabs_node(s, node))
			ret = 1;
3615 3616
	}

3617
	return ret;
3618 3619
}

3620 3621 3622 3623
static int slab_mem_going_offline_callback(void *arg)
{
	struct kmem_cache *s;

3624
	mutex_lock(&slab_mutex);
3625
	list_for_each_entry(s, &slab_caches, list)
3626
		__kmem_cache_shrink(s, false);
3627
	mutex_unlock(&slab_mutex);
3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638

	return 0;
}

static void slab_mem_offline_callback(void *arg)
{
	struct kmem_cache_node *n;
	struct kmem_cache *s;
	struct memory_notify *marg = arg;
	int offline_node;

3639
	offline_node = marg->status_change_nid_normal;
3640 3641 3642 3643 3644 3645 3646 3647

	/*
	 * If the node still has available memory. we need kmem_cache_node
	 * for it yet.
	 */
	if (offline_node < 0)
		return;

3648
	mutex_lock(&slab_mutex);
3649 3650 3651 3652 3653 3654
	list_for_each_entry(s, &slab_caches, list) {
		n = get_node(s, offline_node);
		if (n) {
			/*
			 * if n->nr_slabs > 0, slabs still exist on the node
			 * that is going down. We were unable to free them,
3655
			 * and offline_pages() function shouldn't call this
3656 3657
			 * callback. So, we must fail.
			 */
3658
			BUG_ON(slabs_node(s, offline_node));
3659 3660

			s->node[offline_node] = NULL;
3661
			kmem_cache_free(kmem_cache_node, n);
3662 3663
		}
	}
3664
	mutex_unlock(&slab_mutex);
3665 3666 3667 3668 3669 3670 3671
}

static int slab_mem_going_online_callback(void *arg)
{
	struct kmem_cache_node *n;
	struct kmem_cache *s;
	struct memory_notify *marg = arg;
3672
	int nid = marg->status_change_nid_normal;
3673 3674 3675 3676 3677 3678 3679 3680 3681 3682
	int ret = 0;

	/*
	 * If the node's memory is already available, then kmem_cache_node is
	 * already created. Nothing to do.
	 */
	if (nid < 0)
		return 0;

	/*
3683
	 * We are bringing a node online. No memory is available yet. We must
3684 3685 3686
	 * allocate a kmem_cache_node structure in order to bring the node
	 * online.
	 */
3687
	mutex_lock(&slab_mutex);
3688 3689 3690 3691 3692 3693
	list_for_each_entry(s, &slab_caches, list) {
		/*
		 * XXX: kmem_cache_alloc_node will fallback to other nodes
		 *      since memory is not yet available from the node that
		 *      is brought up.
		 */
3694
		n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
3695 3696 3697 3698
		if (!n) {
			ret = -ENOMEM;
			goto out;
		}
3699
		init_kmem_cache_node(n);
3700 3701 3702
		s->node[nid] = n;
	}
out:
3703
	mutex_unlock(&slab_mutex);
3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726
	return ret;
}

static int slab_memory_callback(struct notifier_block *self,
				unsigned long action, void *arg)
{
	int ret = 0;

	switch (action) {
	case MEM_GOING_ONLINE:
		ret = slab_mem_going_online_callback(arg);
		break;
	case MEM_GOING_OFFLINE:
		ret = slab_mem_going_offline_callback(arg);
		break;
	case MEM_OFFLINE:
	case MEM_CANCEL_ONLINE:
		slab_mem_offline_callback(arg);
		break;
	case MEM_ONLINE:
	case MEM_CANCEL_OFFLINE:
		break;
	}
3727 3728 3729 3730
	if (ret)
		ret = notifier_from_errno(ret);
	else
		ret = NOTIFY_OK;
3731 3732 3733
	return ret;
}

3734 3735 3736 3737
static struct notifier_block slab_memory_callback_nb = {
	.notifier_call = slab_memory_callback,
	.priority = SLAB_CALLBACK_PRI,
};
3738

C
Christoph Lameter 已提交
3739 3740 3741 3742
/********************************************************************
 *			Basic setup of slabs
 *******************************************************************/

3743 3744
/*
 * Used for early kmem_cache structures that were allocated using
3745 3746
 * the page allocator. Allocate them properly then fix up the pointers
 * that may be pointing to the wrong kmem_cache structure.
3747 3748
 */

3749
static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
3750 3751
{
	int node;
3752
	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
C
Christoph Lameter 已提交
3753
	struct kmem_cache_node *n;
3754

3755
	memcpy(s, static_cache, kmem_cache->object_size);
3756

3757 3758 3759 3760 3761 3762
	/*
	 * This runs very early, and only the boot processor is supposed to be
	 * up.  Even if it weren't true, IRQs are not up so we couldn't fire
	 * IPIs around.
	 */
	__flush_cpu_slab(s, smp_processor_id());
C
Christoph Lameter 已提交
3763
	for_each_kmem_cache_node(s, node, n) {
3764 3765
		struct page *p;

C
Christoph Lameter 已提交
3766 3767
		list_for_each_entry(p, &n->partial, lru)
			p->slab_cache = s;
3768

L
Li Zefan 已提交
3769
#ifdef CONFIG_SLUB_DEBUG
C
Christoph Lameter 已提交
3770 3771
		list_for_each_entry(p, &n->full, lru)
			p->slab_cache = s;
3772 3773
#endif
	}
3774
	slab_init_memcg_params(s);
3775 3776
	list_add(&s->list, &slab_caches);
	return s;
3777 3778
}

C
Christoph Lameter 已提交
3779 3780
void __init kmem_cache_init(void)
{
3781 3782
	static __initdata struct kmem_cache boot_kmem_cache,
		boot_kmem_cache_node;
3783

3784 3785 3786
	if (debug_guardpage_minorder())
		slub_max_order = 0;

3787 3788
	kmem_cache_node = &boot_kmem_cache_node;
	kmem_cache = &boot_kmem_cache;
3789

3790 3791
	create_boot_cache(kmem_cache_node, "kmem_cache_node",
		sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
3792

3793
	register_hotmemory_notifier(&slab_memory_callback_nb);
C
Christoph Lameter 已提交
3794 3795 3796 3797

	/* Able to allocate the per node structures */
	slab_state = PARTIAL;

3798 3799 3800 3801
	create_boot_cache(kmem_cache, "kmem_cache",
			offsetof(struct kmem_cache, node) +
				nr_node_ids * sizeof(struct kmem_cache_node *),
		       SLAB_HWCACHE_ALIGN);
3802

3803
	kmem_cache = bootstrap(&boot_kmem_cache);
C
Christoph Lameter 已提交
3804

3805 3806 3807 3808 3809
	/*
	 * Allocate kmem_cache_node properly from the kmem_cache slab.
	 * kmem_cache_node is separately allocated so no need to
	 * update any list pointers.
	 */
3810
	kmem_cache_node = bootstrap(&boot_kmem_cache_node);
3811 3812

	/* Now we can use the kmem_cache to allocate kmalloc slabs */
3813
	setup_kmalloc_cache_index_table();
3814
	create_kmalloc_caches(0);
C
Christoph Lameter 已提交
3815 3816 3817

#ifdef CONFIG_SMP
	register_cpu_notifier(&slab_notifier);
3818
#endif
C
Christoph Lameter 已提交
3819

3820
	pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n",
3821
		cache_line_size(),
C
Christoph Lameter 已提交
3822 3823 3824 3825
		slub_min_order, slub_max_order, slub_min_objects,
		nr_cpu_ids, nr_node_ids);
}

3826 3827 3828 3829
void __init kmem_cache_init_late(void)
{
}

3830
struct kmem_cache *
3831 3832
__kmem_cache_alias(const char *name, size_t size, size_t align,
		   unsigned long flags, void (*ctor)(void *))
C
Christoph Lameter 已提交
3833
{
3834
	struct kmem_cache *s, *c;
C
Christoph Lameter 已提交
3835

3836
	s = find_mergeable(size, align, flags, name, ctor);
C
Christoph Lameter 已提交
3837 3838
	if (s) {
		s->refcount++;
3839

C
Christoph Lameter 已提交
3840 3841 3842 3843
		/*
		 * Adjust the object sizes so that we clear
		 * the complete object on kzalloc.
		 */
3844
		s->object_size = max(s->object_size, (int)size);
C
Christoph Lameter 已提交
3845
		s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
C
Christoph Lameter 已提交
3846

3847
		for_each_memcg_cache(c, s) {
3848 3849 3850 3851 3852
			c->object_size = s->object_size;
			c->inuse = max_t(int, c->inuse,
					 ALIGN(size, sizeof(void *)));
		}

3853 3854
		if (sysfs_slab_alias(s, name)) {
			s->refcount--;
3855
			s = NULL;
3856
		}
3857
	}
C
Christoph Lameter 已提交
3858

3859 3860
	return s;
}
P
Pekka Enberg 已提交
3861

3862
int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
3863
{
3864 3865 3866 3867 3868
	int err;

	err = kmem_cache_open(s, flags);
	if (err)
		return err;
3869

3870 3871 3872 3873
	/* Mutex is not taken during early boot */
	if (slab_state <= UP)
		return 0;

3874
	memcg_propagate_slab_attrs(s);
3875 3876 3877
	err = sysfs_slab_add(s);
	if (err)
		kmem_cache_close(s);
3878

3879
	return err;
C
Christoph Lameter 已提交
3880 3881 3882 3883
}

#ifdef CONFIG_SMP
/*
C
Christoph Lameter 已提交
3884 3885
 * Use the cpu notifier to insure that the cpu slabs are flushed when
 * necessary.
C
Christoph Lameter 已提交
3886
 */
3887
static int slab_cpuup_callback(struct notifier_block *nfb,
C
Christoph Lameter 已提交
3888 3889 3890
		unsigned long action, void *hcpu)
{
	long cpu = (long)hcpu;
3891 3892
	struct kmem_cache *s;
	unsigned long flags;
C
Christoph Lameter 已提交
3893 3894 3895

	switch (action) {
	case CPU_UP_CANCELED:
3896
	case CPU_UP_CANCELED_FROZEN:
C
Christoph Lameter 已提交
3897
	case CPU_DEAD:
3898
	case CPU_DEAD_FROZEN:
3899
		mutex_lock(&slab_mutex);
3900 3901 3902 3903 3904
		list_for_each_entry(s, &slab_caches, list) {
			local_irq_save(flags);
			__flush_cpu_slab(s, cpu);
			local_irq_restore(flags);
		}
3905
		mutex_unlock(&slab_mutex);
C
Christoph Lameter 已提交
3906 3907 3908 3909 3910 3911 3912
		break;
	default:
		break;
	}
	return NOTIFY_OK;
}

3913
static struct notifier_block slab_notifier = {
I
Ingo Molnar 已提交
3914
	.notifier_call = slab_cpuup_callback
P
Pekka Enberg 已提交
3915
};
C
Christoph Lameter 已提交
3916 3917 3918

#endif

3919
void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
C
Christoph Lameter 已提交
3920
{
3921
	struct kmem_cache *s;
3922
	void *ret;
3923

3924
	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
3925 3926
		return kmalloc_large(size, gfpflags);

3927
	s = kmalloc_slab(size, gfpflags);
C
Christoph Lameter 已提交
3928

3929
	if (unlikely(ZERO_OR_NULL_PTR(s)))
3930
		return s;
C
Christoph Lameter 已提交
3931

3932
	ret = slab_alloc(s, gfpflags, caller);
3933

L
Lucas De Marchi 已提交
3934
	/* Honor the call site pointer we received. */
3935
	trace_kmalloc(caller, ret, size, s->size, gfpflags);
3936 3937

	return ret;
C
Christoph Lameter 已提交
3938 3939
}

3940
#ifdef CONFIG_NUMA
C
Christoph Lameter 已提交
3941
void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3942
					int node, unsigned long caller)
C
Christoph Lameter 已提交
3943
{
3944
	struct kmem_cache *s;
3945
	void *ret;
3946

3947
	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
3948 3949 3950 3951 3952 3953 3954 3955
		ret = kmalloc_large_node(size, gfpflags, node);

		trace_kmalloc_node(caller, ret,
				   size, PAGE_SIZE << get_order(size),
				   gfpflags, node);

		return ret;
	}
3956

3957
	s = kmalloc_slab(size, gfpflags);
C
Christoph Lameter 已提交
3958

3959
	if (unlikely(ZERO_OR_NULL_PTR(s)))
3960
		return s;
C
Christoph Lameter 已提交
3961

3962
	ret = slab_alloc_node(s, gfpflags, node, caller);
3963

L
Lucas De Marchi 已提交
3964
	/* Honor the call site pointer we received. */
3965
	trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
3966 3967

	return ret;
C
Christoph Lameter 已提交
3968
}
3969
#endif
C
Christoph Lameter 已提交
3970

3971
#ifdef CONFIG_SYSFS
3972 3973 3974 3975 3976 3977 3978 3979 3980
static int count_inuse(struct page *page)
{
	return page->inuse;
}

static int count_total(struct page *page)
{
	return page->objects;
}
3981
#endif
3982

3983
#ifdef CONFIG_SLUB_DEBUG
3984 3985
static int validate_slab(struct kmem_cache *s, struct page *page,
						unsigned long *map)
3986 3987
{
	void *p;
3988
	void *addr = page_address(page);
3989 3990 3991 3992 3993 3994

	if (!check_slab(s, page) ||
			!on_freelist(s, page, NULL))
		return 0;

	/* Now we know that a valid freelist exists */
3995
	bitmap_zero(map, page->objects);
3996

3997 3998 3999 4000 4001
	get_map(s, page, map);
	for_each_object(p, s, addr, page->objects) {
		if (test_bit(slab_index(p, s, addr), map))
			if (!check_object(s, page, p, SLUB_RED_INACTIVE))
				return 0;
4002 4003
	}

4004
	for_each_object(p, s, addr, page->objects)
4005
		if (!test_bit(slab_index(p, s, addr), map))
4006
			if (!check_object(s, page, p, SLUB_RED_ACTIVE))
4007 4008 4009 4010
				return 0;
	return 1;
}

4011 4012
static void validate_slab_slab(struct kmem_cache *s, struct page *page,
						unsigned long *map)
4013
{
4014 4015 4016
	slab_lock(page);
	validate_slab(s, page, map);
	slab_unlock(page);
4017 4018
}

4019 4020
static int validate_slab_node(struct kmem_cache *s,
		struct kmem_cache_node *n, unsigned long *map)
4021 4022 4023 4024 4025 4026 4027 4028
{
	unsigned long count = 0;
	struct page *page;
	unsigned long flags;

	spin_lock_irqsave(&n->list_lock, flags);

	list_for_each_entry(page, &n->partial, lru) {
4029
		validate_slab_slab(s, page, map);
4030 4031 4032
		count++;
	}
	if (count != n->nr_partial)
4033 4034
		pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
		       s->name, count, n->nr_partial);
4035 4036 4037 4038 4039

	if (!(s->flags & SLAB_STORE_USER))
		goto out;

	list_for_each_entry(page, &n->full, lru) {
4040
		validate_slab_slab(s, page, map);
4041 4042 4043
		count++;
	}
	if (count != atomic_long_read(&n->nr_slabs))
4044 4045
		pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
		       s->name, count, atomic_long_read(&n->nr_slabs));
4046 4047 4048 4049 4050 4051

out:
	spin_unlock_irqrestore(&n->list_lock, flags);
	return count;
}

4052
static long validate_slab_cache(struct kmem_cache *s)
4053 4054 4055
{
	int node;
	unsigned long count = 0;
4056
	unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
4057
				sizeof(unsigned long), GFP_KERNEL);
C
Christoph Lameter 已提交
4058
	struct kmem_cache_node *n;
4059 4060 4061

	if (!map)
		return -ENOMEM;
4062 4063

	flush_all(s);
C
Christoph Lameter 已提交
4064
	for_each_kmem_cache_node(s, node, n)
4065 4066
		count += validate_slab_node(s, n, map);
	kfree(map);
4067 4068
	return count;
}
4069
/*
C
Christoph Lameter 已提交
4070
 * Generate lists of code addresses where slabcache objects are allocated
4071 4072 4073 4074 4075
 * and freed.
 */

struct location {
	unsigned long count;
4076
	unsigned long addr;
4077 4078 4079 4080 4081
	long long sum_time;
	long min_time;
	long max_time;
	long min_pid;
	long max_pid;
R
Rusty Russell 已提交
4082
	DECLARE_BITMAP(cpus, NR_CPUS);
4083
	nodemask_t nodes;
4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098
};

struct loc_track {
	unsigned long max;
	unsigned long count;
	struct location *loc;
};

static void free_loc_track(struct loc_track *t)
{
	if (t->max)
		free_pages((unsigned long)t->loc,
			get_order(sizeof(struct location) * t->max));
}

4099
static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
4100 4101 4102 4103 4104 4105
{
	struct location *l;
	int order;

	order = get_order(sizeof(struct location) * max);

4106
	l = (void *)__get_free_pages(flags, order);
4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119
	if (!l)
		return 0;

	if (t->count) {
		memcpy(l, t->loc, sizeof(struct location) * t->count);
		free_loc_track(t);
	}
	t->max = max;
	t->loc = l;
	return 1;
}

static int add_location(struct loc_track *t, struct kmem_cache *s,
4120
				const struct track *track)
4121 4122 4123
{
	long start, end, pos;
	struct location *l;
4124
	unsigned long caddr;
4125
	unsigned long age = jiffies - track->when;
4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140

	start = -1;
	end = t->count;

	for ( ; ; ) {
		pos = start + (end - start + 1) / 2;

		/*
		 * There is nothing at "end". If we end up there
		 * we need to add something to before end.
		 */
		if (pos == end)
			break;

		caddr = t->loc[pos].addr;
4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156
		if (track->addr == caddr) {

			l = &t->loc[pos];
			l->count++;
			if (track->when) {
				l->sum_time += age;
				if (age < l->min_time)
					l->min_time = age;
				if (age > l->max_time)
					l->max_time = age;

				if (track->pid < l->min_pid)
					l->min_pid = track->pid;
				if (track->pid > l->max_pid)
					l->max_pid = track->pid;

R
Rusty Russell 已提交
4157 4158
				cpumask_set_cpu(track->cpu,
						to_cpumask(l->cpus));
4159 4160
			}
			node_set(page_to_nid(virt_to_page(track)), l->nodes);
4161 4162 4163
			return 1;
		}

4164
		if (track->addr < caddr)
4165 4166 4167 4168 4169 4170
			end = pos;
		else
			start = pos;
	}

	/*
C
Christoph Lameter 已提交
4171
	 * Not found. Insert new tracking element.
4172
	 */
4173
	if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
4174 4175 4176 4177 4178 4179 4180 4181
		return 0;

	l = t->loc + pos;
	if (pos < t->count)
		memmove(l + 1, l,
			(t->count - pos) * sizeof(struct location));
	t->count++;
	l->count = 1;
4182 4183 4184 4185 4186 4187
	l->addr = track->addr;
	l->sum_time = age;
	l->min_time = age;
	l->max_time = age;
	l->min_pid = track->pid;
	l->max_pid = track->pid;
R
Rusty Russell 已提交
4188 4189
	cpumask_clear(to_cpumask(l->cpus));
	cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
4190 4191
	nodes_clear(l->nodes);
	node_set(page_to_nid(virt_to_page(track)), l->nodes);
4192 4193 4194 4195
	return 1;
}

static void process_slab(struct loc_track *t, struct kmem_cache *s,
E
Eric Dumazet 已提交
4196
		struct page *page, enum track_item alloc,
N
Namhyung Kim 已提交
4197
		unsigned long *map)
4198
{
4199
	void *addr = page_address(page);
4200 4201
	void *p;

4202
	bitmap_zero(map, page->objects);
4203
	get_map(s, page, map);
4204

4205
	for_each_object(p, s, addr, page->objects)
4206 4207
		if (!test_bit(slab_index(p, s, addr), map))
			add_location(t, s, get_track(s, p, alloc));
4208 4209 4210 4211 4212
}

static int list_locations(struct kmem_cache *s, char *buf,
					enum track_item alloc)
{
4213
	int len = 0;
4214
	unsigned long i;
4215
	struct loc_track t = { 0, 0, NULL };
4216
	int node;
E
Eric Dumazet 已提交
4217 4218
	unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
				     sizeof(unsigned long), GFP_KERNEL);
C
Christoph Lameter 已提交
4219
	struct kmem_cache_node *n;
4220

E
Eric Dumazet 已提交
4221 4222 4223
	if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
				     GFP_TEMPORARY)) {
		kfree(map);
4224
		return sprintf(buf, "Out of memory\n");
E
Eric Dumazet 已提交
4225
	}
4226 4227 4228
	/* Push back cpu slabs */
	flush_all(s);

C
Christoph Lameter 已提交
4229
	for_each_kmem_cache_node(s, node, n) {
4230 4231 4232
		unsigned long flags;
		struct page *page;

4233
		if (!atomic_long_read(&n->nr_slabs))
4234 4235 4236 4237
			continue;

		spin_lock_irqsave(&n->list_lock, flags);
		list_for_each_entry(page, &n->partial, lru)
E
Eric Dumazet 已提交
4238
			process_slab(&t, s, page, alloc, map);
4239
		list_for_each_entry(page, &n->full, lru)
E
Eric Dumazet 已提交
4240
			process_slab(&t, s, page, alloc, map);
4241 4242 4243 4244
		spin_unlock_irqrestore(&n->list_lock, flags);
	}

	for (i = 0; i < t.count; i++) {
4245
		struct location *l = &t.loc[i];
4246

H
Hugh Dickins 已提交
4247
		if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
4248
			break;
4249
		len += sprintf(buf + len, "%7ld ", l->count);
4250 4251

		if (l->addr)
J
Joe Perches 已提交
4252
			len += sprintf(buf + len, "%pS", (void *)l->addr);
4253
		else
4254
			len += sprintf(buf + len, "<not-available>");
4255 4256

		if (l->sum_time != l->min_time) {
4257
			len += sprintf(buf + len, " age=%ld/%ld/%ld",
R
Roman Zippel 已提交
4258 4259 4260
				l->min_time,
				(long)div_u64(l->sum_time, l->count),
				l->max_time);
4261
		} else
4262
			len += sprintf(buf + len, " age=%ld",
4263 4264 4265
				l->min_time);

		if (l->min_pid != l->max_pid)
4266
			len += sprintf(buf + len, " pid=%ld-%ld",
4267 4268
				l->min_pid, l->max_pid);
		else
4269
			len += sprintf(buf + len, " pid=%ld",
4270 4271
				l->min_pid);

R
Rusty Russell 已提交
4272 4273
		if (num_online_cpus() > 1 &&
				!cpumask_empty(to_cpumask(l->cpus)) &&
4274 4275 4276 4277
				len < PAGE_SIZE - 60)
			len += scnprintf(buf + len, PAGE_SIZE - len - 50,
					 " cpus=%*pbl",
					 cpumask_pr_args(to_cpumask(l->cpus)));
4278

4279
		if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
4280 4281 4282 4283
				len < PAGE_SIZE - 60)
			len += scnprintf(buf + len, PAGE_SIZE - len - 50,
					 " nodes=%*pbl",
					 nodemask_pr_args(&l->nodes));
4284

4285
		len += sprintf(buf + len, "\n");
4286 4287 4288
	}

	free_loc_track(&t);
E
Eric Dumazet 已提交
4289
	kfree(map);
4290
	if (!t.count)
4291 4292
		len += sprintf(buf, "No data\n");
	return len;
4293
}
4294
#endif
4295

4296
#ifdef SLUB_RESILIENCY_TEST
4297
static void __init resiliency_test(void)
4298 4299 4300
{
	u8 *p;

4301
	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
4302

4303 4304 4305
	pr_err("SLUB resiliency testing\n");
	pr_err("-----------------------\n");
	pr_err("A. Corruption after allocation\n");
4306 4307 4308

	p = kzalloc(16, GFP_KERNEL);
	p[16] = 0x12;
4309 4310
	pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
	       p + 16);
4311 4312 4313 4314 4315 4316

	validate_slab_cache(kmalloc_caches[4]);

	/* Hmmm... The next two are dangerous */
	p = kzalloc(32, GFP_KERNEL);
	p[32 + sizeof(void *)] = 0x34;
4317 4318 4319
	pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
	       p);
	pr_err("If allocated object is overwritten then not detectable\n\n");
4320 4321 4322 4323 4324

	validate_slab_cache(kmalloc_caches[5]);
	p = kzalloc(64, GFP_KERNEL);
	p += 64 + (get_cycles() & 0xff) * sizeof(void *);
	*p = 0x56;
4325 4326 4327
	pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
	       p);
	pr_err("If allocated object is overwritten then not detectable\n\n");
4328 4329
	validate_slab_cache(kmalloc_caches[6]);

4330
	pr_err("\nB. Corruption after free\n");
4331 4332 4333
	p = kzalloc(128, GFP_KERNEL);
	kfree(p);
	*p = 0x78;
4334
	pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
4335 4336 4337 4338 4339
	validate_slab_cache(kmalloc_caches[7]);

	p = kzalloc(256, GFP_KERNEL);
	kfree(p);
	p[50] = 0x9a;
4340
	pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
4341 4342 4343 4344 4345
	validate_slab_cache(kmalloc_caches[8]);

	p = kzalloc(512, GFP_KERNEL);
	kfree(p);
	p[512] = 0xab;
4346
	pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
4347 4348 4349 4350 4351 4352 4353 4354
	validate_slab_cache(kmalloc_caches[9]);
}
#else
#ifdef CONFIG_SYSFS
static void resiliency_test(void) {};
#endif
#endif

4355
#ifdef CONFIG_SYSFS
C
Christoph Lameter 已提交
4356
enum slab_stat_type {
4357 4358 4359 4360 4361
	SL_ALL,			/* All slabs */
	SL_PARTIAL,		/* Only partially allocated slabs */
	SL_CPU,			/* Only slabs used for cpu caches */
	SL_OBJECTS,		/* Determine allocated objects not slabs */
	SL_TOTAL		/* Determine object capacity not slabs */
C
Christoph Lameter 已提交
4362 4363
};

4364
#define SO_ALL		(1 << SL_ALL)
C
Christoph Lameter 已提交
4365 4366 4367
#define SO_PARTIAL	(1 << SL_PARTIAL)
#define SO_CPU		(1 << SL_CPU)
#define SO_OBJECTS	(1 << SL_OBJECTS)
4368
#define SO_TOTAL	(1 << SL_TOTAL)
C
Christoph Lameter 已提交
4369

4370 4371
static ssize_t show_slab_objects(struct kmem_cache *s,
			    char *buf, unsigned long flags)
C
Christoph Lameter 已提交
4372 4373 4374 4375 4376 4377
{
	unsigned long total = 0;
	int node;
	int x;
	unsigned long *nodes;

4378
	nodes = kzalloc(sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
4379 4380
	if (!nodes)
		return -ENOMEM;
C
Christoph Lameter 已提交
4381

4382 4383
	if (flags & SO_CPU) {
		int cpu;
C
Christoph Lameter 已提交
4384

4385
		for_each_possible_cpu(cpu) {
4386 4387
			struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
							       cpu);
4388
			int node;
4389
			struct page *page;
4390

4391
			page = READ_ONCE(c->page);
4392 4393
			if (!page)
				continue;
4394

4395 4396 4397 4398 4399 4400 4401
			node = page_to_nid(page);
			if (flags & SO_TOTAL)
				x = page->objects;
			else if (flags & SO_OBJECTS)
				x = page->inuse;
			else
				x = 1;
4402

4403 4404 4405
			total += x;
			nodes[node] += x;

4406
			page = READ_ONCE(c->partial);
4407
			if (page) {
L
Li Zefan 已提交
4408 4409 4410 4411 4412 4413 4414
				node = page_to_nid(page);
				if (flags & SO_TOTAL)
					WARN_ON_ONCE(1);
				else if (flags & SO_OBJECTS)
					WARN_ON_ONCE(1);
				else
					x = page->pages;
4415 4416
				total += x;
				nodes[node] += x;
4417
			}
C
Christoph Lameter 已提交
4418 4419 4420
		}
	}

4421
	get_online_mems();
4422
#ifdef CONFIG_SLUB_DEBUG
4423
	if (flags & SO_ALL) {
C
Christoph Lameter 已提交
4424 4425 4426
		struct kmem_cache_node *n;

		for_each_kmem_cache_node(s, node, n) {
4427

4428 4429 4430 4431 4432
			if (flags & SO_TOTAL)
				x = atomic_long_read(&n->total_objects);
			else if (flags & SO_OBJECTS)
				x = atomic_long_read(&n->total_objects) -
					count_partial(n, count_free);
C
Christoph Lameter 已提交
4433
			else
4434
				x = atomic_long_read(&n->nr_slabs);
C
Christoph Lameter 已提交
4435 4436 4437 4438
			total += x;
			nodes[node] += x;
		}

4439 4440 4441
	} else
#endif
	if (flags & SO_PARTIAL) {
C
Christoph Lameter 已提交
4442
		struct kmem_cache_node *n;
C
Christoph Lameter 已提交
4443

C
Christoph Lameter 已提交
4444
		for_each_kmem_cache_node(s, node, n) {
4445 4446 4447 4448
			if (flags & SO_TOTAL)
				x = count_partial(n, count_total);
			else if (flags & SO_OBJECTS)
				x = count_partial(n, count_inuse);
C
Christoph Lameter 已提交
4449
			else
4450
				x = n->nr_partial;
C
Christoph Lameter 已提交
4451 4452 4453 4454 4455 4456
			total += x;
			nodes[node] += x;
		}
	}
	x = sprintf(buf, "%lu", total);
#ifdef CONFIG_NUMA
C
Christoph Lameter 已提交
4457
	for (node = 0; node < nr_node_ids; node++)
C
Christoph Lameter 已提交
4458 4459 4460 4461
		if (nodes[node])
			x += sprintf(buf + x, " N%d=%lu",
					node, nodes[node]);
#endif
4462
	put_online_mems();
C
Christoph Lameter 已提交
4463 4464 4465 4466
	kfree(nodes);
	return x + sprintf(buf + x, "\n");
}

4467
#ifdef CONFIG_SLUB_DEBUG
C
Christoph Lameter 已提交
4468 4469 4470
static int any_slab_objects(struct kmem_cache *s)
{
	int node;
C
Christoph Lameter 已提交
4471
	struct kmem_cache_node *n;
C
Christoph Lameter 已提交
4472

C
Christoph Lameter 已提交
4473
	for_each_kmem_cache_node(s, node, n)
4474
		if (atomic_long_read(&n->total_objects))
C
Christoph Lameter 已提交
4475
			return 1;
C
Christoph Lameter 已提交
4476

C
Christoph Lameter 已提交
4477 4478
	return 0;
}
4479
#endif
C
Christoph Lameter 已提交
4480 4481

#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
4482
#define to_slab(n) container_of(n, struct kmem_cache, kobj)
C
Christoph Lameter 已提交
4483 4484 4485 4486 4487 4488 4489 4490

struct slab_attribute {
	struct attribute attr;
	ssize_t (*show)(struct kmem_cache *s, char *buf);
	ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
};

#define SLAB_ATTR_RO(_name) \
4491 4492
	static struct slab_attribute _name##_attr = \
	__ATTR(_name, 0400, _name##_show, NULL)
C
Christoph Lameter 已提交
4493 4494 4495

#define SLAB_ATTR(_name) \
	static struct slab_attribute _name##_attr =  \
4496
	__ATTR(_name, 0600, _name##_show, _name##_store)
C
Christoph Lameter 已提交
4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511

static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
{
	return sprintf(buf, "%d\n", s->size);
}
SLAB_ATTR_RO(slab_size);

static ssize_t align_show(struct kmem_cache *s, char *buf)
{
	return sprintf(buf, "%d\n", s->align);
}
SLAB_ATTR_RO(align);

static ssize_t object_size_show(struct kmem_cache *s, char *buf)
{
4512
	return sprintf(buf, "%d\n", s->object_size);
C
Christoph Lameter 已提交
4513 4514 4515 4516 4517
}
SLAB_ATTR_RO(object_size);

static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
{
4518
	return sprintf(buf, "%d\n", oo_objects(s->oo));
C
Christoph Lameter 已提交
4519 4520 4521
}
SLAB_ATTR_RO(objs_per_slab);

4522 4523 4524
static ssize_t order_store(struct kmem_cache *s,
				const char *buf, size_t length)
{
4525 4526 4527
	unsigned long order;
	int err;

4528
	err = kstrtoul(buf, 10, &order);
4529 4530
	if (err)
		return err;
4531 4532 4533 4534 4535 4536 4537 4538

	if (order > slub_max_order || order < slub_min_order)
		return -EINVAL;

	calculate_sizes(s, order);
	return length;
}

C
Christoph Lameter 已提交
4539 4540
static ssize_t order_show(struct kmem_cache *s, char *buf)
{
4541
	return sprintf(buf, "%d\n", oo_order(s->oo));
C
Christoph Lameter 已提交
4542
}
4543
SLAB_ATTR(order);
C
Christoph Lameter 已提交
4544

4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555
static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
{
	return sprintf(buf, "%lu\n", s->min_partial);
}

static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
				 size_t length)
{
	unsigned long min;
	int err;

4556
	err = kstrtoul(buf, 10, &min);
4557 4558 4559
	if (err)
		return err;

4560
	set_min_partial(s, min);
4561 4562 4563 4564
	return length;
}
SLAB_ATTR(min_partial);

4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575
static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
{
	return sprintf(buf, "%u\n", s->cpu_partial);
}

static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
				 size_t length)
{
	unsigned long objects;
	int err;

4576
	err = kstrtoul(buf, 10, &objects);
4577 4578
	if (err)
		return err;
4579
	if (objects && !kmem_cache_has_cpu_partial(s))
4580
		return -EINVAL;
4581 4582 4583 4584 4585 4586 4587

	s->cpu_partial = objects;
	flush_all(s);
	return length;
}
SLAB_ATTR(cpu_partial);

C
Christoph Lameter 已提交
4588 4589
static ssize_t ctor_show(struct kmem_cache *s, char *buf)
{
J
Joe Perches 已提交
4590 4591 4592
	if (!s->ctor)
		return 0;
	return sprintf(buf, "%pS\n", s->ctor);
C
Christoph Lameter 已提交
4593 4594 4595 4596 4597
}
SLAB_ATTR_RO(ctor);

static ssize_t aliases_show(struct kmem_cache *s, char *buf)
{
4598
	return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
C
Christoph Lameter 已提交
4599 4600 4601 4602 4603
}
SLAB_ATTR_RO(aliases);

static ssize_t partial_show(struct kmem_cache *s, char *buf)
{
4604
	return show_slab_objects(s, buf, SO_PARTIAL);
C
Christoph Lameter 已提交
4605 4606 4607 4608 4609
}
SLAB_ATTR_RO(partial);

static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
{
4610
	return show_slab_objects(s, buf, SO_CPU);
C
Christoph Lameter 已提交
4611 4612 4613 4614 4615
}
SLAB_ATTR_RO(cpu_slabs);

static ssize_t objects_show(struct kmem_cache *s, char *buf)
{
4616
	return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
C
Christoph Lameter 已提交
4617 4618 4619
}
SLAB_ATTR_RO(objects);

4620 4621 4622 4623 4624 4625
static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
{
	return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
}
SLAB_ATTR_RO(objects_partial);

4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656
static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
{
	int objects = 0;
	int pages = 0;
	int cpu;
	int len;

	for_each_online_cpu(cpu) {
		struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;

		if (page) {
			pages += page->pages;
			objects += page->pobjects;
		}
	}

	len = sprintf(buf, "%d(%d)", objects, pages);

#ifdef CONFIG_SMP
	for_each_online_cpu(cpu) {
		struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;

		if (page && len < PAGE_SIZE - 20)
			len += sprintf(buf + len, " C%d=%d(%d)", cpu,
				page->pobjects, page->pages);
	}
#endif
	return len + sprintf(buf + len, "\n");
}
SLAB_ATTR_RO(slabs_cpu_partial);

4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691
static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
{
	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
}

static ssize_t reclaim_account_store(struct kmem_cache *s,
				const char *buf, size_t length)
{
	s->flags &= ~SLAB_RECLAIM_ACCOUNT;
	if (buf[0] == '1')
		s->flags |= SLAB_RECLAIM_ACCOUNT;
	return length;
}
SLAB_ATTR(reclaim_account);

static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
{
	return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
}
SLAB_ATTR_RO(hwcache_align);

#ifdef CONFIG_ZONE_DMA
static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
{
	return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
}
SLAB_ATTR_RO(cache_dma);
#endif

static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
{
	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
}
SLAB_ATTR_RO(destroy_by_rcu);

4692 4693 4694 4695 4696 4697
static ssize_t reserved_show(struct kmem_cache *s, char *buf)
{
	return sprintf(buf, "%d\n", s->reserved);
}
SLAB_ATTR_RO(reserved);

4698
#ifdef CONFIG_SLUB_DEBUG
4699 4700 4701 4702 4703 4704
static ssize_t slabs_show(struct kmem_cache *s, char *buf)
{
	return show_slab_objects(s, buf, SO_ALL);
}
SLAB_ATTR_RO(slabs);

4705 4706 4707 4708 4709 4710
static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
{
	return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
}
SLAB_ATTR_RO(total_objects);

C
Christoph Lameter 已提交
4711 4712 4713 4714 4715 4716 4717 4718 4719
static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
{
	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
}

static ssize_t sanity_checks_store(struct kmem_cache *s,
				const char *buf, size_t length)
{
	s->flags &= ~SLAB_DEBUG_FREE;
4720 4721
	if (buf[0] == '1') {
		s->flags &= ~__CMPXCHG_DOUBLE;
C
Christoph Lameter 已提交
4722
		s->flags |= SLAB_DEBUG_FREE;
4723
	}
C
Christoph Lameter 已提交
4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735
	return length;
}
SLAB_ATTR(sanity_checks);

static ssize_t trace_show(struct kmem_cache *s, char *buf)
{
	return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
}

static ssize_t trace_store(struct kmem_cache *s, const char *buf,
							size_t length)
{
4736 4737 4738 4739 4740 4741 4742 4743
	/*
	 * Tracing a merged cache is going to give confusing results
	 * as well as cause other issues like converting a mergeable
	 * cache into an umergeable one.
	 */
	if (s->refcount > 1)
		return -EINVAL;

C
Christoph Lameter 已提交
4744
	s->flags &= ~SLAB_TRACE;
4745 4746
	if (buf[0] == '1') {
		s->flags &= ~__CMPXCHG_DOUBLE;
C
Christoph Lameter 已提交
4747
		s->flags |= SLAB_TRACE;
4748
	}
C
Christoph Lameter 已提交
4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764
	return length;
}
SLAB_ATTR(trace);

static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
{
	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
}

static ssize_t red_zone_store(struct kmem_cache *s,
				const char *buf, size_t length)
{
	if (any_slab_objects(s))
		return -EBUSY;

	s->flags &= ~SLAB_RED_ZONE;
4765 4766
	if (buf[0] == '1') {
		s->flags &= ~__CMPXCHG_DOUBLE;
C
Christoph Lameter 已提交
4767
		s->flags |= SLAB_RED_ZONE;
4768
	}
4769
	calculate_sizes(s, -1);
C
Christoph Lameter 已提交
4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785
	return length;
}
SLAB_ATTR(red_zone);

static ssize_t poison_show(struct kmem_cache *s, char *buf)
{
	return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
}

static ssize_t poison_store(struct kmem_cache *s,
				const char *buf, size_t length)
{
	if (any_slab_objects(s))
		return -EBUSY;

	s->flags &= ~SLAB_POISON;
4786 4787
	if (buf[0] == '1') {
		s->flags &= ~__CMPXCHG_DOUBLE;
C
Christoph Lameter 已提交
4788
		s->flags |= SLAB_POISON;
4789
	}
4790
	calculate_sizes(s, -1);
C
Christoph Lameter 已提交
4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806
	return length;
}
SLAB_ATTR(poison);

static ssize_t store_user_show(struct kmem_cache *s, char *buf)
{
	return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
}

static ssize_t store_user_store(struct kmem_cache *s,
				const char *buf, size_t length)
{
	if (any_slab_objects(s))
		return -EBUSY;

	s->flags &= ~SLAB_STORE_USER;
4807 4808
	if (buf[0] == '1') {
		s->flags &= ~__CMPXCHG_DOUBLE;
C
Christoph Lameter 已提交
4809
		s->flags |= SLAB_STORE_USER;
4810
	}
4811
	calculate_sizes(s, -1);
C
Christoph Lameter 已提交
4812 4813 4814 4815
	return length;
}
SLAB_ATTR(store_user);

4816 4817 4818 4819 4820 4821 4822 4823
static ssize_t validate_show(struct kmem_cache *s, char *buf)
{
	return 0;
}

static ssize_t validate_store(struct kmem_cache *s,
			const char *buf, size_t length)
{
4824 4825 4826 4827 4828 4829 4830 4831
	int ret = -EINVAL;

	if (buf[0] == '1') {
		ret = validate_slab_cache(s);
		if (ret >= 0)
			ret = length;
	}
	return ret;
4832 4833
}
SLAB_ATTR(validate);
4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860

static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
{
	if (!(s->flags & SLAB_STORE_USER))
		return -ENOSYS;
	return list_locations(s, buf, TRACK_ALLOC);
}
SLAB_ATTR_RO(alloc_calls);

static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
{
	if (!(s->flags & SLAB_STORE_USER))
		return -ENOSYS;
	return list_locations(s, buf, TRACK_FREE);
}
SLAB_ATTR_RO(free_calls);
#endif /* CONFIG_SLUB_DEBUG */

#ifdef CONFIG_FAILSLAB
static ssize_t failslab_show(struct kmem_cache *s, char *buf)
{
	return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
}

static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
							size_t length)
{
4861 4862 4863
	if (s->refcount > 1)
		return -EINVAL;

4864 4865 4866 4867 4868 4869
	s->flags &= ~SLAB_FAILSLAB;
	if (buf[0] == '1')
		s->flags |= SLAB_FAILSLAB;
	return length;
}
SLAB_ATTR(failslab);
4870
#endif
4871

4872 4873 4874 4875 4876 4877 4878 4879
static ssize_t shrink_show(struct kmem_cache *s, char *buf)
{
	return 0;
}

static ssize_t shrink_store(struct kmem_cache *s,
			const char *buf, size_t length)
{
4880 4881 4882
	if (buf[0] == '1')
		kmem_cache_shrink(s);
	else
4883 4884 4885 4886 4887
		return -EINVAL;
	return length;
}
SLAB_ATTR(shrink);

C
Christoph Lameter 已提交
4888
#ifdef CONFIG_NUMA
4889
static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
C
Christoph Lameter 已提交
4890
{
4891
	return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
C
Christoph Lameter 已提交
4892 4893
}

4894
static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
C
Christoph Lameter 已提交
4895 4896
				const char *buf, size_t length)
{
4897 4898 4899
	unsigned long ratio;
	int err;

4900
	err = kstrtoul(buf, 10, &ratio);
4901 4902 4903
	if (err)
		return err;

4904
	if (ratio <= 100)
4905
		s->remote_node_defrag_ratio = ratio * 10;
C
Christoph Lameter 已提交
4906 4907 4908

	return length;
}
4909
SLAB_ATTR(remote_node_defrag_ratio);
C
Christoph Lameter 已提交
4910 4911
#endif

4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923
#ifdef CONFIG_SLUB_STATS
static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
{
	unsigned long sum  = 0;
	int cpu;
	int len;
	int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);

	if (!data)
		return -ENOMEM;

	for_each_online_cpu(cpu) {
4924
		unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
4925 4926 4927 4928 4929 4930 4931

		data[cpu] = x;
		sum += x;
	}

	len = sprintf(buf, "%lu", sum);

4932
#ifdef CONFIG_SMP
4933 4934
	for_each_online_cpu(cpu) {
		if (data[cpu] && len < PAGE_SIZE - 20)
4935
			len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
4936
	}
4937
#endif
4938 4939 4940 4941
	kfree(data);
	return len + sprintf(buf + len, "\n");
}

D
David Rientjes 已提交
4942 4943 4944 4945 4946
static void clear_stat(struct kmem_cache *s, enum stat_item si)
{
	int cpu;

	for_each_online_cpu(cpu)
4947
		per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
D
David Rientjes 已提交
4948 4949
}

4950 4951 4952 4953 4954
#define STAT_ATTR(si, text) 					\
static ssize_t text##_show(struct kmem_cache *s, char *buf)	\
{								\
	return show_stat(s, buf, si);				\
}								\
D
David Rientjes 已提交
4955 4956 4957 4958 4959 4960 4961 4962 4963
static ssize_t text##_store(struct kmem_cache *s,		\
				const char *buf, size_t length)	\
{								\
	if (buf[0] != '0')					\
		return -EINVAL;					\
	clear_stat(s, si);					\
	return length;						\
}								\
SLAB_ATTR(text);						\
4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974

STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
STAT_ATTR(FREE_FASTPATH, free_fastpath);
STAT_ATTR(FREE_SLOWPATH, free_slowpath);
STAT_ATTR(FREE_FROZEN, free_frozen);
STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
STAT_ATTR(ALLOC_SLAB, alloc_slab);
STAT_ATTR(ALLOC_REFILL, alloc_refill);
4975
STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
4976 4977 4978 4979 4980 4981 4982
STAT_ATTR(FREE_SLAB, free_slab);
STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
4983
STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
4984
STAT_ATTR(ORDER_FALLBACK, order_fallback);
4985 4986
STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
4987 4988
STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
4989 4990
STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
4991 4992
#endif

P
Pekka Enberg 已提交
4993
static struct attribute *slab_attrs[] = {
C
Christoph Lameter 已提交
4994 4995 4996 4997
	&slab_size_attr.attr,
	&object_size_attr.attr,
	&objs_per_slab_attr.attr,
	&order_attr.attr,
4998
	&min_partial_attr.attr,
4999
	&cpu_partial_attr.attr,
C
Christoph Lameter 已提交
5000
	&objects_attr.attr,
5001
	&objects_partial_attr.attr,
C
Christoph Lameter 已提交
5002 5003 5004 5005 5006 5007 5008 5009
	&partial_attr.attr,
	&cpu_slabs_attr.attr,
	&ctor_attr.attr,
	&aliases_attr.attr,
	&align_attr.attr,
	&hwcache_align_attr.attr,
	&reclaim_account_attr.attr,
	&destroy_by_rcu_attr.attr,
5010
	&shrink_attr.attr,
5011
	&reserved_attr.attr,
5012
	&slabs_cpu_partial_attr.attr,
5013
#ifdef CONFIG_SLUB_DEBUG
5014 5015 5016 5017
	&total_objects_attr.attr,
	&slabs_attr.attr,
	&sanity_checks_attr.attr,
	&trace_attr.attr,
C
Christoph Lameter 已提交
5018 5019 5020
	&red_zone_attr.attr,
	&poison_attr.attr,
	&store_user_attr.attr,
5021
	&validate_attr.attr,
5022 5023
	&alloc_calls_attr.attr,
	&free_calls_attr.attr,
5024
#endif
C
Christoph Lameter 已提交
5025 5026 5027 5028
#ifdef CONFIG_ZONE_DMA
	&cache_dma_attr.attr,
#endif
#ifdef CONFIG_NUMA
5029
	&remote_node_defrag_ratio_attr.attr,
5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041
#endif
#ifdef CONFIG_SLUB_STATS
	&alloc_fastpath_attr.attr,
	&alloc_slowpath_attr.attr,
	&free_fastpath_attr.attr,
	&free_slowpath_attr.attr,
	&free_frozen_attr.attr,
	&free_add_partial_attr.attr,
	&free_remove_partial_attr.attr,
	&alloc_from_partial_attr.attr,
	&alloc_slab_attr.attr,
	&alloc_refill_attr.attr,
5042
	&alloc_node_mismatch_attr.attr,
5043 5044 5045 5046 5047 5048 5049
	&free_slab_attr.attr,
	&cpuslab_flush_attr.attr,
	&deactivate_full_attr.attr,
	&deactivate_empty_attr.attr,
	&deactivate_to_head_attr.attr,
	&deactivate_to_tail_attr.attr,
	&deactivate_remote_frees_attr.attr,
5050
	&deactivate_bypass_attr.attr,
5051
	&order_fallback_attr.attr,
5052 5053
	&cmpxchg_double_fail_attr.attr,
	&cmpxchg_double_cpu_fail_attr.attr,
5054 5055
	&cpu_partial_alloc_attr.attr,
	&cpu_partial_free_attr.attr,
5056 5057
	&cpu_partial_node_attr.attr,
	&cpu_partial_drain_attr.attr,
C
Christoph Lameter 已提交
5058
#endif
5059 5060 5061 5062
#ifdef CONFIG_FAILSLAB
	&failslab_attr.attr,
#endif

C
Christoph Lameter 已提交
5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103
	NULL
};

static struct attribute_group slab_attr_group = {
	.attrs = slab_attrs,
};

static ssize_t slab_attr_show(struct kobject *kobj,
				struct attribute *attr,
				char *buf)
{
	struct slab_attribute *attribute;
	struct kmem_cache *s;
	int err;

	attribute = to_slab_attr(attr);
	s = to_slab(kobj);

	if (!attribute->show)
		return -EIO;

	err = attribute->show(s, buf);

	return err;
}

static ssize_t slab_attr_store(struct kobject *kobj,
				struct attribute *attr,
				const char *buf, size_t len)
{
	struct slab_attribute *attribute;
	struct kmem_cache *s;
	int err;

	attribute = to_slab_attr(attr);
	s = to_slab(kobj);

	if (!attribute->store)
		return -EIO;

	err = attribute->store(s, buf, len);
5104 5105
#ifdef CONFIG_MEMCG_KMEM
	if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
5106
		struct kmem_cache *c;
C
Christoph Lameter 已提交
5107

5108 5109 5110 5111
		mutex_lock(&slab_mutex);
		if (s->max_attr_size < len)
			s->max_attr_size = len;

5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128
		/*
		 * This is a best effort propagation, so this function's return
		 * value will be determined by the parent cache only. This is
		 * basically because not all attributes will have a well
		 * defined semantics for rollbacks - most of the actions will
		 * have permanent effects.
		 *
		 * Returning the error value of any of the children that fail
		 * is not 100 % defined, in the sense that users seeing the
		 * error code won't be able to know anything about the state of
		 * the cache.
		 *
		 * Only returning the error code for the parent cache at least
		 * has well defined semantics. The cache being written to
		 * directly either failed or succeeded, in which case we loop
		 * through the descendants with best-effort propagation.
		 */
5129 5130
		for_each_memcg_cache(c, s)
			attribute->store(c, buf, len);
5131 5132 5133
		mutex_unlock(&slab_mutex);
	}
#endif
C
Christoph Lameter 已提交
5134 5135 5136
	return err;
}

5137 5138 5139 5140 5141
static void memcg_propagate_slab_attrs(struct kmem_cache *s)
{
#ifdef CONFIG_MEMCG_KMEM
	int i;
	char *buffer = NULL;
5142
	struct kmem_cache *root_cache;
5143

5144
	if (is_root_cache(s))
5145 5146
		return;

5147
	root_cache = s->memcg_params.root_cache;
5148

5149 5150 5151 5152
	/*
	 * This mean this cache had no attribute written. Therefore, no point
	 * in copying default values around
	 */
5153
	if (!root_cache->max_attr_size)
5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174
		return;

	for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
		char mbuf[64];
		char *buf;
		struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);

		if (!attr || !attr->store || !attr->show)
			continue;

		/*
		 * It is really bad that we have to allocate here, so we will
		 * do it only as a fallback. If we actually allocate, though,
		 * we can just use the allocated buffer until the end.
		 *
		 * Most of the slub attributes will tend to be very small in
		 * size, but sysfs allows buffers up to a page, so they can
		 * theoretically happen.
		 */
		if (buffer)
			buf = buffer;
5175
		else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf))
5176 5177 5178 5179 5180 5181 5182 5183
			buf = mbuf;
		else {
			buffer = (char *) get_zeroed_page(GFP_KERNEL);
			if (WARN_ON(!buffer))
				continue;
			buf = buffer;
		}

5184
		attr->show(root_cache, buf);
5185 5186 5187 5188 5189 5190 5191 5192
		attr->store(s, buf, strlen(buf));
	}

	if (buffer)
		free_page((unsigned long)buffer);
#endif
}

5193 5194 5195 5196 5197
static void kmem_cache_release(struct kobject *k)
{
	slab_kmem_cache_release(to_slab(k));
}

5198
static const struct sysfs_ops slab_sysfs_ops = {
C
Christoph Lameter 已提交
5199 5200 5201 5202 5203 5204
	.show = slab_attr_show,
	.store = slab_attr_store,
};

static struct kobj_type slab_ktype = {
	.sysfs_ops = &slab_sysfs_ops,
5205
	.release = kmem_cache_release,
C
Christoph Lameter 已提交
5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216
};

static int uevent_filter(struct kset *kset, struct kobject *kobj)
{
	struct kobj_type *ktype = get_ktype(kobj);

	if (ktype == &slab_ktype)
		return 1;
	return 0;
}

5217
static const struct kset_uevent_ops slab_uevent_ops = {
C
Christoph Lameter 已提交
5218 5219 5220
	.filter = uevent_filter,
};

5221
static struct kset *slab_kset;
C
Christoph Lameter 已提交
5222

5223 5224 5225 5226
static inline struct kset *cache_kset(struct kmem_cache *s)
{
#ifdef CONFIG_MEMCG_KMEM
	if (!is_root_cache(s))
5227
		return s->memcg_params.root_cache->memcg_kset;
5228 5229 5230 5231
#endif
	return slab_kset;
}

C
Christoph Lameter 已提交
5232 5233 5234
#define ID_STR_LENGTH 64

/* Create a unique string id for a slab cache:
C
Christoph Lameter 已提交
5235 5236
 *
 * Format	:[flags-]size
C
Christoph Lameter 已提交
5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258
 */
static char *create_unique_id(struct kmem_cache *s)
{
	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
	char *p = name;

	BUG_ON(!name);

	*p++ = ':';
	/*
	 * First flags affecting slabcache operations. We will only
	 * get here for aliasable slabs so we do not need to support
	 * too many flags. The flags here must cover all flags that
	 * are matched during merging to guarantee that the id is
	 * unique.
	 */
	if (s->flags & SLAB_CACHE_DMA)
		*p++ = 'd';
	if (s->flags & SLAB_RECLAIM_ACCOUNT)
		*p++ = 'a';
	if (s->flags & SLAB_DEBUG_FREE)
		*p++ = 'F';
V
Vegard Nossum 已提交
5259 5260
	if (!(s->flags & SLAB_NOTRACK))
		*p++ = 't';
C
Christoph Lameter 已提交
5261 5262 5263
	if (p != name + 1)
		*p++ = '-';
	p += sprintf(p, "%07d", s->size);
5264

C
Christoph Lameter 已提交
5265 5266 5267 5268 5269 5270 5271 5272
	BUG_ON(p > name + ID_STR_LENGTH - 1);
	return name;
}

static int sysfs_slab_add(struct kmem_cache *s)
{
	int err;
	const char *name;
5273
	int unmergeable = slab_unmergeable(s);
C
Christoph Lameter 已提交
5274 5275 5276 5277 5278 5279 5280

	if (unmergeable) {
		/*
		 * Slabcache can never be merged so we can use the name proper.
		 * This is typically the case for debug situations. In that
		 * case we can catch duplicate names easily.
		 */
5281
		sysfs_remove_link(&slab_kset->kobj, s->name);
C
Christoph Lameter 已提交
5282 5283 5284 5285 5286 5287 5288 5289 5290
		name = s->name;
	} else {
		/*
		 * Create a unique name for the slab as a target
		 * for the symlinks.
		 */
		name = create_unique_id(s);
	}

5291
	s->kobj.kset = cache_kset(s);
5292
	err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
5293
	if (err)
5294
		goto out;
C
Christoph Lameter 已提交
5295 5296

	err = sysfs_create_group(&s->kobj, &slab_attr_group);
5297 5298
	if (err)
		goto out_del_kobj;
5299 5300 5301 5302 5303

#ifdef CONFIG_MEMCG_KMEM
	if (is_root_cache(s)) {
		s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
		if (!s->memcg_kset) {
5304 5305
			err = -ENOMEM;
			goto out_del_kobj;
5306 5307 5308 5309
		}
	}
#endif

C
Christoph Lameter 已提交
5310 5311 5312 5313 5314
	kobject_uevent(&s->kobj, KOBJ_ADD);
	if (!unmergeable) {
		/* Setup first alias */
		sysfs_slab_alias(s, s->name);
	}
5315 5316 5317 5318 5319 5320 5321
out:
	if (!unmergeable)
		kfree(name);
	return err;
out_del_kobj:
	kobject_del(&s->kobj);
	goto out;
C
Christoph Lameter 已提交
5322 5323
}

5324
void sysfs_slab_remove(struct kmem_cache *s)
C
Christoph Lameter 已提交
5325
{
5326
	if (slab_state < FULL)
5327 5328 5329 5330 5331 5332
		/*
		 * Sysfs has not been setup yet so no need to remove the
		 * cache from sysfs.
		 */
		return;

5333 5334 5335
#ifdef CONFIG_MEMCG_KMEM
	kset_unregister(s->memcg_kset);
#endif
C
Christoph Lameter 已提交
5336 5337
	kobject_uevent(&s->kobj, KOBJ_REMOVE);
	kobject_del(&s->kobj);
C
Christoph Lameter 已提交
5338
	kobject_put(&s->kobj);
C
Christoph Lameter 已提交
5339 5340 5341 5342
}

/*
 * Need to buffer aliases during bootup until sysfs becomes
N
Nick Andrew 已提交
5343
 * available lest we lose that information.
C
Christoph Lameter 已提交
5344 5345 5346 5347 5348 5349 5350
 */
struct saved_alias {
	struct kmem_cache *s;
	const char *name;
	struct saved_alias *next;
};

A
Adrian Bunk 已提交
5351
static struct saved_alias *alias_list;
C
Christoph Lameter 已提交
5352 5353 5354 5355 5356

static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
{
	struct saved_alias *al;

5357
	if (slab_state == FULL) {
C
Christoph Lameter 已提交
5358 5359 5360
		/*
		 * If we have a leftover link then remove it.
		 */
5361 5362
		sysfs_remove_link(&slab_kset->kobj, name);
		return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
C
Christoph Lameter 已提交
5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377
	}

	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
	if (!al)
		return -ENOMEM;

	al->s = s;
	al->name = name;
	al->next = alias_list;
	alias_list = al;
	return 0;
}

static int __init slab_sysfs_init(void)
{
5378
	struct kmem_cache *s;
C
Christoph Lameter 已提交
5379 5380
	int err;

5381
	mutex_lock(&slab_mutex);
5382

5383
	slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
5384
	if (!slab_kset) {
5385
		mutex_unlock(&slab_mutex);
5386
		pr_err("Cannot register slab subsystem.\n");
C
Christoph Lameter 已提交
5387 5388 5389
		return -ENOSYS;
	}

5390
	slab_state = FULL;
5391

5392
	list_for_each_entry(s, &slab_caches, list) {
5393
		err = sysfs_slab_add(s);
5394
		if (err)
5395 5396
			pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
			       s->name);
5397
	}
C
Christoph Lameter 已提交
5398 5399 5400 5401 5402 5403

	while (alias_list) {
		struct saved_alias *al = alias_list;

		alias_list = alias_list->next;
		err = sysfs_slab_alias(al->s, al->name);
5404
		if (err)
5405 5406
			pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
			       al->name);
C
Christoph Lameter 已提交
5407 5408 5409
		kfree(al);
	}

5410
	mutex_unlock(&slab_mutex);
C
Christoph Lameter 已提交
5411 5412 5413 5414 5415
	resiliency_test();
	return 0;
}

__initcall(slab_sysfs_init);
5416
#endif /* CONFIG_SYSFS */
P
Pekka J Enberg 已提交
5417 5418 5419 5420

/*
 * The /proc/slabinfo ABI
 */
5421
#ifdef CONFIG_SLABINFO
5422
void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
P
Pekka J Enberg 已提交
5423 5424
{
	unsigned long nr_slabs = 0;
5425 5426
	unsigned long nr_objs = 0;
	unsigned long nr_free = 0;
P
Pekka J Enberg 已提交
5427
	int node;
C
Christoph Lameter 已提交
5428
	struct kmem_cache_node *n;
P
Pekka J Enberg 已提交
5429

C
Christoph Lameter 已提交
5430
	for_each_kmem_cache_node(s, node, n) {
5431 5432
		nr_slabs += node_nr_slabs(n);
		nr_objs += node_nr_objs(n);
5433
		nr_free += count_partial(n, count_free);
P
Pekka J Enberg 已提交
5434 5435
	}

5436 5437 5438 5439 5440 5441
	sinfo->active_objs = nr_objs - nr_free;
	sinfo->num_objs = nr_objs;
	sinfo->active_slabs = nr_slabs;
	sinfo->num_slabs = nr_slabs;
	sinfo->objects_per_slab = oo_objects(s->oo);
	sinfo->cache_order = oo_order(s->oo);
P
Pekka J Enberg 已提交
5442 5443
}

5444
void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
5445 5446 5447
{
}

5448 5449
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
		       size_t count, loff_t *ppos)
5450
{
5451
	return -EIO;
5452
}
5453
#endif /* CONFIG_SLABINFO */