slub.c 140.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Christoph Lameter 已提交
2 3 4 5
/*
 * SLUB: A slab allocator that limits cache line use instead of queuing
 * objects in per cpu and per node lists.
 *
6 7
 * The allocator synchronizes using per slab locks or atomic operatios
 * and only uses a centralized lock to manage a pool of partial slabs.
C
Christoph Lameter 已提交
8
 *
C
Christoph Lameter 已提交
9
 * (C) 2007 SGI, Christoph Lameter
10
 * (C) 2011 Linux Foundation, Christoph Lameter
C
Christoph Lameter 已提交
11 12 13
 */

#include <linux/mm.h>
N
Nick Piggin 已提交
14
#include <linux/swap.h> /* struct reclaim_state */
C
Christoph Lameter 已提交
15 16 17 18 19
#include <linux/module.h>
#include <linux/bit_spinlock.h>
#include <linux/interrupt.h>
#include <linux/bitops.h>
#include <linux/slab.h>
20
#include "slab.h"
21
#include <linux/proc_fs.h>
C
Christoph Lameter 已提交
22
#include <linux/seq_file.h>
23
#include <linux/kasan.h>
C
Christoph Lameter 已提交
24 25 26 27
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/mempolicy.h>
#include <linux/ctype.h>
28
#include <linux/debugobjects.h>
C
Christoph Lameter 已提交
29
#include <linux/kallsyms.h>
30
#include <linux/memory.h>
R
Roman Zippel 已提交
31
#include <linux/math64.h>
A
Akinobu Mita 已提交
32
#include <linux/fault-inject.h>
33
#include <linux/stacktrace.h>
34
#include <linux/prefetch.h>
35
#include <linux/memcontrol.h>
36
#include <linux/random.h>
C
Christoph Lameter 已提交
37

38 39
#include <trace/events/kmem.h>

40 41
#include "internal.h"

C
Christoph Lameter 已提交
42 43
/*
 * Lock order:
44
 *   1. slab_mutex (Global Mutex)
45 46
 *   2. node->list_lock
 *   3. slab_lock(page) (Only on some arches and for debugging)
C
Christoph Lameter 已提交
47
 *
48
 *   slab_mutex
49
 *
50
 *   The role of the slab_mutex is to protect the list of all the slabs
51 52 53
 *   and to synchronize major metadata changes to slab cache structures.
 *
 *   The slab_lock is only used for debugging and on arches that do not
54
 *   have the ability to do a cmpxchg_double. It only protects:
55
 *	A. page->freelist	-> List of object free in a page
56 57 58
 *	B. page->inuse		-> Number of objects in use
 *	C. page->objects	-> Number of objects in page
 *	D. page->frozen		-> frozen state
59 60
 *
 *   If a slab is frozen then it is exempt from list management. It is not
61 62 63 64 65
 *   on any list except per cpu partial list. The processor that froze the
 *   slab is the one who can perform list operations on the page. Other
 *   processors may put objects onto the freelist but the processor that
 *   froze the slab is the only one that can retrieve the objects from the
 *   page's freelist.
C
Christoph Lameter 已提交
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
 *
 *   The list_lock protects the partial and full list on each node and
 *   the partial slab counter. If taken then no new slabs may be added or
 *   removed from the lists nor make the number of partial slabs be modified.
 *   (Note that the total number of slabs is an atomic value that may be
 *   modified without taking the list lock).
 *
 *   The list_lock is a centralized lock and thus we avoid taking it as
 *   much as possible. As long as SLUB does not have to handle partial
 *   slabs, operations can continue without any centralized lock. F.e.
 *   allocating a long series of objects that fill up slabs does not require
 *   the list lock.
 *   Interrupts are disabled during allocation and deallocation in order to
 *   make the slab allocator safe to use in the context of an irq. In addition
 *   interrupts are disabled to ensure that the processor does not change
 *   while handling per_cpu slabs, due to kernel preemption.
 *
 * SLUB assigns one slab for allocation to each processor.
 * Allocations only occur from these slabs called cpu slabs.
 *
C
Christoph Lameter 已提交
86 87
 * Slabs with free elements are kept on a partial list and during regular
 * operations no list for full slabs is used. If an object in a full slab is
C
Christoph Lameter 已提交
88
 * freed then the slab will show up again on the partial lists.
C
Christoph Lameter 已提交
89 90
 * We track full slabs for debugging purposes though because otherwise we
 * cannot scan all objects.
C
Christoph Lameter 已提交
91 92 93 94 95
 *
 * Slabs are freed when they become empty. Teardown and setup is
 * minimal so we rely on the page allocators per cpu caches for
 * fast frees and allocs.
 *
Y
Yu Zhao 已提交
96
 * page->frozen		The slab is frozen and exempt from list processing.
97 98 99 100 101 102 103 104 105 106 107
 * 			This means that the slab is dedicated to a purpose
 * 			such as satisfying allocations for a specific
 * 			processor. Objects may be freed in the slab while
 * 			it is frozen but slab_free will then skip the usual
 * 			list operations. It is up to the processor holding
 * 			the slab to integrate the slab into the slab lists
 * 			when the slab is no longer needed.
 *
 * 			One use of this flag is to mark slabs that are
 * 			used for allocations. Then such a slab becomes a cpu
 * 			slab. The cpu slab may be equipped with an additional
108
 * 			freelist that allows lockless access to
109 110
 * 			free objects in addition to the regular freelist
 * 			that requires the slab lock.
C
Christoph Lameter 已提交
111
 *
Y
Yu Zhao 已提交
112
 * SLAB_DEBUG_FLAGS	Slab requires special handling due to debug
C
Christoph Lameter 已提交
113
 * 			options set. This moves	slab handling out of
114
 * 			the fast path and disables lockless freelists.
C
Christoph Lameter 已提交
115 116
 */

117 118 119 120 121 122 123 124
#ifdef CONFIG_SLUB_DEBUG
#ifdef CONFIG_SLUB_DEBUG_ON
DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
#else
DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
#endif

125 126 127
static inline bool kmem_cache_debug(struct kmem_cache *s)
{
	return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
128
}
129

130
void *fixup_red_left(struct kmem_cache *s, void *p)
J
Joonsoo Kim 已提交
131
{
132
	if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
J
Joonsoo Kim 已提交
133 134 135 136 137
		p += s->red_left_pad;

	return p;
}

138 139 140 141 142 143 144 145 146
static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
	return !kmem_cache_debug(s);
#else
	return false;
#endif
}

C
Christoph Lameter 已提交
147 148 149 150 151 152 153 154 155 156 157
/*
 * Issues still to be resolved:
 *
 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
 *
 * - Variable sizing of the per node arrays
 */

/* Enable to test recovery from slab corruption on boot */
#undef SLUB_RESILIENCY_TEST

158 159 160
/* Enable to log cmpxchg failures */
#undef SLUB_DEBUG_CMPXCHG

161 162 163 164
/*
 * Mininum number of partial slabs. These will be left on the partial
 * lists even if they are empty. kmem_cache_shrink may reclaim them.
 */
C
Christoph Lameter 已提交
165
#define MIN_PARTIAL 5
C
Christoph Lameter 已提交
166

167 168 169
/*
 * Maximum number of desirable partial slabs.
 * The existence of more partial slabs makes kmem_cache_shrink
Z
Zhi Yong Wu 已提交
170
 * sort the partial list by the number of objects in use.
171 172 173
 */
#define MAX_PARTIAL 10

174
#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
C
Christoph Lameter 已提交
175
				SLAB_POISON | SLAB_STORE_USER)
C
Christoph Lameter 已提交
176

177 178 179 180 181 182 183 184
/*
 * These debug flags cannot use CMPXCHG because there might be consistency
 * issues when checking or reading debug information
 */
#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
				SLAB_TRACE)


185
/*
186 187 188
 * Debugging flags that require metadata to be stored in the slab.  These get
 * disabled when slub_debug=O is used and a cache's min order increases with
 * metadata.
189
 */
190
#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
191

192 193
#define OO_SHIFT	16
#define OO_MASK		((1 << OO_SHIFT) - 1)
194
#define MAX_OBJS_PER_PAGE	32767 /* since page.objects is u15 */
195

C
Christoph Lameter 已提交
196
/* Internal SLUB flags */
197
/* Poison object */
198
#define __OBJECT_POISON		((slab_flags_t __force)0x80000000U)
199
/* Use cmpxchg_double */
200
#define __CMPXCHG_DOUBLE	((slab_flags_t __force)0x40000000U)
C
Christoph Lameter 已提交
201

202 203 204
/*
 * Tracking user of a slab.
 */
205
#define TRACK_ADDRS_COUNT 16
206
struct track {
207
	unsigned long addr;	/* Called from address */
208 209 210
#ifdef CONFIG_STACKTRACE
	unsigned long addrs[TRACK_ADDRS_COUNT];	/* Called from address */
#endif
211 212 213 214 215 216 217
	int cpu;		/* Was running on cpu */
	int pid;		/* Pid context */
	unsigned long when;	/* When did the operation occur */
};

enum track_item { TRACK_ALLOC, TRACK_FREE };

218
#ifdef CONFIG_SYSFS
C
Christoph Lameter 已提交
219 220 221
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
#else
222 223 224
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
							{ return 0; }
C
Christoph Lameter 已提交
225 226
#endif

227
static inline void stat(const struct kmem_cache *s, enum stat_item si)
228 229
{
#ifdef CONFIG_SLUB_STATS
230 231 232 233 234
	/*
	 * The rmw is racy on a preemptible kernel but this is acceptable, so
	 * avoid this_cpu_add()'s irq-disable overhead.
	 */
	raw_cpu_inc(s->cpu_slab->stat[si]);
235 236 237
#endif
}

C
Christoph Lameter 已提交
238 239 240 241
/********************************************************************
 * 			Core slab cache functions
 *******************************************************************/

242 243 244 245 246 247 248 249 250
/*
 * Returns freelist pointer (ptr). With hardening, this is obfuscated
 * with an XOR of the address where the pointer is held and a per-cache
 * random number.
 */
static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
				 unsigned long ptr_addr)
{
#ifdef CONFIG_SLAB_FREELIST_HARDENED
251
	/*
252
	 * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged.
253 254 255 256 257 258 259 260 261
	 * Normally, this doesn't cause any issues, as both set_freepointer()
	 * and get_freepointer() are called with a pointer with the same tag.
	 * However, there are some issues with CONFIG_SLUB_DEBUG code. For
	 * example, when __free_slub() iterates over objects in a cache, it
	 * passes untagged pointers to check_object(). check_object() in turns
	 * calls get_freepointer() with an untagged pointer, which causes the
	 * freepointer to be restored incorrectly.
	 */
	return (void *)((unsigned long)ptr ^ s->random ^
262
			swab((unsigned long)kasan_reset_tag((void *)ptr_addr)));
263 264 265 266 267 268 269 270 271 272 273 274 275
#else
	return ptr;
#endif
}

/* Returns the freelist pointer recorded at location ptr_addr. */
static inline void *freelist_dereference(const struct kmem_cache *s,
					 void *ptr_addr)
{
	return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr),
			    (unsigned long)ptr_addr);
}

276 277
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
278
	object = kasan_reset_tag(object);
279
	return freelist_dereference(s, object + s->offset);
280 281
}

282 283
static void prefetch_freepointer(const struct kmem_cache *s, void *object)
{
284
	prefetch(object + s->offset);
285 286
}

287 288
static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
289
	unsigned long freepointer_addr;
290 291
	void *p;

292
	if (!debug_pagealloc_enabled_static())
293 294
		return get_freepointer(s, object);

295
	freepointer_addr = (unsigned long)object + s->offset;
296
	copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p));
297
	return freelist_ptr(s, p, freepointer_addr);
298 299
}

300 301
static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
{
302 303
	unsigned long freeptr_addr = (unsigned long)object + s->offset;

304 305 306 307
#ifdef CONFIG_SLAB_FREELIST_HARDENED
	BUG_ON(object == fp); /* naive detection of double free or corruption */
#endif

308
	freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
309
	*(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
310 311 312
}

/* Loop over all objects in a slab */
313
#define for_each_object(__p, __s, __addr, __objects) \
J
Joonsoo Kim 已提交
314 315 316
	for (__p = fixup_red_left(__s, __addr); \
		__p < (__addr) + (__objects) * (__s)->size; \
		__p += (__s)->size)
317

318
static inline unsigned int order_objects(unsigned int order, unsigned int size)
319
{
320
	return ((unsigned int)PAGE_SIZE << order) / size;
321 322
}

323
static inline struct kmem_cache_order_objects oo_make(unsigned int order,
324
		unsigned int size)
325 326
{
	struct kmem_cache_order_objects x = {
327
		(order << OO_SHIFT) + order_objects(order, size)
328 329 330 331 332
	};

	return x;
}

333
static inline unsigned int oo_order(struct kmem_cache_order_objects x)
334
{
335
	return x.x >> OO_SHIFT;
336 337
}

338
static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
339
{
340
	return x.x & OO_MASK;
341 342
}

343 344 345 346 347
/*
 * Per slab locking using the pagelock
 */
static __always_inline void slab_lock(struct page *page)
{
348
	VM_BUG_ON_PAGE(PageTail(page), page);
349 350 351 352 353
	bit_spin_lock(PG_locked, &page->flags);
}

static __always_inline void slab_unlock(struct page *page)
{
354
	VM_BUG_ON_PAGE(PageTail(page), page);
355 356 357
	__bit_spin_unlock(PG_locked, &page->flags);
}

358 359 360 361 362 363 364
/* Interrupts must be disabled (for the fallback code to work right) */
static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
		void *freelist_old, unsigned long counters_old,
		void *freelist_new, unsigned long counters_new,
		const char *n)
{
	VM_BUG_ON(!irqs_disabled());
365 366
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
367
	if (s->flags & __CMPXCHG_DOUBLE) {
368
		if (cmpxchg_double(&page->freelist, &page->counters,
369 370
				   freelist_old, counters_old,
				   freelist_new, counters_new))
371
			return true;
372 373 374 375
	} else
#endif
	{
		slab_lock(page);
376 377
		if (page->freelist == freelist_old &&
					page->counters == counters_old) {
378
			page->freelist = freelist_new;
379
			page->counters = counters_new;
380
			slab_unlock(page);
381
			return true;
382 383 384 385 386 387 388 389
		}
		slab_unlock(page);
	}

	cpu_relax();
	stat(s, CMPXCHG_DOUBLE_FAIL);

#ifdef SLUB_DEBUG_CMPXCHG
390
	pr_info("%s %s: cmpxchg double redo ", n, s->name);
391 392
#endif

393
	return false;
394 395
}

396 397 398 399 400
static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
		void *freelist_old, unsigned long counters_old,
		void *freelist_new, unsigned long counters_new,
		const char *n)
{
401 402
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
403
	if (s->flags & __CMPXCHG_DOUBLE) {
404
		if (cmpxchg_double(&page->freelist, &page->counters,
405 406
				   freelist_old, counters_old,
				   freelist_new, counters_new))
407
			return true;
408 409 410
	} else
#endif
	{
411 412 413
		unsigned long flags;

		local_irq_save(flags);
414
		slab_lock(page);
415 416
		if (page->freelist == freelist_old &&
					page->counters == counters_old) {
417
			page->freelist = freelist_new;
418
			page->counters = counters_new;
419
			slab_unlock(page);
420
			local_irq_restore(flags);
421
			return true;
422
		}
423
		slab_unlock(page);
424
		local_irq_restore(flags);
425 426 427 428 429 430
	}

	cpu_relax();
	stat(s, CMPXCHG_DOUBLE_FAIL);

#ifdef SLUB_DEBUG_CMPXCHG
431
	pr_info("%s %s: cmpxchg double redo ", n, s->name);
432 433
#endif

434
	return false;
435 436
}

C
Christoph Lameter 已提交
437
#ifdef CONFIG_SLUB_DEBUG
438 439 440
static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
static DEFINE_SPINLOCK(object_map_lock);

441 442 443
/*
 * Determine a map of object in use on a page.
 *
444
 * Node listlock must be held to guarantee that the page does
445 446
 * not vanish from under us.
 */
447
static unsigned long *get_map(struct kmem_cache *s, struct page *page)
448
	__acquires(&object_map_lock)
449 450 451 452
{
	void *p;
	void *addr = page_address(page);

453 454 455 456 457 458
	VM_BUG_ON(!irqs_disabled());

	spin_lock(&object_map_lock);

	bitmap_zero(object_map, page->objects);

459
	for (p = page->freelist; p; p = get_freepointer(s, p))
460
		set_bit(__obj_to_index(s, addr, p), object_map);
461 462 463 464

	return object_map;
}

465
static void put_map(unsigned long *map) __releases(&object_map_lock)
466 467 468
{
	VM_BUG_ON(map != object_map);
	spin_unlock(&object_map_lock);
469 470
}

471
static inline unsigned int size_from_object(struct kmem_cache *s)
J
Joonsoo Kim 已提交
472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
{
	if (s->flags & SLAB_RED_ZONE)
		return s->size - s->red_left_pad;

	return s->size;
}

static inline void *restore_red_left(struct kmem_cache *s, void *p)
{
	if (s->flags & SLAB_RED_ZONE)
		p -= s->red_left_pad;

	return p;
}

C
Christoph Lameter 已提交
487 488 489
/*
 * Debug settings:
 */
490
#if defined(CONFIG_SLUB_DEBUG_ON)
491
static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
492
#else
493
static slab_flags_t slub_debug;
494
#endif
C
Christoph Lameter 已提交
495

496
static char *slub_debug_string;
497
static int disable_higher_order_debug;
C
Christoph Lameter 已提交
498

499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514
/*
 * slub is about to manipulate internal object metadata.  This memory lies
 * outside the range of the allocated object, so accessing it would normally
 * be reported by kasan as a bounds error.  metadata_access_enable() is used
 * to tell kasan that these accesses are OK.
 */
static inline void metadata_access_enable(void)
{
	kasan_disable_current();
}

static inline void metadata_access_disable(void)
{
	kasan_enable_current();
}

C
Christoph Lameter 已提交
515 516 517
/*
 * Object debugging
 */
J
Joonsoo Kim 已提交
518 519 520 521 522 523 524 525 526 527 528

/* Verify that a pointer has an address that is valid within a slab page */
static inline int check_valid_pointer(struct kmem_cache *s,
				struct page *page, void *object)
{
	void *base;

	if (!object)
		return 1;

	base = page_address(page);
529
	object = kasan_reset_tag(object);
J
Joonsoo Kim 已提交
530 531 532 533 534 535 536 537 538
	object = restore_red_left(s, object);
	if (object < base || object >= base + page->objects * s->size ||
		(object - base) % s->size) {
		return 0;
	}

	return 1;
}

539 540
static void print_section(char *level, char *text, u8 *addr,
			  unsigned int length)
C
Christoph Lameter 已提交
541
{
542
	metadata_access_enable();
543 544
	print_hex_dump(level, kasan_reset_tag(text), DUMP_PREFIX_ADDRESS,
			16, 1, addr, length, 1);
545
	metadata_access_disable();
C
Christoph Lameter 已提交
546 547
}

548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567
/*
 * See comment in calculate_sizes().
 */
static inline bool freeptr_outside_object(struct kmem_cache *s)
{
	return s->offset >= s->inuse;
}

/*
 * Return offset of the end of info block which is inuse + free pointer if
 * not overlapping with object.
 */
static inline unsigned int get_info_end(struct kmem_cache *s)
{
	if (freeptr_outside_object(s))
		return s->inuse + sizeof(void *);
	else
		return s->inuse;
}

C
Christoph Lameter 已提交
568 569 570 571 572
static struct track *get_track(struct kmem_cache *s, void *object,
	enum track_item alloc)
{
	struct track *p;

573
	p = object + get_info_end(s);
C
Christoph Lameter 已提交
574

575
	return kasan_reset_tag(p + alloc);
C
Christoph Lameter 已提交
576 577 578
}

static void set_track(struct kmem_cache *s, void *object,
579
			enum track_item alloc, unsigned long addr)
C
Christoph Lameter 已提交
580
{
A
Akinobu Mita 已提交
581
	struct track *p = get_track(s, object, alloc);
C
Christoph Lameter 已提交
582 583

	if (addr) {
584
#ifdef CONFIG_STACKTRACE
585
		unsigned int nr_entries;
586

587
		metadata_access_enable();
588 589
		nr_entries = stack_trace_save(kasan_reset_tag(p->addrs),
					      TRACK_ADDRS_COUNT, 3);
590
		metadata_access_disable();
591

592 593
		if (nr_entries < TRACK_ADDRS_COUNT)
			p->addrs[nr_entries] = 0;
594
#endif
C
Christoph Lameter 已提交
595 596
		p->addr = addr;
		p->cpu = smp_processor_id();
A
Alexey Dobriyan 已提交
597
		p->pid = current->pid;
C
Christoph Lameter 已提交
598
		p->when = jiffies;
599
	} else {
C
Christoph Lameter 已提交
600
		memset(p, 0, sizeof(struct track));
601
	}
C
Christoph Lameter 已提交
602 603 604 605
}

static void init_tracking(struct kmem_cache *s, void *object)
{
606 607 608
	if (!(s->flags & SLAB_STORE_USER))
		return;

609 610
	set_track(s, object, TRACK_FREE, 0UL);
	set_track(s, object, TRACK_ALLOC, 0UL);
C
Christoph Lameter 已提交
611 612
}

613
static void print_track(const char *s, struct track *t, unsigned long pr_time)
C
Christoph Lameter 已提交
614 615 616 617
{
	if (!t->addr)
		return;

618
	pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
619
	       s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
620 621 622 623 624
#ifdef CONFIG_STACKTRACE
	{
		int i;
		for (i = 0; i < TRACK_ADDRS_COUNT; i++)
			if (t->addrs[i])
625
				pr_err("\t%pS\n", (void *)t->addrs[i]);
626 627 628 629
			else
				break;
	}
#endif
630 631
}

632
void print_tracking(struct kmem_cache *s, void *object)
633
{
634
	unsigned long pr_time = jiffies;
635 636 637
	if (!(s->flags & SLAB_STORE_USER))
		return;

638 639
	print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time);
	print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
640 641 642 643
}

static void print_page_info(struct page *page)
{
644
	pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
645
	       page, page->objects, page->inuse, page->freelist, page->flags);
646 647 648 649 650

}

static void slab_bug(struct kmem_cache *s, char *fmt, ...)
{
651
	struct va_format vaf;
652 653 654
	va_list args;

	va_start(args, fmt);
655 656
	vaf.fmt = fmt;
	vaf.va = &args;
657
	pr_err("=============================================================================\n");
658
	pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
659
	pr_err("-----------------------------------------------------------------------------\n\n");
660

661
	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
662
	va_end(args);
C
Christoph Lameter 已提交
663 664
}

665 666
static void slab_fix(struct kmem_cache *s, char *fmt, ...)
{
667
	struct va_format vaf;
668 669 670
	va_list args;

	va_start(args, fmt);
671 672 673
	vaf.fmt = fmt;
	vaf.va = &args;
	pr_err("FIX %s: %pV\n", s->name, &vaf);
674 675 676
	va_end(args);
}

677
static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
678
			       void **freelist, void *nextfree)
679 680
{
	if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
681 682 683
	    !check_valid_pointer(s, page, nextfree) && freelist) {
		object_err(s, page, *freelist, "Freechain corrupt");
		*freelist = NULL;
684 685 686 687 688 689 690
		slab_fix(s, "Isolate corrupted freechain");
		return true;
	}

	return false;
}

691
static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
C
Christoph Lameter 已提交
692 693
{
	unsigned int off;	/* Offset of last byte */
694
	u8 *addr = page_address(page);
695 696 697 698 699

	print_tracking(s, p);

	print_page_info(page);

700 701
	pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
	       p, p - addr, get_freepointer(s, p));
702

J
Joonsoo Kim 已提交
703
	if (s->flags & SLAB_RED_ZONE)
704 705
		print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
			      s->red_left_pad);
J
Joonsoo Kim 已提交
706
	else if (p > addr + 16)
707
		print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
C
Christoph Lameter 已提交
708

709
	print_section(KERN_ERR, "Object ", p,
710
		      min_t(unsigned int, s->object_size, PAGE_SIZE));
C
Christoph Lameter 已提交
711
	if (s->flags & SLAB_RED_ZONE)
712
		print_section(KERN_ERR, "Redzone ", p + s->object_size,
713
			s->inuse - s->object_size);
C
Christoph Lameter 已提交
714

715
	off = get_info_end(s);
C
Christoph Lameter 已提交
716

717
	if (s->flags & SLAB_STORE_USER)
C
Christoph Lameter 已提交
718 719
		off += 2 * sizeof(struct track);

720 721
	off += kasan_metadata_size(s);

J
Joonsoo Kim 已提交
722
	if (off != size_from_object(s))
C
Christoph Lameter 已提交
723
		/* Beginning of the filler is the free pointer */
724 725
		print_section(KERN_ERR, "Padding ", p + off,
			      size_from_object(s) - off);
726 727

	dump_stack();
C
Christoph Lameter 已提交
728 729
}

730
void object_err(struct kmem_cache *s, struct page *page,
C
Christoph Lameter 已提交
731 732
			u8 *object, char *reason)
{
733
	slab_bug(s, "%s", reason);
734
	print_trailer(s, page, object);
C
Christoph Lameter 已提交
735 736
}

737
static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
738
			const char *fmt, ...)
C
Christoph Lameter 已提交
739 740 741 742
{
	va_list args;
	char buf[100];

743 744
	va_start(args, fmt);
	vsnprintf(buf, sizeof(buf), fmt, args);
C
Christoph Lameter 已提交
745
	va_end(args);
746
	slab_bug(s, "%s", buf);
747
	print_page_info(page);
C
Christoph Lameter 已提交
748 749 750
	dump_stack();
}

751
static void init_object(struct kmem_cache *s, void *object, u8 val)
C
Christoph Lameter 已提交
752
{
753
	u8 *p = kasan_reset_tag(object);
C
Christoph Lameter 已提交
754

J
Joonsoo Kim 已提交
755 756 757
	if (s->flags & SLAB_RED_ZONE)
		memset(p - s->red_left_pad, val, s->red_left_pad);

C
Christoph Lameter 已提交
758
	if (s->flags & __OBJECT_POISON) {
759 760
		memset(p, POISON_FREE, s->object_size - 1);
		p[s->object_size - 1] = POISON_END;
C
Christoph Lameter 已提交
761 762 763
	}

	if (s->flags & SLAB_RED_ZONE)
764
		memset(p + s->object_size, val, s->inuse - s->object_size);
C
Christoph Lameter 已提交
765 766
}

767 768 769 770 771 772 773 774 775
static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
						void *from, void *to)
{
	slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
	memset(from, data, to - from);
}

static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
			u8 *object, char *what,
P
Pekka Enberg 已提交
776
			u8 *start, unsigned int value, unsigned int bytes)
777 778 779
{
	u8 *fault;
	u8 *end;
780
	u8 *addr = page_address(page);
781

782
	metadata_access_enable();
783
	fault = memchr_inv(kasan_reset_tag(start), value, bytes);
784
	metadata_access_disable();
785 786 787 788 789 790 791 792
	if (!fault)
		return 1;

	end = start + bytes;
	while (end > fault && end[-1] == value)
		end--;

	slab_bug(s, "%s overwritten", what);
793 794 795
	pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
					fault, end - 1, fault - addr,
					fault[0], value);
796 797 798 799
	print_trailer(s, page, object);

	restore_bytes(s, what, value, fault, end);
	return 0;
C
Christoph Lameter 已提交
800 801 802 803 804 805 806 807
}

/*
 * Object layout:
 *
 * object address
 * 	Bytes of the object to be managed.
 * 	If the freepointer may overlay the object then the free
808
 *	pointer is at the middle of the object.
C
Christoph Lameter 已提交
809
 *
C
Christoph Lameter 已提交
810 811 812
 * 	Poisoning uses 0x6b (POISON_FREE) and the last byte is
 * 	0xa5 (POISON_END)
 *
813
 * object + s->object_size
C
Christoph Lameter 已提交
814
 * 	Padding to reach word boundary. This is also used for Redzoning.
C
Christoph Lameter 已提交
815
 * 	Padding is extended by another word if Redzoning is enabled and
816
 * 	object_size == inuse.
C
Christoph Lameter 已提交
817
 *
C
Christoph Lameter 已提交
818 819 820 821
 * 	We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 * 	0xcc (RED_ACTIVE) for objects in use.
 *
 * object + s->inuse
C
Christoph Lameter 已提交
822 823
 * 	Meta data starts here.
 *
C
Christoph Lameter 已提交
824 825
 * 	A. Free pointer (if we cannot overwrite object on free)
 * 	B. Tracking data for SLAB_STORE_USER
C
Christoph Lameter 已提交
826
 * 	C. Padding to reach required alignment boundary or at mininum
C
Christoph Lameter 已提交
827
 * 		one word if debugging is on to be able to detect writes
C
Christoph Lameter 已提交
828 829 830
 * 		before the word boundary.
 *
 *	Padding is done using 0x5a (POISON_INUSE)
C
Christoph Lameter 已提交
831 832
 *
 * object + s->size
C
Christoph Lameter 已提交
833
 * 	Nothing is used beyond s->size.
C
Christoph Lameter 已提交
834
 *
835
 * If slabcaches are merged then the object_size and inuse boundaries are mostly
C
Christoph Lameter 已提交
836
 * ignored. And therefore no slab options that rely on these boundaries
C
Christoph Lameter 已提交
837 838 839 840 841
 * may be used with merged slabcaches.
 */

static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
{
842
	unsigned long off = get_info_end(s);	/* The end of info */
C
Christoph Lameter 已提交
843 844 845 846 847

	if (s->flags & SLAB_STORE_USER)
		/* We also have user information there */
		off += 2 * sizeof(struct track);

848 849
	off += kasan_metadata_size(s);

J
Joonsoo Kim 已提交
850
	if (size_from_object(s) == off)
C
Christoph Lameter 已提交
851 852
		return 1;

853
	return check_bytes_and_report(s, page, p, "Object padding",
J
Joonsoo Kim 已提交
854
			p + off, POISON_INUSE, size_from_object(s) - off);
C
Christoph Lameter 已提交
855 856
}

857
/* Check the pad bytes at the end of a slab page */
C
Christoph Lameter 已提交
858 859
static int slab_pad_check(struct kmem_cache *s, struct page *page)
{
860 861 862
	u8 *start;
	u8 *fault;
	u8 *end;
863
	u8 *pad;
864 865
	int length;
	int remainder;
C
Christoph Lameter 已提交
866 867 868 869

	if (!(s->flags & SLAB_POISON))
		return 1;

870
	start = page_address(page);
871
	length = page_size(page);
872 873
	end = start + length;
	remainder = length % s->size;
C
Christoph Lameter 已提交
874 875 876
	if (!remainder)
		return 1;

877
	pad = end - remainder;
878
	metadata_access_enable();
879
	fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder);
880
	metadata_access_disable();
881 882 883 884 885
	if (!fault)
		return 1;
	while (end > fault && end[-1] == POISON_INUSE)
		end--;

886 887
	slab_err(s, page, "Padding overwritten. 0x%p-0x%p @offset=%tu",
			fault, end - 1, fault - start);
888
	print_section(KERN_ERR, "Padding ", pad, remainder);
889

890
	restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
891
	return 0;
C
Christoph Lameter 已提交
892 893 894
}

static int check_object(struct kmem_cache *s, struct page *page,
895
					void *object, u8 val)
C
Christoph Lameter 已提交
896 897
{
	u8 *p = object;
898
	u8 *endobject = object + s->object_size;
C
Christoph Lameter 已提交
899 900

	if (s->flags & SLAB_RED_ZONE) {
J
Joonsoo Kim 已提交
901 902 903 904
		if (!check_bytes_and_report(s, page, object, "Redzone",
			object - s->red_left_pad, val, s->red_left_pad))
			return 0;

905
		if (!check_bytes_and_report(s, page, object, "Redzone",
906
			endobject, val, s->inuse - s->object_size))
C
Christoph Lameter 已提交
907 908
			return 0;
	} else {
909
		if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
I
Ingo Molnar 已提交
910
			check_bytes_and_report(s, page, p, "Alignment padding",
911 912
				endobject, POISON_INUSE,
				s->inuse - s->object_size);
I
Ingo Molnar 已提交
913
		}
C
Christoph Lameter 已提交
914 915 916
	}

	if (s->flags & SLAB_POISON) {
917
		if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
918
			(!check_bytes_and_report(s, page, p, "Poison", p,
919
					POISON_FREE, s->object_size - 1) ||
920
			 !check_bytes_and_report(s, page, p, "Poison",
921
				p + s->object_size - 1, POISON_END, 1)))
C
Christoph Lameter 已提交
922 923 924 925 926 927 928
			return 0;
		/*
		 * check_pad_bytes cleans up on its own.
		 */
		check_pad_bytes(s, page, p);
	}

929
	if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE)
C
Christoph Lameter 已提交
930 931 932 933 934 935 936 937 938 939
		/*
		 * Object and freepointer overlap. Cannot check
		 * freepointer while object is allocated.
		 */
		return 1;

	/* Check free pointer validity */
	if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
		object_err(s, page, p, "Freepointer corrupt");
		/*
N
Nick Andrew 已提交
940
		 * No choice but to zap it and thus lose the remainder
C
Christoph Lameter 已提交
941
		 * of the free objects in this slab. May cause
C
Christoph Lameter 已提交
942
		 * another error because the object count is now wrong.
C
Christoph Lameter 已提交
943
		 */
944
		set_freepointer(s, p, NULL);
C
Christoph Lameter 已提交
945 946 947 948 949 950 951
		return 0;
	}
	return 1;
}

static int check_slab(struct kmem_cache *s, struct page *page)
{
952 953
	int maxobj;

C
Christoph Lameter 已提交
954 955 956
	VM_BUG_ON(!irqs_disabled());

	if (!PageSlab(page)) {
957
		slab_err(s, page, "Not a valid slab page");
C
Christoph Lameter 已提交
958 959
		return 0;
	}
960

961
	maxobj = order_objects(compound_order(page), s->size);
962 963
	if (page->objects > maxobj) {
		slab_err(s, page, "objects %u > max %u",
964
			page->objects, maxobj);
965 966 967
		return 0;
	}
	if (page->inuse > page->objects) {
968
		slab_err(s, page, "inuse %u > max %u",
969
			page->inuse, page->objects);
C
Christoph Lameter 已提交
970 971 972 973 974 975 976 977
		return 0;
	}
	/* Slab_pad_check fixes things up after itself */
	slab_pad_check(s, page);
	return 1;
}

/*
C
Christoph Lameter 已提交
978 979
 * Determine if a certain object on a page is on the freelist. Must hold the
 * slab lock to guarantee that the chains are in a consistent state.
C
Christoph Lameter 已提交
980 981 982 983
 */
static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
{
	int nr = 0;
984
	void *fp;
C
Christoph Lameter 已提交
985
	void *object = NULL;
986
	int max_objects;
C
Christoph Lameter 已提交
987

988
	fp = page->freelist;
989
	while (fp && nr <= page->objects) {
C
Christoph Lameter 已提交
990 991 992 993 994 995
		if (fp == search)
			return 1;
		if (!check_valid_pointer(s, page, fp)) {
			if (object) {
				object_err(s, page, object,
					"Freechain corrupt");
996
				set_freepointer(s, object, NULL);
C
Christoph Lameter 已提交
997
			} else {
998
				slab_err(s, page, "Freepointer corrupt");
999
				page->freelist = NULL;
1000
				page->inuse = page->objects;
1001
				slab_fix(s, "Freelist cleared");
C
Christoph Lameter 已提交
1002 1003 1004 1005 1006 1007 1008 1009 1010
				return 0;
			}
			break;
		}
		object = fp;
		fp = get_freepointer(s, object);
		nr++;
	}

1011
	max_objects = order_objects(compound_order(page), s->size);
1012 1013
	if (max_objects > MAX_OBJS_PER_PAGE)
		max_objects = MAX_OBJS_PER_PAGE;
1014 1015

	if (page->objects != max_objects) {
J
Joe Perches 已提交
1016 1017
		slab_err(s, page, "Wrong number of objects. Found %d but should be %d",
			 page->objects, max_objects);
1018 1019 1020
		page->objects = max_objects;
		slab_fix(s, "Number of objects adjusted.");
	}
1021
	if (page->inuse != page->objects - nr) {
J
Joe Perches 已提交
1022 1023
		slab_err(s, page, "Wrong object count. Counter is %d but counted were %d",
			 page->inuse, page->objects - nr);
1024
		page->inuse = page->objects - nr;
1025
		slab_fix(s, "Object count adjusted.");
C
Christoph Lameter 已提交
1026 1027 1028 1029
	}
	return search == NULL;
}

1030 1031
static void trace(struct kmem_cache *s, struct page *page, void *object,
								int alloc)
C
Christoph Lameter 已提交
1032 1033
{
	if (s->flags & SLAB_TRACE) {
1034
		pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
C
Christoph Lameter 已提交
1035 1036 1037 1038 1039 1040
			s->name,
			alloc ? "alloc" : "free",
			object, page->inuse,
			page->freelist);

		if (!alloc)
1041
			print_section(KERN_INFO, "Object ", (void *)object,
1042
					s->object_size);
C
Christoph Lameter 已提交
1043 1044 1045 1046 1047

		dump_stack();
	}
}

1048
/*
C
Christoph Lameter 已提交
1049
 * Tracking of fully allocated slabs for debugging purposes.
1050
 */
1051 1052
static void add_full(struct kmem_cache *s,
	struct kmem_cache_node *n, struct page *page)
1053
{
1054 1055 1056
	if (!(s->flags & SLAB_STORE_USER))
		return;

1057
	lockdep_assert_held(&n->list_lock);
1058
	list_add(&page->slab_list, &n->full);
1059 1060
}

P
Peter Zijlstra 已提交
1061
static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
1062 1063 1064 1065
{
	if (!(s->flags & SLAB_STORE_USER))
		return;

1066
	lockdep_assert_held(&n->list_lock);
1067
	list_del(&page->slab_list);
1068 1069
}

1070 1071 1072 1073 1074 1075 1076 1077
/* Tracking of the number of slabs for debugging purposes */
static inline unsigned long slabs_node(struct kmem_cache *s, int node)
{
	struct kmem_cache_node *n = get_node(s, node);

	return atomic_long_read(&n->nr_slabs);
}

1078 1079 1080 1081 1082
static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
{
	return atomic_long_read(&n->nr_slabs);
}

1083
static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
1084 1085 1086 1087 1088 1089 1090 1091 1092
{
	struct kmem_cache_node *n = get_node(s, node);

	/*
	 * May be called early in order to allocate a slab for the
	 * kmem_cache_node structure. Solve the chicken-egg
	 * dilemma by deferring the increment of the count during
	 * bootstrap (see early_kmem_cache_node_alloc).
	 */
1093
	if (likely(n)) {
1094
		atomic_long_inc(&n->nr_slabs);
1095 1096
		atomic_long_add(objects, &n->total_objects);
	}
1097
}
1098
static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
1099 1100 1101 1102
{
	struct kmem_cache_node *n = get_node(s, node);

	atomic_long_dec(&n->nr_slabs);
1103
	atomic_long_sub(objects, &n->total_objects);
1104 1105 1106
}

/* Object debug checks for alloc/free paths */
C
Christoph Lameter 已提交
1107 1108 1109
static void setup_object_debug(struct kmem_cache *s, struct page *page,
								void *object)
{
1110
	if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
C
Christoph Lameter 已提交
1111 1112
		return;

1113
	init_object(s, object, SLUB_RED_INACTIVE);
C
Christoph Lameter 已提交
1114 1115 1116
	init_tracking(s, object);
}

1117 1118
static
void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
1119
{
1120
	if (!kmem_cache_debug_flags(s, SLAB_POISON))
1121 1122 1123
		return;

	metadata_access_enable();
1124
	memset(kasan_reset_tag(addr), POISON_INUSE, page_size(page));
1125 1126 1127
	metadata_access_disable();
}

1128
static inline int alloc_consistency_checks(struct kmem_cache *s,
1129
					struct page *page, void *object)
C
Christoph Lameter 已提交
1130 1131
{
	if (!check_slab(s, page))
1132
		return 0;
C
Christoph Lameter 已提交
1133 1134 1135

	if (!check_valid_pointer(s, page, object)) {
		object_err(s, page, object, "Freelist Pointer check fails");
1136
		return 0;
C
Christoph Lameter 已提交
1137 1138
	}

1139
	if (!check_object(s, page, object, SLUB_RED_INACTIVE))
1140 1141 1142 1143 1144 1145 1146 1147 1148 1149
		return 0;

	return 1;
}

static noinline int alloc_debug_processing(struct kmem_cache *s,
					struct page *page,
					void *object, unsigned long addr)
{
	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1150
		if (!alloc_consistency_checks(s, page, object))
1151 1152
			goto bad;
	}
C
Christoph Lameter 已提交
1153

C
Christoph Lameter 已提交
1154 1155 1156 1157
	/* Success perform special debug activities for allocs */
	if (s->flags & SLAB_STORE_USER)
		set_track(s, object, TRACK_ALLOC, addr);
	trace(s, page, object, 1);
1158
	init_object(s, object, SLUB_RED_ACTIVE);
C
Christoph Lameter 已提交
1159
	return 1;
C
Christoph Lameter 已提交
1160

C
Christoph Lameter 已提交
1161 1162 1163 1164 1165
bad:
	if (PageSlab(page)) {
		/*
		 * If this is a slab page then lets do the best we can
		 * to avoid issues in the future. Marking all objects
C
Christoph Lameter 已提交
1166
		 * as used avoids touching the remaining objects.
C
Christoph Lameter 已提交
1167
		 */
1168
		slab_fix(s, "Marking all objects used");
1169
		page->inuse = page->objects;
1170
		page->freelist = NULL;
C
Christoph Lameter 已提交
1171 1172 1173 1174
	}
	return 0;
}

1175 1176
static inline int free_consistency_checks(struct kmem_cache *s,
		struct page *page, void *object, unsigned long addr)
C
Christoph Lameter 已提交
1177 1178
{
	if (!check_valid_pointer(s, page, object)) {
1179
		slab_err(s, page, "Invalid object pointer 0x%p", object);
1180
		return 0;
C
Christoph Lameter 已提交
1181 1182 1183
	}

	if (on_freelist(s, page, object)) {
1184
		object_err(s, page, object, "Object already free");
1185
		return 0;
C
Christoph Lameter 已提交
1186 1187
	}

1188
	if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1189
		return 0;
C
Christoph Lameter 已提交
1190

1191
	if (unlikely(s != page->slab_cache)) {
I
Ingo Molnar 已提交
1192
		if (!PageSlab(page)) {
J
Joe Perches 已提交
1193 1194
			slab_err(s, page, "Attempt to free object(0x%p) outside of slab",
				 object);
1195
		} else if (!page->slab_cache) {
1196 1197
			pr_err("SLUB <none>: no slab for object 0x%p.\n",
			       object);
1198
			dump_stack();
P
Pekka Enberg 已提交
1199
		} else
1200 1201
			object_err(s, page, object,
					"page slab pointer corrupt.");
1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215
		return 0;
	}
	return 1;
}

/* Supports checking bulk free of a constructed freelist */
static noinline int free_debug_processing(
	struct kmem_cache *s, struct page *page,
	void *head, void *tail, int bulk_cnt,
	unsigned long addr)
{
	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
	void *object = head;
	int cnt = 0;
1216
	unsigned long flags;
1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232
	int ret = 0;

	spin_lock_irqsave(&n->list_lock, flags);
	slab_lock(page);

	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
		if (!check_slab(s, page))
			goto out;
	}

next_object:
	cnt++;

	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
		if (!free_consistency_checks(s, page, object, addr))
			goto out;
C
Christoph Lameter 已提交
1233
	}
C
Christoph Lameter 已提交
1234 1235 1236 1237

	if (s->flags & SLAB_STORE_USER)
		set_track(s, object, TRACK_FREE, addr);
	trace(s, page, object, 0);
1238
	/* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
1239
	init_object(s, object, SLUB_RED_INACTIVE);
1240 1241 1242 1243 1244 1245

	/* Reached end of constructed freelist yet? */
	if (object != tail) {
		object = get_freepointer(s, object);
		goto next_object;
	}
1246 1247
	ret = 1;

1248
out:
1249 1250 1251 1252
	if (cnt != bulk_cnt)
		slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
			 bulk_cnt, cnt);

1253
	slab_unlock(page);
1254
	spin_unlock_irqrestore(&n->list_lock, flags);
1255 1256 1257
	if (!ret)
		slab_fix(s, "Object at 0x%p not freed", object);
	return ret;
C
Christoph Lameter 已提交
1258 1259
}

1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271
/*
 * Parse a block of slub_debug options. Blocks are delimited by ';'
 *
 * @str:    start of block
 * @flags:  returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
 * @slabs:  return start of list of slabs, or NULL when there's no list
 * @init:   assume this is initial parsing and not per-kmem-create parsing
 *
 * returns the start of next block if there's any, or NULL
 */
static char *
parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
C
Christoph Lameter 已提交
1272
{
1273
	bool higher_order_disable = false;
1274

1275 1276 1277 1278 1279
	/* Skip any completely empty blocks */
	while (*str && *str == ';')
		str++;

	if (*str == ',') {
1280 1281 1282 1283
		/*
		 * No options but restriction on slabs. This means full
		 * debugging for slabs matching a pattern.
		 */
1284
		*flags = DEBUG_DEFAULT_FLAGS;
1285
		goto check_slabs;
1286 1287
	}
	*flags = 0;
1288

1289 1290
	/* Determine which debug features should be switched on */
	for (; *str && *str != ',' && *str != ';'; str++) {
1291
		switch (tolower(*str)) {
1292 1293 1294
		case '-':
			*flags = 0;
			break;
1295
		case 'f':
1296
			*flags |= SLAB_CONSISTENCY_CHECKS;
1297 1298
			break;
		case 'z':
1299
			*flags |= SLAB_RED_ZONE;
1300 1301
			break;
		case 'p':
1302
			*flags |= SLAB_POISON;
1303 1304
			break;
		case 'u':
1305
			*flags |= SLAB_STORE_USER;
1306 1307
			break;
		case 't':
1308
			*flags |= SLAB_TRACE;
1309
			break;
1310
		case 'a':
1311
			*flags |= SLAB_FAILSLAB;
1312
			break;
1313 1314 1315 1316 1317
		case 'o':
			/*
			 * Avoid enabling debugging on caches if its minimum
			 * order would increase as a result.
			 */
1318
			higher_order_disable = true;
1319
			break;
1320
		default:
1321 1322
			if (init)
				pr_err("slub_debug option '%c' unknown. skipped\n", *str);
1323
		}
C
Christoph Lameter 已提交
1324
	}
1325
check_slabs:
C
Christoph Lameter 已提交
1326
	if (*str == ',')
1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385
		*slabs = ++str;
	else
		*slabs = NULL;

	/* Skip over the slab list */
	while (*str && *str != ';')
		str++;

	/* Skip any completely empty blocks */
	while (*str && *str == ';')
		str++;

	if (init && higher_order_disable)
		disable_higher_order_debug = 1;

	if (*str)
		return str;
	else
		return NULL;
}

static int __init setup_slub_debug(char *str)
{
	slab_flags_t flags;
	char *saved_str;
	char *slab_list;
	bool global_slub_debug_changed = false;
	bool slab_list_specified = false;

	slub_debug = DEBUG_DEFAULT_FLAGS;
	if (*str++ != '=' || !*str)
		/*
		 * No options specified. Switch on full debugging.
		 */
		goto out;

	saved_str = str;
	while (str) {
		str = parse_slub_debug_flags(str, &flags, &slab_list, true);

		if (!slab_list) {
			slub_debug = flags;
			global_slub_debug_changed = true;
		} else {
			slab_list_specified = true;
		}
	}

	/*
	 * For backwards compatibility, a single list of flags with list of
	 * slabs means debugging is only enabled for those slabs, so the global
	 * slub_debug should be 0. We can extended that to multiple lists as
	 * long as there is no option specifying flags without a slab list.
	 */
	if (slab_list_specified) {
		if (!global_slub_debug_changed)
			slub_debug = 0;
		slub_debug_string = saved_str;
	}
1386
out:
1387 1388
	if (slub_debug != 0 || slub_debug_string)
		static_branch_enable(&slub_debug_enabled);
1389 1390 1391 1392
	if ((static_branch_unlikely(&init_on_alloc) ||
	     static_branch_unlikely(&init_on_free)) &&
	    (slub_debug & SLAB_POISON))
		pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
C
Christoph Lameter 已提交
1393 1394 1395 1396 1397
	return 1;
}

__setup("slub_debug", setup_slub_debug);

1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409
/*
 * kmem_cache_flags - apply debugging options to the cache
 * @object_size:	the size of an object without meta data
 * @flags:		flags to set
 * @name:		name of the cache
 * @ctor:		constructor function
 *
 * Debug option(s) are applied to @flags. In addition to the debug
 * option(s), if a slab name (or multiple) is specified i.e.
 * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
 * then only the select slabs will receive the debug option(s).
 */
1410
slab_flags_t kmem_cache_flags(unsigned int object_size,
1411
	slab_flags_t flags, const char *name,
1412
	void (*ctor)(void *))
C
Christoph Lameter 已提交
1413
{
1414 1415
	char *iter;
	size_t len;
1416 1417
	char *next_block;
	slab_flags_t block_flags;
1418 1419

	len = strlen(name);
1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439
	next_block = slub_debug_string;
	/* Go through all blocks of debug options, see if any matches our slab's name */
	while (next_block) {
		next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false);
		if (!iter)
			continue;
		/* Found a block that has a slab list, search it */
		while (*iter) {
			char *end, *glob;
			size_t cmplen;

			end = strchrnul(iter, ',');
			if (next_block && next_block < end)
				end = next_block - 1;

			glob = strnchr(iter, end - iter, '*');
			if (glob)
				cmplen = glob - iter;
			else
				cmplen = max_t(size_t, len, (end - iter));
1440

1441 1442 1443 1444
			if (!strncmp(name, iter, cmplen)) {
				flags |= block_flags;
				return flags;
			}
1445

1446 1447 1448
			if (!*end || *end == ';')
				break;
			iter = end + 1;
1449 1450
		}
	}
1451

1452
	return flags | slub_debug;
C
Christoph Lameter 已提交
1453
}
1454
#else /* !CONFIG_SLUB_DEBUG */
C
Christoph Lameter 已提交
1455 1456
static inline void setup_object_debug(struct kmem_cache *s,
			struct page *page, void *object) {}
1457 1458
static inline
void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {}
C
Christoph Lameter 已提交
1459

C
Christoph Lameter 已提交
1460
static inline int alloc_debug_processing(struct kmem_cache *s,
1461
	struct page *page, void *object, unsigned long addr) { return 0; }
C
Christoph Lameter 已提交
1462

1463
static inline int free_debug_processing(
1464 1465
	struct kmem_cache *s, struct page *page,
	void *head, void *tail, int bulk_cnt,
1466
	unsigned long addr) { return 0; }
C
Christoph Lameter 已提交
1467 1468 1469 1470

static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
			{ return 1; }
static inline int check_object(struct kmem_cache *s, struct page *page,
1471
			void *object, u8 val) { return 1; }
1472 1473
static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
					struct page *page) {}
P
Peter Zijlstra 已提交
1474 1475
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
					struct page *page) {}
1476
slab_flags_t kmem_cache_flags(unsigned int object_size,
1477
	slab_flags_t flags, const char *name,
1478
	void (*ctor)(void *))
1479 1480 1481
{
	return flags;
}
C
Christoph Lameter 已提交
1482
#define slub_debug 0
1483

1484 1485
#define disable_higher_order_debug 0

1486 1487
static inline unsigned long slabs_node(struct kmem_cache *s, int node)
							{ return 0; }
1488 1489
static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
							{ return 0; }
1490 1491 1492 1493
static inline void inc_slabs_node(struct kmem_cache *s, int node,
							int objects) {}
static inline void dec_slabs_node(struct kmem_cache *s, int node,
							int objects) {}
1494

1495
static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
1496
			       void **freelist, void *nextfree)
1497 1498 1499
{
	return false;
}
1500 1501 1502 1503 1504 1505
#endif /* CONFIG_SLUB_DEBUG */

/*
 * Hooks for other subsystems that check memory allocations. In a typical
 * production configuration these hooks all should produce no code at all.
 */
1506
static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
1507
{
1508
	ptr = kasan_kmalloc_large(ptr, size, flags);
1509
	/* As ptr might get tagged, call kmemleak hook after KASAN. */
1510
	kmemleak_alloc(ptr, size, 1, flags);
1511
	return ptr;
1512 1513
}

1514
static __always_inline void kfree_hook(void *x)
1515 1516
{
	kmemleak_free(x);
1517
	kasan_kfree_large(x, _RET_IP_);
1518 1519
}

1520
static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
1521 1522
{
	kmemleak_free_recursive(x, s->flags);
1523

1524 1525 1526 1527 1528
	/*
	 * Trouble is that we may no longer disable interrupts in the fast path
	 * So in order to make the debug calls that expect irqs to be
	 * disabled we need to disable interrupts temporarily.
	 */
1529
#ifdef CONFIG_LOCKDEP
1530 1531 1532 1533 1534 1535 1536 1537 1538 1539
	{
		unsigned long flags;

		local_irq_save(flags);
		debug_check_no_locks_freed(x, s->object_size);
		local_irq_restore(flags);
	}
#endif
	if (!(s->flags & SLAB_DEBUG_OBJECTS))
		debug_check_no_obj_freed(x, s->object_size);
1540

1541 1542 1543 1544 1545
	/* Use KCSAN to help debug racy use-after-free. */
	if (!(s->flags & SLAB_TYPESAFE_BY_RCU))
		__kcsan_check_access(x, s->object_size,
				     KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);

1546 1547
	/* KASAN might put x into memory quarantine, delaying its reuse */
	return kasan_slab_free(s, x, _RET_IP_);
1548
}
1549

1550 1551
static inline bool slab_free_freelist_hook(struct kmem_cache *s,
					   void **head, void **tail)
1552
{
1553 1554 1555 1556 1557 1558

	void *object;
	void *next = *head;
	void *old_tail = *tail ? *tail : *head;
	int rsize;

1559 1560 1561
	/* Head and tail of the reconstructed freelist */
	*head = NULL;
	*tail = NULL;
1562

1563 1564 1565 1566 1567
	do {
		object = next;
		next = get_freepointer(s, object);

		if (slab_want_init_on_free(s)) {
1568 1569 1570 1571
			/*
			 * Clear the object and the metadata, but don't touch
			 * the redzone.
			 */
1572
			memset(kasan_reset_tag(object), 0, s->object_size);
1573 1574
			rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad
							   : 0;
1575
			memset((char *)kasan_reset_tag(object) + s->inuse, 0,
1576
			       s->size - s->inuse - rsize);
1577

1578
		}
1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592
		/* If object's reuse doesn't have to be delayed */
		if (!slab_free_hook(s, object)) {
			/* Move object to the new freelist */
			set_freepointer(s, object, *head);
			*head = object;
			if (!*tail)
				*tail = object;
		}
	} while (object != old_tail);

	if (*head == *tail)
		*tail = NULL;

	return *head != NULL;
1593 1594
}

1595
static void *setup_object(struct kmem_cache *s, struct page *page,
1596 1597 1598
				void *object)
{
	setup_object_debug(s, page, object);
1599
	object = kasan_init_slab_obj(s, object);
1600 1601 1602 1603 1604
	if (unlikely(s->ctor)) {
		kasan_unpoison_object_data(s, object);
		s->ctor(object);
		kasan_poison_object_data(s, object);
	}
1605
	return object;
1606 1607
}

C
Christoph Lameter 已提交
1608 1609 1610
/*
 * Slab allocation and freeing
 */
1611 1612
static inline struct page *alloc_slab_page(struct kmem_cache *s,
		gfp_t flags, int node, struct kmem_cache_order_objects oo)
1613
{
1614
	struct page *page;
1615
	unsigned int order = oo_order(oo);
1616

1617
	if (node == NUMA_NO_NODE)
1618
		page = alloc_pages(flags, order);
1619
	else
1620
		page = __alloc_pages_node(node, flags, order);
1621 1622

	return page;
1623 1624
}

T
Thomas Garnier 已提交
1625 1626 1627 1628
#ifdef CONFIG_SLAB_FREELIST_RANDOM
/* Pre-initialize the random sequence cache */
static int init_cache_random_seq(struct kmem_cache *s)
{
1629
	unsigned int count = oo_objects(s->oo);
T
Thomas Garnier 已提交
1630 1631
	int err;

1632 1633 1634 1635
	/* Bailout if already initialised */
	if (s->random_seq)
		return 0;

T
Thomas Garnier 已提交
1636 1637 1638 1639 1640 1641 1642 1643 1644
	err = cache_random_seq_create(s, count, GFP_KERNEL);
	if (err) {
		pr_err("SLUB: Unable to initialize free list for %s\n",
			s->name);
		return err;
	}

	/* Transform to an offset on the set of pages */
	if (s->random_seq) {
1645 1646
		unsigned int i;

T
Thomas Garnier 已提交
1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707
		for (i = 0; i < count; i++)
			s->random_seq[i] *= s->size;
	}
	return 0;
}

/* Initialize each random sequence freelist per cache */
static void __init init_freelist_randomization(void)
{
	struct kmem_cache *s;

	mutex_lock(&slab_mutex);

	list_for_each_entry(s, &slab_caches, list)
		init_cache_random_seq(s);

	mutex_unlock(&slab_mutex);
}

/* Get the next entry on the pre-computed freelist randomized */
static void *next_freelist_entry(struct kmem_cache *s, struct page *page,
				unsigned long *pos, void *start,
				unsigned long page_limit,
				unsigned long freelist_count)
{
	unsigned int idx;

	/*
	 * If the target page allocation failed, the number of objects on the
	 * page might be smaller than the usual size defined by the cache.
	 */
	do {
		idx = s->random_seq[*pos];
		*pos += 1;
		if (*pos >= freelist_count)
			*pos = 0;
	} while (unlikely(idx >= page_limit));

	return (char *)start + idx;
}

/* Shuffle the single linked freelist based on a random pre-computed sequence */
static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
{
	void *start;
	void *cur;
	void *next;
	unsigned long idx, pos, page_limit, freelist_count;

	if (page->objects < 2 || !s->random_seq)
		return false;

	freelist_count = oo_objects(s->oo);
	pos = get_random_int() % freelist_count;

	page_limit = page->objects * s->size;
	start = fixup_red_left(s, page_address(page));

	/* First entry is used as the base of the freelist */
	cur = next_freelist_entry(s, page, &pos, start, page_limit,
				freelist_count);
1708
	cur = setup_object(s, page, cur);
T
Thomas Garnier 已提交
1709 1710 1711 1712 1713
	page->freelist = cur;

	for (idx = 1; idx < page->objects; idx++) {
		next = next_freelist_entry(s, page, &pos, start, page_limit,
			freelist_count);
1714
		next = setup_object(s, page, next);
T
Thomas Garnier 已提交
1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733
		set_freepointer(s, cur, next);
		cur = next;
	}
	set_freepointer(s, cur, NULL);

	return true;
}
#else
static inline int init_cache_random_seq(struct kmem_cache *s)
{
	return 0;
}
static inline void init_freelist_randomization(void) { }
static inline bool shuffle_freelist(struct kmem_cache *s, struct page *page)
{
	return false;
}
#endif /* CONFIG_SLAB_FREELIST_RANDOM */

C
Christoph Lameter 已提交
1734 1735
static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
P
Pekka Enberg 已提交
1736
	struct page *page;
1737
	struct kmem_cache_order_objects oo = s->oo;
1738
	gfp_t alloc_gfp;
1739
	void *start, *p, *next;
1740
	int idx;
T
Thomas Garnier 已提交
1741
	bool shuffle;
C
Christoph Lameter 已提交
1742

1743 1744
	flags &= gfp_allowed_mask;

1745
	if (gfpflags_allow_blocking(flags))
1746 1747
		local_irq_enable();

1748
	flags |= s->allocflags;
1749

1750 1751 1752 1753 1754
	/*
	 * Let the initial higher-order allocation fail under memory pressure
	 * so we fall-back to the minimum order allocation.
	 */
	alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1755
	if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
1756
		alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
1757

1758
	page = alloc_slab_page(s, alloc_gfp, node, oo);
1759 1760
	if (unlikely(!page)) {
		oo = s->min;
1761
		alloc_gfp = flags;
1762 1763 1764 1765
		/*
		 * Allocation may have failed due to fragmentation.
		 * Try a lower order alloc if possible
		 */
1766
		page = alloc_slab_page(s, alloc_gfp, node, oo);
1767 1768 1769
		if (unlikely(!page))
			goto out;
		stat(s, ORDER_FALLBACK);
1770
	}
V
Vegard Nossum 已提交
1771

1772
	page->objects = oo_objects(oo);
C
Christoph Lameter 已提交
1773

1774 1775
	account_slab_page(page, oo_order(oo), s);

1776
	page->slab_cache = s;
1777
	__SetPageSlab(page);
1778
	if (page_is_pfmemalloc(page))
1779
		SetPageSlabPfmemalloc(page);
C
Christoph Lameter 已提交
1780

1781
	kasan_poison_slab(page);
C
Christoph Lameter 已提交
1782

1783
	start = page_address(page);
C
Christoph Lameter 已提交
1784

1785
	setup_page_debug(s, page, start);
1786

T
Thomas Garnier 已提交
1787 1788 1789
	shuffle = shuffle_freelist(s, page);

	if (!shuffle) {
1790 1791 1792
		start = fixup_red_left(s, start);
		start = setup_object(s, page, start);
		page->freelist = start;
1793 1794 1795 1796 1797 1798 1799
		for (idx = 0, p = start; idx < page->objects - 1; idx++) {
			next = p + s->size;
			next = setup_object(s, page, next);
			set_freepointer(s, p, next);
			p = next;
		}
		set_freepointer(s, p, NULL);
C
Christoph Lameter 已提交
1800 1801
	}

1802
	page->inuse = page->objects;
1803
	page->frozen = 1;
1804

C
Christoph Lameter 已提交
1805
out:
1806
	if (gfpflags_allow_blocking(flags))
1807 1808 1809 1810 1811 1812
		local_irq_disable();
	if (!page)
		return NULL;

	inc_slabs_node(s, page_to_nid(page), page->objects);

C
Christoph Lameter 已提交
1813 1814 1815
	return page;
}

1816 1817
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
1818 1819
	if (unlikely(flags & GFP_SLAB_BUG_MASK))
		flags = kmalloc_fix_flags(flags);
1820 1821 1822 1823 1824

	return allocate_slab(s,
		flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
}

C
Christoph Lameter 已提交
1825 1826
static void __free_slab(struct kmem_cache *s, struct page *page)
{
1827 1828
	int order = compound_order(page);
	int pages = 1 << order;
C
Christoph Lameter 已提交
1829

1830
	if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
C
Christoph Lameter 已提交
1831 1832 1833
		void *p;

		slab_pad_check(s, page);
1834 1835
		for_each_object(p, s, page_address(page),
						page->objects)
1836
			check_object(s, page, p, SLUB_RED_INACTIVE);
C
Christoph Lameter 已提交
1837 1838
	}

1839
	__ClearPageSlabPfmemalloc(page);
1840
	__ClearPageSlab(page);
1841 1842
	/* In union with page->mapping where page allocator expects NULL */
	page->slab_cache = NULL;
N
Nick Piggin 已提交
1843 1844
	if (current->reclaim_state)
		current->reclaim_state->reclaimed_slab += pages;
1845
	unaccount_slab_page(page, order, s);
1846
	__free_pages(page, order);
C
Christoph Lameter 已提交
1847 1848 1849 1850
}

static void rcu_free_slab(struct rcu_head *h)
{
1851
	struct page *page = container_of(h, struct page, rcu_head);
1852

1853
	__free_slab(page->slab_cache, page);
C
Christoph Lameter 已提交
1854 1855 1856 1857
}

static void free_slab(struct kmem_cache *s, struct page *page)
{
1858
	if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
1859
		call_rcu(&page->rcu_head, rcu_free_slab);
C
Christoph Lameter 已提交
1860 1861 1862 1863 1864 1865
	} else
		__free_slab(s, page);
}

static void discard_slab(struct kmem_cache *s, struct page *page)
{
1866
	dec_slabs_node(s, page_to_nid(page), page->objects);
C
Christoph Lameter 已提交
1867 1868 1869 1870
	free_slab(s, page);
}

/*
1871
 * Management of partially allocated slabs.
C
Christoph Lameter 已提交
1872
 */
1873 1874
static inline void
__add_partial(struct kmem_cache_node *n, struct page *page, int tail)
C
Christoph Lameter 已提交
1875
{
C
Christoph Lameter 已提交
1876
	n->nr_partial++;
1877
	if (tail == DEACTIVATE_TO_TAIL)
1878
		list_add_tail(&page->slab_list, &n->partial);
1879
	else
1880
		list_add(&page->slab_list, &n->partial);
C
Christoph Lameter 已提交
1881 1882
}

1883 1884
static inline void add_partial(struct kmem_cache_node *n,
				struct page *page, int tail)
1885
{
P
Peter Zijlstra 已提交
1886
	lockdep_assert_held(&n->list_lock);
1887 1888
	__add_partial(n, page, tail);
}
P
Peter Zijlstra 已提交
1889

1890 1891 1892 1893
static inline void remove_partial(struct kmem_cache_node *n,
					struct page *page)
{
	lockdep_assert_held(&n->list_lock);
1894
	list_del(&page->slab_list);
1895
	n->nr_partial--;
1896 1897
}

C
Christoph Lameter 已提交
1898
/*
1899 1900
 * Remove slab from the partial list, freeze it and
 * return the pointer to the freelist.
C
Christoph Lameter 已提交
1901
 *
1902
 * Returns a list of objects or NULL if it fails.
C
Christoph Lameter 已提交
1903
 */
1904
static inline void *acquire_slab(struct kmem_cache *s,
1905
		struct kmem_cache_node *n, struct page *page,
1906
		int mode, int *objects)
C
Christoph Lameter 已提交
1907
{
1908 1909 1910 1911
	void *freelist;
	unsigned long counters;
	struct page new;

P
Peter Zijlstra 已提交
1912 1913
	lockdep_assert_held(&n->list_lock);

1914 1915 1916 1917 1918
	/*
	 * Zap the freelist and set the frozen bit.
	 * The old freelist is the list of objects for the
	 * per cpu allocation list.
	 */
1919 1920 1921
	freelist = page->freelist;
	counters = page->counters;
	new.counters = counters;
1922
	*objects = new.objects - new.inuse;
1923
	if (mode) {
1924
		new.inuse = page->objects;
1925 1926 1927 1928
		new.freelist = NULL;
	} else {
		new.freelist = freelist;
	}
1929

1930
	VM_BUG_ON(new.frozen);
1931
	new.frozen = 1;
1932

1933
	if (!__cmpxchg_double_slab(s, page,
1934
			freelist, counters,
1935
			new.freelist, new.counters,
1936 1937
			"acquire_slab"))
		return NULL;
1938 1939

	remove_partial(n, page);
1940
	WARN_ON(!freelist);
1941
	return freelist;
C
Christoph Lameter 已提交
1942 1943
}

1944
static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
1945
static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
1946

C
Christoph Lameter 已提交
1947
/*
C
Christoph Lameter 已提交
1948
 * Try to allocate a partial slab from a specific node.
C
Christoph Lameter 已提交
1949
 */
1950 1951
static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
				struct kmem_cache_cpu *c, gfp_t flags)
C
Christoph Lameter 已提交
1952
{
1953 1954
	struct page *page, *page2;
	void *object = NULL;
1955
	unsigned int available = 0;
1956
	int objects;
C
Christoph Lameter 已提交
1957 1958 1959 1960

	/*
	 * Racy check. If we mistakenly see no partial slabs then we
	 * just allocate an empty slab. If we mistakenly try to get a
C
Chen Tao 已提交
1961
	 * partial slab and there is none available then get_partial()
C
Christoph Lameter 已提交
1962
	 * will return NULL.
C
Christoph Lameter 已提交
1963 1964 1965 1966 1967
	 */
	if (!n || !n->nr_partial)
		return NULL;

	spin_lock(&n->list_lock);
1968
	list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
1969
		void *t;
1970

1971 1972 1973
		if (!pfmemalloc_match(page, flags))
			continue;

1974
		t = acquire_slab(s, n, page, object == NULL, &objects);
1975
		if (!t)
1976
			continue; /* cmpxchg raced */
1977

1978
		available += objects;
1979
		if (!object) {
1980 1981 1982 1983
			c->page = page;
			stat(s, ALLOC_FROM_PARTIAL);
			object = t;
		} else {
1984
			put_cpu_partial(s, page, 0);
1985
			stat(s, CPU_PARTIAL_NODE);
1986
		}
1987
		if (!kmem_cache_has_cpu_partial(s)
1988
			|| available > slub_cpu_partial(s) / 2)
1989 1990
			break;

1991
	}
C
Christoph Lameter 已提交
1992
	spin_unlock(&n->list_lock);
1993
	return object;
C
Christoph Lameter 已提交
1994 1995 1996
}

/*
C
Christoph Lameter 已提交
1997
 * Get a page from somewhere. Search in increasing NUMA distances.
C
Christoph Lameter 已提交
1998
 */
1999
static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
2000
		struct kmem_cache_cpu *c)
C
Christoph Lameter 已提交
2001 2002 2003
{
#ifdef CONFIG_NUMA
	struct zonelist *zonelist;
2004
	struct zoneref *z;
2005
	struct zone *zone;
2006
	enum zone_type highest_zoneidx = gfp_zone(flags);
2007
	void *object;
2008
	unsigned int cpuset_mems_cookie;
C
Christoph Lameter 已提交
2009 2010

	/*
C
Christoph Lameter 已提交
2011 2012 2013 2014
	 * The defrag ratio allows a configuration of the tradeoffs between
	 * inter node defragmentation and node local allocations. A lower
	 * defrag_ratio increases the tendency to do local allocations
	 * instead of attempting to obtain partial slabs from other nodes.
C
Christoph Lameter 已提交
2015
	 *
C
Christoph Lameter 已提交
2016 2017 2018 2019
	 * If the defrag_ratio is set to 0 then kmalloc() always
	 * returns node local objects. If the ratio is higher then kmalloc()
	 * may return off node objects because partial slabs are obtained
	 * from other nodes and filled up.
C
Christoph Lameter 已提交
2020
	 *
2021 2022 2023 2024 2025
	 * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
	 * (which makes defrag_ratio = 1000) then every (well almost)
	 * allocation will first attempt to defrag slab caches on other nodes.
	 * This means scanning over all nodes to look for partial slabs which
	 * may be expensive if we do it every time we are trying to find a slab
C
Christoph Lameter 已提交
2026
	 * with available objects.
C
Christoph Lameter 已提交
2027
	 */
2028 2029
	if (!s->remote_node_defrag_ratio ||
			get_cycles() % 1024 > s->remote_node_defrag_ratio)
C
Christoph Lameter 已提交
2030 2031
		return NULL;

2032
	do {
2033
		cpuset_mems_cookie = read_mems_allowed_begin();
2034
		zonelist = node_zonelist(mempolicy_slab_node(), flags);
2035
		for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
2036 2037 2038 2039
			struct kmem_cache_node *n;

			n = get_node(s, zone_to_nid(zone));

2040
			if (n && cpuset_zone_allowed(zone, flags) &&
2041
					n->nr_partial > s->min_partial) {
2042
				object = get_partial_node(s, n, c, flags);
2043 2044
				if (object) {
					/*
2045 2046 2047 2048 2049
					 * Don't check read_mems_allowed_retry()
					 * here - if mems_allowed was updated in
					 * parallel, that was a harmless race
					 * between allocation and the cpuset
					 * update
2050 2051 2052
					 */
					return object;
				}
2053
			}
C
Christoph Lameter 已提交
2054
		}
2055
	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2056
#endif	/* CONFIG_NUMA */
C
Christoph Lameter 已提交
2057 2058 2059 2060 2061 2062
	return NULL;
}

/*
 * Get a partial page, lock it and return it.
 */
2063
static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
2064
		struct kmem_cache_cpu *c)
C
Christoph Lameter 已提交
2065
{
2066
	void *object;
2067 2068 2069 2070
	int searchnode = node;

	if (node == NUMA_NO_NODE)
		searchnode = numa_mem_id();
C
Christoph Lameter 已提交
2071

2072
	object = get_partial_node(s, get_node(s, searchnode), c, flags);
2073 2074
	if (object || node != NUMA_NO_NODE)
		return object;
C
Christoph Lameter 已提交
2075

2076
	return get_any_partial(s, flags, c);
C
Christoph Lameter 已提交
2077 2078
}

2079
#ifdef CONFIG_PREEMPTION
2080
/*
2081
 * Calculate the next globally unique transaction for disambiguation
2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098
 * during cmpxchg. The transactions start with the cpu number and are then
 * incremented by CONFIG_NR_CPUS.
 */
#define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
#else
/*
 * No preemption supported therefore also no need to check for
 * different cpus.
 */
#define TID_STEP 1
#endif

static inline unsigned long next_tid(unsigned long tid)
{
	return tid + TID_STEP;
}

2099
#ifdef SLUB_DEBUG_CMPXCHG
2100 2101 2102 2103 2104 2105 2106 2107 2108
static inline unsigned int tid_to_cpu(unsigned long tid)
{
	return tid % TID_STEP;
}

static inline unsigned long tid_to_event(unsigned long tid)
{
	return tid / TID_STEP;
}
2109
#endif
2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121

static inline unsigned int init_tid(int cpu)
{
	return cpu;
}

static inline void note_cmpxchg_failure(const char *n,
		const struct kmem_cache *s, unsigned long tid)
{
#ifdef SLUB_DEBUG_CMPXCHG
	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);

2122
	pr_info("%s %s: cmpxchg redo ", n, s->name);
2123

2124
#ifdef CONFIG_PREEMPTION
2125
	if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
2126
		pr_warn("due to cpu change %d -> %d\n",
2127 2128 2129 2130
			tid_to_cpu(tid), tid_to_cpu(actual_tid));
	else
#endif
	if (tid_to_event(tid) != tid_to_event(actual_tid))
2131
		pr_warn("due to cpu running other code. Event %ld->%ld\n",
2132 2133
			tid_to_event(tid), tid_to_event(actual_tid));
	else
2134
		pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
2135 2136
			actual_tid, tid, next_tid(tid));
#endif
2137
	stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
2138 2139
}

2140
static void init_kmem_cache_cpus(struct kmem_cache *s)
2141 2142 2143 2144 2145 2146
{
	int cpu;

	for_each_possible_cpu(cpu)
		per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
}
2147

C
Christoph Lameter 已提交
2148 2149 2150
/*
 * Remove the cpu slab
 */
2151
static void deactivate_slab(struct kmem_cache *s, struct page *page,
2152
				void *freelist, struct kmem_cache_cpu *c)
C
Christoph Lameter 已提交
2153
{
2154 2155 2156 2157 2158
	enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
	int lock = 0;
	enum slab_modes l = M_NONE, m = M_NONE;
	void *nextfree;
2159
	int tail = DEACTIVATE_TO_HEAD;
2160 2161 2162 2163
	struct page new;
	struct page old;

	if (page->freelist) {
2164
		stat(s, DEACTIVATE_REMOTE_FREES);
2165
		tail = DEACTIVATE_TO_TAIL;
2166 2167
	}

2168
	/*
2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179
	 * Stage one: Free all available per cpu objects back
	 * to the page freelist while it is still frozen. Leave the
	 * last one.
	 *
	 * There is no need to take the list->lock because the page
	 * is still frozen.
	 */
	while (freelist && (nextfree = get_freepointer(s, freelist))) {
		void *prior;
		unsigned long counters;

2180 2181 2182 2183 2184
		/*
		 * If 'nextfree' is invalid, it is possible that the object at
		 * 'freelist' is already corrupted.  So isolate all objects
		 * starting at 'freelist'.
		 */
2185
		if (freelist_corrupted(s, page, &freelist, nextfree))
2186 2187
			break;

2188 2189 2190 2191 2192 2193
		do {
			prior = page->freelist;
			counters = page->counters;
			set_freepointer(s, freelist, prior);
			new.counters = counters;
			new.inuse--;
2194
			VM_BUG_ON(!new.frozen);
2195

2196
		} while (!__cmpxchg_double_slab(s, page,
2197 2198 2199 2200 2201 2202 2203
			prior, counters,
			freelist, new.counters,
			"drain percpu freelist"));

		freelist = nextfree;
	}

2204
	/*
2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216
	 * Stage two: Ensure that the page is unfrozen while the
	 * list presence reflects the actual number of objects
	 * during unfreeze.
	 *
	 * We setup the list membership and then perform a cmpxchg
	 * with the count. If there is a mismatch then the page
	 * is not unfrozen but the page is on the wrong list.
	 *
	 * Then we restart the process which may have to remove
	 * the page from the list that we just put it on again
	 * because the number of objects in the slab may have
	 * changed.
2217
	 */
2218
redo:
2219

2220 2221
	old.freelist = page->freelist;
	old.counters = page->counters;
2222
	VM_BUG_ON(!old.frozen);
2223

2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234
	/* Determine target state of the slab */
	new.counters = old.counters;
	if (freelist) {
		new.inuse--;
		set_freepointer(s, freelist, old.freelist);
		new.freelist = freelist;
	} else
		new.freelist = old.freelist;

	new.frozen = 0;

2235
	if (!new.inuse && n->nr_partial >= s->min_partial)
2236 2237 2238 2239 2240 2241
		m = M_FREE;
	else if (new.freelist) {
		m = M_PARTIAL;
		if (!lock) {
			lock = 1;
			/*
W
Wei Yang 已提交
2242
			 * Taking the spinlock removes the possibility
2243 2244 2245 2246 2247 2248 2249
			 * that acquire_slab() will see a slab page that
			 * is frozen
			 */
			spin_lock(&n->list_lock);
		}
	} else {
		m = M_FULL;
2250
		if (kmem_cache_debug_flags(s, SLAB_STORE_USER) && !lock) {
2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264
			lock = 1;
			/*
			 * This also ensures that the scanning of full
			 * slabs from diagnostic functions will not see
			 * any frozen slabs.
			 */
			spin_lock(&n->list_lock);
		}
	}

	if (l != m) {
		if (l == M_PARTIAL)
			remove_partial(n, page);
		else if (l == M_FULL)
P
Peter Zijlstra 已提交
2265
			remove_full(s, n, page);
2266

2267
		if (m == M_PARTIAL)
2268
			add_partial(n, page, tail);
2269
		else if (m == M_FULL)
2270 2271 2272 2273
			add_full(s, n, page);
	}

	l = m;
2274
	if (!__cmpxchg_double_slab(s, page,
2275 2276 2277 2278 2279 2280 2281 2282
				old.freelist, old.counters,
				new.freelist, new.counters,
				"unfreezing slab"))
		goto redo;

	if (lock)
		spin_unlock(&n->list_lock);

2283 2284 2285 2286 2287
	if (m == M_PARTIAL)
		stat(s, tail);
	else if (m == M_FULL)
		stat(s, DEACTIVATE_FULL);
	else if (m == M_FREE) {
2288 2289 2290
		stat(s, DEACTIVATE_EMPTY);
		discard_slab(s, page);
		stat(s, FREE_SLAB);
2291
	}
2292 2293 2294

	c->page = NULL;
	c->freelist = NULL;
C
Christoph Lameter 已提交
2295 2296
}

2297 2298 2299
/*
 * Unfreeze all the cpu partial slabs.
 *
2300 2301 2302
 * This function must be called with interrupts disabled
 * for the cpu using c (or some other guarantee must be there
 * to guarantee no concurrent accesses).
2303
 */
2304 2305
static void unfreeze_partials(struct kmem_cache *s,
		struct kmem_cache_cpu *c)
2306
{
2307
#ifdef CONFIG_SLUB_CPU_PARTIAL
2308
	struct kmem_cache_node *n = NULL, *n2 = NULL;
2309
	struct page *page, *discard_page = NULL;
2310

2311
	while ((page = slub_percpu_partial(c))) {
2312 2313 2314
		struct page new;
		struct page old;

2315
		slub_set_percpu_partial(c, page);
2316 2317 2318 2319 2320 2321 2322 2323 2324

		n2 = get_node(s, page_to_nid(page));
		if (n != n2) {
			if (n)
				spin_unlock(&n->list_lock);

			n = n2;
			spin_lock(&n->list_lock);
		}
2325 2326 2327 2328 2329

		do {

			old.freelist = page->freelist;
			old.counters = page->counters;
2330
			VM_BUG_ON(!old.frozen);
2331 2332 2333 2334 2335 2336

			new.counters = old.counters;
			new.freelist = old.freelist;

			new.frozen = 0;

2337
		} while (!__cmpxchg_double_slab(s, page,
2338 2339 2340 2341
				old.freelist, old.counters,
				new.freelist, new.counters,
				"unfreezing slab"));

2342
		if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
2343 2344
			page->next = discard_page;
			discard_page = page;
2345 2346 2347
		} else {
			add_partial(n, page, DEACTIVATE_TO_TAIL);
			stat(s, FREE_ADD_PARTIAL);
2348 2349 2350 2351 2352
		}
	}

	if (n)
		spin_unlock(&n->list_lock);
2353 2354 2355 2356 2357 2358 2359 2360 2361

	while (discard_page) {
		page = discard_page;
		discard_page = discard_page->next;

		stat(s, DEACTIVATE_EMPTY);
		discard_slab(s, page);
		stat(s, FREE_SLAB);
	}
2362
#endif	/* CONFIG_SLUB_CPU_PARTIAL */
2363 2364 2365
}

/*
2366 2367
 * Put a page that was just frozen (in __slab_free|get_partial_node) into a
 * partial page slot if available.
2368 2369 2370 2371
 *
 * If we did not find a slot then simply move all the partials to the
 * per node partial list.
 */
2372
static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2373
{
2374
#ifdef CONFIG_SLUB_CPU_PARTIAL
2375 2376 2377 2378
	struct page *oldpage;
	int pages;
	int pobjects;

2379
	preempt_disable();
2380 2381 2382 2383 2384 2385 2386 2387
	do {
		pages = 0;
		pobjects = 0;
		oldpage = this_cpu_read(s->cpu_slab->partial);

		if (oldpage) {
			pobjects = oldpage->pobjects;
			pages = oldpage->pages;
2388
			if (drain && pobjects > slub_cpu_partial(s)) {
2389 2390 2391 2392 2393 2394
				unsigned long flags;
				/*
				 * partial array is full. Move the existing
				 * set to the per node partial list.
				 */
				local_irq_save(flags);
2395
				unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
2396
				local_irq_restore(flags);
2397
				oldpage = NULL;
2398 2399
				pobjects = 0;
				pages = 0;
2400
				stat(s, CPU_PARTIAL_DRAIN);
2401 2402 2403 2404 2405 2406 2407 2408 2409 2410
			}
		}

		pages++;
		pobjects += page->objects - page->inuse;

		page->pages = pages;
		page->pobjects = pobjects;
		page->next = oldpage;

2411 2412
	} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
								!= oldpage);
2413
	if (unlikely(!slub_cpu_partial(s))) {
2414 2415 2416 2417 2418 2419 2420
		unsigned long flags;

		local_irq_save(flags);
		unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
		local_irq_restore(flags);
	}
	preempt_enable();
2421
#endif	/* CONFIG_SLUB_CPU_PARTIAL */
2422 2423
}

2424
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
C
Christoph Lameter 已提交
2425
{
2426
	stat(s, CPUSLAB_FLUSH);
2427
	deactivate_slab(s, c->page, c->freelist, c);
2428 2429

	c->tid = next_tid(c->tid);
C
Christoph Lameter 已提交
2430 2431 2432 2433
}

/*
 * Flush cpu slab.
C
Christoph Lameter 已提交
2434
 *
C
Christoph Lameter 已提交
2435 2436
 * Called from IPI handler with interrupts disabled.
 */
2437
static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
C
Christoph Lameter 已提交
2438
{
2439
	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
C
Christoph Lameter 已提交
2440

2441 2442
	if (c->page)
		flush_slab(s, c);
2443

2444
	unfreeze_partials(s, c);
C
Christoph Lameter 已提交
2445 2446 2447 2448 2449 2450
}

static void flush_cpu_slab(void *d)
{
	struct kmem_cache *s = d;

2451
	__flush_cpu_slab(s, smp_processor_id());
C
Christoph Lameter 已提交
2452 2453
}

2454 2455 2456 2457 2458
static bool has_cpu_slab(int cpu, void *info)
{
	struct kmem_cache *s = info;
	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);

2459
	return c->page || slub_percpu_partial(c);
2460 2461
}

C
Christoph Lameter 已提交
2462 2463
static void flush_all(struct kmem_cache *s)
{
2464
	on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
C
Christoph Lameter 已提交
2465 2466
}

2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485
/*
 * Use the cpu notifier to insure that the cpu slabs are flushed when
 * necessary.
 */
static int slub_cpu_dead(unsigned int cpu)
{
	struct kmem_cache *s;
	unsigned long flags;

	mutex_lock(&slab_mutex);
	list_for_each_entry(s, &slab_caches, list) {
		local_irq_save(flags);
		__flush_cpu_slab(s, cpu);
		local_irq_restore(flags);
	}
	mutex_unlock(&slab_mutex);
	return 0;
}

2486 2487 2488 2489
/*
 * Check if the objects in a per cpu structure fit numa
 * locality expectations.
 */
2490
static inline int node_match(struct page *page, int node)
2491 2492
{
#ifdef CONFIG_NUMA
2493
	if (node != NUMA_NO_NODE && page_to_nid(page) != node)
2494 2495 2496 2497 2498
		return 0;
#endif
	return 1;
}

2499
#ifdef CONFIG_SLUB_DEBUG
P
Pekka Enberg 已提交
2500 2501 2502 2503 2504
static int count_free(struct page *page)
{
	return page->objects - page->inuse;
}

2505 2506 2507 2508 2509 2510 2511
static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
{
	return atomic_long_read(&n->total_objects);
}
#endif /* CONFIG_SLUB_DEBUG */

#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
P
Pekka Enberg 已提交
2512 2513 2514 2515 2516 2517 2518 2519
static unsigned long count_partial(struct kmem_cache_node *n,
					int (*get_count)(struct page *))
{
	unsigned long flags;
	unsigned long x = 0;
	struct page *page;

	spin_lock_irqsave(&n->list_lock, flags);
2520
	list_for_each_entry(page, &n->partial, slab_list)
P
Pekka Enberg 已提交
2521 2522 2523 2524
		x += get_count(page);
	spin_unlock_irqrestore(&n->list_lock, flags);
	return x;
}
2525
#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
2526

P
Pekka Enberg 已提交
2527 2528 2529
static noinline void
slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
{
2530 2531 2532
#ifdef CONFIG_SLUB_DEBUG
	static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
				      DEFAULT_RATELIMIT_BURST);
P
Pekka Enberg 已提交
2533
	int node;
C
Christoph Lameter 已提交
2534
	struct kmem_cache_node *n;
P
Pekka Enberg 已提交
2535

2536 2537 2538
	if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
		return;

2539 2540
	pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
		nid, gfpflags, &gfpflags);
2541
	pr_warn("  cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
2542 2543
		s->name, s->object_size, s->size, oo_order(s->oo),
		oo_order(s->min));
P
Pekka Enberg 已提交
2544

2545
	if (oo_order(s->min) > get_order(s->object_size))
2546 2547
		pr_warn("  %s debugging increased min order, use slub_debug=O to disable.\n",
			s->name);
2548

C
Christoph Lameter 已提交
2549
	for_each_kmem_cache_node(s, node, n) {
P
Pekka Enberg 已提交
2550 2551 2552 2553
		unsigned long nr_slabs;
		unsigned long nr_objs;
		unsigned long nr_free;

2554 2555 2556
		nr_free  = count_partial(n, count_free);
		nr_slabs = node_nr_slabs(n);
		nr_objs  = node_nr_objs(n);
P
Pekka Enberg 已提交
2557

2558
		pr_warn("  node %d: slabs: %ld, objs: %ld, free: %ld\n",
P
Pekka Enberg 已提交
2559 2560
			node, nr_slabs, nr_objs, nr_free);
	}
2561
#endif
P
Pekka Enberg 已提交
2562 2563
}

2564 2565 2566
static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
			int node, struct kmem_cache_cpu **pc)
{
2567
	void *freelist;
2568 2569
	struct kmem_cache_cpu *c = *pc;
	struct page *page;
2570

2571 2572
	WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));

2573
	freelist = get_partial(s, flags, node, c);
2574

2575 2576 2577 2578
	if (freelist)
		return freelist;

	page = new_slab(s, flags, node);
2579
	if (page) {
2580
		c = raw_cpu_ptr(s->cpu_slab);
2581 2582 2583 2584 2585 2586 2587
		if (c->page)
			flush_slab(s, c);

		/*
		 * No other reference to the page yet so we can
		 * muck around with it freely without cmpxchg
		 */
2588
		freelist = page->freelist;
2589 2590 2591 2592 2593
		page->freelist = NULL;

		stat(s, ALLOC_SLAB);
		c->page = page;
		*pc = c;
2594
	}
2595

2596
	return freelist;
2597 2598
}

2599 2600 2601 2602 2603 2604 2605 2606
static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
{
	if (unlikely(PageSlabPfmemalloc(page)))
		return gfp_pfmemalloc_allowed(gfpflags);

	return true;
}

2607
/*
2608 2609
 * Check the page->freelist of a page and either transfer the freelist to the
 * per cpu freelist or deactivate the page.
2610 2611 2612 2613
 *
 * The page is still frozen if the return value is not NULL.
 *
 * If this function returns NULL then the page has been unfrozen.
2614 2615
 *
 * This function must be called with interrupt disabled.
2616 2617 2618 2619 2620 2621 2622 2623 2624 2625
 */
static inline void *get_freelist(struct kmem_cache *s, struct page *page)
{
	struct page new;
	unsigned long counters;
	void *freelist;

	do {
		freelist = page->freelist;
		counters = page->counters;
2626

2627
		new.counters = counters;
2628
		VM_BUG_ON(!new.frozen);
2629 2630 2631 2632

		new.inuse = page->objects;
		new.frozen = freelist != NULL;

2633
	} while (!__cmpxchg_double_slab(s, page,
2634 2635 2636 2637 2638 2639 2640
		freelist, counters,
		NULL, new.counters,
		"get_freelist"));

	return freelist;
}

C
Christoph Lameter 已提交
2641
/*
2642 2643 2644 2645 2646 2647
 * Slow path. The lockless freelist is empty or we need to perform
 * debugging duties.
 *
 * Processing is still very fast if new objects have been freed to the
 * regular freelist. In that case we simply take over the regular freelist
 * as the lockless freelist and zap the regular freelist.
C
Christoph Lameter 已提交
2648
 *
2649 2650 2651
 * If that is not working then we fall back to the partial lists. We take the
 * first element of the freelist as the object to allocate now and move the
 * rest of the freelist to the lockless freelist.
C
Christoph Lameter 已提交
2652
 *
2653
 * And if we were unable to get a new slab from the partial slab lists then
C
Christoph Lameter 已提交
2654 2655
 * we need to allocate a new slab. This is the slowest path since it involves
 * a call to the page allocator and the setup of a new slab.
2656 2657 2658
 *
 * Version of __slab_alloc to use when we know that interrupts are
 * already disabled (which is the case for bulk allocation).
C
Christoph Lameter 已提交
2659
 */
2660
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2661
			  unsigned long addr, struct kmem_cache_cpu *c)
C
Christoph Lameter 已提交
2662
{
2663
	void *freelist;
2664
	struct page *page;
C
Christoph Lameter 已提交
2665

2666 2667
	stat(s, ALLOC_SLOWPATH);

2668
	page = c->page;
2669 2670 2671 2672 2673 2674 2675 2676
	if (!page) {
		/*
		 * if the node is not online or has no normal memory, just
		 * ignore the node constraint
		 */
		if (unlikely(node != NUMA_NO_NODE &&
			     !node_state(node, N_NORMAL_MEMORY)))
			node = NUMA_NO_NODE;
C
Christoph Lameter 已提交
2677
		goto new_slab;
2678
	}
2679
redo:
2680

2681
	if (unlikely(!node_match(page, node))) {
2682 2683 2684 2685 2686 2687 2688 2689
		/*
		 * same as above but node_match() being false already
		 * implies node != NUMA_NO_NODE
		 */
		if (!node_state(node, N_NORMAL_MEMORY)) {
			node = NUMA_NO_NODE;
			goto redo;
		} else {
2690
			stat(s, ALLOC_NODE_MISMATCH);
2691
			deactivate_slab(s, page, c->freelist, c);
2692 2693
			goto new_slab;
		}
2694
	}
C
Christoph Lameter 已提交
2695

2696 2697 2698 2699 2700 2701
	/*
	 * By rights, we should be searching for a slab page that was
	 * PFMEMALLOC but right now, we are losing the pfmemalloc
	 * information when the page leaves the per-cpu allocator
	 */
	if (unlikely(!pfmemalloc_match(page, gfpflags))) {
2702
		deactivate_slab(s, page, c->freelist, c);
2703 2704 2705
		goto new_slab;
	}

2706
	/* must check again c->freelist in case of cpu migration or IRQ */
2707 2708
	freelist = c->freelist;
	if (freelist)
2709
		goto load_freelist;
2710

2711
	freelist = get_freelist(s, page);
C
Christoph Lameter 已提交
2712

2713
	if (!freelist) {
2714 2715
		c->page = NULL;
		stat(s, DEACTIVATE_BYPASS);
2716
		goto new_slab;
2717
	}
C
Christoph Lameter 已提交
2718

2719
	stat(s, ALLOC_REFILL);
C
Christoph Lameter 已提交
2720

2721
load_freelist:
2722 2723 2724 2725 2726
	/*
	 * freelist is pointing to the list of objects to be used.
	 * page is pointing to the page from which the objects are obtained.
	 * That page must be frozen for per cpu allocations to work.
	 */
2727
	VM_BUG_ON(!c->page->frozen);
2728
	c->freelist = get_freepointer(s, freelist);
2729
	c->tid = next_tid(c->tid);
2730
	return freelist;
C
Christoph Lameter 已提交
2731 2732

new_slab:
2733

2734 2735 2736
	if (slub_percpu_partial(c)) {
		page = c->page = slub_percpu_partial(c);
		slub_set_percpu_partial(c, page);
2737 2738
		stat(s, CPU_PARTIAL_ALLOC);
		goto redo;
C
Christoph Lameter 已提交
2739 2740
	}

2741
	freelist = new_slab_objects(s, gfpflags, node, &c);
2742

2743
	if (unlikely(!freelist)) {
2744
		slab_out_of_memory(s, gfpflags, node);
2745
		return NULL;
C
Christoph Lameter 已提交
2746
	}
2747

2748
	page = c->page;
2749
	if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
2750
		goto load_freelist;
2751

2752
	/* Only entered in the debug case */
2753 2754
	if (kmem_cache_debug(s) &&
			!alloc_debug_processing(s, page, freelist, addr))
2755
		goto new_slab;	/* Slab failed checks. Next slab needed */
2756

2757
	deactivate_slab(s, page, get_freepointer(s, freelist), c);
2758
	return freelist;
2759 2760
}

2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771
/*
 * Another one that disabled interrupt and compensates for possible
 * cpu changes by refetching the per cpu area pointer.
 */
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
			  unsigned long addr, struct kmem_cache_cpu *c)
{
	void *p;
	unsigned long flags;

	local_irq_save(flags);
2772
#ifdef CONFIG_PREEMPTION
2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785
	/*
	 * We may have been preempted and rescheduled on a different
	 * cpu before disabling interrupts. Need to reload cpu area
	 * pointer.
	 */
	c = this_cpu_ptr(s->cpu_slab);
#endif

	p = ___slab_alloc(s, gfpflags, node, addr, c);
	local_irq_restore(flags);
	return p;
}

2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796
/*
 * If the object has been wiped upon free, make sure it's fully initialized by
 * zeroing out freelist pointer.
 */
static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
						   void *obj)
{
	if (unlikely(slab_want_init_on_free(s)) && obj)
		memset((void *)((char *)obj + s->offset), 0, sizeof(void *));
}

2797 2798 2799 2800 2801 2802 2803 2804 2805 2806
/*
 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
 * have the fastpath folded into their functions. So no function call
 * overhead for requests that can be satisfied on the fastpath.
 *
 * The fastpath works by first checking if the lockless freelist can be used.
 * If not then __slab_alloc is called for slow processing.
 *
 * Otherwise we can simply pick the next object from the lockless free list.
 */
2807
static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2808
		gfp_t gfpflags, int node, unsigned long addr)
2809
{
2810
	void *object;
2811
	struct kmem_cache_cpu *c;
2812
	struct page *page;
2813
	unsigned long tid;
2814
	struct obj_cgroup *objcg = NULL;
2815

2816
	s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
2817
	if (!s)
A
Akinobu Mita 已提交
2818
		return NULL;
2819 2820 2821 2822 2823 2824
redo:
	/*
	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
	 * enabled. We may switch back and forth between cpus while
	 * reading from one cpu area. That does not matter as long
	 * as we end up on the original cpu again when doing the cmpxchg.
2825
	 *
2826
	 * We should guarantee that tid and kmem_cache are retrieved on
2827
	 * the same cpu. It could be different if CONFIG_PREEMPTION so we need
2828
	 * to check if it is matched or not.
2829
	 */
2830 2831 2832
	do {
		tid = this_cpu_read(s->cpu_slab->tid);
		c = raw_cpu_ptr(s->cpu_slab);
2833
	} while (IS_ENABLED(CONFIG_PREEMPTION) &&
2834
		 unlikely(tid != READ_ONCE(c->tid)));
2835 2836 2837 2838 2839 2840 2841 2842 2843 2844

	/*
	 * Irqless object alloc/free algorithm used here depends on sequence
	 * of fetching cpu_slab's data. tid should be fetched before anything
	 * on c to guarantee that object and page associated with previous tid
	 * won't be used with current tid. If we fetch tid first, object and
	 * page could be one associated with next tid and our alloc/free
	 * request will be failed. In this case, we will retry. So, no problem.
	 */
	barrier();
2845 2846 2847 2848 2849 2850 2851 2852

	/*
	 * The transaction ids are globally unique per cpu and per operation on
	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
	 * occurs on the right processor and that there was no operation on the
	 * linked list in between.
	 */

2853
	object = c->freelist;
2854
	page = c->page;
2855
	if (unlikely(!object || !page || !node_match(page, node))) {
2856
		object = __slab_alloc(s, gfpflags, node, addr, c);
D
Dave Hansen 已提交
2857
	} else {
2858 2859
		void *next_object = get_freepointer_safe(s, object);

2860
		/*
L
Lucas De Marchi 已提交
2861
		 * The cmpxchg will only match if there was no additional
2862 2863
		 * operation and if we are on the right processor.
		 *
2864 2865
		 * The cmpxchg does the following atomically (without lock
		 * semantics!)
2866 2867 2868 2869
		 * 1. Relocate first pointer to the current per cpu area.
		 * 2. Verify that tid and freelist have not been changed
		 * 3. If they were not changed replace tid and freelist
		 *
2870 2871 2872
		 * Since this is without lock semantics the protection is only
		 * against code executing on this cpu *not* from access by
		 * other cpus.
2873
		 */
2874
		if (unlikely(!this_cpu_cmpxchg_double(
2875 2876
				s->cpu_slab->freelist, s->cpu_slab->tid,
				object, tid,
2877
				next_object, next_tid(tid)))) {
2878 2879 2880 2881

			note_cmpxchg_failure("slab_alloc", s, tid);
			goto redo;
		}
2882
		prefetch_freepointer(s, next_object);
2883
		stat(s, ALLOC_FASTPATH);
2884
	}
2885

2886
	maybe_wipe_obj_freeptr(s, kasan_reset_tag(object));
2887

2888
	if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
2889
		memset(kasan_reset_tag(object), 0, s->object_size);
2890

2891
	slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
V
Vegard Nossum 已提交
2892

2893
	return object;
C
Christoph Lameter 已提交
2894 2895
}

2896 2897 2898 2899 2900 2901
static __always_inline void *slab_alloc(struct kmem_cache *s,
		gfp_t gfpflags, unsigned long addr)
{
	return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
}

C
Christoph Lameter 已提交
2902 2903
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
2904
	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
E
Eduard - Gabriel Munteanu 已提交
2905

2906 2907
	trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
				s->size, gfpflags);
E
Eduard - Gabriel Munteanu 已提交
2908 2909

	return ret;
C
Christoph Lameter 已提交
2910 2911 2912
}
EXPORT_SYMBOL(kmem_cache_alloc);

2913
#ifdef CONFIG_TRACING
2914 2915
void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
2916
	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2917
	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
2918
	ret = kasan_kmalloc(s, ret, size, gfpflags);
2919 2920 2921
	return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_trace);
E
Eduard - Gabriel Munteanu 已提交
2922 2923
#endif

C
Christoph Lameter 已提交
2924 2925 2926
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
{
2927
	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
E
Eduard - Gabriel Munteanu 已提交
2928

2929
	trace_kmem_cache_alloc_node(_RET_IP_, ret,
2930
				    s->object_size, s->size, gfpflags, node);
E
Eduard - Gabriel Munteanu 已提交
2931 2932

	return ret;
C
Christoph Lameter 已提交
2933 2934 2935
}
EXPORT_SYMBOL(kmem_cache_alloc_node);

2936
#ifdef CONFIG_TRACING
2937
void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
E
Eduard - Gabriel Munteanu 已提交
2938
				    gfp_t gfpflags,
2939
				    int node, size_t size)
E
Eduard - Gabriel Munteanu 已提交
2940
{
2941
	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
2942 2943 2944

	trace_kmalloc_node(_RET_IP_, ret,
			   size, s->size, gfpflags, node);
2945

2946
	ret = kasan_kmalloc(s, ret, size, gfpflags);
2947
	return ret;
E
Eduard - Gabriel Munteanu 已提交
2948
}
2949
EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
E
Eduard - Gabriel Munteanu 已提交
2950
#endif
2951
#endif	/* CONFIG_NUMA */
E
Eduard - Gabriel Munteanu 已提交
2952

C
Christoph Lameter 已提交
2953
/*
K
Kim Phillips 已提交
2954
 * Slow path handling. This may still be called frequently since objects
2955
 * have a longer lifetime than the cpu slabs in most processing loads.
C
Christoph Lameter 已提交
2956
 *
2957 2958 2959
 * So we still attempt to reduce cache line usage. Just take the slab
 * lock and free the item. If there is no additional partial page
 * handling required then we can return immediately.
C
Christoph Lameter 已提交
2960
 */
2961
static void __slab_free(struct kmem_cache *s, struct page *page,
2962 2963 2964
			void *head, void *tail, int cnt,
			unsigned long addr)

C
Christoph Lameter 已提交
2965 2966
{
	void *prior;
2967 2968 2969 2970
	int was_frozen;
	struct page new;
	unsigned long counters;
	struct kmem_cache_node *n = NULL;
2971
	unsigned long flags;
C
Christoph Lameter 已提交
2972

2973
	stat(s, FREE_SLOWPATH);
C
Christoph Lameter 已提交
2974

2975
	if (kmem_cache_debug(s) &&
2976
	    !free_debug_processing(s, page, head, tail, cnt, addr))
2977
		return;
C
Christoph Lameter 已提交
2978

2979
	do {
2980 2981 2982 2983
		if (unlikely(n)) {
			spin_unlock_irqrestore(&n->list_lock, flags);
			n = NULL;
		}
2984 2985
		prior = page->freelist;
		counters = page->counters;
2986
		set_freepointer(s, tail, prior);
2987 2988
		new.counters = counters;
		was_frozen = new.frozen;
2989
		new.inuse -= cnt;
2990
		if ((!new.inuse || !prior) && !was_frozen) {
2991

P
Peter Zijlstra 已提交
2992
			if (kmem_cache_has_cpu_partial(s) && !prior) {
2993 2994

				/*
2995 2996 2997 2998
				 * Slab was on no list before and will be
				 * partially empty
				 * We can defer the list move and instead
				 * freeze it.
2999 3000 3001
				 */
				new.frozen = 1;

P
Peter Zijlstra 已提交
3002
			} else { /* Needs to be taken off a list */
3003

3004
				n = get_node(s, page_to_nid(page));
3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015
				/*
				 * Speculatively acquire the list_lock.
				 * If the cmpxchg does not succeed then we may
				 * drop the list_lock without any processing.
				 *
				 * Otherwise the list_lock will synchronize with
				 * other processors updating the list of slabs.
				 */
				spin_lock_irqsave(&n->list_lock, flags);

			}
3016
		}
C
Christoph Lameter 已提交
3017

3018 3019
	} while (!cmpxchg_double_slab(s, page,
		prior, counters,
3020
		head, new.counters,
3021
		"__slab_free"));
C
Christoph Lameter 已提交
3022

3023
	if (likely(!n)) {
3024

3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035
		if (likely(was_frozen)) {
			/*
			 * The list lock was not taken therefore no list
			 * activity can be necessary.
			 */
			stat(s, FREE_FROZEN);
		} else if (new.frozen) {
			/*
			 * If we just froze the page then put it onto the
			 * per cpu partial list.
			 */
3036
			put_cpu_partial(s, page, 1);
3037 3038
			stat(s, CPU_PARTIAL_FREE);
		}
3039

3040 3041
		return;
	}
C
Christoph Lameter 已提交
3042

3043
	if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
3044 3045
		goto slab_empty;

C
Christoph Lameter 已提交
3046
	/*
3047 3048
	 * Objects left in the slab. If it was not on the partial list before
	 * then add it.
C
Christoph Lameter 已提交
3049
	 */
3050
	if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
3051
		remove_full(s, n, page);
3052 3053
		add_partial(n, page, DEACTIVATE_TO_TAIL);
		stat(s, FREE_ADD_PARTIAL);
3054
	}
3055
	spin_unlock_irqrestore(&n->list_lock, flags);
C
Christoph Lameter 已提交
3056 3057 3058
	return;

slab_empty:
3059
	if (prior) {
C
Christoph Lameter 已提交
3060
		/*
3061
		 * Slab on the partial list.
C
Christoph Lameter 已提交
3062
		 */
3063
		remove_partial(n, page);
3064
		stat(s, FREE_REMOVE_PARTIAL);
P
Peter Zijlstra 已提交
3065
	} else {
3066
		/* Slab must be on the full list */
P
Peter Zijlstra 已提交
3067 3068
		remove_full(s, n, page);
	}
3069

3070
	spin_unlock_irqrestore(&n->list_lock, flags);
3071
	stat(s, FREE_SLAB);
C
Christoph Lameter 已提交
3072 3073 3074
	discard_slab(s, page);
}

3075 3076 3077 3078 3079 3080 3081 3082 3083 3084
/*
 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
 * can perform fastpath freeing without additional function calls.
 *
 * The fastpath is only possible if we are freeing to the current cpu slab
 * of this processor. This typically the case if we have just allocated
 * the item before.
 *
 * If fastpath is not possible then fall back to __slab_free where we deal
 * with all sorts of special processing.
3085 3086 3087 3088
 *
 * Bulk free of a freelist with several objects (all pointing to the
 * same page) possible by specifying head and tail ptr, plus objects
 * count (cnt). Bulk free indicated by tail pointer being set.
3089
 */
3090 3091 3092
static __always_inline void do_slab_free(struct kmem_cache *s,
				struct page *page, void *head, void *tail,
				int cnt, unsigned long addr)
3093
{
3094
	void *tail_obj = tail ? : head;
3095
	struct kmem_cache_cpu *c;
3096
	unsigned long tid;
3097

3098
	memcg_slab_free_hook(s, &head, 1);
3099 3100 3101 3102 3103
redo:
	/*
	 * Determine the currently cpus per cpu slab.
	 * The cpu may change afterward. However that does not matter since
	 * data is retrieved via this pointer. If we are on the same cpu
3104
	 * during the cmpxchg then the free will succeed.
3105
	 */
3106 3107 3108
	do {
		tid = this_cpu_read(s->cpu_slab->tid);
		c = raw_cpu_ptr(s->cpu_slab);
3109
	} while (IS_ENABLED(CONFIG_PREEMPTION) &&
3110
		 unlikely(tid != READ_ONCE(c->tid)));
3111

3112 3113
	/* Same with comment on barrier() in slab_alloc_node() */
	barrier();
3114

3115
	if (likely(page == c->page)) {
3116 3117 3118
		void **freelist = READ_ONCE(c->freelist);

		set_freepointer(s, tail_obj, freelist);
3119

3120
		if (unlikely(!this_cpu_cmpxchg_double(
3121
				s->cpu_slab->freelist, s->cpu_slab->tid,
3122
				freelist, tid,
3123
				head, next_tid(tid)))) {
3124 3125 3126 3127

			note_cmpxchg_failure("slab_free", s, tid);
			goto redo;
		}
3128
		stat(s, FREE_FASTPATH);
3129
	} else
3130
		__slab_free(s, page, head, tail_obj, cnt, addr);
3131 3132 3133

}

3134 3135 3136 3137 3138
static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
				      void *head, void *tail, int cnt,
				      unsigned long addr)
{
	/*
3139 3140
	 * With KASAN enabled slab_free_freelist_hook modifies the freelist
	 * to remove objects, whose reuse must be delayed.
3141
	 */
3142 3143
	if (slab_free_freelist_hook(s, &head, &tail))
		do_slab_free(s, page, head, tail, cnt, addr);
3144 3145
}

3146
#ifdef CONFIG_KASAN_GENERIC
3147 3148 3149 3150 3151 3152
void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
{
	do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
}
#endif

C
Christoph Lameter 已提交
3153 3154
void kmem_cache_free(struct kmem_cache *s, void *x)
{
3155 3156
	s = cache_from_obj(s, x);
	if (!s)
3157
		return;
3158
	slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
3159
	trace_kmem_cache_free(_RET_IP_, x);
C
Christoph Lameter 已提交
3160 3161 3162
}
EXPORT_SYMBOL(kmem_cache_free);

3163
struct detached_freelist {
3164
	struct page *page;
3165 3166 3167
	void *tail;
	void *freelist;
	int cnt;
3168
	struct kmem_cache *s;
3169
};
3170

3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182
/*
 * This function progressively scans the array with free objects (with
 * a limited look ahead) and extract objects belonging to the same
 * page.  It builds a detached freelist directly within the given
 * page/objects.  This can happen without any need for
 * synchronization, because the objects are owned by running process.
 * The freelist is build up as a single linked list in the objects.
 * The idea is, that this detached freelist can then be bulk
 * transferred to the real freelist(s), but only requiring a single
 * synchronization primitive.  Look ahead in the array is limited due
 * to performance reasons.
 */
3183 3184 3185
static inline
int build_detached_freelist(struct kmem_cache *s, size_t size,
			    void **p, struct detached_freelist *df)
3186 3187 3188 3189
{
	size_t first_skipped_index = 0;
	int lookahead = 3;
	void *object;
3190
	struct page *page;
3191

3192 3193
	/* Always re-init detached_freelist */
	df->page = NULL;
3194

3195 3196
	do {
		object = p[--size];
3197
		/* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
3198
	} while (!object && size);
3199

3200 3201
	if (!object)
		return 0;
3202

3203 3204 3205 3206 3207 3208
	page = virt_to_head_page(object);
	if (!s) {
		/* Handle kalloc'ed objects */
		if (unlikely(!PageSlab(page))) {
			BUG_ON(!PageCompound(page));
			kfree_hook(object);
3209
			__free_pages(page, compound_order(page));
3210 3211 3212 3213 3214 3215 3216 3217
			p[size] = NULL; /* mark object processed */
			return size;
		}
		/* Derive kmem_cache from object */
		df->s = page->slab_cache;
	} else {
		df->s = cache_from_obj(s, object); /* Support for memcg */
	}
3218

3219
	/* Start new detached freelist */
3220
	df->page = page;
3221
	set_freepointer(df->s, object, NULL);
3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234
	df->tail = object;
	df->freelist = object;
	p[size] = NULL; /* mark object processed */
	df->cnt = 1;

	while (size) {
		object = p[--size];
		if (!object)
			continue; /* Skip processed objects */

		/* df->page is always set at this point */
		if (df->page == virt_to_head_page(object)) {
			/* Opportunity build freelist */
3235
			set_freepointer(df->s, object, df->freelist);
3236 3237 3238 3239 3240
			df->freelist = object;
			df->cnt++;
			p[size] = NULL; /* mark object processed */

			continue;
3241
		}
3242 3243 3244 3245 3246 3247 3248

		/* Limit look ahead search */
		if (!--lookahead)
			break;

		if (!first_skipped_index)
			first_skipped_index = size + 1;
3249
	}
3250 3251 3252 3253 3254

	return first_skipped_index;
}

/* Note that interrupts must be enabled when calling this function. */
3255
void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
3256 3257 3258 3259
{
	if (WARN_ON(!size))
		return;

3260
	memcg_slab_free_hook(s, p, size);
3261 3262 3263 3264
	do {
		struct detached_freelist df;

		size = build_detached_freelist(s, size, p, &df);
A
Arnd Bergmann 已提交
3265
		if (!df.page)
3266 3267
			continue;

3268
		slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
3269
	} while (likely(size));
3270 3271 3272
}
EXPORT_SYMBOL(kmem_cache_free_bulk);

3273
/* Note that interrupts must be enabled when calling this function. */
3274 3275
int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
			  void **p)
3276
{
3277 3278
	struct kmem_cache_cpu *c;
	int i;
3279
	struct obj_cgroup *objcg = NULL;
3280

3281
	/* memcg and kmem_cache debug support */
3282
	s = slab_pre_alloc_hook(s, &objcg, size, flags);
3283 3284
	if (unlikely(!s))
		return false;
3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295
	/*
	 * Drain objects in the per cpu slab, while disabling local
	 * IRQs, which protects against PREEMPT and interrupts
	 * handlers invoking normal fastpath.
	 */
	local_irq_disable();
	c = this_cpu_ptr(s->cpu_slab);

	for (i = 0; i < size; i++) {
		void *object = c->freelist;

3296
		if (unlikely(!object)) {
3297 3298 3299 3300 3301 3302 3303 3304 3305
			/*
			 * We may have removed an object from c->freelist using
			 * the fastpath in the previous iteration; in that case,
			 * c->tid has not been bumped yet.
			 * Since ___slab_alloc() may reenable interrupts while
			 * allocating memory, we should bump c->tid now.
			 */
			c->tid = next_tid(c->tid);

3306 3307 3308 3309
			/*
			 * Invoking slow path likely have side-effect
			 * of re-populating per CPU c->freelist
			 */
3310
			p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
3311
					    _RET_IP_, c);
3312 3313 3314
			if (unlikely(!p[i]))
				goto error;

3315
			c = this_cpu_ptr(s->cpu_slab);
3316 3317
			maybe_wipe_obj_freeptr(s, p[i]);

3318 3319
			continue; /* goto for-loop */
		}
3320 3321
		c->freelist = get_freepointer(s, object);
		p[i] = object;
3322
		maybe_wipe_obj_freeptr(s, p[i]);
3323 3324 3325 3326 3327
	}
	c->tid = next_tid(c->tid);
	local_irq_enable();

	/* Clear memory outside IRQ disabled fastpath loop */
3328
	if (unlikely(slab_want_init_on_alloc(flags, s))) {
3329 3330 3331 3332 3333 3334
		int j;

		for (j = 0; j < i; j++)
			memset(p[j], 0, s->object_size);
	}

3335
	/* memcg and kmem_cache debug support */
3336
	slab_post_alloc_hook(s, objcg, flags, size, p);
3337
	return i;
3338 3339
error:
	local_irq_enable();
3340
	slab_post_alloc_hook(s, objcg, flags, i, p);
3341
	__kmem_cache_free_bulk(s, i, p);
3342
	return 0;
3343 3344 3345 3346
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);


C
Christoph Lameter 已提交
3347
/*
C
Christoph Lameter 已提交
3348 3349 3350 3351
 * Object placement in a slab is made very easy because we always start at
 * offset 0. If we tune the size of the object to the alignment then we can
 * get the required alignment by putting one properly sized object after
 * another.
C
Christoph Lameter 已提交
3352 3353 3354 3355
 *
 * Notice that the allocation order determines the sizes of the per cpu
 * caches. Each processor has always one slab available for allocations.
 * Increasing the allocation order reduces the number of times that slabs
C
Christoph Lameter 已提交
3356
 * must be moved on and off the partial lists and is therefore a factor in
C
Christoph Lameter 已提交
3357 3358 3359 3360 3361 3362 3363 3364 3365
 * locking overhead.
 */

/*
 * Mininum / Maximum order of slab pages. This influences locking overhead
 * and slab fragmentation. A higher order reduces the number of partial slabs
 * and increases the number of allocations possible without having to
 * take the list_lock.
 */
3366 3367 3368
static unsigned int slub_min_order;
static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
static unsigned int slub_min_objects;
C
Christoph Lameter 已提交
3369 3370 3371 3372

/*
 * Calculate the order of allocation given an slab object size.
 *
C
Christoph Lameter 已提交
3373 3374 3375 3376
 * The order of allocation has significant impact on performance and other
 * system components. Generally order 0 allocations should be preferred since
 * order 0 does not cause fragmentation in the page allocator. Larger objects
 * be problematic to put into order 0 slabs because there may be too much
C
Christoph Lameter 已提交
3377
 * unused space left. We go to a higher order if more than 1/16th of the slab
C
Christoph Lameter 已提交
3378 3379 3380 3381 3382 3383
 * would be wasted.
 *
 * In order to reach satisfactory performance we must ensure that a minimum
 * number of objects is in one slab. Otherwise we may generate too much
 * activity on the partial lists which requires taking the list_lock. This is
 * less a concern for large slabs though which are rarely used.
C
Christoph Lameter 已提交
3384
 *
C
Christoph Lameter 已提交
3385 3386 3387 3388
 * slub_max_order specifies the order where we begin to stop considering the
 * number of objects in a slab as critical. If we reach slub_max_order then
 * we try to keep the page order as low as possible. So we accept more waste
 * of space in favor of a small page order.
C
Christoph Lameter 已提交
3389
 *
C
Christoph Lameter 已提交
3390 3391 3392 3393
 * Higher order allocations also allow the placement of more objects in a
 * slab and thereby reduce object handling overhead. If the user has
 * requested a higher mininum order then we start with that one instead of
 * the smallest order which will fit the object.
C
Christoph Lameter 已提交
3394
 */
3395 3396
static inline unsigned int slab_order(unsigned int size,
		unsigned int min_objects, unsigned int max_order,
3397
		unsigned int fract_leftover)
C
Christoph Lameter 已提交
3398
{
3399 3400
	unsigned int min_order = slub_min_order;
	unsigned int order;
C
Christoph Lameter 已提交
3401

3402
	if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
3403
		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
3404

3405
	for (order = max(min_order, (unsigned int)get_order(min_objects * size));
3406
			order <= max_order; order++) {
C
Christoph Lameter 已提交
3407

3408 3409
		unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
		unsigned int rem;
C
Christoph Lameter 已提交
3410

3411
		rem = slab_size % size;
C
Christoph Lameter 已提交
3412

3413
		if (rem <= slab_size / fract_leftover)
C
Christoph Lameter 已提交
3414 3415
			break;
	}
C
Christoph Lameter 已提交
3416

C
Christoph Lameter 已提交
3417 3418 3419
	return order;
}

3420
static inline int calculate_order(unsigned int size)
3421
{
3422 3423 3424
	unsigned int order;
	unsigned int min_objects;
	unsigned int max_objects;
3425 3426 3427 3428 3429 3430

	/*
	 * Attempt to find best configuration for a slab. This
	 * works by first attempting to generate a layout with
	 * the best configuration and backing off gradually.
	 *
3431
	 * First we increase the acceptable waste in a slab. Then
3432 3433 3434
	 * we reduce the minimum objects required in a slab.
	 */
	min_objects = slub_min_objects;
3435
	if (!min_objects)
3436
		min_objects = 4 * (fls(num_online_cpus()) + 1);
3437
	max_objects = order_objects(slub_max_order, size);
3438 3439
	min_objects = min(min_objects, max_objects);

3440
	while (min_objects > 1) {
3441 3442
		unsigned int fraction;

C
Christoph Lameter 已提交
3443
		fraction = 16;
3444 3445
		while (fraction >= 4) {
			order = slab_order(size, min_objects,
3446
					slub_max_order, fraction);
3447 3448 3449 3450
			if (order <= slub_max_order)
				return order;
			fraction /= 2;
		}
3451
		min_objects--;
3452 3453 3454 3455 3456 3457
	}

	/*
	 * We were unable to place multiple objects in a slab. Now
	 * lets see if we can place a single object there.
	 */
3458
	order = slab_order(size, 1, slub_max_order, 1);
3459 3460 3461 3462 3463 3464
	if (order <= slub_max_order)
		return order;

	/*
	 * Doh this slab cannot be placed using slub_max_order.
	 */
3465
	order = slab_order(size, 1, MAX_ORDER, 1);
D
David Rientjes 已提交
3466
	if (order < MAX_ORDER)
3467 3468 3469 3470
		return order;
	return -ENOSYS;
}

3471
static void
3472
init_kmem_cache_node(struct kmem_cache_node *n)
C
Christoph Lameter 已提交
3473 3474 3475 3476
{
	n->nr_partial = 0;
	spin_lock_init(&n->list_lock);
	INIT_LIST_HEAD(&n->partial);
3477
#ifdef CONFIG_SLUB_DEBUG
3478
	atomic_long_set(&n->nr_slabs, 0);
3479
	atomic_long_set(&n->total_objects, 0);
3480
	INIT_LIST_HEAD(&n->full);
3481
#endif
C
Christoph Lameter 已提交
3482 3483
}

3484
static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
3485
{
3486
	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
3487
			KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
3488

3489
	/*
3490 3491
	 * Must align to double word boundary for the double cmpxchg
	 * instructions to work; see __pcpu_double_call_return_bool().
3492
	 */
3493 3494
	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
				     2 * sizeof(void *));
3495 3496 3497 3498 3499

	if (!s->cpu_slab)
		return 0;

	init_kmem_cache_cpus(s);
3500

3501
	return 1;
3502 3503
}

3504 3505
static struct kmem_cache *kmem_cache_node;

C
Christoph Lameter 已提交
3506 3507 3508 3509 3510
/*
 * No kmalloc_node yet so do it by hand. We know that this is the first
 * slab on the node for this slabcache. There are no concurrent accesses
 * possible.
 *
Z
Zhi Yong Wu 已提交
3511 3512
 * Note that this function only works on the kmem_cache_node
 * when allocating for the kmem_cache_node. This is used for bootstrapping
3513
 * memory on a fresh node that has no slab structures yet.
C
Christoph Lameter 已提交
3514
 */
3515
static void early_kmem_cache_node_alloc(int node)
C
Christoph Lameter 已提交
3516 3517 3518 3519
{
	struct page *page;
	struct kmem_cache_node *n;

3520
	BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
C
Christoph Lameter 已提交
3521

3522
	page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
C
Christoph Lameter 已提交
3523 3524

	BUG_ON(!page);
3525
	if (page_to_nid(page) != node) {
3526 3527
		pr_err("SLUB: Unable to allocate memory from node %d\n", node);
		pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
3528 3529
	}

C
Christoph Lameter 已提交
3530 3531
	n = page->freelist;
	BUG_ON(!n);
3532
#ifdef CONFIG_SLUB_DEBUG
3533
	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
3534
	init_tracking(kmem_cache_node, n);
3535
#endif
3536
	n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
3537
		      GFP_KERNEL);
3538 3539 3540 3541
	page->freelist = get_freepointer(kmem_cache_node, n);
	page->inuse = 1;
	page->frozen = 0;
	kmem_cache_node->node[node] = n;
3542
	init_kmem_cache_node(n);
3543
	inc_slabs_node(kmem_cache_node, node, page->objects);
C
Christoph Lameter 已提交
3544

3545
	/*
3546 3547
	 * No locks need to be taken here as it has just been
	 * initialized and there is no concurrent access.
3548
	 */
3549
	__add_partial(n, page, DEACTIVATE_TO_HEAD);
C
Christoph Lameter 已提交
3550 3551 3552 3553 3554
}

static void free_kmem_cache_nodes(struct kmem_cache *s)
{
	int node;
C
Christoph Lameter 已提交
3555
	struct kmem_cache_node *n;
C
Christoph Lameter 已提交
3556

C
Christoph Lameter 已提交
3557
	for_each_kmem_cache_node(s, node, n) {
C
Christoph Lameter 已提交
3558
		s->node[node] = NULL;
3559
		kmem_cache_free(kmem_cache_node, n);
C
Christoph Lameter 已提交
3560 3561 3562
	}
}

3563 3564
void __kmem_cache_release(struct kmem_cache *s)
{
T
Thomas Garnier 已提交
3565
	cache_random_seq_destroy(s);
3566 3567 3568 3569
	free_percpu(s->cpu_slab);
	free_kmem_cache_nodes(s);
}

3570
static int init_kmem_cache_nodes(struct kmem_cache *s)
C
Christoph Lameter 已提交
3571 3572 3573
{
	int node;

C
Christoph Lameter 已提交
3574
	for_each_node_state(node, N_NORMAL_MEMORY) {
C
Christoph Lameter 已提交
3575 3576
		struct kmem_cache_node *n;

3577
		if (slab_state == DOWN) {
3578
			early_kmem_cache_node_alloc(node);
3579 3580
			continue;
		}
3581
		n = kmem_cache_alloc_node(kmem_cache_node,
3582
						GFP_KERNEL, node);
C
Christoph Lameter 已提交
3583

3584 3585 3586
		if (!n) {
			free_kmem_cache_nodes(s);
			return 0;
C
Christoph Lameter 已提交
3587
		}
3588

3589
		init_kmem_cache_node(n);
3590
		s->node[node] = n;
C
Christoph Lameter 已提交
3591 3592 3593 3594
	}
	return 1;
}

3595
static void set_min_partial(struct kmem_cache *s, unsigned long min)
3596 3597 3598 3599 3600 3601 3602 3603
{
	if (min < MIN_PARTIAL)
		min = MIN_PARTIAL;
	else if (min > MAX_PARTIAL)
		min = MAX_PARTIAL;
	s->min_partial = min;
}

3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624
static void set_cpu_partial(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
	/*
	 * cpu_partial determined the maximum number of objects kept in the
	 * per cpu partial lists of a processor.
	 *
	 * Per cpu partial lists mainly contain slabs that just have one
	 * object freed. If they are used for allocation then they can be
	 * filled up again with minimal effort. The slab will never hit the
	 * per node partial lists and therefore no locking will be required.
	 *
	 * This setting also determines
	 *
	 * A) The number of objects from per cpu partial slabs dumped to the
	 *    per node list when we reach the limit.
	 * B) The number of objects in cpu partial slabs to extract from the
	 *    per node list when we run out of per cpu objects. We only fetch
	 *    50% to keep some capacity around for frees.
	 */
	if (!kmem_cache_has_cpu_partial(s))
3625
		slub_set_cpu_partial(s, 0);
3626
	else if (s->size >= PAGE_SIZE)
3627
		slub_set_cpu_partial(s, 2);
3628
	else if (s->size >= 1024)
3629
		slub_set_cpu_partial(s, 6);
3630
	else if (s->size >= 256)
3631
		slub_set_cpu_partial(s, 13);
3632
	else
3633
		slub_set_cpu_partial(s, 30);
3634 3635 3636
#endif
}

C
Christoph Lameter 已提交
3637 3638 3639 3640
/*
 * calculate_sizes() determines the order and the distribution of data within
 * a slab object.
 */
3641
static int calculate_sizes(struct kmem_cache *s, int forced_order)
C
Christoph Lameter 已提交
3642
{
3643
	slab_flags_t flags = s->flags;
3644
	unsigned int size = s->object_size;
3645
	unsigned int freepointer_area;
3646
	unsigned int order;
C
Christoph Lameter 已提交
3647

3648 3649 3650 3651 3652 3653
	/*
	 * Round up object size to the next word boundary. We can only
	 * place the free pointer at word boundaries and this determines
	 * the possible location of the free pointer.
	 */
	size = ALIGN(size, sizeof(void *));
3654 3655 3656 3657 3658 3659 3660
	/*
	 * This is the area of the object where a freepointer can be
	 * safely written. If redzoning adds more to the inuse size, we
	 * can't use that portion for writing the freepointer, so
	 * s->offset must be limited within this for the general case.
	 */
	freepointer_area = size;
3661 3662

#ifdef CONFIG_SLUB_DEBUG
C
Christoph Lameter 已提交
3663 3664 3665 3666 3667
	/*
	 * Determine if we can poison the object itself. If the user of
	 * the slab may touch the object after free or before allocation
	 * then we should never poison the object itself.
	 */
3668
	if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
3669
			!s->ctor)
C
Christoph Lameter 已提交
3670 3671 3672 3673 3674 3675
		s->flags |= __OBJECT_POISON;
	else
		s->flags &= ~__OBJECT_POISON;


	/*
C
Christoph Lameter 已提交
3676
	 * If we are Redzoning then check if there is some space between the
C
Christoph Lameter 已提交
3677
	 * end of the object and the free pointer. If not then add an
C
Christoph Lameter 已提交
3678
	 * additional word to have some bytes to store Redzone information.
C
Christoph Lameter 已提交
3679
	 */
3680
	if ((flags & SLAB_RED_ZONE) && size == s->object_size)
C
Christoph Lameter 已提交
3681
		size += sizeof(void *);
C
Christoph Lameter 已提交
3682
#endif
C
Christoph Lameter 已提交
3683 3684

	/*
C
Christoph Lameter 已提交
3685 3686
	 * With that we have determined the number of bytes in actual use
	 * by the object. This is the potential offset to the free pointer.
C
Christoph Lameter 已提交
3687 3688 3689
	 */
	s->inuse = size;

3690
	if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
3691
		s->ctor)) {
C
Christoph Lameter 已提交
3692 3693 3694 3695 3696 3697 3698
		/*
		 * Relocate free pointer after the object if it is not
		 * permitted to overwrite the first word of the object on
		 * kmem_cache_free.
		 *
		 * This is the case if we do RCU, have a constructor or
		 * destructor or are poisoning the objects.
3699 3700 3701 3702 3703
		 *
		 * The assumption that s->offset >= s->inuse means free
		 * pointer is outside of the object is used in the
		 * freeptr_outside_object() function. If that is no
		 * longer true, the function needs to be modified.
C
Christoph Lameter 已提交
3704 3705 3706
		 */
		s->offset = size;
		size += sizeof(void *);
3707
	} else if (freepointer_area > sizeof(void *)) {
3708 3709 3710 3711 3712
		/*
		 * Store freelist pointer near middle of object to keep
		 * it away from the edges of the object to avoid small
		 * sized over/underflows from neighboring allocations.
		 */
3713
		s->offset = ALIGN(freepointer_area / 2, sizeof(void *));
C
Christoph Lameter 已提交
3714 3715
	}

3716
#ifdef CONFIG_SLUB_DEBUG
C
Christoph Lameter 已提交
3717 3718 3719 3720 3721 3722
	if (flags & SLAB_STORE_USER)
		/*
		 * Need to store information about allocs and frees after
		 * the object.
		 */
		size += 2 * sizeof(struct track);
3723
#endif
C
Christoph Lameter 已提交
3724

3725 3726
	kasan_cache_create(s, &size, &s->flags);
#ifdef CONFIG_SLUB_DEBUG
J
Joonsoo Kim 已提交
3727
	if (flags & SLAB_RED_ZONE) {
C
Christoph Lameter 已提交
3728 3729 3730 3731
		/*
		 * Add some empty padding so that we can catch
		 * overwrites from earlier objects rather than let
		 * tracking information or the free pointer be
3732
		 * corrupted if a user writes before the start
C
Christoph Lameter 已提交
3733 3734 3735
		 * of the object.
		 */
		size += sizeof(void *);
J
Joonsoo Kim 已提交
3736 3737 3738 3739 3740

		s->red_left_pad = sizeof(void *);
		s->red_left_pad = ALIGN(s->red_left_pad, s->align);
		size += s->red_left_pad;
	}
C
Christoph Lameter 已提交
3741
#endif
C
Christoph Lameter 已提交
3742

C
Christoph Lameter 已提交
3743 3744 3745 3746 3747
	/*
	 * SLUB stores one object immediately after another beginning from
	 * offset 0. In order to align the objects we have to simply size
	 * each object to conform to the alignment.
	 */
3748
	size = ALIGN(size, s->align);
C
Christoph Lameter 已提交
3749
	s->size = size;
3750
	s->reciprocal_size = reciprocal_value(size);
3751 3752 3753
	if (forced_order >= 0)
		order = forced_order;
	else
3754
		order = calculate_order(size);
C
Christoph Lameter 已提交
3755

3756
	if ((int)order < 0)
C
Christoph Lameter 已提交
3757 3758
		return 0;

3759
	s->allocflags = 0;
3760
	if (order)
3761 3762 3763
		s->allocflags |= __GFP_COMP;

	if (s->flags & SLAB_CACHE_DMA)
3764
		s->allocflags |= GFP_DMA;
3765

3766 3767 3768
	if (s->flags & SLAB_CACHE_DMA32)
		s->allocflags |= GFP_DMA32;

3769 3770 3771
	if (s->flags & SLAB_RECLAIM_ACCOUNT)
		s->allocflags |= __GFP_RECLAIMABLE;

C
Christoph Lameter 已提交
3772 3773 3774
	/*
	 * Determine the number of objects per slab
	 */
3775 3776
	s->oo = oo_make(order, size);
	s->min = oo_make(get_order(size), size);
3777 3778
	if (oo_objects(s->oo) > oo_objects(s->max))
		s->max = s->oo;
C
Christoph Lameter 已提交
3779

3780
	return !!oo_objects(s->oo);
C
Christoph Lameter 已提交
3781 3782
}

3783
static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
C
Christoph Lameter 已提交
3784
{
3785
	s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
3786 3787 3788
#ifdef CONFIG_SLAB_FREELIST_HARDENED
	s->random = get_random_long();
#endif
C
Christoph Lameter 已提交
3789

3790
	if (!calculate_sizes(s, -1))
C
Christoph Lameter 已提交
3791
		goto error;
3792 3793 3794 3795 3796
	if (disable_higher_order_debug) {
		/*
		 * Disable debugging flags that store metadata if the min slab
		 * order increased.
		 */
3797
		if (get_order(s->size) > get_order(s->object_size)) {
3798 3799 3800 3801 3802 3803
			s->flags &= ~DEBUG_METADATA_FLAGS;
			s->offset = 0;
			if (!calculate_sizes(s, -1))
				goto error;
		}
	}
C
Christoph Lameter 已提交
3804

3805 3806
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
3807
	if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
3808 3809 3810 3811
		/* Enable fast mode */
		s->flags |= __CMPXCHG_DOUBLE;
#endif

3812 3813 3814 3815
	/*
	 * The larger the object size is, the more pages we want on the partial
	 * list to avoid pounding the page allocator excessively.
	 */
3816 3817
	set_min_partial(s, ilog2(s->size) / 2);

3818
	set_cpu_partial(s);
3819

C
Christoph Lameter 已提交
3820
#ifdef CONFIG_NUMA
3821
	s->remote_node_defrag_ratio = 1000;
C
Christoph Lameter 已提交
3822
#endif
T
Thomas Garnier 已提交
3823 3824 3825 3826 3827 3828 3829

	/* Initialize the pre-computed randomized freelist if slab is up */
	if (slab_state >= UP) {
		if (init_cache_random_seq(s))
			goto error;
	}

3830
	if (!init_kmem_cache_nodes(s))
3831
		goto error;
C
Christoph Lameter 已提交
3832

3833
	if (alloc_kmem_cache_cpus(s))
3834
		return 0;
3835

3836
	free_kmem_cache_nodes(s);
C
Christoph Lameter 已提交
3837
error:
3838
	return -EINVAL;
C
Christoph Lameter 已提交
3839 3840
}

3841
static void list_slab_objects(struct kmem_cache *s, struct page *page,
3842
			      const char *text)
3843 3844 3845
{
#ifdef CONFIG_SLUB_DEBUG
	void *addr = page_address(page);
3846
	unsigned long *map;
3847
	void *p;
3848

3849
	slab_err(s, page, text, s->name);
3850 3851
	slab_lock(page);

3852
	map = get_map(s, page);
3853 3854
	for_each_object(p, s, addr, page->objects) {

3855
		if (!test_bit(__obj_to_index(s, addr, p), map)) {
3856
			pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
3857 3858 3859
			print_tracking(s, p);
		}
	}
3860
	put_map(map);
3861 3862 3863 3864
	slab_unlock(page);
#endif
}

C
Christoph Lameter 已提交
3865
/*
C
Christoph Lameter 已提交
3866
 * Attempt to free all partial slabs on a node.
3867 3868
 * This is called from __kmem_cache_shutdown(). We must take list_lock
 * because sysfs file might still access partial list after the shutdowning.
C
Christoph Lameter 已提交
3869
 */
C
Christoph Lameter 已提交
3870
static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
C
Christoph Lameter 已提交
3871
{
3872
	LIST_HEAD(discard);
C
Christoph Lameter 已提交
3873 3874
	struct page *page, *h;

3875 3876
	BUG_ON(irqs_disabled());
	spin_lock_irq(&n->list_lock);
3877
	list_for_each_entry_safe(page, h, &n->partial, slab_list) {
C
Christoph Lameter 已提交
3878
		if (!page->inuse) {
3879
			remove_partial(n, page);
3880
			list_add(&page->slab_list, &discard);
3881 3882
		} else {
			list_slab_objects(s, page,
3883
			  "Objects remaining in %s on __kmem_cache_shutdown()");
C
Christoph Lameter 已提交
3884
		}
3885
	}
3886
	spin_unlock_irq(&n->list_lock);
3887

3888
	list_for_each_entry_safe(page, h, &discard, slab_list)
3889
		discard_slab(s, page);
C
Christoph Lameter 已提交
3890 3891
}

3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902
bool __kmem_cache_empty(struct kmem_cache *s)
{
	int node;
	struct kmem_cache_node *n;

	for_each_kmem_cache_node(s, node, n)
		if (n->nr_partial || slabs_node(s, node))
			return false;
	return true;
}

C
Christoph Lameter 已提交
3903
/*
C
Christoph Lameter 已提交
3904
 * Release all resources used by a slab cache.
C
Christoph Lameter 已提交
3905
 */
3906
int __kmem_cache_shutdown(struct kmem_cache *s)
C
Christoph Lameter 已提交
3907 3908
{
	int node;
C
Christoph Lameter 已提交
3909
	struct kmem_cache_node *n;
C
Christoph Lameter 已提交
3910 3911 3912

	flush_all(s);
	/* Attempt to free all objects */
C
Christoph Lameter 已提交
3913
	for_each_kmem_cache_node(s, node, n) {
C
Christoph Lameter 已提交
3914 3915
		free_partial(s, n);
		if (n->nr_partial || slabs_node(s, node))
C
Christoph Lameter 已提交
3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926
			return 1;
	}
	return 0;
}

/********************************************************************
 *		Kmalloc subsystem
 *******************************************************************/

static int __init setup_slub_min_order(char *str)
{
3927
	get_option(&str, (int *)&slub_min_order);
C
Christoph Lameter 已提交
3928 3929 3930 3931 3932 3933 3934 3935

	return 1;
}

__setup("slub_min_order=", setup_slub_min_order);

static int __init setup_slub_max_order(char *str)
{
3936 3937
	get_option(&str, (int *)&slub_max_order);
	slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1);
C
Christoph Lameter 已提交
3938 3939 3940 3941 3942 3943 3944 3945

	return 1;
}

__setup("slub_max_order=", setup_slub_max_order);

static int __init setup_slub_min_objects(char *str)
{
3946
	get_option(&str, (int *)&slub_min_objects);
C
Christoph Lameter 已提交
3947 3948 3949 3950 3951 3952 3953 3954

	return 1;
}

__setup("slub_min_objects=", setup_slub_min_objects);

void *__kmalloc(size_t size, gfp_t flags)
{
3955
	struct kmem_cache *s;
E
Eduard - Gabriel Munteanu 已提交
3956
	void *ret;
C
Christoph Lameter 已提交
3957

3958
	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
3959
		return kmalloc_large(size, flags);
3960

3961
	s = kmalloc_slab(size, flags);
3962 3963

	if (unlikely(ZERO_OR_NULL_PTR(s)))
3964 3965
		return s;

3966
	ret = slab_alloc(s, flags, _RET_IP_);
E
Eduard - Gabriel Munteanu 已提交
3967

3968
	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
E
Eduard - Gabriel Munteanu 已提交
3969

3970
	ret = kasan_kmalloc(s, ret, size, flags);
3971

E
Eduard - Gabriel Munteanu 已提交
3972
	return ret;
C
Christoph Lameter 已提交
3973 3974 3975
}
EXPORT_SYMBOL(__kmalloc);

3976
#ifdef CONFIG_NUMA
3977 3978
static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
{
3979
	struct page *page;
3980
	void *ptr = NULL;
3981
	unsigned int order = get_order(size);
3982

3983
	flags |= __GFP_COMP;
3984 3985
	page = alloc_pages_node(node, flags, order);
	if (page) {
3986
		ptr = page_address(page);
3987 3988
		mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
				    PAGE_SIZE << order);
3989
	}
3990

3991
	return kmalloc_large_node_hook(ptr, size, flags);
3992 3993
}

C
Christoph Lameter 已提交
3994 3995
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
3996
	struct kmem_cache *s;
E
Eduard - Gabriel Munteanu 已提交
3997
	void *ret;
C
Christoph Lameter 已提交
3998

3999
	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
E
Eduard - Gabriel Munteanu 已提交
4000 4001
		ret = kmalloc_large_node(size, flags, node);

4002 4003 4004
		trace_kmalloc_node(_RET_IP_, ret,
				   size, PAGE_SIZE << get_order(size),
				   flags, node);
E
Eduard - Gabriel Munteanu 已提交
4005 4006 4007

		return ret;
	}
4008

4009
	s = kmalloc_slab(size, flags);
4010 4011

	if (unlikely(ZERO_OR_NULL_PTR(s)))
4012 4013
		return s;

4014
	ret = slab_alloc_node(s, flags, node, _RET_IP_);
E
Eduard - Gabriel Munteanu 已提交
4015

4016
	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
E
Eduard - Gabriel Munteanu 已提交
4017

4018
	ret = kasan_kmalloc(s, ret, size, flags);
4019

E
Eduard - Gabriel Munteanu 已提交
4020
	return ret;
C
Christoph Lameter 已提交
4021 4022
}
EXPORT_SYMBOL(__kmalloc_node);
4023
#endif	/* CONFIG_NUMA */
C
Christoph Lameter 已提交
4024

K
Kees Cook 已提交
4025 4026
#ifdef CONFIG_HARDENED_USERCOPY
/*
4027 4028 4029
 * Rejects incorrectly sized objects and objects that are to be copied
 * to/from userspace but do not fall entirely within the containing slab
 * cache's usercopy region.
K
Kees Cook 已提交
4030 4031 4032 4033
 *
 * Returns NULL if check passes, otherwise const char * to name of cache
 * to indicate an error.
 */
4034 4035
void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
			 bool to_user)
K
Kees Cook 已提交
4036 4037
{
	struct kmem_cache *s;
A
Alexey Dobriyan 已提交
4038
	unsigned int offset;
K
Kees Cook 已提交
4039 4040
	size_t object_size;

4041 4042
	ptr = kasan_reset_tag(ptr);

K
Kees Cook 已提交
4043 4044 4045 4046 4047
	/* Find object and usable object size. */
	s = page->slab_cache;

	/* Reject impossible pointers. */
	if (ptr < page_address(page))
4048 4049
		usercopy_abort("SLUB object not in SLUB page?!", NULL,
			       to_user, 0, n);
K
Kees Cook 已提交
4050 4051 4052 4053 4054

	/* Find offset within object. */
	offset = (ptr - page_address(page)) % s->size;

	/* Adjust for redzone and reject if within the redzone. */
4055
	if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
K
Kees Cook 已提交
4056
		if (offset < s->red_left_pad)
4057 4058
			usercopy_abort("SLUB object in left red zone",
				       s->name, to_user, offset, n);
K
Kees Cook 已提交
4059 4060 4061
		offset -= s->red_left_pad;
	}

4062 4063 4064 4065
	/* Allow address range falling entirely within usercopy region. */
	if (offset >= s->useroffset &&
	    offset - s->useroffset <= s->usersize &&
	    n <= s->useroffset - offset + s->usersize)
4066
		return;
K
Kees Cook 已提交
4067

4068 4069 4070 4071 4072 4073 4074
	/*
	 * If the copy is still within the allocated object, produce
	 * a warning instead of rejecting the copy. This is intended
	 * to be a temporary method to find any missing usercopy
	 * whitelists.
	 */
	object_size = slab_ksize(s);
4075 4076
	if (usercopy_fallback &&
	    offset <= object_size && n <= object_size - offset) {
4077 4078 4079
		usercopy_warn("SLUB object", s->name, to_user, offset, n);
		return;
	}
K
Kees Cook 已提交
4080

4081
	usercopy_abort("SLUB object", s->name, to_user, offset, n);
K
Kees Cook 已提交
4082 4083 4084
}
#endif /* CONFIG_HARDENED_USERCOPY */

4085
size_t __ksize(const void *object)
C
Christoph Lameter 已提交
4086
{
4087
	struct page *page;
C
Christoph Lameter 已提交
4088

4089
	if (unlikely(object == ZERO_SIZE_PTR))
4090 4091
		return 0;

4092 4093
	page = virt_to_head_page(object);

P
Pekka Enberg 已提交
4094 4095
	if (unlikely(!PageSlab(page))) {
		WARN_ON(!PageCompound(page));
4096
		return page_size(page);
P
Pekka Enberg 已提交
4097
	}
C
Christoph Lameter 已提交
4098

4099
	return slab_ksize(page->slab_cache);
C
Christoph Lameter 已提交
4100
}
4101
EXPORT_SYMBOL(__ksize);
C
Christoph Lameter 已提交
4102 4103 4104 4105

void kfree(const void *x)
{
	struct page *page;
4106
	void *object = (void *)x;
C
Christoph Lameter 已提交
4107

4108 4109
	trace_kfree(_RET_IP_, x);

4110
	if (unlikely(ZERO_OR_NULL_PTR(x)))
C
Christoph Lameter 已提交
4111 4112
		return;

4113
	page = virt_to_head_page(x);
4114
	if (unlikely(!PageSlab(page))) {
4115 4116
		unsigned int order = compound_order(page);

4117
		BUG_ON(!PageCompound(page));
4118
		kfree_hook(object);
4119 4120
		mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
				    -(PAGE_SIZE << order));
4121
		__free_pages(page, order);
4122 4123
		return;
	}
4124
	slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
C
Christoph Lameter 已提交
4125 4126 4127
}
EXPORT_SYMBOL(kfree);

4128 4129
#define SHRINK_PROMOTE_MAX 32

4130
/*
4131 4132 4133
 * kmem_cache_shrink discards empty slabs and promotes the slabs filled
 * up most to the head of the partial lists. New allocations will then
 * fill those up and thus they can be removed from the partial lists.
C
Christoph Lameter 已提交
4134 4135 4136 4137
 *
 * The slabs with the least items are placed last. This results in them
 * being allocated from last increasing the chance that the last objects
 * are freed in them.
4138
 */
4139
int __kmem_cache_shrink(struct kmem_cache *s)
4140 4141 4142 4143 4144 4145
{
	int node;
	int i;
	struct kmem_cache_node *n;
	struct page *page;
	struct page *t;
4146 4147
	struct list_head discard;
	struct list_head promote[SHRINK_PROMOTE_MAX];
4148
	unsigned long flags;
4149
	int ret = 0;
4150 4151

	flush_all(s);
C
Christoph Lameter 已提交
4152
	for_each_kmem_cache_node(s, node, n) {
4153 4154 4155
		INIT_LIST_HEAD(&discard);
		for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
			INIT_LIST_HEAD(promote + i);
4156 4157 4158 4159

		spin_lock_irqsave(&n->list_lock, flags);

		/*
4160
		 * Build lists of slabs to discard or promote.
4161
		 *
C
Christoph Lameter 已提交
4162 4163
		 * Note that concurrent frees may occur while we hold the
		 * list_lock. page->inuse here is the upper limit.
4164
		 */
4165
		list_for_each_entry_safe(page, t, &n->partial, slab_list) {
4166 4167 4168 4169 4170 4171 4172 4173 4174
			int free = page->objects - page->inuse;

			/* Do not reread page->inuse */
			barrier();

			/* We do not keep full slabs on the list */
			BUG_ON(free <= 0);

			if (free == page->objects) {
4175
				list_move(&page->slab_list, &discard);
4176
				n->nr_partial--;
4177
			} else if (free <= SHRINK_PROMOTE_MAX)
4178
				list_move(&page->slab_list, promote + free - 1);
4179 4180 4181
		}

		/*
4182 4183
		 * Promote the slabs filled up most to the head of the
		 * partial list.
4184
		 */
4185 4186
		for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
			list_splice(promote + i, &n->partial);
4187 4188

		spin_unlock_irqrestore(&n->list_lock, flags);
4189 4190

		/* Release empty slabs */
4191
		list_for_each_entry_safe(page, t, &discard, slab_list)
4192
			discard_slab(s, page);
4193 4194 4195

		if (slabs_node(s, node))
			ret = 1;
4196 4197
	}

4198
	return ret;
4199 4200
}

4201 4202 4203 4204
static int slab_mem_going_offline_callback(void *arg)
{
	struct kmem_cache *s;

4205
	mutex_lock(&slab_mutex);
4206
	list_for_each_entry(s, &slab_caches, list)
4207
		__kmem_cache_shrink(s);
4208
	mutex_unlock(&slab_mutex);
4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219

	return 0;
}

static void slab_mem_offline_callback(void *arg)
{
	struct kmem_cache_node *n;
	struct kmem_cache *s;
	struct memory_notify *marg = arg;
	int offline_node;

4220
	offline_node = marg->status_change_nid_normal;
4221 4222 4223 4224 4225 4226 4227 4228

	/*
	 * If the node still has available memory. we need kmem_cache_node
	 * for it yet.
	 */
	if (offline_node < 0)
		return;

4229
	mutex_lock(&slab_mutex);
4230 4231 4232 4233 4234 4235
	list_for_each_entry(s, &slab_caches, list) {
		n = get_node(s, offline_node);
		if (n) {
			/*
			 * if n->nr_slabs > 0, slabs still exist on the node
			 * that is going down. We were unable to free them,
4236
			 * and offline_pages() function shouldn't call this
4237 4238
			 * callback. So, we must fail.
			 */
4239
			BUG_ON(slabs_node(s, offline_node));
4240 4241

			s->node[offline_node] = NULL;
4242
			kmem_cache_free(kmem_cache_node, n);
4243 4244
		}
	}
4245
	mutex_unlock(&slab_mutex);
4246 4247 4248 4249 4250 4251 4252
}

static int slab_mem_going_online_callback(void *arg)
{
	struct kmem_cache_node *n;
	struct kmem_cache *s;
	struct memory_notify *marg = arg;
4253
	int nid = marg->status_change_nid_normal;
4254 4255 4256 4257 4258 4259 4260 4261 4262 4263
	int ret = 0;

	/*
	 * If the node's memory is already available, then kmem_cache_node is
	 * already created. Nothing to do.
	 */
	if (nid < 0)
		return 0;

	/*
4264
	 * We are bringing a node online. No memory is available yet. We must
4265 4266 4267
	 * allocate a kmem_cache_node structure in order to bring the node
	 * online.
	 */
4268
	mutex_lock(&slab_mutex);
4269 4270 4271 4272 4273 4274
	list_for_each_entry(s, &slab_caches, list) {
		/*
		 * XXX: kmem_cache_alloc_node will fallback to other nodes
		 *      since memory is not yet available from the node that
		 *      is brought up.
		 */
4275
		n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
4276 4277 4278 4279
		if (!n) {
			ret = -ENOMEM;
			goto out;
		}
4280
		init_kmem_cache_node(n);
4281 4282 4283
		s->node[nid] = n;
	}
out:
4284
	mutex_unlock(&slab_mutex);
4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307
	return ret;
}

static int slab_memory_callback(struct notifier_block *self,
				unsigned long action, void *arg)
{
	int ret = 0;

	switch (action) {
	case MEM_GOING_ONLINE:
		ret = slab_mem_going_online_callback(arg);
		break;
	case MEM_GOING_OFFLINE:
		ret = slab_mem_going_offline_callback(arg);
		break;
	case MEM_OFFLINE:
	case MEM_CANCEL_ONLINE:
		slab_mem_offline_callback(arg);
		break;
	case MEM_ONLINE:
	case MEM_CANCEL_OFFLINE:
		break;
	}
4308 4309 4310 4311
	if (ret)
		ret = notifier_from_errno(ret);
	else
		ret = NOTIFY_OK;
4312 4313 4314
	return ret;
}

4315 4316 4317 4318
static struct notifier_block slab_memory_callback_nb = {
	.notifier_call = slab_memory_callback,
	.priority = SLAB_CALLBACK_PRI,
};
4319

C
Christoph Lameter 已提交
4320 4321 4322 4323
/********************************************************************
 *			Basic setup of slabs
 *******************************************************************/

4324 4325
/*
 * Used for early kmem_cache structures that were allocated using
4326 4327
 * the page allocator. Allocate them properly then fix up the pointers
 * that may be pointing to the wrong kmem_cache structure.
4328 4329
 */

4330
static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
4331 4332
{
	int node;
4333
	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
C
Christoph Lameter 已提交
4334
	struct kmem_cache_node *n;
4335

4336
	memcpy(s, static_cache, kmem_cache->object_size);
4337

4338 4339 4340 4341 4342 4343
	/*
	 * This runs very early, and only the boot processor is supposed to be
	 * up.  Even if it weren't true, IRQs are not up so we couldn't fire
	 * IPIs around.
	 */
	__flush_cpu_slab(s, smp_processor_id());
C
Christoph Lameter 已提交
4344
	for_each_kmem_cache_node(s, node, n) {
4345 4346
		struct page *p;

4347
		list_for_each_entry(p, &n->partial, slab_list)
C
Christoph Lameter 已提交
4348
			p->slab_cache = s;
4349

L
Li Zefan 已提交
4350
#ifdef CONFIG_SLUB_DEBUG
4351
		list_for_each_entry(p, &n->full, slab_list)
C
Christoph Lameter 已提交
4352
			p->slab_cache = s;
4353 4354
#endif
	}
4355 4356
	list_add(&s->list, &slab_caches);
	return s;
4357 4358
}

C
Christoph Lameter 已提交
4359 4360
void __init kmem_cache_init(void)
{
4361 4362
	static __initdata struct kmem_cache boot_kmem_cache,
		boot_kmem_cache_node;
4363

4364 4365 4366
	if (debug_guardpage_minorder())
		slub_max_order = 0;

4367 4368
	kmem_cache_node = &boot_kmem_cache_node;
	kmem_cache = &boot_kmem_cache;
4369

4370
	create_boot_cache(kmem_cache_node, "kmem_cache_node",
4371
		sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
4372

4373
	register_hotmemory_notifier(&slab_memory_callback_nb);
C
Christoph Lameter 已提交
4374 4375 4376 4377

	/* Able to allocate the per node structures */
	slab_state = PARTIAL;

4378 4379 4380
	create_boot_cache(kmem_cache, "kmem_cache",
			offsetof(struct kmem_cache, node) +
				nr_node_ids * sizeof(struct kmem_cache_node *),
4381
		       SLAB_HWCACHE_ALIGN, 0, 0);
4382

4383 4384
	kmem_cache = bootstrap(&boot_kmem_cache);
	kmem_cache_node = bootstrap(&boot_kmem_cache_node);
4385 4386

	/* Now we can use the kmem_cache to allocate kmalloc slabs */
4387
	setup_kmalloc_cache_index_table();
4388
	create_kmalloc_caches(0);
C
Christoph Lameter 已提交
4389

T
Thomas Garnier 已提交
4390 4391 4392
	/* Setup random freelists for each cache */
	init_freelist_randomization();

4393 4394
	cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
				  slub_cpu_dead);
C
Christoph Lameter 已提交
4395

4396
	pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
4397
		cache_line_size(),
C
Christoph Lameter 已提交
4398 4399 4400 4401
		slub_min_order, slub_max_order, slub_min_objects,
		nr_cpu_ids, nr_node_ids);
}

4402 4403 4404 4405
void __init kmem_cache_init_late(void)
{
}

4406
struct kmem_cache *
4407
__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
4408
		   slab_flags_t flags, void (*ctor)(void *))
C
Christoph Lameter 已提交
4409
{
4410
	struct kmem_cache *s;
C
Christoph Lameter 已提交
4411

4412
	s = find_mergeable(size, align, flags, name, ctor);
C
Christoph Lameter 已提交
4413 4414
	if (s) {
		s->refcount++;
4415

C
Christoph Lameter 已提交
4416 4417 4418 4419
		/*
		 * Adjust the object sizes so that we clear
		 * the complete object on kzalloc.
		 */
4420
		s->object_size = max(s->object_size, size);
4421
		s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
C
Christoph Lameter 已提交
4422

4423 4424
		if (sysfs_slab_alias(s, name)) {
			s->refcount--;
4425
			s = NULL;
4426
		}
4427
	}
C
Christoph Lameter 已提交
4428

4429 4430
	return s;
}
P
Pekka Enberg 已提交
4431

4432
int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
4433
{
4434 4435 4436 4437 4438
	int err;

	err = kmem_cache_open(s, flags);
	if (err)
		return err;
4439

4440 4441 4442 4443
	/* Mutex is not taken during early boot */
	if (slab_state <= UP)
		return 0;

4444 4445
	err = sysfs_slab_add(s);
	if (err)
4446
		__kmem_cache_release(s);
4447

4448
	return err;
C
Christoph Lameter 已提交
4449 4450
}

4451
void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
C
Christoph Lameter 已提交
4452
{
4453
	struct kmem_cache *s;
4454
	void *ret;
4455

4456
	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
4457 4458
		return kmalloc_large(size, gfpflags);

4459
	s = kmalloc_slab(size, gfpflags);
C
Christoph Lameter 已提交
4460

4461
	if (unlikely(ZERO_OR_NULL_PTR(s)))
4462
		return s;
C
Christoph Lameter 已提交
4463

4464
	ret = slab_alloc(s, gfpflags, caller);
4465

L
Lucas De Marchi 已提交
4466
	/* Honor the call site pointer we received. */
4467
	trace_kmalloc(caller, ret, size, s->size, gfpflags);
4468 4469

	return ret;
C
Christoph Lameter 已提交
4470
}
4471
EXPORT_SYMBOL(__kmalloc_track_caller);
C
Christoph Lameter 已提交
4472

4473
#ifdef CONFIG_NUMA
C
Christoph Lameter 已提交
4474
void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
4475
					int node, unsigned long caller)
C
Christoph Lameter 已提交
4476
{
4477
	struct kmem_cache *s;
4478
	void *ret;
4479

4480
	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
4481 4482 4483 4484 4485 4486 4487 4488
		ret = kmalloc_large_node(size, gfpflags, node);

		trace_kmalloc_node(caller, ret,
				   size, PAGE_SIZE << get_order(size),
				   gfpflags, node);

		return ret;
	}
4489

4490
	s = kmalloc_slab(size, gfpflags);
C
Christoph Lameter 已提交
4491

4492
	if (unlikely(ZERO_OR_NULL_PTR(s)))
4493
		return s;
C
Christoph Lameter 已提交
4494

4495
	ret = slab_alloc_node(s, gfpflags, node, caller);
4496

L
Lucas De Marchi 已提交
4497
	/* Honor the call site pointer we received. */
4498
	trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
4499 4500

	return ret;
C
Christoph Lameter 已提交
4501
}
4502
EXPORT_SYMBOL(__kmalloc_node_track_caller);
4503
#endif
C
Christoph Lameter 已提交
4504

4505
#ifdef CONFIG_SYSFS
4506 4507 4508 4509 4510 4511 4512 4513 4514
static int count_inuse(struct page *page)
{
	return page->inuse;
}

static int count_total(struct page *page)
{
	return page->objects;
}
4515
#endif
4516

4517
#ifdef CONFIG_SLUB_DEBUG
4518
static void validate_slab(struct kmem_cache *s, struct page *page)
4519 4520
{
	void *p;
4521
	void *addr = page_address(page);
4522 4523 4524
	unsigned long *map;

	slab_lock(page);
4525

Y
Yu Zhao 已提交
4526
	if (!check_slab(s, page) || !on_freelist(s, page, NULL))
4527
		goto unlock;
4528 4529

	/* Now we know that a valid freelist exists */
4530
	map = get_map(s, page);
4531
	for_each_object(p, s, addr, page->objects) {
4532
		u8 val = test_bit(__obj_to_index(s, addr, p), map) ?
Y
Yu Zhao 已提交
4533
			 SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
4534

Y
Yu Zhao 已提交
4535 4536 4537
		if (!check_object(s, page, p, val))
			break;
	}
4538 4539
	put_map(map);
unlock:
4540
	slab_unlock(page);
4541 4542
}

4543
static int validate_slab_node(struct kmem_cache *s,
4544
		struct kmem_cache_node *n)
4545 4546 4547 4548 4549 4550 4551
{
	unsigned long count = 0;
	struct page *page;
	unsigned long flags;

	spin_lock_irqsave(&n->list_lock, flags);

4552
	list_for_each_entry(page, &n->partial, slab_list) {
4553
		validate_slab(s, page);
4554 4555 4556
		count++;
	}
	if (count != n->nr_partial)
4557 4558
		pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
		       s->name, count, n->nr_partial);
4559 4560 4561 4562

	if (!(s->flags & SLAB_STORE_USER))
		goto out;

4563
	list_for_each_entry(page, &n->full, slab_list) {
4564
		validate_slab(s, page);
4565 4566 4567
		count++;
	}
	if (count != atomic_long_read(&n->nr_slabs))
4568 4569
		pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
		       s->name, count, atomic_long_read(&n->nr_slabs));
4570 4571 4572 4573 4574 4575

out:
	spin_unlock_irqrestore(&n->list_lock, flags);
	return count;
}

4576
static long validate_slab_cache(struct kmem_cache *s)
4577 4578 4579
{
	int node;
	unsigned long count = 0;
C
Christoph Lameter 已提交
4580
	struct kmem_cache_node *n;
4581 4582

	flush_all(s);
C
Christoph Lameter 已提交
4583
	for_each_kmem_cache_node(s, node, n)
4584 4585
		count += validate_slab_node(s, n);

4586 4587
	return count;
}
4588
/*
C
Christoph Lameter 已提交
4589
 * Generate lists of code addresses where slabcache objects are allocated
4590 4591 4592 4593 4594
 * and freed.
 */

struct location {
	unsigned long count;
4595
	unsigned long addr;
4596 4597 4598 4599 4600
	long long sum_time;
	long min_time;
	long max_time;
	long min_pid;
	long max_pid;
R
Rusty Russell 已提交
4601
	DECLARE_BITMAP(cpus, NR_CPUS);
4602
	nodemask_t nodes;
4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617
};

struct loc_track {
	unsigned long max;
	unsigned long count;
	struct location *loc;
};

static void free_loc_track(struct loc_track *t)
{
	if (t->max)
		free_pages((unsigned long)t->loc,
			get_order(sizeof(struct location) * t->max));
}

4618
static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
4619 4620 4621 4622 4623 4624
{
	struct location *l;
	int order;

	order = get_order(sizeof(struct location) * max);

4625
	l = (void *)__get_free_pages(flags, order);
4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638
	if (!l)
		return 0;

	if (t->count) {
		memcpy(l, t->loc, sizeof(struct location) * t->count);
		free_loc_track(t);
	}
	t->max = max;
	t->loc = l;
	return 1;
}

static int add_location(struct loc_track *t, struct kmem_cache *s,
4639
				const struct track *track)
4640 4641 4642
{
	long start, end, pos;
	struct location *l;
4643
	unsigned long caddr;
4644
	unsigned long age = jiffies - track->when;
4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659

	start = -1;
	end = t->count;

	for ( ; ; ) {
		pos = start + (end - start + 1) / 2;

		/*
		 * There is nothing at "end". If we end up there
		 * we need to add something to before end.
		 */
		if (pos == end)
			break;

		caddr = t->loc[pos].addr;
4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675
		if (track->addr == caddr) {

			l = &t->loc[pos];
			l->count++;
			if (track->when) {
				l->sum_time += age;
				if (age < l->min_time)
					l->min_time = age;
				if (age > l->max_time)
					l->max_time = age;

				if (track->pid < l->min_pid)
					l->min_pid = track->pid;
				if (track->pid > l->max_pid)
					l->max_pid = track->pid;

R
Rusty Russell 已提交
4676 4677
				cpumask_set_cpu(track->cpu,
						to_cpumask(l->cpus));
4678 4679
			}
			node_set(page_to_nid(virt_to_page(track)), l->nodes);
4680 4681 4682
			return 1;
		}

4683
		if (track->addr < caddr)
4684 4685 4686 4687 4688 4689
			end = pos;
		else
			start = pos;
	}

	/*
C
Christoph Lameter 已提交
4690
	 * Not found. Insert new tracking element.
4691
	 */
4692
	if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
4693 4694 4695 4696 4697 4698 4699 4700
		return 0;

	l = t->loc + pos;
	if (pos < t->count)
		memmove(l + 1, l,
			(t->count - pos) * sizeof(struct location));
	t->count++;
	l->count = 1;
4701 4702 4703 4704 4705 4706
	l->addr = track->addr;
	l->sum_time = age;
	l->min_time = age;
	l->max_time = age;
	l->min_pid = track->pid;
	l->max_pid = track->pid;
R
Rusty Russell 已提交
4707 4708
	cpumask_clear(to_cpumask(l->cpus));
	cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
4709 4710
	nodes_clear(l->nodes);
	node_set(page_to_nid(virt_to_page(track)), l->nodes);
4711 4712 4713 4714
	return 1;
}

static void process_slab(struct loc_track *t, struct kmem_cache *s,
4715
		struct page *page, enum track_item alloc)
4716
{
4717
	void *addr = page_address(page);
4718
	void *p;
4719
	unsigned long *map;
4720

4721
	map = get_map(s, page);
4722
	for_each_object(p, s, addr, page->objects)
4723
		if (!test_bit(__obj_to_index(s, addr, p), map))
4724
			add_location(t, s, get_track(s, p, alloc));
4725
	put_map(map);
4726 4727 4728
}

static int list_locations(struct kmem_cache *s, char *buf,
4729
			  enum track_item alloc)
4730
{
4731
	int len = 0;
4732
	unsigned long i;
4733
	struct loc_track t = { 0, 0, NULL };
4734
	int node;
C
Christoph Lameter 已提交
4735
	struct kmem_cache_node *n;
4736

4737 4738
	if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
			     GFP_KERNEL)) {
4739
		return sysfs_emit(buf, "Out of memory\n");
E
Eric Dumazet 已提交
4740
	}
4741 4742 4743
	/* Push back cpu slabs */
	flush_all(s);

C
Christoph Lameter 已提交
4744
	for_each_kmem_cache_node(s, node, n) {
4745 4746 4747
		unsigned long flags;
		struct page *page;

4748
		if (!atomic_long_read(&n->nr_slabs))
4749 4750 4751
			continue;

		spin_lock_irqsave(&n->list_lock, flags);
4752
		list_for_each_entry(page, &n->partial, slab_list)
4753
			process_slab(&t, s, page, alloc);
4754
		list_for_each_entry(page, &n->full, slab_list)
4755
			process_slab(&t, s, page, alloc);
4756 4757 4758 4759
		spin_unlock_irqrestore(&n->list_lock, flags);
	}

	for (i = 0; i < t.count; i++) {
4760
		struct location *l = &t.loc[i];
4761

4762
		len += sysfs_emit_at(buf, len, "%7ld ", l->count);
4763 4764

		if (l->addr)
4765
			len += sysfs_emit_at(buf, len, "%pS", (void *)l->addr);
4766
		else
4767 4768 4769 4770 4771 4772 4773 4774 4775 4776
			len += sysfs_emit_at(buf, len, "<not-available>");

		if (l->sum_time != l->min_time)
			len += sysfs_emit_at(buf, len, " age=%ld/%ld/%ld",
					     l->min_time,
					     (long)div_u64(l->sum_time,
							   l->count),
					     l->max_time);
		else
			len += sysfs_emit_at(buf, len, " age=%ld", l->min_time);
4777 4778

		if (l->min_pid != l->max_pid)
4779 4780
			len += sysfs_emit_at(buf, len, " pid=%ld-%ld",
					     l->min_pid, l->max_pid);
4781
		else
4782 4783
			len += sysfs_emit_at(buf, len, " pid=%ld",
					     l->min_pid);
4784

R
Rusty Russell 已提交
4785
		if (num_online_cpus() > 1 &&
4786 4787 4788 4789 4790 4791 4792 4793 4794
		    !cpumask_empty(to_cpumask(l->cpus)))
			len += sysfs_emit_at(buf, len, " cpus=%*pbl",
					     cpumask_pr_args(to_cpumask(l->cpus)));

		if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
			len += sysfs_emit_at(buf, len, " nodes=%*pbl",
					     nodemask_pr_args(&l->nodes));

		len += sysfs_emit_at(buf, len, "\n");
4795 4796 4797 4798
	}

	free_loc_track(&t);
	if (!t.count)
4799 4800
		len += sysfs_emit_at(buf, len, "No data\n");

4801
	return len;
4802
}
4803
#endif	/* CONFIG_SLUB_DEBUG */
4804

4805
#ifdef SLUB_RESILIENCY_TEST
4806
static void __init resiliency_test(void)
4807 4808
{
	u8 *p;
4809
	int type = KMALLOC_NORMAL;
4810

4811
	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
4812

4813 4814 4815
	pr_err("SLUB resiliency testing\n");
	pr_err("-----------------------\n");
	pr_err("A. Corruption after allocation\n");
4816 4817 4818

	p = kzalloc(16, GFP_KERNEL);
	p[16] = 0x12;
4819 4820
	pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
	       p + 16);
4821

4822
	validate_slab_cache(kmalloc_caches[type][4]);
4823 4824 4825 4826

	/* Hmmm... The next two are dangerous */
	p = kzalloc(32, GFP_KERNEL);
	p[32 + sizeof(void *)] = 0x34;
4827 4828 4829
	pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
	       p);
	pr_err("If allocated object is overwritten then not detectable\n\n");
4830

4831
	validate_slab_cache(kmalloc_caches[type][5]);
4832 4833 4834
	p = kzalloc(64, GFP_KERNEL);
	p += 64 + (get_cycles() & 0xff) * sizeof(void *);
	*p = 0x56;
4835 4836 4837
	pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
	       p);
	pr_err("If allocated object is overwritten then not detectable\n\n");
4838
	validate_slab_cache(kmalloc_caches[type][6]);
4839

4840
	pr_err("\nB. Corruption after free\n");
4841 4842 4843
	p = kzalloc(128, GFP_KERNEL);
	kfree(p);
	*p = 0x78;
4844
	pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
4845
	validate_slab_cache(kmalloc_caches[type][7]);
4846 4847 4848 4849

	p = kzalloc(256, GFP_KERNEL);
	kfree(p);
	p[50] = 0x9a;
4850
	pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
4851
	validate_slab_cache(kmalloc_caches[type][8]);
4852 4853 4854 4855

	p = kzalloc(512, GFP_KERNEL);
	kfree(p);
	p[512] = 0xab;
4856
	pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
4857
	validate_slab_cache(kmalloc_caches[type][9]);
4858 4859 4860 4861 4862
}
#else
#ifdef CONFIG_SYSFS
static void resiliency_test(void) {};
#endif
4863
#endif	/* SLUB_RESILIENCY_TEST */
4864

4865
#ifdef CONFIG_SYSFS
C
Christoph Lameter 已提交
4866
enum slab_stat_type {
4867 4868 4869 4870 4871
	SL_ALL,			/* All slabs */
	SL_PARTIAL,		/* Only partially allocated slabs */
	SL_CPU,			/* Only slabs used for cpu caches */
	SL_OBJECTS,		/* Determine allocated objects not slabs */
	SL_TOTAL		/* Determine object capacity not slabs */
C
Christoph Lameter 已提交
4872 4873
};

4874
#define SO_ALL		(1 << SL_ALL)
C
Christoph Lameter 已提交
4875 4876 4877
#define SO_PARTIAL	(1 << SL_PARTIAL)
#define SO_CPU		(1 << SL_CPU)
#define SO_OBJECTS	(1 << SL_OBJECTS)
4878
#define SO_TOTAL	(1 << SL_TOTAL)
C
Christoph Lameter 已提交
4879

4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895
#ifdef CONFIG_MEMCG
static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON);

static int __init setup_slub_memcg_sysfs(char *str)
{
	int v;

	if (get_option(&str, &v) > 0)
		memcg_sysfs_enabled = v;

	return 1;
}

__setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs);
#endif

4896
static ssize_t show_slab_objects(struct kmem_cache *s,
4897
				 char *buf, unsigned long flags)
C
Christoph Lameter 已提交
4898 4899 4900 4901 4902
{
	unsigned long total = 0;
	int node;
	int x;
	unsigned long *nodes;
4903
	int len = 0;
C
Christoph Lameter 已提交
4904

K
Kees Cook 已提交
4905
	nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
4906 4907
	if (!nodes)
		return -ENOMEM;
C
Christoph Lameter 已提交
4908

4909 4910
	if (flags & SO_CPU) {
		int cpu;
C
Christoph Lameter 已提交
4911

4912
		for_each_possible_cpu(cpu) {
4913 4914
			struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
							       cpu);
4915
			int node;
4916
			struct page *page;
4917

4918
			page = READ_ONCE(c->page);
4919 4920
			if (!page)
				continue;
4921

4922 4923 4924 4925 4926 4927 4928
			node = page_to_nid(page);
			if (flags & SO_TOTAL)
				x = page->objects;
			else if (flags & SO_OBJECTS)
				x = page->inuse;
			else
				x = 1;
4929

4930 4931 4932
			total += x;
			nodes[node] += x;

4933
			page = slub_percpu_partial_read_once(c);
4934
			if (page) {
L
Li Zefan 已提交
4935 4936 4937 4938 4939 4940 4941
				node = page_to_nid(page);
				if (flags & SO_TOTAL)
					WARN_ON_ONCE(1);
				else if (flags & SO_OBJECTS)
					WARN_ON_ONCE(1);
				else
					x = page->pages;
4942 4943
				total += x;
				nodes[node] += x;
4944
			}
C
Christoph Lameter 已提交
4945 4946 4947
		}
	}

4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958
	/*
	 * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
	 * already held which will conflict with an existing lock order:
	 *
	 * mem_hotplug_lock->slab_mutex->kernfs_mutex
	 *
	 * We don't really need mem_hotplug_lock (to hold off
	 * slab_mem_going_offline_callback) here because slab's memory hot
	 * unplug code doesn't destroy the kmem_cache->node[] data.
	 */

4959
#ifdef CONFIG_SLUB_DEBUG
4960
	if (flags & SO_ALL) {
C
Christoph Lameter 已提交
4961 4962 4963
		struct kmem_cache_node *n;

		for_each_kmem_cache_node(s, node, n) {
4964

4965 4966 4967 4968 4969
			if (flags & SO_TOTAL)
				x = atomic_long_read(&n->total_objects);
			else if (flags & SO_OBJECTS)
				x = atomic_long_read(&n->total_objects) -
					count_partial(n, count_free);
C
Christoph Lameter 已提交
4970
			else
4971
				x = atomic_long_read(&n->nr_slabs);
C
Christoph Lameter 已提交
4972 4973 4974 4975
			total += x;
			nodes[node] += x;
		}

4976 4977 4978
	} else
#endif
	if (flags & SO_PARTIAL) {
C
Christoph Lameter 已提交
4979
		struct kmem_cache_node *n;
C
Christoph Lameter 已提交
4980

C
Christoph Lameter 已提交
4981
		for_each_kmem_cache_node(s, node, n) {
4982 4983 4984 4985
			if (flags & SO_TOTAL)
				x = count_partial(n, count_total);
			else if (flags & SO_OBJECTS)
				x = count_partial(n, count_inuse);
C
Christoph Lameter 已提交
4986
			else
4987
				x = n->nr_partial;
C
Christoph Lameter 已提交
4988 4989 4990 4991
			total += x;
			nodes[node] += x;
		}
	}
4992 4993

	len += sysfs_emit_at(buf, len, "%lu", total);
C
Christoph Lameter 已提交
4994
#ifdef CONFIG_NUMA
4995
	for (node = 0; node < nr_node_ids; node++) {
C
Christoph Lameter 已提交
4996
		if (nodes[node])
4997 4998 4999
			len += sysfs_emit_at(buf, len, " N%d=%lu",
					     node, nodes[node]);
	}
C
Christoph Lameter 已提交
5000
#endif
5001
	len += sysfs_emit_at(buf, len, "\n");
C
Christoph Lameter 已提交
5002
	kfree(nodes);
5003 5004

	return len;
C
Christoph Lameter 已提交
5005 5006 5007
}

#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
5008
#define to_slab(n) container_of(n, struct kmem_cache, kobj)
C
Christoph Lameter 已提交
5009 5010 5011 5012 5013 5014 5015 5016

struct slab_attribute {
	struct attribute attr;
	ssize_t (*show)(struct kmem_cache *s, char *buf);
	ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
};

#define SLAB_ATTR_RO(_name) \
5017 5018
	static struct slab_attribute _name##_attr = \
	__ATTR(_name, 0400, _name##_show, NULL)
C
Christoph Lameter 已提交
5019 5020 5021

#define SLAB_ATTR(_name) \
	static struct slab_attribute _name##_attr =  \
5022
	__ATTR(_name, 0600, _name##_show, _name##_store)
C
Christoph Lameter 已提交
5023 5024 5025

static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
{
5026
	return sysfs_emit(buf, "%u\n", s->size);
C
Christoph Lameter 已提交
5027 5028 5029 5030 5031
}
SLAB_ATTR_RO(slab_size);

static ssize_t align_show(struct kmem_cache *s, char *buf)
{
5032
	return sysfs_emit(buf, "%u\n", s->align);
C
Christoph Lameter 已提交
5033 5034 5035 5036 5037
}
SLAB_ATTR_RO(align);

static ssize_t object_size_show(struct kmem_cache *s, char *buf)
{
5038
	return sysfs_emit(buf, "%u\n", s->object_size);
C
Christoph Lameter 已提交
5039 5040 5041 5042 5043
}
SLAB_ATTR_RO(object_size);

static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
{
5044
	return sysfs_emit(buf, "%u\n", oo_objects(s->oo));
C
Christoph Lameter 已提交
5045 5046 5047 5048 5049
}
SLAB_ATTR_RO(objs_per_slab);

static ssize_t order_show(struct kmem_cache *s, char *buf)
{
5050
	return sysfs_emit(buf, "%u\n", oo_order(s->oo));
C
Christoph Lameter 已提交
5051
}
5052
SLAB_ATTR_RO(order);
C
Christoph Lameter 已提交
5053

5054 5055
static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
{
5056
	return sysfs_emit(buf, "%lu\n", s->min_partial);
5057 5058 5059 5060 5061 5062 5063 5064
}

static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
				 size_t length)
{
	unsigned long min;
	int err;

5065
	err = kstrtoul(buf, 10, &min);
5066 5067 5068
	if (err)
		return err;

5069
	set_min_partial(s, min);
5070 5071 5072 5073
	return length;
}
SLAB_ATTR(min_partial);

5074 5075
static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
{
5076
	return sysfs_emit(buf, "%u\n", slub_cpu_partial(s));
5077 5078 5079 5080 5081
}

static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
				 size_t length)
{
5082
	unsigned int objects;
5083 5084
	int err;

5085
	err = kstrtouint(buf, 10, &objects);
5086 5087
	if (err)
		return err;
5088
	if (objects && !kmem_cache_has_cpu_partial(s))
5089
		return -EINVAL;
5090

5091
	slub_set_cpu_partial(s, objects);
5092 5093 5094 5095 5096
	flush_all(s);
	return length;
}
SLAB_ATTR(cpu_partial);

C
Christoph Lameter 已提交
5097 5098
static ssize_t ctor_show(struct kmem_cache *s, char *buf)
{
J
Joe Perches 已提交
5099 5100
	if (!s->ctor)
		return 0;
5101
	return sysfs_emit(buf, "%pS\n", s->ctor);
C
Christoph Lameter 已提交
5102 5103 5104 5105 5106
}
SLAB_ATTR_RO(ctor);

static ssize_t aliases_show(struct kmem_cache *s, char *buf)
{
5107
	return sysfs_emit(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
C
Christoph Lameter 已提交
5108 5109 5110 5111 5112
}
SLAB_ATTR_RO(aliases);

static ssize_t partial_show(struct kmem_cache *s, char *buf)
{
5113
	return show_slab_objects(s, buf, SO_PARTIAL);
C
Christoph Lameter 已提交
5114 5115 5116 5117 5118
}
SLAB_ATTR_RO(partial);

static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
{
5119
	return show_slab_objects(s, buf, SO_CPU);
C
Christoph Lameter 已提交
5120 5121 5122 5123 5124
}
SLAB_ATTR_RO(cpu_slabs);

static ssize_t objects_show(struct kmem_cache *s, char *buf)
{
5125
	return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
C
Christoph Lameter 已提交
5126 5127 5128
}
SLAB_ATTR_RO(objects);

5129 5130 5131 5132 5133 5134
static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
{
	return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
}
SLAB_ATTR_RO(objects_partial);

5135 5136 5137 5138 5139
static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
{
	int objects = 0;
	int pages = 0;
	int cpu;
5140
	int len = 0;
5141 5142

	for_each_online_cpu(cpu) {
5143 5144 5145
		struct page *page;

		page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
5146 5147 5148 5149 5150 5151 5152

		if (page) {
			pages += page->pages;
			objects += page->pobjects;
		}
	}

5153
	len += sysfs_emit_at(buf, len, "%d(%d)", objects, pages);
5154 5155 5156

#ifdef CONFIG_SMP
	for_each_online_cpu(cpu) {
5157 5158 5159
		struct page *page;

		page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
5160 5161 5162
		if (page)
			len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
					     cpu, page->pobjects, page->pages);
5163 5164
	}
#endif
5165 5166 5167
	len += sysfs_emit_at(buf, len, "\n");

	return len;
5168 5169 5170
}
SLAB_ATTR_RO(slabs_cpu_partial);

5171 5172
static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
{
5173
	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
5174
}
5175
SLAB_ATTR_RO(reclaim_account);
5176 5177 5178

static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
{
5179
	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
5180 5181 5182 5183 5184 5185
}
SLAB_ATTR_RO(hwcache_align);

#ifdef CONFIG_ZONE_DMA
static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
{
5186
	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
5187 5188 5189 5190
}
SLAB_ATTR_RO(cache_dma);
#endif

5191 5192
static ssize_t usersize_show(struct kmem_cache *s, char *buf)
{
5193
	return sysfs_emit(buf, "%u\n", s->usersize);
5194 5195 5196
}
SLAB_ATTR_RO(usersize);

5197 5198
static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
{
5199
	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
5200 5201 5202
}
SLAB_ATTR_RO(destroy_by_rcu);

5203
#ifdef CONFIG_SLUB_DEBUG
5204 5205 5206 5207 5208 5209
static ssize_t slabs_show(struct kmem_cache *s, char *buf)
{
	return show_slab_objects(s, buf, SO_ALL);
}
SLAB_ATTR_RO(slabs);

5210 5211 5212 5213 5214 5215
static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
{
	return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
}
SLAB_ATTR_RO(total_objects);

C
Christoph Lameter 已提交
5216 5217
static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
{
5218
	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
C
Christoph Lameter 已提交
5219
}
5220
SLAB_ATTR_RO(sanity_checks);
C
Christoph Lameter 已提交
5221 5222 5223

static ssize_t trace_show(struct kmem_cache *s, char *buf)
{
5224
	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TRACE));
C
Christoph Lameter 已提交
5225
}
5226
SLAB_ATTR_RO(trace);
C
Christoph Lameter 已提交
5227 5228 5229

static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
{
5230
	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
C
Christoph Lameter 已提交
5231 5232
}

5233
SLAB_ATTR_RO(red_zone);
C
Christoph Lameter 已提交
5234 5235 5236

static ssize_t poison_show(struct kmem_cache *s, char *buf)
{
5237
	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_POISON));
C
Christoph Lameter 已提交
5238 5239
}

5240
SLAB_ATTR_RO(poison);
C
Christoph Lameter 已提交
5241 5242 5243

static ssize_t store_user_show(struct kmem_cache *s, char *buf)
{
5244
	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
C
Christoph Lameter 已提交
5245 5246
}

5247
SLAB_ATTR_RO(store_user);
C
Christoph Lameter 已提交
5248

5249 5250 5251 5252 5253 5254 5255 5256
static ssize_t validate_show(struct kmem_cache *s, char *buf)
{
	return 0;
}

static ssize_t validate_store(struct kmem_cache *s,
			const char *buf, size_t length)
{
5257 5258 5259 5260 5261 5262 5263 5264
	int ret = -EINVAL;

	if (buf[0] == '1') {
		ret = validate_slab_cache(s);
		if (ret >= 0)
			ret = length;
	}
	return ret;
5265 5266
}
SLAB_ATTR(validate);
5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287

static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
{
	if (!(s->flags & SLAB_STORE_USER))
		return -ENOSYS;
	return list_locations(s, buf, TRACK_ALLOC);
}
SLAB_ATTR_RO(alloc_calls);

static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
{
	if (!(s->flags & SLAB_STORE_USER))
		return -ENOSYS;
	return list_locations(s, buf, TRACK_FREE);
}
SLAB_ATTR_RO(free_calls);
#endif /* CONFIG_SLUB_DEBUG */

#ifdef CONFIG_FAILSLAB
static ssize_t failslab_show(struct kmem_cache *s, char *buf)
{
5288
	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
5289
}
5290
SLAB_ATTR_RO(failslab);
5291
#endif
5292

5293 5294 5295 5296 5297 5298 5299 5300
static ssize_t shrink_show(struct kmem_cache *s, char *buf)
{
	return 0;
}

static ssize_t shrink_store(struct kmem_cache *s,
			const char *buf, size_t length)
{
5301
	if (buf[0] == '1')
5302
		kmem_cache_shrink(s);
5303
	else
5304 5305 5306 5307 5308
		return -EINVAL;
	return length;
}
SLAB_ATTR(shrink);

C
Christoph Lameter 已提交
5309
#ifdef CONFIG_NUMA
5310
static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
C
Christoph Lameter 已提交
5311
{
5312
	return sysfs_emit(buf, "%u\n", s->remote_node_defrag_ratio / 10);
C
Christoph Lameter 已提交
5313 5314
}

5315
static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
C
Christoph Lameter 已提交
5316 5317
				const char *buf, size_t length)
{
5318
	unsigned int ratio;
5319 5320
	int err;

5321
	err = kstrtouint(buf, 10, &ratio);
5322 5323
	if (err)
		return err;
5324 5325
	if (ratio > 100)
		return -ERANGE;
5326

5327
	s->remote_node_defrag_ratio = ratio * 10;
C
Christoph Lameter 已提交
5328 5329 5330

	return length;
}
5331
SLAB_ATTR(remote_node_defrag_ratio);
C
Christoph Lameter 已提交
5332 5333
#endif

5334 5335 5336 5337 5338
#ifdef CONFIG_SLUB_STATS
static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
{
	unsigned long sum  = 0;
	int cpu;
5339
	int len = 0;
5340
	int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL);
5341 5342 5343 5344 5345

	if (!data)
		return -ENOMEM;

	for_each_online_cpu(cpu) {
5346
		unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
5347 5348 5349 5350 5351

		data[cpu] = x;
		sum += x;
	}

5352
	len += sysfs_emit_at(buf, len, "%lu", sum);
5353

5354
#ifdef CONFIG_SMP
5355
	for_each_online_cpu(cpu) {
5356 5357 5358
		if (data[cpu])
			len += sysfs_emit_at(buf, len, " C%d=%u",
					     cpu, data[cpu]);
5359
	}
5360
#endif
5361
	kfree(data);
5362 5363 5364
	len += sysfs_emit_at(buf, len, "\n");

	return len;
5365 5366
}

D
David Rientjes 已提交
5367 5368 5369 5370 5371
static void clear_stat(struct kmem_cache *s, enum stat_item si)
{
	int cpu;

	for_each_online_cpu(cpu)
5372
		per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
D
David Rientjes 已提交
5373 5374
}

5375 5376 5377 5378 5379
#define STAT_ATTR(si, text) 					\
static ssize_t text##_show(struct kmem_cache *s, char *buf)	\
{								\
	return show_stat(s, buf, si);				\
}								\
D
David Rientjes 已提交
5380 5381 5382 5383 5384 5385 5386 5387 5388
static ssize_t text##_store(struct kmem_cache *s,		\
				const char *buf, size_t length)	\
{								\
	if (buf[0] != '0')					\
		return -EINVAL;					\
	clear_stat(s, si);					\
	return length;						\
}								\
SLAB_ATTR(text);						\
5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399

STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
STAT_ATTR(FREE_FASTPATH, free_fastpath);
STAT_ATTR(FREE_SLOWPATH, free_slowpath);
STAT_ATTR(FREE_FROZEN, free_frozen);
STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
STAT_ATTR(ALLOC_SLAB, alloc_slab);
STAT_ATTR(ALLOC_REFILL, alloc_refill);
5400
STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
5401 5402 5403 5404 5405 5406 5407
STAT_ATTR(FREE_SLAB, free_slab);
STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
5408
STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
5409
STAT_ATTR(ORDER_FALLBACK, order_fallback);
5410 5411
STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
5412 5413
STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
5414 5415
STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
5416
#endif	/* CONFIG_SLUB_STATS */
5417

P
Pekka Enberg 已提交
5418
static struct attribute *slab_attrs[] = {
C
Christoph Lameter 已提交
5419 5420 5421 5422
	&slab_size_attr.attr,
	&object_size_attr.attr,
	&objs_per_slab_attr.attr,
	&order_attr.attr,
5423
	&min_partial_attr.attr,
5424
	&cpu_partial_attr.attr,
C
Christoph Lameter 已提交
5425
	&objects_attr.attr,
5426
	&objects_partial_attr.attr,
C
Christoph Lameter 已提交
5427 5428 5429 5430 5431 5432 5433 5434
	&partial_attr.attr,
	&cpu_slabs_attr.attr,
	&ctor_attr.attr,
	&aliases_attr.attr,
	&align_attr.attr,
	&hwcache_align_attr.attr,
	&reclaim_account_attr.attr,
	&destroy_by_rcu_attr.attr,
5435
	&shrink_attr.attr,
5436
	&slabs_cpu_partial_attr.attr,
5437
#ifdef CONFIG_SLUB_DEBUG
5438 5439 5440 5441
	&total_objects_attr.attr,
	&slabs_attr.attr,
	&sanity_checks_attr.attr,
	&trace_attr.attr,
C
Christoph Lameter 已提交
5442 5443 5444
	&red_zone_attr.attr,
	&poison_attr.attr,
	&store_user_attr.attr,
5445
	&validate_attr.attr,
5446 5447
	&alloc_calls_attr.attr,
	&free_calls_attr.attr,
5448
#endif
C
Christoph Lameter 已提交
5449 5450 5451 5452
#ifdef CONFIG_ZONE_DMA
	&cache_dma_attr.attr,
#endif
#ifdef CONFIG_NUMA
5453
	&remote_node_defrag_ratio_attr.attr,
5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465
#endif
#ifdef CONFIG_SLUB_STATS
	&alloc_fastpath_attr.attr,
	&alloc_slowpath_attr.attr,
	&free_fastpath_attr.attr,
	&free_slowpath_attr.attr,
	&free_frozen_attr.attr,
	&free_add_partial_attr.attr,
	&free_remove_partial_attr.attr,
	&alloc_from_partial_attr.attr,
	&alloc_slab_attr.attr,
	&alloc_refill_attr.attr,
5466
	&alloc_node_mismatch_attr.attr,
5467 5468 5469 5470 5471 5472 5473
	&free_slab_attr.attr,
	&cpuslab_flush_attr.attr,
	&deactivate_full_attr.attr,
	&deactivate_empty_attr.attr,
	&deactivate_to_head_attr.attr,
	&deactivate_to_tail_attr.attr,
	&deactivate_remote_frees_attr.attr,
5474
	&deactivate_bypass_attr.attr,
5475
	&order_fallback_attr.attr,
5476 5477
	&cmpxchg_double_fail_attr.attr,
	&cmpxchg_double_cpu_fail_attr.attr,
5478 5479
	&cpu_partial_alloc_attr.attr,
	&cpu_partial_free_attr.attr,
5480 5481
	&cpu_partial_node_attr.attr,
	&cpu_partial_drain_attr.attr,
C
Christoph Lameter 已提交
5482
#endif
5483 5484 5485
#ifdef CONFIG_FAILSLAB
	&failslab_attr.attr,
#endif
5486
	&usersize_attr.attr,
5487

C
Christoph Lameter 已提交
5488 5489 5490
	NULL
};

5491
static const struct attribute_group slab_attr_group = {
C
Christoph Lameter 已提交
5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531
	.attrs = slab_attrs,
};

static ssize_t slab_attr_show(struct kobject *kobj,
				struct attribute *attr,
				char *buf)
{
	struct slab_attribute *attribute;
	struct kmem_cache *s;
	int err;

	attribute = to_slab_attr(attr);
	s = to_slab(kobj);

	if (!attribute->show)
		return -EIO;

	err = attribute->show(s, buf);

	return err;
}

static ssize_t slab_attr_store(struct kobject *kobj,
				struct attribute *attr,
				const char *buf, size_t len)
{
	struct slab_attribute *attribute;
	struct kmem_cache *s;
	int err;

	attribute = to_slab_attr(attr);
	s = to_slab(kobj);

	if (!attribute->store)
		return -EIO;

	err = attribute->store(s, buf, len);
	return err;
}

5532 5533 5534 5535 5536
static void kmem_cache_release(struct kobject *k)
{
	slab_kmem_cache_release(to_slab(k));
}

5537
static const struct sysfs_ops slab_sysfs_ops = {
C
Christoph Lameter 已提交
5538 5539 5540 5541 5542 5543
	.show = slab_attr_show,
	.store = slab_attr_store,
};

static struct kobj_type slab_ktype = {
	.sysfs_ops = &slab_sysfs_ops,
5544
	.release = kmem_cache_release,
C
Christoph Lameter 已提交
5545 5546
};

5547
static struct kset *slab_kset;
C
Christoph Lameter 已提交
5548

5549 5550 5551 5552 5553
static inline struct kset *cache_kset(struct kmem_cache *s)
{
	return slab_kset;
}

C
Christoph Lameter 已提交
5554 5555 5556
#define ID_STR_LENGTH 64

/* Create a unique string id for a slab cache:
C
Christoph Lameter 已提交
5557 5558
 *
 * Format	:[flags-]size
C
Christoph Lameter 已提交
5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576
 */
static char *create_unique_id(struct kmem_cache *s)
{
	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
	char *p = name;

	BUG_ON(!name);

	*p++ = ':';
	/*
	 * First flags affecting slabcache operations. We will only
	 * get here for aliasable slabs so we do not need to support
	 * too many flags. The flags here must cover all flags that
	 * are matched during merging to guarantee that the id is
	 * unique.
	 */
	if (s->flags & SLAB_CACHE_DMA)
		*p++ = 'd';
5577 5578
	if (s->flags & SLAB_CACHE_DMA32)
		*p++ = 'D';
C
Christoph Lameter 已提交
5579 5580
	if (s->flags & SLAB_RECLAIM_ACCOUNT)
		*p++ = 'a';
5581
	if (s->flags & SLAB_CONSISTENCY_CHECKS)
C
Christoph Lameter 已提交
5582
		*p++ = 'F';
V
Vladimir Davydov 已提交
5583 5584
	if (s->flags & SLAB_ACCOUNT)
		*p++ = 'A';
C
Christoph Lameter 已提交
5585 5586
	if (p != name + 1)
		*p++ = '-';
A
Alexey Dobriyan 已提交
5587
	p += sprintf(p, "%07u", s->size);
5588

C
Christoph Lameter 已提交
5589 5590 5591 5592 5593 5594 5595 5596
	BUG_ON(p > name + ID_STR_LENGTH - 1);
	return name;
}

static int sysfs_slab_add(struct kmem_cache *s)
{
	int err;
	const char *name;
5597
	struct kset *kset = cache_kset(s);
5598
	int unmergeable = slab_unmergeable(s);
C
Christoph Lameter 已提交
5599

5600 5601 5602 5603 5604
	if (!kset) {
		kobject_init(&s->kobj, &slab_ktype);
		return 0;
	}

5605 5606 5607 5608
	if (!unmergeable && disable_higher_order_debug &&
			(slub_debug & DEBUG_METADATA_FLAGS))
		unmergeable = 1;

C
Christoph Lameter 已提交
5609 5610 5611 5612 5613 5614
	if (unmergeable) {
		/*
		 * Slabcache can never be merged so we can use the name proper.
		 * This is typically the case for debug situations. In that
		 * case we can catch duplicate names easily.
		 */
5615
		sysfs_remove_link(&slab_kset->kobj, s->name);
C
Christoph Lameter 已提交
5616 5617 5618 5619 5620 5621 5622 5623 5624
		name = s->name;
	} else {
		/*
		 * Create a unique name for the slab as a target
		 * for the symlinks.
		 */
		name = create_unique_id(s);
	}

5625
	s->kobj.kset = kset;
5626
	err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
5627 5628
	if (err) {
		kobject_put(&s->kobj);
5629
		goto out;
5630
	}
C
Christoph Lameter 已提交
5631 5632

	err = sysfs_create_group(&s->kobj, &slab_attr_group);
5633 5634
	if (err)
		goto out_del_kobj;
5635

C
Christoph Lameter 已提交
5636 5637 5638 5639
	if (!unmergeable) {
		/* Setup first alias */
		sysfs_slab_alias(s, s->name);
	}
5640 5641 5642 5643 5644 5645 5646
out:
	if (!unmergeable)
		kfree(name);
	return err;
out_del_kobj:
	kobject_del(&s->kobj);
	goto out;
C
Christoph Lameter 已提交
5647 5648
}

5649 5650 5651 5652 5653 5654
void sysfs_slab_unlink(struct kmem_cache *s)
{
	if (slab_state >= FULL)
		kobject_del(&s->kobj);
}

5655 5656 5657 5658
void sysfs_slab_release(struct kmem_cache *s)
{
	if (slab_state >= FULL)
		kobject_put(&s->kobj);
C
Christoph Lameter 已提交
5659 5660 5661 5662
}

/*
 * Need to buffer aliases during bootup until sysfs becomes
N
Nick Andrew 已提交
5663
 * available lest we lose that information.
C
Christoph Lameter 已提交
5664 5665 5666 5667 5668 5669 5670
 */
struct saved_alias {
	struct kmem_cache *s;
	const char *name;
	struct saved_alias *next;
};

A
Adrian Bunk 已提交
5671
static struct saved_alias *alias_list;
C
Christoph Lameter 已提交
5672 5673 5674 5675 5676

static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
{
	struct saved_alias *al;

5677
	if (slab_state == FULL) {
C
Christoph Lameter 已提交
5678 5679 5680
		/*
		 * If we have a leftover link then remove it.
		 */
5681 5682
		sysfs_remove_link(&slab_kset->kobj, name);
		return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
C
Christoph Lameter 已提交
5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697
	}

	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
	if (!al)
		return -ENOMEM;

	al->s = s;
	al->name = name;
	al->next = alias_list;
	alias_list = al;
	return 0;
}

static int __init slab_sysfs_init(void)
{
5698
	struct kmem_cache *s;
C
Christoph Lameter 已提交
5699 5700
	int err;

5701
	mutex_lock(&slab_mutex);
5702

5703
	slab_kset = kset_create_and_add("slab", NULL, kernel_kobj);
5704
	if (!slab_kset) {
5705
		mutex_unlock(&slab_mutex);
5706
		pr_err("Cannot register slab subsystem.\n");
C
Christoph Lameter 已提交
5707 5708 5709
		return -ENOSYS;
	}

5710
	slab_state = FULL;
5711

5712
	list_for_each_entry(s, &slab_caches, list) {
5713
		err = sysfs_slab_add(s);
5714
		if (err)
5715 5716
			pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
			       s->name);
5717
	}
C
Christoph Lameter 已提交
5718 5719 5720 5721 5722 5723

	while (alias_list) {
		struct saved_alias *al = alias_list;

		alias_list = alias_list->next;
		err = sysfs_slab_alias(al->s, al->name);
5724
		if (err)
5725 5726
			pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
			       al->name);
C
Christoph Lameter 已提交
5727 5728 5729
		kfree(al);
	}

5730
	mutex_unlock(&slab_mutex);
C
Christoph Lameter 已提交
5731 5732 5733 5734 5735
	resiliency_test();
	return 0;
}

__initcall(slab_sysfs_init);
5736
#endif /* CONFIG_SYSFS */
P
Pekka J Enberg 已提交
5737 5738 5739 5740

/*
 * The /proc/slabinfo ABI
 */
Y
Yang Shi 已提交
5741
#ifdef CONFIG_SLUB_DEBUG
5742
void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
P
Pekka J Enberg 已提交
5743 5744
{
	unsigned long nr_slabs = 0;
5745 5746
	unsigned long nr_objs = 0;
	unsigned long nr_free = 0;
P
Pekka J Enberg 已提交
5747
	int node;
C
Christoph Lameter 已提交
5748
	struct kmem_cache_node *n;
P
Pekka J Enberg 已提交
5749

C
Christoph Lameter 已提交
5750
	for_each_kmem_cache_node(s, node, n) {
5751 5752
		nr_slabs += node_nr_slabs(n);
		nr_objs += node_nr_objs(n);
5753
		nr_free += count_partial(n, count_free);
P
Pekka J Enberg 已提交
5754 5755
	}

5756 5757 5758 5759 5760 5761
	sinfo->active_objs = nr_objs - nr_free;
	sinfo->num_objs = nr_objs;
	sinfo->active_slabs = nr_slabs;
	sinfo->num_slabs = nr_slabs;
	sinfo->objects_per_slab = oo_objects(s->oo);
	sinfo->cache_order = oo_order(s->oo);
P
Pekka J Enberg 已提交
5762 5763
}

5764
void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
5765 5766 5767
{
}

5768 5769
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
		       size_t count, loff_t *ppos)
5770
{
5771
	return -EIO;
5772
}
Y
Yang Shi 已提交
5773
#endif /* CONFIG_SLUB_DEBUG */