workqueue.c 103.3 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
T
Tejun Heo 已提交
2
 * kernel/workqueue.c - generic async execution with shared worker pool
L
Linus Torvalds 已提交
3
 *
T
Tejun Heo 已提交
4
 * Copyright (C) 2002		Ingo Molnar
L
Linus Torvalds 已提交
5
 *
T
Tejun Heo 已提交
6 7 8 9 10
 *   Derived from the taskqueue/keventd code by:
 *     David Woodhouse <dwmw2@infradead.org>
 *     Andrew Morton
 *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
 *     Theodore Ts'o <tytso@mit.edu>
L
Linus Torvalds 已提交
11
 *
T
Tejun Heo 已提交
12
 * Made to use alloc_percpu by Christoph Lameter.
L
Linus Torvalds 已提交
13
 *
T
Tejun Heo 已提交
14 15
 * Copyright (C) 2010		SUSE Linux Products GmbH
 * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
16
 *
T
Tejun Heo 已提交
17 18 19 20 21 22 23
 * This is the generic async execution mechanism.  Work items as are
 * executed in process context.  The worker pool is shared and
 * automatically managed.  There is one worker pool for each CPU and
 * one extra for works which are better served by workers which are
 * not bound to any specific CPU.
 *
 * Please read Documentation/workqueue.txt for details.
L
Linus Torvalds 已提交
24 25
 */

26
#include <linux/export.h>
L
Linus Torvalds 已提交
27 28 29 30 31 32 33 34 35 36
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/completion.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
37
#include <linux/hardirq.h>
38
#include <linux/mempolicy.h>
39
#include <linux/freezer.h>
40 41
#include <linux/kallsyms.h>
#include <linux/debug_locks.h>
42
#include <linux/lockdep.h>
T
Tejun Heo 已提交
43
#include <linux/idr.h>
44
#include <linux/hashtable.h>
45

46
#include "workqueue_internal.h"
L
Linus Torvalds 已提交
47

T
Tejun Heo 已提交
48
enum {
49 50
	/*
	 * worker_pool flags
51
	 *
52
	 * A bound pool is either associated or disassociated with its CPU.
53 54 55 56 57 58
	 * While associated (!DISASSOCIATED), all workers are bound to the
	 * CPU and none has %WORKER_UNBOUND set and concurrency management
	 * is in effect.
	 *
	 * While DISASSOCIATED, the cpu may be offline and all workers have
	 * %WORKER_UNBOUND set and concurrency management disabled, and may
59
	 * be executing on any CPU.  The pool behaves as an unbound one.
60 61
	 *
	 * Note that DISASSOCIATED can be flipped only while holding
62 63
	 * assoc_mutex to avoid changing binding state while
	 * create_worker() is in progress.
64
	 */
65
	POOL_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
66
	POOL_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
67
	POOL_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
68
	POOL_FREEZING		= 1 << 3,	/* freeze in progress */
69

T
Tejun Heo 已提交
70 71 72 73
	/* worker flags */
	WORKER_STARTED		= 1 << 0,	/* started */
	WORKER_DIE		= 1 << 1,	/* die die die */
	WORKER_IDLE		= 1 << 2,	/* is idle */
74
	WORKER_PREP		= 1 << 3,	/* preparing to run works */
75
	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
76
	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */
77

78
	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_UNBOUND |
79
				  WORKER_CPU_INTENSIVE,
80

81
	NR_STD_WORKER_POOLS	= 2,		/* # standard pools per cpu */
82

T
Tejun Heo 已提交
83
	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
84

85 86 87
	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */
	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */

88 89 90
	MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
						/* call for help after 10ms
						   (min two ticks) */
91 92 93 94 95 96 97 98
	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */

	/*
	 * Rescue workers are used only on emergencies and shared by
	 * all cpus.  Give -20.
	 */
	RESCUER_NICE_LEVEL	= -20,
99
	HIGHPRI_NICE_LEVEL	= -20,
T
Tejun Heo 已提交
100
};
L
Linus Torvalds 已提交
101 102

/*
T
Tejun Heo 已提交
103 104
 * Structure fields follow one of the following exclusion rules.
 *
105 106
 * I: Modifiable by initialization/destruction paths and read-only for
 *    everyone else.
T
Tejun Heo 已提交
107
 *
108 109 110
 * P: Preemption protected.  Disabling preemption is enough and should
 *    only be modified and accessed from the local cpu.
 *
111
 * L: pool->lock protected.  Access with pool->lock held.
T
Tejun Heo 已提交
112
 *
113 114 115 116
 * X: During normal operation, modification requires pool->lock and should
 *    be done only from local cpu.  Either disabling preemption on local
 *    cpu or grabbing pool->lock is enough for read access.  If
 *    POOL_DISASSOCIATED is set, it's identical to L.
117
 *
118 119
 * F: wq->flush_mutex protected.
 *
T
Tejun Heo 已提交
120
 * W: workqueue_lock protected.
L
Linus Torvalds 已提交
121 122
 */

123
/* struct worker is defined in workqueue_internal.h */
T
Tejun Heo 已提交
124

125
struct worker_pool {
126
	spinlock_t		lock;		/* the pool lock */
127
	unsigned int		cpu;		/* I: the associated cpu */
T
Tejun Heo 已提交
128
	int			id;		/* I: pool ID */
129
	unsigned int		flags;		/* X: flags */
130 131 132

	struct list_head	worklist;	/* L: list of pending works */
	int			nr_workers;	/* L: total number of workers */
133 134

	/* nr_idle includes the ones off idle_list for rebinding */
135 136 137 138 139 140
	int			nr_idle;	/* L: currently idle ones */

	struct list_head	idle_list;	/* X: list of idle workers */
	struct timer_list	idle_timer;	/* L: worker idle timeout */
	struct timer_list	mayday_timer;	/* L: SOS timer for workers */

141 142 143 144
	/* workers are chained either in busy_hash or idle_list */
	DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
						/* L: hash of busy workers */

145
	struct mutex		assoc_mutex;	/* protect POOL_DISASSOCIATED */
146
	struct ida		worker_ida;	/* L: for worker IDs */
147 148 149 150 151 152 153

	/*
	 * The current concurrency level.  As it's likely to be accessed
	 * from other CPUs during try_to_wake_up(), put it in a separate
	 * cacheline.
	 */
	atomic_t		nr_running ____cacheline_aligned_in_smp;
154 155
} ____cacheline_aligned_in_smp;

L
Linus Torvalds 已提交
156
/*
157 158 159 160
 * The per-pool workqueue.  While queued, the lower WORK_STRUCT_FLAG_BITS
 * of work_struct->data are used for flags and the remaining high bits
 * point to the pwq; thus, pwqs need to be aligned at two's power of the
 * number of flag bits.
L
Linus Torvalds 已提交
161
 */
162
struct pool_workqueue {
163
	struct worker_pool	*pool;		/* I: the associated pool */
T
Tejun Heo 已提交
164
	struct workqueue_struct *wq;		/* I: the owning workqueue */
165 166 167 168
	int			work_color;	/* L: current color */
	int			flush_color;	/* L: flushing color */
	int			nr_in_flight[WORK_NR_COLORS];
						/* L: nr of in_flight works */
169
	int			nr_active;	/* L: nr of active works */
170
	int			max_active;	/* L: max active works */
171
	struct list_head	delayed_works;	/* L: delayed works */
172
	struct list_head	pwqs_node;	/* I: node on wq->pwqs */
173
	struct list_head	mayday_node;	/* W: node on wq->maydays */
174
} __aligned(1 << WORK_STRUCT_FLAG_BITS);
L
Linus Torvalds 已提交
175

176 177 178 179 180 181 182 183 184
/*
 * Structure used to wait for workqueue flush.
 */
struct wq_flusher {
	struct list_head	list;		/* F: list of flushers */
	int			flush_color;	/* F: flush color waiting for */
	struct completion	done;		/* flush completion */
};

L
Linus Torvalds 已提交
185 186 187 188 189
/*
 * The externally visible workqueue abstraction is an array of
 * per-CPU workqueues:
 */
struct workqueue_struct {
190
	unsigned int		flags;		/* W: WQ_* flags */
191
	union {
192 193
		struct pool_workqueue __percpu		*pcpu;
		struct pool_workqueue			*single;
194
		unsigned long				v;
195
	} pool_wq;				/* I: pwq's */
196
	struct list_head	pwqs;		/* I: all pwqs of this wq */
T
Tejun Heo 已提交
197
	struct list_head	list;		/* W: list of all workqueues */
198 199 200 201

	struct mutex		flush_mutex;	/* protects wq flushing */
	int			work_color;	/* F: current work color */
	int			flush_color;	/* F: current flush color */
202
	atomic_t		nr_pwqs_to_flush; /* flush in progress */
203 204 205 206
	struct wq_flusher	*first_flusher;	/* F: first flusher */
	struct list_head	flusher_queue;	/* F: flush waiters */
	struct list_head	flusher_overflow; /* F: flush overflow list */

207
	struct list_head	maydays;	/* W: pwqs requesting rescue */
208 209
	struct worker		*rescuer;	/* I: rescue worker */

210
	int			nr_drainers;	/* W: drain in progress */
211
	int			saved_max_active; /* W: saved pwq max_active */
212
#ifdef CONFIG_LOCKDEP
T
Tejun Heo 已提交
213
	struct lockdep_map	lockdep_map;
214
#endif
215
	char			name[];		/* I: workqueue name */
L
Linus Torvalds 已提交
216 217
};

218 219
static struct kmem_cache *pwq_cache;

220 221
struct workqueue_struct *system_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_wq);
V
Valentin Ilie 已提交
222
struct workqueue_struct *system_highpri_wq __read_mostly;
223
EXPORT_SYMBOL_GPL(system_highpri_wq);
V
Valentin Ilie 已提交
224
struct workqueue_struct *system_long_wq __read_mostly;
225
EXPORT_SYMBOL_GPL(system_long_wq);
V
Valentin Ilie 已提交
226
struct workqueue_struct *system_unbound_wq __read_mostly;
227
EXPORT_SYMBOL_GPL(system_unbound_wq);
V
Valentin Ilie 已提交
228
struct workqueue_struct *system_freezable_wq __read_mostly;
229
EXPORT_SYMBOL_GPL(system_freezable_wq);
230

231 232 233
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>

234
#define for_each_std_worker_pool(pool, cpu)				\
T
Tejun Heo 已提交
235 236
	for ((pool) = &std_worker_pools(cpu)[0];			\
	     (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++)
237

238 239
#define for_each_busy_worker(worker, i, pool)				\
	hash_for_each(pool->busy_hash, i, worker, hentry)
240

241 242
static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
				unsigned int sw)
243 244 245 246 247 248 249 250 251 252
{
	if (cpu < nr_cpu_ids) {
		if (sw & 1) {
			cpu = cpumask_next(cpu, mask);
			if (cpu < nr_cpu_ids)
				return cpu;
		}
		if (sw & 2)
			return WORK_CPU_UNBOUND;
	}
253
	return WORK_CPU_END;
254 255
}

256 257 258
/*
 * CPU iterators
 *
259
 * An extra cpu number is defined using an invalid cpu number
260
 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
261 262
 * specific CPU.  The following iterators are similar to for_each_*_cpu()
 * iterators but also considers the unbound CPU.
263
 *
264 265
 * for_each_wq_cpu()		: possible CPUs + WORK_CPU_UNBOUND
 * for_each_online_wq_cpu()	: online CPUs + WORK_CPU_UNBOUND
266
 */
267 268
#define for_each_wq_cpu(cpu)						\
	for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3);		\
269
	     (cpu) < WORK_CPU_END;					\
270
	     (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3))
271

272 273
#define for_each_online_wq_cpu(cpu)					\
	for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3);		\
274
	     (cpu) < WORK_CPU_END;					\
275
	     (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3))
276

T
Tejun Heo 已提交
277 278 279 280 281 282 283 284
/**
 * for_each_pool - iterate through all worker_pools in the system
 * @pool: iteration cursor
 * @id: integer used for iteration
 */
#define for_each_pool(pool, id)						\
	idr_for_each_entry(&worker_pool_idr, pool, id)

285 286 287 288 289 290 291
/**
 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
 * @pwq: iteration cursor
 * @wq: the target workqueue
 */
#define for_each_pwq(pwq, wq)						\
	list_for_each_entry((pwq), &(wq)->pwqs, pwqs_node)
292

293 294 295 296
#ifdef CONFIG_DEBUG_OBJECTS_WORK

static struct debug_obj_descr work_debug_descr;

297 298 299 300 301
static void *work_debug_hint(void *addr)
{
	return ((struct work_struct *) addr)->func;
}

302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static int work_fixup_init(void *addr, enum debug_obj_state state)
{
	struct work_struct *work = addr;

	switch (state) {
	case ODEBUG_STATE_ACTIVE:
		cancel_work_sync(work);
		debug_object_init(work, &work_debug_descr);
		return 1;
	default:
		return 0;
	}
}

/*
 * fixup_activate is called when:
 * - an active object is activated
 * - an unknown object is activated (might be a statically initialized object)
 */
static int work_fixup_activate(void *addr, enum debug_obj_state state)
{
	struct work_struct *work = addr;

	switch (state) {

	case ODEBUG_STATE_NOTAVAILABLE:
		/*
		 * This is not really a fixup. The work struct was
		 * statically initialized. We just make sure that it
		 * is tracked in the object tracker.
		 */
337
		if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372
			debug_object_init(work, &work_debug_descr);
			debug_object_activate(work, &work_debug_descr);
			return 0;
		}
		WARN_ON_ONCE(1);
		return 0;

	case ODEBUG_STATE_ACTIVE:
		WARN_ON(1);

	default:
		return 0;
	}
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static int work_fixup_free(void *addr, enum debug_obj_state state)
{
	struct work_struct *work = addr;

	switch (state) {
	case ODEBUG_STATE_ACTIVE:
		cancel_work_sync(work);
		debug_object_free(work, &work_debug_descr);
		return 1;
	default:
		return 0;
	}
}

static struct debug_obj_descr work_debug_descr = {
	.name		= "work_struct",
373
	.debug_hint	= work_debug_hint,
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408
	.fixup_init	= work_fixup_init,
	.fixup_activate	= work_fixup_activate,
	.fixup_free	= work_fixup_free,
};

static inline void debug_work_activate(struct work_struct *work)
{
	debug_object_activate(work, &work_debug_descr);
}

static inline void debug_work_deactivate(struct work_struct *work)
{
	debug_object_deactivate(work, &work_debug_descr);
}

void __init_work(struct work_struct *work, int onstack)
{
	if (onstack)
		debug_object_init_on_stack(work, &work_debug_descr);
	else
		debug_object_init(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(__init_work);

void destroy_work_on_stack(struct work_struct *work)
{
	debug_object_free(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_work_on_stack);

#else
static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
#endif

409 410
/* Serializes the accesses to the list of workqueues. */
static DEFINE_SPINLOCK(workqueue_lock);
L
Linus Torvalds 已提交
411
static LIST_HEAD(workqueues);
412
static bool workqueue_freezing;		/* W: have wqs started freezing? */
T
Tejun Heo 已提交
413

414
/*
415 416
 * The CPU and unbound standard worker pools.  The unbound ones have
 * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set.
417
 */
418 419
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
				     cpu_std_worker_pools);
T
Tejun Heo 已提交
420
static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS];
421

T
Tejun Heo 已提交
422 423 424 425
/* idr of all pools */
static DEFINE_MUTEX(worker_pool_idr_mutex);
static DEFINE_IDR(worker_pool_idr);

T
Tejun Heo 已提交
426
static int worker_thread(void *__worker);
L
Linus Torvalds 已提交
427

T
Tejun Heo 已提交
428
static struct worker_pool *std_worker_pools(int cpu)
429
{
430
	if (cpu != WORK_CPU_UNBOUND)
T
Tejun Heo 已提交
431
		return per_cpu(cpu_std_worker_pools, cpu);
432
	else
T
Tejun Heo 已提交
433
		return unbound_std_worker_pools;
434 435
}

T
Tejun Heo 已提交
436 437
static int std_worker_pool_pri(struct worker_pool *pool)
{
T
Tejun Heo 已提交
438
	return pool - std_worker_pools(pool->cpu);
T
Tejun Heo 已提交
439 440
}

T
Tejun Heo 已提交
441 442 443 444 445 446 447 448 449 450 451 452 453
/* allocate ID and assign it to @pool */
static int worker_pool_assign_id(struct worker_pool *pool)
{
	int ret;

	mutex_lock(&worker_pool_idr_mutex);
	idr_pre_get(&worker_pool_idr, GFP_KERNEL);
	ret = idr_get_new(&worker_pool_idr, pool, &pool->id);
	mutex_unlock(&worker_pool_idr_mutex);

	return ret;
}

454 455 456 457 458 459 460 461 462
/*
 * Lookup worker_pool by id.  The idr currently is built during boot and
 * never modified.  Don't worry about locking for now.
 */
static struct worker_pool *worker_pool_by_id(int pool_id)
{
	return idr_find(&worker_pool_idr, pool_id);
}

463 464
static struct worker_pool *get_std_worker_pool(int cpu, bool highpri)
{
T
Tejun Heo 已提交
465
	struct worker_pool *pools = std_worker_pools(cpu);
466

T
Tejun Heo 已提交
467
	return &pools[highpri];
468 469
}

470 471
static struct pool_workqueue *get_pwq(unsigned int cpu,
				      struct workqueue_struct *wq)
472
{
473
	if (!(wq->flags & WQ_UNBOUND)) {
474
		if (likely(cpu < nr_cpu_ids))
475
			return per_cpu_ptr(wq->pool_wq.pcpu, cpu);
476
	} else if (likely(cpu == WORK_CPU_UNBOUND))
477
		return wq->pool_wq.single;
478
	return NULL;
479 480
}

481 482 483 484 485 486 487 488 489 490 491 492 493 494 495
static unsigned int work_color_to_flags(int color)
{
	return color << WORK_STRUCT_COLOR_SHIFT;
}

static int get_work_color(struct work_struct *work)
{
	return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
		((1 << WORK_STRUCT_COLOR_BITS) - 1);
}

static int work_next_color(int color)
{
	return (color + 1) % WORK_NR_COLORS;
}
L
Linus Torvalds 已提交
496

497
/*
498 499
 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
 * contain the pointer to the queued pwq.  Once execution starts, the flag
500
 * is cleared and the high bits contain OFFQ flags and pool ID.
501
 *
502 503
 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
 * and clear_work_data() can be used to set the pwq, pool or clear
504 505
 * work->data.  These functions should only be called while the work is
 * owned - ie. while the PENDING bit is set.
506
 *
507
 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
508
 * corresponding to a work.  Pool is available once the work has been
509
 * queued anywhere after initialization until it is sync canceled.  pwq is
510
 * available only while the work item is queued.
511
 *
512 513 514 515
 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
 * canceled.  While being canceled, a work item may have its PENDING set
 * but stay off timer and worklist for arbitrarily long and nobody should
 * try to steal the PENDING bit.
516
 */
517 518
static inline void set_work_data(struct work_struct *work, unsigned long data,
				 unsigned long flags)
519
{
520
	WARN_ON_ONCE(!work_pending(work));
521 522
	atomic_long_set(&work->data, data | flags | work_static(work));
}
523

524
static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
525 526
			 unsigned long extra_flags)
{
527 528
	set_work_data(work, (unsigned long)pwq,
		      WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
529 530
}

531 532 533 534 535 536 537
static void set_work_pool_and_keep_pending(struct work_struct *work,
					   int pool_id)
{
	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
		      WORK_STRUCT_PENDING);
}

538 539
static void set_work_pool_and_clear_pending(struct work_struct *work,
					    int pool_id)
540
{
541 542 543 544 545 546 547
	/*
	 * The following wmb is paired with the implied mb in
	 * test_and_set_bit(PENDING) and ensures all updates to @work made
	 * here are visible to and precede any updates by the next PENDING
	 * owner.
	 */
	smp_wmb();
548
	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
549
}
550

551
static void clear_work_data(struct work_struct *work)
L
Linus Torvalds 已提交
552
{
553 554
	smp_wmb();	/* see set_work_pool_and_clear_pending() */
	set_work_data(work, WORK_STRUCT_NO_POOL, 0);
L
Linus Torvalds 已提交
555 556
}

557
static struct pool_workqueue *get_work_pwq(struct work_struct *work)
558
{
559
	unsigned long data = atomic_long_read(&work->data);
560

561
	if (data & WORK_STRUCT_PWQ)
562 563 564
		return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
	else
		return NULL;
565 566
}

567 568 569 570 571 572 573
/**
 * get_work_pool - return the worker_pool a given work was associated with
 * @work: the work item of interest
 *
 * Return the worker_pool @work was last associated with.  %NULL if none.
 */
static struct worker_pool *get_work_pool(struct work_struct *work)
574
{
575
	unsigned long data = atomic_long_read(&work->data);
576 577
	struct worker_pool *pool;
	int pool_id;
578

579 580
	if (data & WORK_STRUCT_PWQ)
		return ((struct pool_workqueue *)
581
			(data & WORK_STRUCT_WQ_DATA_MASK))->pool;
582

583 584
	pool_id = data >> WORK_OFFQ_POOL_SHIFT;
	if (pool_id == WORK_OFFQ_POOL_NONE)
585 586
		return NULL;

587 588 589 590 591 592 593 594 595 596 597 598 599 600
	pool = worker_pool_by_id(pool_id);
	WARN_ON_ONCE(!pool);
	return pool;
}

/**
 * get_work_pool_id - return the worker pool ID a given work is associated with
 * @work: the work item of interest
 *
 * Return the worker_pool ID @work was last associated with.
 * %WORK_OFFQ_POOL_NONE if none.
 */
static int get_work_pool_id(struct work_struct *work)
{
601 602
	unsigned long data = atomic_long_read(&work->data);

603 604
	if (data & WORK_STRUCT_PWQ)
		return ((struct pool_workqueue *)
605
			(data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
606

607
	return data >> WORK_OFFQ_POOL_SHIFT;
608 609
}

610 611
static void mark_work_canceling(struct work_struct *work)
{
612
	unsigned long pool_id = get_work_pool_id(work);
613

614 615
	pool_id <<= WORK_OFFQ_POOL_SHIFT;
	set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
616 617 618 619 620 621
}

static bool work_is_canceling(struct work_struct *work)
{
	unsigned long data = atomic_long_read(&work->data);

622
	return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
623 624
}

625
/*
626 627
 * Policy functions.  These define the policies on how the global worker
 * pools are managed.  Unless noted otherwise, these functions assume that
628
 * they're being called with pool->lock held.
629 630
 */

631
static bool __need_more_worker(struct worker_pool *pool)
632
{
633
	return !atomic_read(&pool->nr_running);
634 635
}

636
/*
637 638
 * Need to wake up a worker?  Called from anything but currently
 * running workers.
639 640
 *
 * Note that, because unbound workers never contribute to nr_running, this
641
 * function will always return %true for unbound pools as long as the
642
 * worklist isn't empty.
643
 */
644
static bool need_more_worker(struct worker_pool *pool)
645
{
646
	return !list_empty(&pool->worklist) && __need_more_worker(pool);
647
}
648

649
/* Can I start working?  Called from busy but !running workers. */
650
static bool may_start_working(struct worker_pool *pool)
651
{
652
	return pool->nr_idle;
653 654 655
}

/* Do I need to keep working?  Called from currently running workers. */
656
static bool keep_working(struct worker_pool *pool)
657
{
658 659
	return !list_empty(&pool->worklist) &&
		atomic_read(&pool->nr_running) <= 1;
660 661 662
}

/* Do we need a new worker?  Called from manager. */
663
static bool need_to_create_worker(struct worker_pool *pool)
664
{
665
	return need_more_worker(pool) && !may_start_working(pool);
666
}
667

668
/* Do I need to be the manager? */
669
static bool need_to_manage_workers(struct worker_pool *pool)
670
{
671
	return need_to_create_worker(pool) ||
672
		(pool->flags & POOL_MANAGE_WORKERS);
673 674 675
}

/* Do we have too many workers and should some go away? */
676
static bool too_many_workers(struct worker_pool *pool)
677
{
678
	bool managing = pool->flags & POOL_MANAGING_WORKERS;
679 680
	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
	int nr_busy = pool->nr_workers - nr_idle;
681

682 683 684 685 686 687 688
	/*
	 * nr_idle and idle_list may disagree if idle rebinding is in
	 * progress.  Never return %true if idle_list is empty.
	 */
	if (list_empty(&pool->idle_list))
		return false;

689
	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
690 691
}

692
/*
693 694 695
 * Wake up functions.
 */

696
/* Return the first worker.  Safe with preemption disabled */
697
static struct worker *first_worker(struct worker_pool *pool)
698
{
699
	if (unlikely(list_empty(&pool->idle_list)))
700 701
		return NULL;

702
	return list_first_entry(&pool->idle_list, struct worker, entry);
703 704 705 706
}

/**
 * wake_up_worker - wake up an idle worker
707
 * @pool: worker pool to wake worker from
708
 *
709
 * Wake up the first idle worker of @pool.
710 711
 *
 * CONTEXT:
712
 * spin_lock_irq(pool->lock).
713
 */
714
static void wake_up_worker(struct worker_pool *pool)
715
{
716
	struct worker *worker = first_worker(pool);
717 718 719 720 721

	if (likely(worker))
		wake_up_process(worker->task);
}

722
/**
723 724 725 726 727 728 729 730 731 732 733 734 735 736
 * wq_worker_waking_up - a worker is waking up
 * @task: task waking up
 * @cpu: CPU @task is waking up to
 *
 * This function is called during try_to_wake_up() when a worker is
 * being awoken.
 *
 * CONTEXT:
 * spin_lock_irq(rq->lock)
 */
void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
{
	struct worker *worker = kthread_data(task);

737
	if (!(worker->flags & WORKER_NOT_RUNNING)) {
738
		WARN_ON_ONCE(worker->pool->cpu != cpu);
739
		atomic_inc(&worker->pool->nr_running);
740
	}
741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761
}

/**
 * wq_worker_sleeping - a worker is going to sleep
 * @task: task going to sleep
 * @cpu: CPU in question, must be the current CPU number
 *
 * This function is called during schedule() when a busy worker is
 * going to sleep.  Worker on the same cpu can be woken up by
 * returning pointer to its task.
 *
 * CONTEXT:
 * spin_lock_irq(rq->lock)
 *
 * RETURNS:
 * Worker task on @cpu to wake up, %NULL if none.
 */
struct task_struct *wq_worker_sleeping(struct task_struct *task,
				       unsigned int cpu)
{
	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
762
	struct worker_pool *pool;
763

764 765 766 767 768
	/*
	 * Rescuers, which may not have all the fields set up like normal
	 * workers, also reach here, let's not access anything before
	 * checking NOT_RUNNING.
	 */
769
	if (worker->flags & WORKER_NOT_RUNNING)
770 771
		return NULL;

772 773
	pool = worker->pool;

774
	/* this can only happen on the local cpu */
775 776
	if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
		return NULL;
777 778 779 780 781 782

	/*
	 * The counterpart of the following dec_and_test, implied mb,
	 * worklist not empty test sequence is in insert_work().
	 * Please read comment there.
	 *
783 784 785
	 * NOT_RUNNING is clear.  This means that we're bound to and
	 * running on the local cpu w/ rq lock held and preemption
	 * disabled, which in turn means that none else could be
786
	 * manipulating idle_list, so dereferencing idle_list without pool
787
	 * lock is safe.
788
	 */
789 790
	if (atomic_dec_and_test(&pool->nr_running) &&
	    !list_empty(&pool->worklist))
791
		to_wakeup = first_worker(pool);
792 793 794 795 796
	return to_wakeup ? to_wakeup->task : NULL;
}

/**
 * worker_set_flags - set worker flags and adjust nr_running accordingly
797
 * @worker: self
798 799 800
 * @flags: flags to set
 * @wakeup: wakeup an idle worker if necessary
 *
801 802 803
 * Set @flags in @worker->flags and adjust nr_running accordingly.  If
 * nr_running becomes zero and @wakeup is %true, an idle worker is
 * woken up.
804
 *
805
 * CONTEXT:
806
 * spin_lock_irq(pool->lock)
807 808 809 810
 */
static inline void worker_set_flags(struct worker *worker, unsigned int flags,
				    bool wakeup)
{
811
	struct worker_pool *pool = worker->pool;
812

813 814
	WARN_ON_ONCE(worker->task != current);

815 816 817 818 819 820 821 822
	/*
	 * If transitioning into NOT_RUNNING, adjust nr_running and
	 * wake up an idle worker as necessary if requested by
	 * @wakeup.
	 */
	if ((flags & WORKER_NOT_RUNNING) &&
	    !(worker->flags & WORKER_NOT_RUNNING)) {
		if (wakeup) {
823
			if (atomic_dec_and_test(&pool->nr_running) &&
824
			    !list_empty(&pool->worklist))
825
				wake_up_worker(pool);
826
		} else
827
			atomic_dec(&pool->nr_running);
828 829
	}

830 831 832 833
	worker->flags |= flags;
}

/**
834
 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
835
 * @worker: self
836 837
 * @flags: flags to clear
 *
838
 * Clear @flags in @worker->flags and adjust nr_running accordingly.
839
 *
840
 * CONTEXT:
841
 * spin_lock_irq(pool->lock)
842 843 844
 */
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
845
	struct worker_pool *pool = worker->pool;
846 847
	unsigned int oflags = worker->flags;

848 849
	WARN_ON_ONCE(worker->task != current);

850
	worker->flags &= ~flags;
851

852 853 854 855 856
	/*
	 * If transitioning out of NOT_RUNNING, increment nr_running.  Note
	 * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
	 * of multiple flags, not a single flag.
	 */
857 858
	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
		if (!(worker->flags & WORKER_NOT_RUNNING))
859
			atomic_inc(&pool->nr_running);
860 861
}

862 863
/**
 * find_worker_executing_work - find worker which is executing a work
864
 * @pool: pool of interest
865 866
 * @work: work to find worker for
 *
867 868
 * Find a worker which is executing @work on @pool by searching
 * @pool->busy_hash which is keyed by the address of @work.  For a worker
869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887
 * to match, its current execution should match the address of @work and
 * its work function.  This is to avoid unwanted dependency between
 * unrelated work executions through a work item being recycled while still
 * being executed.
 *
 * This is a bit tricky.  A work item may be freed once its execution
 * starts and nothing prevents the freed area from being recycled for
 * another work item.  If the same work item address ends up being reused
 * before the original execution finishes, workqueue will identify the
 * recycled work item as currently executing and make it wait until the
 * current execution finishes, introducing an unwanted dependency.
 *
 * This function checks the work item address, work function and workqueue
 * to avoid false positives.  Note that this isn't complete as one may
 * construct a work function which can introduce dependency onto itself
 * through a recycled work item.  Well, if somebody wants to shoot oneself
 * in the foot that badly, there's only so much we can do, and if such
 * deadlock actually occurs, it should be easy to locate the culprit work
 * function.
888 889
 *
 * CONTEXT:
890
 * spin_lock_irq(pool->lock).
891 892 893 894
 *
 * RETURNS:
 * Pointer to worker which is executing @work if found, NULL
 * otherwise.
895
 */
896
static struct worker *find_worker_executing_work(struct worker_pool *pool,
897
						 struct work_struct *work)
898
{
899 900
	struct worker *worker;

901
	hash_for_each_possible(pool->busy_hash, worker, hentry,
902 903 904
			       (unsigned long)work)
		if (worker->current_work == work &&
		    worker->current_func == work->func)
905 906 907
			return worker;

	return NULL;
908 909
}

910 911 912 913 914 915 916 917 918 919 920 921 922 923 924
/**
 * move_linked_works - move linked works to a list
 * @work: start of series of works to be scheduled
 * @head: target list to append @work to
 * @nextp: out paramter for nested worklist walking
 *
 * Schedule linked works starting from @work to @head.  Work series to
 * be scheduled starts at @work and includes any consecutive work with
 * WORK_STRUCT_LINKED set in its predecessor.
 *
 * If @nextp is not NULL, it's updated to point to the next work of
 * the last scheduled work.  This allows move_linked_works() to be
 * nested inside outer list_for_each_entry_safe().
 *
 * CONTEXT:
925
 * spin_lock_irq(pool->lock).
926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950
 */
static void move_linked_works(struct work_struct *work, struct list_head *head,
			      struct work_struct **nextp)
{
	struct work_struct *n;

	/*
	 * Linked worklist will always end before the end of the list,
	 * use NULL for list head.
	 */
	list_for_each_entry_safe_from(work, n, NULL, entry) {
		list_move_tail(&work->entry, head);
		if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
			break;
	}

	/*
	 * If we're already inside safe list traversal and have moved
	 * multiple works to the scheduled queue, the next position
	 * needs to be updated.
	 */
	if (nextp)
		*nextp = n;
}

951
static void pwq_activate_delayed_work(struct work_struct *work)
952
{
953
	struct pool_workqueue *pwq = get_work_pwq(work);
954 955

	trace_workqueue_activate_work(work);
956
	move_linked_works(work, &pwq->pool->worklist, NULL);
957
	__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
958
	pwq->nr_active++;
959 960
}

961
static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
962
{
963
	struct work_struct *work = list_first_entry(&pwq->delayed_works,
964 965
						    struct work_struct, entry);

966
	pwq_activate_delayed_work(work);
967 968
}

969
/**
970 971
 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
 * @pwq: pwq of interest
972 973 974
 * @color: color of work which left the queue
 *
 * A work either has completed or is removed from pending queue,
975
 * decrement nr_in_flight of its pwq and handle workqueue flushing.
976 977
 *
 * CONTEXT:
978
 * spin_lock_irq(pool->lock).
979
 */
980
static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
981 982 983 984 985
{
	/* ignore uncolored works */
	if (color == WORK_NO_COLOR)
		return;

986
	pwq->nr_in_flight[color]--;
987

988 989
	pwq->nr_active--;
	if (!list_empty(&pwq->delayed_works)) {
990
		/* one down, submit a delayed one */
991 992
		if (pwq->nr_active < pwq->max_active)
			pwq_activate_first_delayed(pwq);
993 994 995
	}

	/* is flush in progress and are we at the flushing tip? */
996
	if (likely(pwq->flush_color != color))
997 998 999
		return;

	/* are there still in-flight works? */
1000
	if (pwq->nr_in_flight[color])
1001 1002
		return;

1003 1004
	/* this pwq is done, clear flush_color */
	pwq->flush_color = -1;
1005 1006

	/*
1007
	 * If this was the last pwq, wake up the first flusher.  It
1008 1009
	 * will handle the rest.
	 */
1010 1011
	if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
		complete(&pwq->wq->first_flusher->done);
1012 1013
}

1014
/**
1015
 * try_to_grab_pending - steal work item from worklist and disable irq
1016 1017
 * @work: work item to steal
 * @is_dwork: @work is a delayed_work
1018
 * @flags: place to store irq state
1019 1020 1021 1022 1023 1024 1025
 *
 * Try to grab PENDING bit of @work.  This function can handle @work in any
 * stable state - idle, on timer or on worklist.  Return values are
 *
 *  1		if @work was pending and we successfully stole PENDING
 *  0		if @work was idle and we claimed PENDING
 *  -EAGAIN	if PENDING couldn't be grabbed at the moment, safe to busy-retry
1026 1027
 *  -ENOENT	if someone else is canceling @work, this state may persist
 *		for arbitrarily long
1028
 *
1029
 * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
1030 1031 1032
 * interrupted while holding PENDING and @work off queue, irq must be
 * disabled on entry.  This, combined with delayed_work->timer being
 * irqsafe, ensures that we return -EAGAIN for finite short period of time.
1033 1034 1035 1036
 *
 * On successful return, >= 0, irq is disabled and the caller is
 * responsible for releasing it using local_irq_restore(*@flags).
 *
1037
 * This function is safe to call from any context including IRQ handler.
1038
 */
1039 1040
static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
			       unsigned long *flags)
1041
{
1042
	struct worker_pool *pool;
1043
	struct pool_workqueue *pwq;
1044

1045 1046
	local_irq_save(*flags);

1047 1048 1049 1050
	/* try to steal the timer if it exists */
	if (is_dwork) {
		struct delayed_work *dwork = to_delayed_work(work);

1051 1052 1053 1054 1055
		/*
		 * dwork->timer is irqsafe.  If del_timer() fails, it's
		 * guaranteed that the timer is not queued anywhere and not
		 * running on the local CPU.
		 */
1056 1057 1058 1059 1060
		if (likely(del_timer(&dwork->timer)))
			return 1;
	}

	/* try to claim PENDING the normal way */
1061 1062 1063 1064 1065 1066 1067
	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
		return 0;

	/*
	 * The queueing is in progress, or it is already queued. Try to
	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
	 */
1068 1069
	pool = get_work_pool(work);
	if (!pool)
1070
		goto fail;
1071

1072
	spin_lock(&pool->lock);
1073
	/*
1074 1075 1076 1077 1078
	 * work->data is guaranteed to point to pwq only while the work
	 * item is queued on pwq->wq, and both updating work->data to point
	 * to pwq on queueing and to pool on dequeueing are done under
	 * pwq->pool->lock.  This in turn guarantees that, if work->data
	 * points to pwq which is associated with a locked pool, the work
1079 1080
	 * item is currently queued on that pool.
	 */
1081 1082
	pwq = get_work_pwq(work);
	if (pwq && pwq->pool == pool) {
1083 1084 1085 1086 1087
		debug_work_deactivate(work);

		/*
		 * A delayed work item cannot be grabbed directly because
		 * it might have linked NO_COLOR work items which, if left
1088
		 * on the delayed_list, will confuse pwq->nr_active
1089 1090 1091 1092
		 * management later on and cause stall.  Make sure the work
		 * item is activated before grabbing.
		 */
		if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
1093
			pwq_activate_delayed_work(work);
1094 1095

		list_del_init(&work->entry);
1096
		pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
1097

1098
		/* work->data points to pwq iff queued, point to pool */
1099 1100 1101 1102
		set_work_pool_and_keep_pending(work, pool->id);

		spin_unlock(&pool->lock);
		return 1;
1103
	}
1104
	spin_unlock(&pool->lock);
1105 1106 1107 1108 1109
fail:
	local_irq_restore(*flags);
	if (work_is_canceling(work))
		return -ENOENT;
	cpu_relax();
1110
	return -EAGAIN;
1111 1112
}

T
Tejun Heo 已提交
1113
/**
1114
 * insert_work - insert a work into a pool
1115
 * @pwq: pwq @work belongs to
T
Tejun Heo 已提交
1116 1117 1118 1119
 * @work: work to insert
 * @head: insertion point
 * @extra_flags: extra WORK_STRUCT_* flags to set
 *
1120
 * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to
1121
 * work_struct flags.
T
Tejun Heo 已提交
1122 1123
 *
 * CONTEXT:
1124
 * spin_lock_irq(pool->lock).
T
Tejun Heo 已提交
1125
 */
1126 1127
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
			struct list_head *head, unsigned int extra_flags)
O
Oleg Nesterov 已提交
1128
{
1129
	struct worker_pool *pool = pwq->pool;
1130

T
Tejun Heo 已提交
1131
	/* we own @work, set data and link */
1132
	set_work_pwq(work, pwq, extra_flags);
1133
	list_add_tail(&work->entry, head);
1134 1135 1136 1137 1138 1139 1140 1141

	/*
	 * Ensure either worker_sched_deactivated() sees the above
	 * list_add_tail() or we see zero nr_running to avoid workers
	 * lying around lazily while there are works to be processed.
	 */
	smp_mb();

1142 1143
	if (__need_more_worker(pool))
		wake_up_worker(pool);
O
Oleg Nesterov 已提交
1144 1145
}

1146 1147
/*
 * Test whether @work is being queued from another work executing on the
1148
 * same workqueue.
1149 1150 1151
 */
static bool is_chained_work(struct workqueue_struct *wq)
{
1152 1153 1154 1155 1156 1157 1158
	struct worker *worker;

	worker = current_wq_worker();
	/*
	 * Return %true iff I'm a worker execuing a work item on @wq.  If
	 * I'm @worker, it's safe to dereference it without locking.
	 */
1159
	return worker && worker->current_pwq->wq == wq;
1160 1161
}

T
Tejun Heo 已提交
1162
static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
L
Linus Torvalds 已提交
1163 1164
			 struct work_struct *work)
{
1165
	struct pool_workqueue *pwq;
1166
	struct list_head *worklist;
1167
	unsigned int work_flags;
1168
	unsigned int req_cpu = cpu;
1169 1170 1171 1172 1173 1174 1175 1176

	/*
	 * While a work item is PENDING && off queue, a task trying to
	 * steal the PENDING will busy-loop waiting for it to either get
	 * queued or lose PENDING.  Grabbing PENDING and queueing should
	 * happen with IRQ disabled.
	 */
	WARN_ON_ONCE(!irqs_disabled());
L
Linus Torvalds 已提交
1177

1178
	debug_work_activate(work);
1179

1180
	/* if dying, only works from the same workqueue are allowed */
1181
	if (unlikely(wq->flags & WQ_DRAINING) &&
1182
	    WARN_ON_ONCE(!is_chained_work(wq)))
1183 1184
		return;

1185
	/* determine the pwq to use */
1186
	if (!(wq->flags & WQ_UNBOUND)) {
1187
		struct worker_pool *last_pool;
1188

1189
		if (cpu == WORK_CPU_UNBOUND)
1190 1191
			cpu = raw_smp_processor_id();

1192
		/*
1193 1194 1195 1196
		 * It's multi cpu.  If @work was previously on a different
		 * cpu, it might still be running there, in which case the
		 * work needs to be queued on that cpu to guarantee
		 * non-reentrancy.
1197
		 */
1198
		pwq = get_pwq(cpu, wq);
1199
		last_pool = get_work_pool(work);
1200

1201
		if (last_pool && last_pool != pwq->pool) {
1202 1203
			struct worker *worker;

1204
			spin_lock(&last_pool->lock);
1205

1206
			worker = find_worker_executing_work(last_pool, work);
1207

1208 1209
			if (worker && worker->current_pwq->wq == wq) {
				pwq = get_pwq(last_pool->cpu, wq);
1210
			} else {
1211
				/* meh... not running there, queue here */
1212
				spin_unlock(&last_pool->lock);
1213
				spin_lock(&pwq->pool->lock);
1214
			}
1215
		} else {
1216
			spin_lock(&pwq->pool->lock);
1217
		}
1218
	} else {
1219 1220
		pwq = get_pwq(WORK_CPU_UNBOUND, wq);
		spin_lock(&pwq->pool->lock);
1221 1222
	}

1223 1224
	/* pwq determined, queue */
	trace_workqueue_queue_work(req_cpu, pwq, work);
1225

1226
	if (WARN_ON(!list_empty(&work->entry))) {
1227
		spin_unlock(&pwq->pool->lock);
1228 1229
		return;
	}
1230

1231 1232
	pwq->nr_in_flight[pwq->work_color]++;
	work_flags = work_color_to_flags(pwq->work_color);
1233

1234
	if (likely(pwq->nr_active < pwq->max_active)) {
1235
		trace_workqueue_activate_work(work);
1236 1237
		pwq->nr_active++;
		worklist = &pwq->pool->worklist;
1238 1239
	} else {
		work_flags |= WORK_STRUCT_DELAYED;
1240
		worklist = &pwq->delayed_works;
1241
	}
1242

1243
	insert_work(pwq, work, worklist, work_flags);
1244

1245
	spin_unlock(&pwq->pool->lock);
L
Linus Torvalds 已提交
1246 1247
}

1248
/**
1249 1250
 * queue_work_on - queue work on specific cpu
 * @cpu: CPU number to execute work on
1251 1252 1253
 * @wq: workqueue to use
 * @work: work to queue
 *
1254
 * Returns %false if @work was already on a queue, %true otherwise.
L
Linus Torvalds 已提交
1255
 *
1256 1257
 * We queue the work to a specific CPU, the caller must ensure it
 * can't go away.
L
Linus Torvalds 已提交
1258
 */
1259 1260
bool queue_work_on(int cpu, struct workqueue_struct *wq,
		   struct work_struct *work)
L
Linus Torvalds 已提交
1261
{
1262
	bool ret = false;
1263
	unsigned long flags;
1264

1265
	local_irq_save(flags);
1266

1267
	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
T
Tejun Heo 已提交
1268
		__queue_work(cpu, wq, work);
1269
		ret = true;
1270
	}
1271

1272
	local_irq_restore(flags);
L
Linus Torvalds 已提交
1273 1274
	return ret;
}
1275
EXPORT_SYMBOL_GPL(queue_work_on);
L
Linus Torvalds 已提交
1276

1277
/**
1278
 * queue_work - queue work on a workqueue
1279 1280 1281
 * @wq: workqueue to use
 * @work: work to queue
 *
1282
 * Returns %false if @work was already on a queue, %true otherwise.
1283
 *
1284 1285
 * We queue the work to the CPU on which it was submitted, but if the CPU dies
 * it can be processed by another CPU.
1286
 */
1287
bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
1288
{
1289
	return queue_work_on(WORK_CPU_UNBOUND, wq, work);
1290
}
1291
EXPORT_SYMBOL_GPL(queue_work);
1292

1293
void delayed_work_timer_fn(unsigned long __data)
L
Linus Torvalds 已提交
1294
{
1295
	struct delayed_work *dwork = (struct delayed_work *)__data;
L
Linus Torvalds 已提交
1296

1297
	/* should have been called from irqsafe timer with irq already off */
1298
	__queue_work(dwork->cpu, dwork->wq, &dwork->work);
L
Linus Torvalds 已提交
1299
}
1300
EXPORT_SYMBOL(delayed_work_timer_fn);
L
Linus Torvalds 已提交
1301

1302 1303
static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
				struct delayed_work *dwork, unsigned long delay)
L
Linus Torvalds 已提交
1304
{
1305 1306 1307 1308 1309
	struct timer_list *timer = &dwork->timer;
	struct work_struct *work = &dwork->work;

	WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
		     timer->data != (unsigned long)dwork);
1310 1311
	WARN_ON_ONCE(timer_pending(timer));
	WARN_ON_ONCE(!list_empty(&work->entry));
1312

1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323
	/*
	 * If @delay is 0, queue @dwork->work immediately.  This is for
	 * both optimization and correctness.  The earliest @timer can
	 * expire is on the closest next tick and delayed_work users depend
	 * on that there's no such delay when @delay is 0.
	 */
	if (!delay) {
		__queue_work(cpu, wq, &dwork->work);
		return;
	}

1324
	timer_stats_timer_set_start_info(&dwork->timer);
L
Linus Torvalds 已提交
1325

1326
	dwork->wq = wq;
1327
	dwork->cpu = cpu;
1328 1329 1330 1331 1332 1333
	timer->expires = jiffies + delay;

	if (unlikely(cpu != WORK_CPU_UNBOUND))
		add_timer_on(timer, cpu);
	else
		add_timer(timer);
L
Linus Torvalds 已提交
1334 1335
}

1336 1337 1338 1339
/**
 * queue_delayed_work_on - queue work on specific CPU after delay
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
1340
 * @dwork: work to queue
1341 1342
 * @delay: number of jiffies to wait before queueing
 *
1343 1344 1345
 * Returns %false if @work was already on a queue, %true otherwise.  If
 * @delay is zero and @dwork is idle, it will be scheduled for immediate
 * execution.
1346
 */
1347 1348
bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
			   struct delayed_work *dwork, unsigned long delay)
1349
{
1350
	struct work_struct *work = &dwork->work;
1351
	bool ret = false;
1352
	unsigned long flags;
1353

1354 1355
	/* read the comment in __queue_work() */
	local_irq_save(flags);
1356

1357
	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1358
		__queue_delayed_work(cpu, wq, dwork, delay);
1359
		ret = true;
1360
	}
1361

1362
	local_irq_restore(flags);
1363 1364
	return ret;
}
1365
EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1366

1367 1368 1369 1370 1371 1372
/**
 * queue_delayed_work - queue work on a workqueue after delay
 * @wq: workqueue to use
 * @dwork: delayable work to queue
 * @delay: number of jiffies to wait before queueing
 *
1373
 * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
1374
 */
1375
bool queue_delayed_work(struct workqueue_struct *wq,
1376 1377
			struct delayed_work *dwork, unsigned long delay)
{
1378
	return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
1379 1380
}
EXPORT_SYMBOL_GPL(queue_delayed_work);
1381

1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396
/**
 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
 * modify @dwork's timer so that it expires after @delay.  If @delay is
 * zero, @work is guaranteed to be scheduled immediately regardless of its
 * current state.
 *
 * Returns %false if @dwork was idle and queued, %true if @dwork was
 * pending and its timer was modified.
 *
1397
 * This function is safe to call from any context including IRQ handler.
1398 1399 1400 1401 1402 1403 1404
 * See try_to_grab_pending() for details.
 */
bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
			 struct delayed_work *dwork, unsigned long delay)
{
	unsigned long flags;
	int ret;
1405

1406 1407 1408
	do {
		ret = try_to_grab_pending(&dwork->work, true, &flags);
	} while (unlikely(ret == -EAGAIN));
1409

1410 1411 1412
	if (likely(ret >= 0)) {
		__queue_delayed_work(cpu, wq, dwork, delay);
		local_irq_restore(flags);
1413
	}
1414 1415

	/* -ENOENT from try_to_grab_pending() becomes %true */
1416 1417
	return ret;
}
1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433
EXPORT_SYMBOL_GPL(mod_delayed_work_on);

/**
 * mod_delayed_work - modify delay of or queue a delayed work
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * mod_delayed_work_on() on local CPU.
 */
bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
		      unsigned long delay)
{
	return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}
EXPORT_SYMBOL_GPL(mod_delayed_work);
L
Linus Torvalds 已提交
1434

T
Tejun Heo 已提交
1435 1436 1437 1438 1439 1440 1441 1442
/**
 * worker_enter_idle - enter idle state
 * @worker: worker which is entering idle state
 *
 * @worker is entering idle state.  Update stats and idle timer if
 * necessary.
 *
 * LOCKING:
1443
 * spin_lock_irq(pool->lock).
T
Tejun Heo 已提交
1444 1445
 */
static void worker_enter_idle(struct worker *worker)
L
Linus Torvalds 已提交
1446
{
1447
	struct worker_pool *pool = worker->pool;
T
Tejun Heo 已提交
1448

1449 1450 1451 1452
	if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
	    WARN_ON_ONCE(!list_empty(&worker->entry) &&
			 (worker->hentry.next || worker->hentry.pprev)))
		return;
T
Tejun Heo 已提交
1453

1454 1455
	/* can't use worker_set_flags(), also called from start_worker() */
	worker->flags |= WORKER_IDLE;
1456
	pool->nr_idle++;
1457
	worker->last_active = jiffies;
T
Tejun Heo 已提交
1458 1459

	/* idle_list is LIFO */
1460
	list_add(&worker->entry, &pool->idle_list);
1461

1462 1463
	if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
		mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
1464

1465
	/*
1466
	 * Sanity check nr_running.  Because wq_unbind_fn() releases
1467
	 * pool->lock between setting %WORKER_UNBOUND and zapping
1468 1469
	 * nr_running, the warning may trigger spuriously.  Check iff
	 * unbind is not in progress.
1470
	 */
1471
	WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
1472
		     pool->nr_workers == pool->nr_idle &&
1473
		     atomic_read(&pool->nr_running));
T
Tejun Heo 已提交
1474 1475 1476 1477 1478 1479 1480 1481 1482
}

/**
 * worker_leave_idle - leave idle state
 * @worker: worker which is leaving idle state
 *
 * @worker is leaving idle state.  Update stats.
 *
 * LOCKING:
1483
 * spin_lock_irq(pool->lock).
T
Tejun Heo 已提交
1484 1485 1486
 */
static void worker_leave_idle(struct worker *worker)
{
1487
	struct worker_pool *pool = worker->pool;
T
Tejun Heo 已提交
1488

1489 1490
	if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
		return;
1491
	worker_clr_flags(worker, WORKER_IDLE);
1492
	pool->nr_idle--;
T
Tejun Heo 已提交
1493 1494 1495
	list_del_init(&worker->entry);
}

1496
/**
1497 1498 1499 1500
 * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it
 * @pool: target worker_pool
 *
 * Bind %current to the cpu of @pool if it is associated and lock @pool.
1501 1502 1503 1504 1505 1506
 *
 * Works which are scheduled while the cpu is online must at least be
 * scheduled to a worker which is bound to the cpu so that if they are
 * flushed from cpu callbacks while cpu is going down, they are
 * guaranteed to execute on the cpu.
 *
1507
 * This function is to be used by unbound workers and rescuers to bind
1508 1509 1510
 * themselves to the target cpu and may race with cpu going down or
 * coming online.  kthread_bind() can't be used because it may put the
 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1511
 * verbatim as it's best effort and blocking and pool may be
1512 1513
 * [dis]associated in the meantime.
 *
1514
 * This function tries set_cpus_allowed() and locks pool and verifies the
1515
 * binding against %POOL_DISASSOCIATED which is set during
1516 1517 1518
 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
 * enters idle state or fetches works without dropping lock, it can
 * guarantee the scheduling requirement described in the first paragraph.
1519 1520
 *
 * CONTEXT:
1521
 * Might sleep.  Called without any lock but returns with pool->lock
1522 1523 1524
 * held.
 *
 * RETURNS:
1525
 * %true if the associated pool is online (@worker is successfully
1526 1527
 * bound), %false if offline.
 */
1528
static bool worker_maybe_bind_and_lock(struct worker_pool *pool)
1529
__acquires(&pool->lock)
1530 1531
{
	while (true) {
1532
		/*
1533 1534 1535
		 * The following call may fail, succeed or succeed
		 * without actually migrating the task to the cpu if
		 * it races with cpu hotunplug operation.  Verify
1536
		 * against POOL_DISASSOCIATED.
1537
		 */
1538
		if (!(pool->flags & POOL_DISASSOCIATED))
1539
			set_cpus_allowed_ptr(current, get_cpu_mask(pool->cpu));
1540

1541
		spin_lock_irq(&pool->lock);
1542
		if (pool->flags & POOL_DISASSOCIATED)
1543
			return false;
1544
		if (task_cpu(current) == pool->cpu &&
1545
		    cpumask_equal(&current->cpus_allowed,
1546
				  get_cpu_mask(pool->cpu)))
1547
			return true;
1548
		spin_unlock_irq(&pool->lock);
1549

1550 1551 1552 1553 1554 1555
		/*
		 * We've raced with CPU hot[un]plug.  Give it a breather
		 * and retry migration.  cond_resched() is required here;
		 * otherwise, we might deadlock against cpu_stop trying to
		 * bring down the CPU on non-preemptive kernel.
		 */
1556
		cpu_relax();
1557
		cond_resched();
1558 1559 1560
	}
}

1561
/*
1562
 * Rebind an idle @worker to its CPU.  worker_thread() will test
1563
 * list_empty(@worker->entry) before leaving idle and call this function.
1564 1565 1566
 */
static void idle_worker_rebind(struct worker *worker)
{
1567
	/* CPU may go down again inbetween, clear UNBOUND only on success */
1568
	if (worker_maybe_bind_and_lock(worker->pool))
1569
		worker_clr_flags(worker, WORKER_UNBOUND);
1570

1571 1572
	/* rebind complete, become available again */
	list_add(&worker->entry, &worker->pool->idle_list);
1573
	spin_unlock_irq(&worker->pool->lock);
1574 1575
}

1576
/*
1577
 * Function for @worker->rebind.work used to rebind unbound busy workers to
1578 1579 1580
 * the associated cpu which is coming back online.  This is scheduled by
 * cpu up but can race with other cpu hotplug operations and may be
 * executed twice without intervening cpu down.
1581
 */
1582
static void busy_worker_rebind_fn(struct work_struct *work)
1583 1584 1585
{
	struct worker *worker = container_of(work, struct worker, rebind_work);

1586
	if (worker_maybe_bind_and_lock(worker->pool))
1587
		worker_clr_flags(worker, WORKER_UNBOUND);
1588

1589
	spin_unlock_irq(&worker->pool->lock);
1590 1591
}

1592
/**
1593 1594
 * rebind_workers - rebind all workers of a pool to the associated CPU
 * @pool: pool of interest
1595
 *
1596
 * @pool->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
1597 1598
 * is different for idle and busy ones.
 *
1599 1600 1601 1602
 * Idle ones will be removed from the idle_list and woken up.  They will
 * add themselves back after completing rebind.  This ensures that the
 * idle_list doesn't contain any unbound workers when re-bound busy workers
 * try to perform local wake-ups for concurrency management.
1603
 *
1604 1605 1606 1607
 * Busy workers can rebind after they finish their current work items.
 * Queueing the rebind work item at the head of the scheduled list is
 * enough.  Note that nr_running will be properly bumped as busy workers
 * rebind.
1608
 *
1609 1610 1611 1612
 * On return, all non-manager workers are scheduled for rebind - see
 * manage_workers() for the manager special case.  Any idle worker
 * including the manager will not appear on @idle_list until rebind is
 * complete, making local wake-ups safe.
1613
 */
1614
static void rebind_workers(struct worker_pool *pool)
1615
{
1616
	struct worker *worker, *n;
1617 1618
	int i;

1619 1620
	lockdep_assert_held(&pool->assoc_mutex);
	lockdep_assert_held(&pool->lock);
1621

1622
	/* dequeue and kick idle ones */
1623 1624 1625 1626 1627 1628
	list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
		/*
		 * idle workers should be off @pool->idle_list until rebind
		 * is complete to avoid receiving premature local wake-ups.
		 */
		list_del_init(&worker->entry);
1629

1630 1631 1632 1633 1634 1635
		/*
		 * worker_thread() will see the above dequeuing and call
		 * idle_worker_rebind().
		 */
		wake_up_process(worker->task);
	}
1636

1637
	/* rebind busy workers */
1638
	for_each_busy_worker(worker, i, pool) {
1639 1640
		struct work_struct *rebind_work = &worker->rebind_work;
		struct workqueue_struct *wq;
1641

1642 1643 1644
		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
				     work_data_bits(rebind_work)))
			continue;
1645

1646
		debug_work_activate(rebind_work);
1647

1648 1649
		/*
		 * wq doesn't really matter but let's keep @worker->pool
1650
		 * and @pwq->pool consistent for sanity.
1651 1652 1653 1654 1655 1656
		 */
		if (std_worker_pool_pri(worker->pool))
			wq = system_highpri_wq;
		else
			wq = system_wq;

1657
		insert_work(get_pwq(pool->cpu, wq), rebind_work,
1658 1659
			    worker->scheduled.next,
			    work_color_to_flags(WORK_NO_COLOR));
1660
	}
1661 1662
}

T
Tejun Heo 已提交
1663 1664 1665 1666 1667
static struct worker *alloc_worker(void)
{
	struct worker *worker;

	worker = kzalloc(sizeof(*worker), GFP_KERNEL);
T
Tejun Heo 已提交
1668 1669
	if (worker) {
		INIT_LIST_HEAD(&worker->entry);
1670
		INIT_LIST_HEAD(&worker->scheduled);
1671
		INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
1672 1673
		/* on creation a worker is in !idle && prep state */
		worker->flags = WORKER_PREP;
T
Tejun Heo 已提交
1674
	}
T
Tejun Heo 已提交
1675 1676 1677 1678 1679
	return worker;
}

/**
 * create_worker - create a new workqueue worker
1680
 * @pool: pool the new worker will belong to
T
Tejun Heo 已提交
1681
 *
1682
 * Create a new worker which is bound to @pool.  The returned worker
T
Tejun Heo 已提交
1683 1684 1685 1686 1687 1688 1689 1690 1691
 * can be started by calling start_worker() or destroyed using
 * destroy_worker().
 *
 * CONTEXT:
 * Might sleep.  Does GFP_KERNEL allocations.
 *
 * RETURNS:
 * Pointer to the newly created worker.
 */
1692
static struct worker *create_worker(struct worker_pool *pool)
T
Tejun Heo 已提交
1693
{
1694
	const char *pri = std_worker_pool_pri(pool) ? "H" : "";
T
Tejun Heo 已提交
1695
	struct worker *worker = NULL;
1696
	int id = -1;
T
Tejun Heo 已提交
1697

1698
	spin_lock_irq(&pool->lock);
1699
	while (ida_get_new(&pool->worker_ida, &id)) {
1700
		spin_unlock_irq(&pool->lock);
1701
		if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
T
Tejun Heo 已提交
1702
			goto fail;
1703
		spin_lock_irq(&pool->lock);
T
Tejun Heo 已提交
1704
	}
1705
	spin_unlock_irq(&pool->lock);
T
Tejun Heo 已提交
1706 1707 1708 1709 1710

	worker = alloc_worker();
	if (!worker)
		goto fail;

1711
	worker->pool = pool;
T
Tejun Heo 已提交
1712 1713
	worker->id = id;

1714
	if (pool->cpu != WORK_CPU_UNBOUND)
1715
		worker->task = kthread_create_on_node(worker_thread,
1716 1717
					worker, cpu_to_node(pool->cpu),
					"kworker/%u:%d%s", pool->cpu, id, pri);
1718 1719
	else
		worker->task = kthread_create(worker_thread, worker,
1720
					      "kworker/u:%d%s", id, pri);
T
Tejun Heo 已提交
1721 1722 1723
	if (IS_ERR(worker->task))
		goto fail;

1724
	if (std_worker_pool_pri(pool))
1725 1726
		set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);

1727
	/*
1728
	 * Determine CPU binding of the new worker depending on
1729
	 * %POOL_DISASSOCIATED.  The caller is responsible for ensuring the
1730 1731 1732 1733 1734
	 * flag remains stable across this function.  See the comments
	 * above the flag definition for details.
	 *
	 * As an unbound worker may later become a regular one if CPU comes
	 * online, make sure every worker has %PF_THREAD_BOUND set.
1735
	 */
1736
	if (!(pool->flags & POOL_DISASSOCIATED)) {
1737
		kthread_bind(worker->task, pool->cpu);
1738
	} else {
1739
		worker->task->flags |= PF_THREAD_BOUND;
1740
		worker->flags |= WORKER_UNBOUND;
1741
	}
T
Tejun Heo 已提交
1742 1743 1744 1745

	return worker;
fail:
	if (id >= 0) {
1746
		spin_lock_irq(&pool->lock);
1747
		ida_remove(&pool->worker_ida, id);
1748
		spin_unlock_irq(&pool->lock);
T
Tejun Heo 已提交
1749 1750 1751 1752 1753 1754 1755 1756 1757
	}
	kfree(worker);
	return NULL;
}

/**
 * start_worker - start a newly created worker
 * @worker: worker to start
 *
1758
 * Make the pool aware of @worker and start it.
T
Tejun Heo 已提交
1759 1760
 *
 * CONTEXT:
1761
 * spin_lock_irq(pool->lock).
T
Tejun Heo 已提交
1762 1763 1764
 */
static void start_worker(struct worker *worker)
{
1765
	worker->flags |= WORKER_STARTED;
1766
	worker->pool->nr_workers++;
T
Tejun Heo 已提交
1767
	worker_enter_idle(worker);
T
Tejun Heo 已提交
1768 1769 1770 1771 1772 1773 1774
	wake_up_process(worker->task);
}

/**
 * destroy_worker - destroy a workqueue worker
 * @worker: worker to be destroyed
 *
1775
 * Destroy @worker and adjust @pool stats accordingly.
T
Tejun Heo 已提交
1776 1777
 *
 * CONTEXT:
1778
 * spin_lock_irq(pool->lock) which is released and regrabbed.
T
Tejun Heo 已提交
1779 1780 1781
 */
static void destroy_worker(struct worker *worker)
{
1782
	struct worker_pool *pool = worker->pool;
T
Tejun Heo 已提交
1783 1784 1785
	int id = worker->id;

	/* sanity check frenzy */
1786 1787 1788
	if (WARN_ON(worker->current_work) ||
	    WARN_ON(!list_empty(&worker->scheduled)))
		return;
T
Tejun Heo 已提交
1789

T
Tejun Heo 已提交
1790
	if (worker->flags & WORKER_STARTED)
1791
		pool->nr_workers--;
T
Tejun Heo 已提交
1792
	if (worker->flags & WORKER_IDLE)
1793
		pool->nr_idle--;
T
Tejun Heo 已提交
1794 1795

	list_del_init(&worker->entry);
1796
	worker->flags |= WORKER_DIE;
T
Tejun Heo 已提交
1797

1798
	spin_unlock_irq(&pool->lock);
T
Tejun Heo 已提交
1799

T
Tejun Heo 已提交
1800 1801 1802
	kthread_stop(worker->task);
	kfree(worker);

1803
	spin_lock_irq(&pool->lock);
1804
	ida_remove(&pool->worker_ida, id);
T
Tejun Heo 已提交
1805 1806
}

1807
static void idle_worker_timeout(unsigned long __pool)
1808
{
1809
	struct worker_pool *pool = (void *)__pool;
1810

1811
	spin_lock_irq(&pool->lock);
1812

1813
	if (too_many_workers(pool)) {
1814 1815 1816 1817
		struct worker *worker;
		unsigned long expires;

		/* idle_list is kept in LIFO order, check the last one */
1818
		worker = list_entry(pool->idle_list.prev, struct worker, entry);
1819 1820 1821
		expires = worker->last_active + IDLE_WORKER_TIMEOUT;

		if (time_before(jiffies, expires))
1822
			mod_timer(&pool->idle_timer, expires);
1823 1824
		else {
			/* it's been idle for too long, wake up manager */
1825
			pool->flags |= POOL_MANAGE_WORKERS;
1826
			wake_up_worker(pool);
1827
		}
1828 1829
	}

1830
	spin_unlock_irq(&pool->lock);
1831
}
1832

1833
static void send_mayday(struct work_struct *work)
1834
{
1835 1836
	struct pool_workqueue *pwq = get_work_pwq(work);
	struct workqueue_struct *wq = pwq->wq;
1837 1838

	lockdep_assert_held(&workqueue_lock);
1839 1840

	if (!(wq->flags & WQ_RESCUER))
1841
		return;
1842 1843

	/* mayday mayday mayday */
1844 1845
	if (list_empty(&pwq->mayday_node)) {
		list_add_tail(&pwq->mayday_node, &wq->maydays);
1846
		wake_up_process(wq->rescuer->task);
1847
	}
1848 1849
}

1850
static void pool_mayday_timeout(unsigned long __pool)
1851
{
1852
	struct worker_pool *pool = (void *)__pool;
1853 1854
	struct work_struct *work;

1855 1856
	spin_lock_irq(&workqueue_lock);		/* for wq->maydays */
	spin_lock(&pool->lock);
1857

1858
	if (need_to_create_worker(pool)) {
1859 1860 1861 1862 1863 1864
		/*
		 * We've been trying to create a new worker but
		 * haven't been successful.  We might be hitting an
		 * allocation deadlock.  Send distress signals to
		 * rescuers.
		 */
1865
		list_for_each_entry(work, &pool->worklist, entry)
1866
			send_mayday(work);
L
Linus Torvalds 已提交
1867
	}
1868

1869 1870
	spin_unlock(&pool->lock);
	spin_unlock_irq(&workqueue_lock);
1871

1872
	mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
L
Linus Torvalds 已提交
1873 1874
}

1875 1876
/**
 * maybe_create_worker - create a new worker if necessary
1877
 * @pool: pool to create a new worker for
1878
 *
1879
 * Create a new worker for @pool if necessary.  @pool is guaranteed to
1880 1881
 * have at least one idle worker on return from this function.  If
 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
1882
 * sent to all rescuers with works scheduled on @pool to resolve
1883 1884 1885 1886 1887 1888
 * possible allocation deadlock.
 *
 * On return, need_to_create_worker() is guaranteed to be false and
 * may_start_working() true.
 *
 * LOCKING:
1889
 * spin_lock_irq(pool->lock) which may be released and regrabbed
1890 1891 1892 1893
 * multiple times.  Does GFP_KERNEL allocations.  Called only from
 * manager.
 *
 * RETURNS:
1894
 * false if no action was taken and pool->lock stayed locked, true
1895 1896
 * otherwise.
 */
1897
static bool maybe_create_worker(struct worker_pool *pool)
1898 1899
__releases(&pool->lock)
__acquires(&pool->lock)
L
Linus Torvalds 已提交
1900
{
1901
	if (!need_to_create_worker(pool))
1902 1903
		return false;
restart:
1904
	spin_unlock_irq(&pool->lock);
1905

1906
	/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1907
	mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1908 1909 1910 1911

	while (true) {
		struct worker *worker;

1912
		worker = create_worker(pool);
1913
		if (worker) {
1914
			del_timer_sync(&pool->mayday_timer);
1915
			spin_lock_irq(&pool->lock);
1916
			start_worker(worker);
1917 1918
			if (WARN_ON_ONCE(need_to_create_worker(pool)))
				goto restart;
1919 1920 1921
			return true;
		}

1922
		if (!need_to_create_worker(pool))
1923
			break;
L
Linus Torvalds 已提交
1924

1925 1926
		__set_current_state(TASK_INTERRUPTIBLE);
		schedule_timeout(CREATE_COOLDOWN);
1927

1928
		if (!need_to_create_worker(pool))
1929 1930 1931
			break;
	}

1932
	del_timer_sync(&pool->mayday_timer);
1933
	spin_lock_irq(&pool->lock);
1934
	if (need_to_create_worker(pool))
1935 1936 1937 1938 1939 1940
		goto restart;
	return true;
}

/**
 * maybe_destroy_worker - destroy workers which have been idle for a while
1941
 * @pool: pool to destroy workers for
1942
 *
1943
 * Destroy @pool workers which have been idle for longer than
1944 1945 1946
 * IDLE_WORKER_TIMEOUT.
 *
 * LOCKING:
1947
 * spin_lock_irq(pool->lock) which may be released and regrabbed
1948 1949 1950
 * multiple times.  Called only from manager.
 *
 * RETURNS:
1951
 * false if no action was taken and pool->lock stayed locked, true
1952 1953
 * otherwise.
 */
1954
static bool maybe_destroy_workers(struct worker_pool *pool)
1955 1956
{
	bool ret = false;
L
Linus Torvalds 已提交
1957

1958
	while (too_many_workers(pool)) {
1959 1960
		struct worker *worker;
		unsigned long expires;
1961

1962
		worker = list_entry(pool->idle_list.prev, struct worker, entry);
1963
		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1964

1965
		if (time_before(jiffies, expires)) {
1966
			mod_timer(&pool->idle_timer, expires);
1967
			break;
1968
		}
L
Linus Torvalds 已提交
1969

1970 1971
		destroy_worker(worker);
		ret = true;
L
Linus Torvalds 已提交
1972
	}
1973

1974
	return ret;
1975 1976
}

1977
/**
1978 1979
 * manage_workers - manage worker pool
 * @worker: self
1980
 *
1981
 * Assume the manager role and manage the worker pool @worker belongs
1982
 * to.  At any given time, there can be only zero or one manager per
1983
 * pool.  The exclusion is handled automatically by this function.
1984 1985 1986 1987
 *
 * The caller can safely start processing works on false return.  On
 * true return, it's guaranteed that need_to_create_worker() is false
 * and may_start_working() is true.
1988 1989
 *
 * CONTEXT:
1990
 * spin_lock_irq(pool->lock) which may be released and regrabbed
1991 1992 1993
 * multiple times.  Does GFP_KERNEL allocations.
 *
 * RETURNS:
1994 1995
 * spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.
1996
 */
1997
static bool manage_workers(struct worker *worker)
1998
{
1999
	struct worker_pool *pool = worker->pool;
2000
	bool ret = false;
2001

2002
	if (pool->flags & POOL_MANAGING_WORKERS)
2003
		return ret;
2004

2005
	pool->flags |= POOL_MANAGING_WORKERS;
2006

2007 2008 2009 2010 2011 2012
	/*
	 * To simplify both worker management and CPU hotplug, hold off
	 * management while hotplug is in progress.  CPU hotplug path can't
	 * grab %POOL_MANAGING_WORKERS to achieve this because that can
	 * lead to idle worker depletion (all become busy thinking someone
	 * else is managing) which in turn can result in deadlock under
2013
	 * extreme circumstances.  Use @pool->assoc_mutex to synchronize
2014 2015
	 * manager against CPU hotplug.
	 *
2016
	 * assoc_mutex would always be free unless CPU hotplug is in
2017
	 * progress.  trylock first without dropping @pool->lock.
2018
	 */
2019
	if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
2020
		spin_unlock_irq(&pool->lock);
2021
		mutex_lock(&pool->assoc_mutex);
2022 2023
		/*
		 * CPU hotplug could have happened while we were waiting
2024
		 * for assoc_mutex.  Hotplug itself can't handle us
2025
		 * because manager isn't either on idle or busy list, and
2026
		 * @pool's state and ours could have deviated.
2027
		 *
2028
		 * As hotplug is now excluded via assoc_mutex, we can
2029
		 * simply try to bind.  It will succeed or fail depending
2030
		 * on @pool's current state.  Try it and adjust
2031 2032
		 * %WORKER_UNBOUND accordingly.
		 */
2033
		if (worker_maybe_bind_and_lock(pool))
2034 2035 2036
			worker->flags &= ~WORKER_UNBOUND;
		else
			worker->flags |= WORKER_UNBOUND;
2037

2038 2039
		ret = true;
	}
2040

2041
	pool->flags &= ~POOL_MANAGE_WORKERS;
2042 2043

	/*
2044 2045
	 * Destroy and then create so that may_start_working() is true
	 * on return.
2046
	 */
2047 2048
	ret |= maybe_destroy_workers(pool);
	ret |= maybe_create_worker(pool);
2049

2050
	pool->flags &= ~POOL_MANAGING_WORKERS;
2051
	mutex_unlock(&pool->assoc_mutex);
2052
	return ret;
2053 2054
}

2055 2056
/**
 * process_one_work - process single work
T
Tejun Heo 已提交
2057
 * @worker: self
2058 2059 2060 2061 2062 2063 2064 2065 2066
 * @work: work to process
 *
 * Process @work.  This function contains all the logics necessary to
 * process a single work including synchronization against and
 * interaction with other workers on the same cpu, queueing and
 * flushing.  As long as context requirement is met, any worker can
 * call this function to process a work.
 *
 * CONTEXT:
2067
 * spin_lock_irq(pool->lock) which is released and regrabbed.
2068
 */
T
Tejun Heo 已提交
2069
static void process_one_work(struct worker *worker, struct work_struct *work)
2070 2071
__releases(&pool->lock)
__acquires(&pool->lock)
2072
{
2073
	struct pool_workqueue *pwq = get_work_pwq(work);
2074
	struct worker_pool *pool = worker->pool;
2075
	bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
2076
	int work_color;
2077
	struct worker *collision;
2078 2079 2080 2081 2082 2083 2084 2085
#ifdef CONFIG_LOCKDEP
	/*
	 * It is permissible to free the struct work_struct from
	 * inside the function that is called from it, this we need to
	 * take into account for lockdep too.  To avoid bogus "held
	 * lock freed" warnings as well as problems when looking into
	 * work->lockdep_map, make a copy and use that here.
	 */
2086 2087 2088
	struct lockdep_map lockdep_map;

	lockdep_copy_map(&lockdep_map, &work->lockdep_map);
2089
#endif
2090 2091 2092
	/*
	 * Ensure we're on the correct CPU.  DISASSOCIATED test is
	 * necessary to avoid spurious warnings from rescuers servicing the
2093
	 * unbound or a disassociated pool.
2094
	 */
2095
	WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
2096
		     !(pool->flags & POOL_DISASSOCIATED) &&
2097
		     raw_smp_processor_id() != pool->cpu);
2098

2099 2100 2101 2102 2103 2104
	/*
	 * A single work shouldn't be executed concurrently by
	 * multiple workers on a single cpu.  Check whether anyone is
	 * already processing the work.  If so, defer the work to the
	 * currently executing one.
	 */
2105
	collision = find_worker_executing_work(pool, work);
2106 2107 2108 2109 2110
	if (unlikely(collision)) {
		move_linked_works(work, &collision->scheduled, NULL);
		return;
	}

2111
	/* claim and dequeue */
2112
	debug_work_deactivate(work);
2113
	hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
T
Tejun Heo 已提交
2114
	worker->current_work = work;
2115
	worker->current_func = work->func;
2116
	worker->current_pwq = pwq;
2117
	work_color = get_work_color(work);
2118

2119 2120
	list_del_init(&work->entry);

2121 2122 2123 2124 2125 2126 2127
	/*
	 * CPU intensive works don't participate in concurrency
	 * management.  They're the scheduler's responsibility.
	 */
	if (unlikely(cpu_intensive))
		worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);

2128
	/*
2129
	 * Unbound pool isn't concurrency managed and work items should be
2130 2131
	 * executed ASAP.  Wake up another worker if necessary.
	 */
2132 2133
	if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
		wake_up_worker(pool);
2134

2135
	/*
2136
	 * Record the last pool and clear PENDING which should be the last
2137
	 * update to @work.  Also, do this inside @pool->lock so that
2138 2139
	 * PENDING and queued state changes happen together while IRQ is
	 * disabled.
2140
	 */
2141
	set_work_pool_and_clear_pending(work, pool->id);
2142

2143
	spin_unlock_irq(&pool->lock);
2144

2145
	lock_map_acquire_read(&pwq->wq->lockdep_map);
2146
	lock_map_acquire(&lockdep_map);
2147
	trace_workqueue_execute_start(work);
2148
	worker->current_func(work);
2149 2150 2151 2152 2153
	/*
	 * While we must be careful to not use "work" after this, the trace
	 * point will only record its address.
	 */
	trace_workqueue_execute_end(work);
2154
	lock_map_release(&lockdep_map);
2155
	lock_map_release(&pwq->wq->lockdep_map);
2156 2157

	if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
V
Valentin Ilie 已提交
2158 2159
		pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
		       "     last function: %pf\n",
2160 2161
		       current->comm, preempt_count(), task_pid_nr(current),
		       worker->current_func);
2162 2163 2164 2165
		debug_show_held_locks(current);
		dump_stack();
	}

2166
	spin_lock_irq(&pool->lock);
2167

2168 2169 2170 2171
	/* clear cpu intensive status */
	if (unlikely(cpu_intensive))
		worker_clr_flags(worker, WORKER_CPU_INTENSIVE);

2172
	/* we're done with it, release */
2173
	hash_del(&worker->hentry);
T
Tejun Heo 已提交
2174
	worker->current_work = NULL;
2175
	worker->current_func = NULL;
2176 2177
	worker->current_pwq = NULL;
	pwq_dec_nr_in_flight(pwq, work_color);
2178 2179
}

2180 2181 2182 2183 2184 2185 2186 2187 2188
/**
 * process_scheduled_works - process scheduled works
 * @worker: self
 *
 * Process all scheduled works.  Please note that the scheduled list
 * may change while processing a work, so this function repeatedly
 * fetches a work from the top and executes it.
 *
 * CONTEXT:
2189
 * spin_lock_irq(pool->lock) which may be released and regrabbed
2190 2191 2192
 * multiple times.
 */
static void process_scheduled_works(struct worker *worker)
L
Linus Torvalds 已提交
2193
{
2194 2195
	while (!list_empty(&worker->scheduled)) {
		struct work_struct *work = list_first_entry(&worker->scheduled,
L
Linus Torvalds 已提交
2196
						struct work_struct, entry);
T
Tejun Heo 已提交
2197
		process_one_work(worker, work);
L
Linus Torvalds 已提交
2198 2199 2200
	}
}

T
Tejun Heo 已提交
2201 2202
/**
 * worker_thread - the worker thread function
T
Tejun Heo 已提交
2203
 * @__worker: self
T
Tejun Heo 已提交
2204
 *
2205 2206
 * The worker thread function.  There are NR_CPU_WORKER_POOLS dynamic pools
 * of these per each cpu.  These workers process all works regardless of
2207 2208 2209
 * their specific target workqueue.  The only exception is works which
 * belong to workqueues with a rescuer which will be explained in
 * rescuer_thread().
T
Tejun Heo 已提交
2210
 */
T
Tejun Heo 已提交
2211
static int worker_thread(void *__worker)
L
Linus Torvalds 已提交
2212
{
T
Tejun Heo 已提交
2213
	struct worker *worker = __worker;
2214
	struct worker_pool *pool = worker->pool;
L
Linus Torvalds 已提交
2215

2216 2217
	/* tell the scheduler that this is a workqueue worker */
	worker->task->flags |= PF_WQ_WORKER;
T
Tejun Heo 已提交
2218
woke_up:
2219
	spin_lock_irq(&pool->lock);
L
Linus Torvalds 已提交
2220

2221 2222
	/* we are off idle list if destruction or rebind is requested */
	if (unlikely(list_empty(&worker->entry))) {
2223
		spin_unlock_irq(&pool->lock);
2224

2225
		/* if DIE is set, destruction is requested */
2226 2227 2228 2229 2230
		if (worker->flags & WORKER_DIE) {
			worker->task->flags &= ~PF_WQ_WORKER;
			return 0;
		}

2231
		/* otherwise, rebind */
2232 2233
		idle_worker_rebind(worker);
		goto woke_up;
T
Tejun Heo 已提交
2234
	}
2235

T
Tejun Heo 已提交
2236
	worker_leave_idle(worker);
2237
recheck:
2238
	/* no more worker necessary? */
2239
	if (!need_more_worker(pool))
2240 2241 2242
		goto sleep;

	/* do we need to manage? */
2243
	if (unlikely(!may_start_working(pool)) && manage_workers(worker))
2244 2245
		goto recheck;

T
Tejun Heo 已提交
2246 2247 2248 2249 2250
	/*
	 * ->scheduled list can only be filled while a worker is
	 * preparing to process a work or actually processing it.
	 * Make sure nobody diddled with it while I was sleeping.
	 */
2251
	WARN_ON_ONCE(!list_empty(&worker->scheduled));
T
Tejun Heo 已提交
2252

2253 2254 2255 2256 2257 2258 2259 2260
	/*
	 * When control reaches this point, we're guaranteed to have
	 * at least one idle worker or that someone else has already
	 * assumed the manager role.
	 */
	worker_clr_flags(worker, WORKER_PREP);

	do {
T
Tejun Heo 已提交
2261
		struct work_struct *work =
2262
			list_first_entry(&pool->worklist,
T
Tejun Heo 已提交
2263 2264 2265 2266 2267 2268
					 struct work_struct, entry);

		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
			/* optimization path, not strictly necessary */
			process_one_work(worker, work);
			if (unlikely(!list_empty(&worker->scheduled)))
2269
				process_scheduled_works(worker);
T
Tejun Heo 已提交
2270 2271 2272
		} else {
			move_linked_works(work, &worker->scheduled, NULL);
			process_scheduled_works(worker);
2273
		}
2274
	} while (keep_working(pool));
2275 2276

	worker_set_flags(worker, WORKER_PREP, false);
2277
sleep:
2278
	if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
2279
		goto recheck;
2280

T
Tejun Heo 已提交
2281
	/*
2282 2283 2284 2285 2286
	 * pool->lock is held and there's no work to process and no need to
	 * manage, sleep.  Workers are woken up only while holding
	 * pool->lock or from local cpu, so setting the current state
	 * before releasing pool->lock is enough to prevent losing any
	 * event.
T
Tejun Heo 已提交
2287 2288 2289
	 */
	worker_enter_idle(worker);
	__set_current_state(TASK_INTERRUPTIBLE);
2290
	spin_unlock_irq(&pool->lock);
T
Tejun Heo 已提交
2291 2292
	schedule();
	goto woke_up;
L
Linus Torvalds 已提交
2293 2294
}

2295 2296
/**
 * rescuer_thread - the rescuer thread function
2297
 * @__rescuer: self
2298 2299 2300 2301
 *
 * Workqueue rescuer thread function.  There's one rescuer for each
 * workqueue which has WQ_RESCUER set.
 *
2302
 * Regular work processing on a pool may block trying to create a new
2303 2304 2305 2306 2307
 * worker which uses GFP_KERNEL allocation which has slight chance of
 * developing into deadlock if some works currently on the same queue
 * need to be processed to satisfy the GFP_KERNEL allocation.  This is
 * the problem rescuer solves.
 *
2308 2309
 * When such condition is possible, the pool summons rescuers of all
 * workqueues which have works queued on the pool and let them process
2310 2311 2312 2313
 * those works so that forward progress can be guaranteed.
 *
 * This should happen rarely.
 */
2314
static int rescuer_thread(void *__rescuer)
2315
{
2316 2317
	struct worker *rescuer = __rescuer;
	struct workqueue_struct *wq = rescuer->rescue_wq;
2318 2319 2320
	struct list_head *scheduled = &rescuer->scheduled;

	set_user_nice(current, RESCUER_NICE_LEVEL);
2321 2322 2323 2324 2325 2326

	/*
	 * Mark rescuer as worker too.  As WORKER_PREP is never cleared, it
	 * doesn't participate in concurrency management.
	 */
	rescuer->task->flags |= PF_WQ_WORKER;
2327 2328 2329
repeat:
	set_current_state(TASK_INTERRUPTIBLE);

2330 2331
	if (kthread_should_stop()) {
		__set_current_state(TASK_RUNNING);
2332
		rescuer->task->flags &= ~PF_WQ_WORKER;
2333
		return 0;
2334
	}
2335

2336 2337 2338 2339 2340 2341
	/* see whether any pwq is asking for help */
	spin_lock_irq(&workqueue_lock);

	while (!list_empty(&wq->maydays)) {
		struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
					struct pool_workqueue, mayday_node);
2342
		struct worker_pool *pool = pwq->pool;
2343 2344 2345
		struct work_struct *work, *n;

		__set_current_state(TASK_RUNNING);
2346 2347 2348
		list_del_init(&pwq->mayday_node);

		spin_unlock_irq(&workqueue_lock);
2349 2350

		/* migrate to the target cpu if possible */
2351
		worker_maybe_bind_and_lock(pool);
2352
		rescuer->pool = pool;
2353 2354 2355 2356 2357

		/*
		 * Slurp in all works issued via this workqueue and
		 * process'em.
		 */
2358
		WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
2359
		list_for_each_entry_safe(work, n, &pool->worklist, entry)
2360
			if (get_work_pwq(work) == pwq)
2361 2362 2363
				move_linked_works(work, scheduled, &n);

		process_scheduled_works(rescuer);
2364 2365

		/*
2366
		 * Leave this pool.  If keep_working() is %true, notify a
2367 2368 2369
		 * regular worker; otherwise, we end up with 0 concurrency
		 * and stalling the execution.
		 */
2370 2371
		if (keep_working(pool))
			wake_up_worker(pool);
2372

2373
		rescuer->pool = NULL;
2374 2375
		spin_unlock(&pool->lock);
		spin_lock(&workqueue_lock);
2376 2377
	}

2378 2379
	spin_unlock_irq(&workqueue_lock);

2380 2381
	/* rescuers should never participate in concurrency management */
	WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
2382 2383
	schedule();
	goto repeat;
L
Linus Torvalds 已提交
2384 2385
}

O
Oleg Nesterov 已提交
2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396
struct wq_barrier {
	struct work_struct	work;
	struct completion	done;
};

static void wq_barrier_func(struct work_struct *work)
{
	struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
	complete(&barr->done);
}

T
Tejun Heo 已提交
2397 2398
/**
 * insert_wq_barrier - insert a barrier work
2399
 * @pwq: pwq to insert barrier into
T
Tejun Heo 已提交
2400
 * @barr: wq_barrier to insert
2401 2402
 * @target: target work to attach @barr to
 * @worker: worker currently executing @target, NULL if @target is not executing
T
Tejun Heo 已提交
2403
 *
2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415
 * @barr is linked to @target such that @barr is completed only after
 * @target finishes execution.  Please note that the ordering
 * guarantee is observed only with respect to @target and on the local
 * cpu.
 *
 * Currently, a queued barrier can't be canceled.  This is because
 * try_to_grab_pending() can't determine whether the work to be
 * grabbed is at the head of the queue and thus can't clear LINKED
 * flag of the previous work while there must be a valid next work
 * after a work with LINKED flag set.
 *
 * Note that when @worker is non-NULL, @target may be modified
2416
 * underneath us, so we can't reliably determine pwq from @target.
T
Tejun Heo 已提交
2417 2418
 *
 * CONTEXT:
2419
 * spin_lock_irq(pool->lock).
T
Tejun Heo 已提交
2420
 */
2421
static void insert_wq_barrier(struct pool_workqueue *pwq,
2422 2423
			      struct wq_barrier *barr,
			      struct work_struct *target, struct worker *worker)
O
Oleg Nesterov 已提交
2424
{
2425 2426 2427
	struct list_head *head;
	unsigned int linked = 0;

2428
	/*
2429
	 * debugobject calls are safe here even with pool->lock locked
2430 2431 2432 2433
	 * as we know for sure that this will not trigger any of the
	 * checks and call back into the fixup functions where we
	 * might deadlock.
	 */
A
Andrew Morton 已提交
2434
	INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
2435
	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
O
Oleg Nesterov 已提交
2436
	init_completion(&barr->done);
2437

2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452
	/*
	 * If @target is currently being executed, schedule the
	 * barrier to the worker; otherwise, put it after @target.
	 */
	if (worker)
		head = worker->scheduled.next;
	else {
		unsigned long *bits = work_data_bits(target);

		head = target->entry.next;
		/* there can already be other linked works, inherit and set */
		linked = *bits & WORK_STRUCT_LINKED;
		__set_bit(WORK_STRUCT_LINKED_BIT, bits);
	}

2453
	debug_work_activate(&barr->work);
2454
	insert_work(pwq, &barr->work, head,
2455
		    work_color_to_flags(WORK_NO_COLOR) | linked);
O
Oleg Nesterov 已提交
2456 2457
}

2458
/**
2459
 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
2460 2461 2462 2463
 * @wq: workqueue being flushed
 * @flush_color: new flush color, < 0 for no-op
 * @work_color: new work color, < 0 for no-op
 *
2464
 * Prepare pwqs for workqueue flushing.
2465
 *
2466 2467 2468 2469 2470
 * If @flush_color is non-negative, flush_color on all pwqs should be
 * -1.  If no pwq has in-flight commands at the specified color, all
 * pwq->flush_color's stay at -1 and %false is returned.  If any pwq
 * has in flight commands, its pwq->flush_color is set to
 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
2471 2472 2473 2474 2475 2476 2477
 * wakeup logic is armed and %true is returned.
 *
 * The caller should have initialized @wq->first_flusher prior to
 * calling this function with non-negative @flush_color.  If
 * @flush_color is negative, no flush color update is done and %false
 * is returned.
 *
2478
 * If @work_color is non-negative, all pwqs should have the same
2479 2480 2481 2482 2483 2484 2485 2486 2487 2488
 * work_color which is previous to @work_color and all will be
 * advanced to @work_color.
 *
 * CONTEXT:
 * mutex_lock(wq->flush_mutex).
 *
 * RETURNS:
 * %true if @flush_color >= 0 and there's something to flush.  %false
 * otherwise.
 */
2489
static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
2490
				      int flush_color, int work_color)
L
Linus Torvalds 已提交
2491
{
2492
	bool wait = false;
2493
	struct pool_workqueue *pwq;
L
Linus Torvalds 已提交
2494

2495
	if (flush_color >= 0) {
2496
		WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
2497
		atomic_set(&wq->nr_pwqs_to_flush, 1);
L
Linus Torvalds 已提交
2498
	}
2499

2500
	for_each_pwq(pwq, wq) {
2501
		struct worker_pool *pool = pwq->pool;
O
Oleg Nesterov 已提交
2502

2503
		spin_lock_irq(&pool->lock);
2504

2505
		if (flush_color >= 0) {
2506
			WARN_ON_ONCE(pwq->flush_color != -1);
O
Oleg Nesterov 已提交
2507

2508 2509 2510
			if (pwq->nr_in_flight[flush_color]) {
				pwq->flush_color = flush_color;
				atomic_inc(&wq->nr_pwqs_to_flush);
2511 2512 2513
				wait = true;
			}
		}
L
Linus Torvalds 已提交
2514

2515
		if (work_color >= 0) {
2516
			WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
2517
			pwq->work_color = work_color;
2518
		}
L
Linus Torvalds 已提交
2519

2520
		spin_unlock_irq(&pool->lock);
L
Linus Torvalds 已提交
2521
	}
2522

2523
	if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
2524
		complete(&wq->first_flusher->done);
2525

2526
	return wait;
L
Linus Torvalds 已提交
2527 2528
}

2529
/**
L
Linus Torvalds 已提交
2530
 * flush_workqueue - ensure that any scheduled work has run to completion.
2531
 * @wq: workqueue to flush
L
Linus Torvalds 已提交
2532 2533 2534 2535
 *
 * Forces execution of the workqueue and blocks until its completion.
 * This is typically used in driver shutdown handlers.
 *
O
Oleg Nesterov 已提交
2536 2537
 * We sleep until all works which were queued on entry have been handled,
 * but we are not livelocked by new incoming ones.
L
Linus Torvalds 已提交
2538
 */
2539
void flush_workqueue(struct workqueue_struct *wq)
L
Linus Torvalds 已提交
2540
{
2541 2542 2543 2544 2545 2546
	struct wq_flusher this_flusher = {
		.list = LIST_HEAD_INIT(this_flusher.list),
		.flush_color = -1,
		.done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
	};
	int next_color;
L
Linus Torvalds 已提交
2547

2548 2549
	lock_map_acquire(&wq->lockdep_map);
	lock_map_release(&wq->lockdep_map);
2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563

	mutex_lock(&wq->flush_mutex);

	/*
	 * Start-to-wait phase
	 */
	next_color = work_next_color(wq->work_color);

	if (next_color != wq->flush_color) {
		/*
		 * Color space is not full.  The current work_color
		 * becomes our flush_color and work_color is advanced
		 * by one.
		 */
2564
		WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
2565 2566 2567 2568 2569
		this_flusher.flush_color = wq->work_color;
		wq->work_color = next_color;

		if (!wq->first_flusher) {
			/* no flush in progress, become the first flusher */
2570
			WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
2571 2572 2573

			wq->first_flusher = &this_flusher;

2574
			if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
2575 2576 2577 2578 2579 2580 2581 2582
						       wq->work_color)) {
				/* nothing to flush, done */
				wq->flush_color = next_color;
				wq->first_flusher = NULL;
				goto out_unlock;
			}
		} else {
			/* wait in queue */
2583
			WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
2584
			list_add_tail(&this_flusher.list, &wq->flusher_queue);
2585
			flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610
		}
	} else {
		/*
		 * Oops, color space is full, wait on overflow queue.
		 * The next flush completion will assign us
		 * flush_color and transfer to flusher_queue.
		 */
		list_add_tail(&this_flusher.list, &wq->flusher_overflow);
	}

	mutex_unlock(&wq->flush_mutex);

	wait_for_completion(&this_flusher.done);

	/*
	 * Wake-up-and-cascade phase
	 *
	 * First flushers are responsible for cascading flushes and
	 * handling overflow.  Non-first flushers can simply return.
	 */
	if (wq->first_flusher != &this_flusher)
		return;

	mutex_lock(&wq->flush_mutex);

2611 2612 2613 2614
	/* we might have raced, check again with mutex held */
	if (wq->first_flusher != &this_flusher)
		goto out_unlock;

2615 2616
	wq->first_flusher = NULL;

2617 2618
	WARN_ON_ONCE(!list_empty(&this_flusher.list));
	WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630

	while (true) {
		struct wq_flusher *next, *tmp;

		/* complete all the flushers sharing the current flush color */
		list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
			if (next->flush_color != wq->flush_color)
				break;
			list_del_init(&next->list);
			complete(&next->done);
		}

2631 2632
		WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
			     wq->flush_color != work_next_color(wq->work_color));
2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651

		/* this flush_color is finished, advance by one */
		wq->flush_color = work_next_color(wq->flush_color);

		/* one color has been freed, handle overflow queue */
		if (!list_empty(&wq->flusher_overflow)) {
			/*
			 * Assign the same color to all overflowed
			 * flushers, advance work_color and append to
			 * flusher_queue.  This is the start-to-wait
			 * phase for these overflowed flushers.
			 */
			list_for_each_entry(tmp, &wq->flusher_overflow, list)
				tmp->flush_color = wq->work_color;

			wq->work_color = work_next_color(wq->work_color);

			list_splice_tail_init(&wq->flusher_overflow,
					      &wq->flusher_queue);
2652
			flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
2653 2654 2655
		}

		if (list_empty(&wq->flusher_queue)) {
2656
			WARN_ON_ONCE(wq->flush_color != wq->work_color);
2657 2658 2659 2660 2661
			break;
		}

		/*
		 * Need to flush more colors.  Make the next flusher
2662
		 * the new first flusher and arm pwqs.
2663
		 */
2664 2665
		WARN_ON_ONCE(wq->flush_color == wq->work_color);
		WARN_ON_ONCE(wq->flush_color != next->flush_color);
2666 2667 2668 2669

		list_del_init(&next->list);
		wq->first_flusher = next;

2670
		if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681
			break;

		/*
		 * Meh... this color is already done, clear first
		 * flusher and repeat cascading.
		 */
		wq->first_flusher = NULL;
	}

out_unlock:
	mutex_unlock(&wq->flush_mutex);
L
Linus Torvalds 已提交
2682
}
2683
EXPORT_SYMBOL_GPL(flush_workqueue);
L
Linus Torvalds 已提交
2684

2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698
/**
 * drain_workqueue - drain a workqueue
 * @wq: workqueue to drain
 *
 * Wait until the workqueue becomes empty.  While draining is in progress,
 * only chain queueing is allowed.  IOW, only currently pending or running
 * work items on @wq can queue further work items on it.  @wq is flushed
 * repeatedly until it becomes empty.  The number of flushing is detemined
 * by the depth of chaining and should be relatively short.  Whine if it
 * takes too long.
 */
void drain_workqueue(struct workqueue_struct *wq)
{
	unsigned int flush_cnt = 0;
2699
	struct pool_workqueue *pwq;
2700 2701 2702 2703 2704 2705

	/*
	 * __queue_work() needs to test whether there are drainers, is much
	 * hotter than drain_workqueue() and already looks at @wq->flags.
	 * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
	 */
2706
	spin_lock_irq(&workqueue_lock);
2707 2708
	if (!wq->nr_drainers++)
		wq->flags |= WQ_DRAINING;
2709
	spin_unlock_irq(&workqueue_lock);
2710 2711 2712
reflush:
	flush_workqueue(wq);

2713
	for_each_pwq(pwq, wq) {
2714
		bool drained;
2715

2716 2717 2718
		spin_lock_irq(&pwq->pool->lock);
		drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
		spin_unlock_irq(&pwq->pool->lock);
2719 2720

		if (drained)
2721 2722 2723 2724
			continue;

		if (++flush_cnt == 10 ||
		    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
V
Valentin Ilie 已提交
2725 2726
			pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n",
				wq->name, flush_cnt);
2727 2728 2729
		goto reflush;
	}

2730
	spin_lock_irq(&workqueue_lock);
2731 2732
	if (!--wq->nr_drainers)
		wq->flags &= ~WQ_DRAINING;
2733
	spin_unlock_irq(&workqueue_lock);
2734 2735 2736
}
EXPORT_SYMBOL_GPL(drain_workqueue);

2737
static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2738
{
2739
	struct worker *worker = NULL;
2740
	struct worker_pool *pool;
2741
	struct pool_workqueue *pwq;
2742 2743

	might_sleep();
2744 2745
	pool = get_work_pool(work);
	if (!pool)
2746
		return false;
2747

2748
	spin_lock_irq(&pool->lock);
2749
	/* see the comment in try_to_grab_pending() with the same code */
2750 2751 2752
	pwq = get_work_pwq(work);
	if (pwq) {
		if (unlikely(pwq->pool != pool))
T
Tejun Heo 已提交
2753
			goto already_gone;
2754
	} else {
2755
		worker = find_worker_executing_work(pool, work);
2756
		if (!worker)
T
Tejun Heo 已提交
2757
			goto already_gone;
2758
		pwq = worker->current_pwq;
2759
	}
2760

2761
	insert_wq_barrier(pwq, barr, work, worker);
2762
	spin_unlock_irq(&pool->lock);
2763

2764 2765 2766 2767 2768 2769
	/*
	 * If @max_active is 1 or rescuer is in use, flushing another work
	 * item on the same workqueue may lead to deadlock.  Make sure the
	 * flusher is not running on the same workqueue by verifying write
	 * access.
	 */
2770 2771
	if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER)
		lock_map_acquire(&pwq->wq->lockdep_map);
2772
	else
2773 2774
		lock_map_acquire_read(&pwq->wq->lockdep_map);
	lock_map_release(&pwq->wq->lockdep_map);
2775

2776
	return true;
T
Tejun Heo 已提交
2777
already_gone:
2778
	spin_unlock_irq(&pool->lock);
2779
	return false;
2780
}
2781 2782 2783 2784 2785

/**
 * flush_work - wait for a work to finish executing the last queueing instance
 * @work: the work to flush
 *
2786 2787
 * Wait until @work has finished execution.  @work is guaranteed to be idle
 * on return if it hasn't been requeued since flush started.
2788 2789 2790 2791 2792 2793 2794 2795 2796
 *
 * RETURNS:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_work(struct work_struct *work)
{
	struct wq_barrier barr;

2797 2798 2799
	lock_map_acquire(&work->lockdep_map);
	lock_map_release(&work->lockdep_map);

2800
	if (start_flush_work(work, &barr)) {
2801 2802 2803
		wait_for_completion(&barr.done);
		destroy_work_on_stack(&barr.work);
		return true;
2804
	} else {
2805
		return false;
2806 2807
	}
}
2808
EXPORT_SYMBOL_GPL(flush_work);
2809

2810
static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
2811
{
2812
	unsigned long flags;
2813 2814 2815
	int ret;

	do {
2816 2817 2818 2819 2820 2821
		ret = try_to_grab_pending(work, is_dwork, &flags);
		/*
		 * If someone else is canceling, wait for the same event it
		 * would be waiting for before retrying.
		 */
		if (unlikely(ret == -ENOENT))
2822
			flush_work(work);
2823 2824
	} while (unlikely(ret < 0));

2825 2826 2827 2828
	/* tell other tasks trying to grab @work to back off */
	mark_work_canceling(work);
	local_irq_restore(flags);

2829
	flush_work(work);
2830
	clear_work_data(work);
2831 2832 2833
	return ret;
}

2834
/**
2835 2836
 * cancel_work_sync - cancel a work and wait for it to finish
 * @work: the work to cancel
2837
 *
2838 2839 2840 2841
 * Cancel @work and wait for its execution to finish.  This function
 * can be used even if the work re-queues itself or migrates to
 * another workqueue.  On return from this function, @work is
 * guaranteed to be not pending or executing on any CPU.
2842
 *
2843 2844
 * cancel_work_sync(&delayed_work->work) must not be used for
 * delayed_work's.  Use cancel_delayed_work_sync() instead.
2845
 *
2846
 * The caller must ensure that the workqueue on which @work was last
2847
 * queued can't be destroyed before this function returns.
2848 2849 2850
 *
 * RETURNS:
 * %true if @work was pending, %false otherwise.
2851
 */
2852
bool cancel_work_sync(struct work_struct *work)
2853
{
2854
	return __cancel_work_timer(work, false);
O
Oleg Nesterov 已提交
2855
}
2856
EXPORT_SYMBOL_GPL(cancel_work_sync);
O
Oleg Nesterov 已提交
2857

2858
/**
2859 2860
 * flush_delayed_work - wait for a dwork to finish executing the last queueing
 * @dwork: the delayed work to flush
2861
 *
2862 2863 2864
 * Delayed timer is cancelled and the pending work is queued for
 * immediate execution.  Like flush_work(), this function only
 * considers the last queueing instance of @dwork.
2865
 *
2866 2867 2868
 * RETURNS:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
2869
 */
2870 2871
bool flush_delayed_work(struct delayed_work *dwork)
{
2872
	local_irq_disable();
2873
	if (del_timer_sync(&dwork->timer))
2874
		__queue_work(dwork->cpu, dwork->wq, &dwork->work);
2875
	local_irq_enable();
2876 2877 2878 2879
	return flush_work(&dwork->work);
}
EXPORT_SYMBOL(flush_delayed_work);

2880
/**
2881 2882
 * cancel_delayed_work - cancel a delayed work
 * @dwork: delayed_work to cancel
2883
 *
2884 2885 2886 2887 2888
 * Kill off a pending delayed_work.  Returns %true if @dwork was pending
 * and canceled; %false if wasn't pending.  Note that the work callback
 * function may still be running on return, unless it returns %true and the
 * work doesn't re-arm itself.  Explicitly flush or use
 * cancel_delayed_work_sync() to wait on it.
2889
 *
2890
 * This function is safe to call from any context including IRQ handler.
2891
 */
2892
bool cancel_delayed_work(struct delayed_work *dwork)
2893
{
2894 2895 2896 2897 2898 2899 2900 2901 2902 2903
	unsigned long flags;
	int ret;

	do {
		ret = try_to_grab_pending(&dwork->work, true, &flags);
	} while (unlikely(ret == -EAGAIN));

	if (unlikely(ret < 0))
		return false;

2904 2905
	set_work_pool_and_clear_pending(&dwork->work,
					get_work_pool_id(&dwork->work));
2906
	local_irq_restore(flags);
2907
	return ret;
2908
}
2909
EXPORT_SYMBOL(cancel_delayed_work);
2910

2911 2912 2913 2914 2915 2916 2917 2918 2919 2920
/**
 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
 * @dwork: the delayed work cancel
 *
 * This is cancel_work_sync() for delayed works.
 *
 * RETURNS:
 * %true if @dwork was pending, %false otherwise.
 */
bool cancel_delayed_work_sync(struct delayed_work *dwork)
2921
{
2922
	return __cancel_work_timer(&dwork->work, true);
2923
}
2924
EXPORT_SYMBOL(cancel_delayed_work_sync);
L
Linus Torvalds 已提交
2925

2926
/**
2927 2928 2929 2930 2931 2932
 * schedule_work_on - put work task on a specific cpu
 * @cpu: cpu to put the work task on
 * @work: job to be done
 *
 * This puts a job on a specific cpu
 */
2933
bool schedule_work_on(int cpu, struct work_struct *work)
2934
{
2935
	return queue_work_on(cpu, system_wq, work);
2936 2937 2938
}
EXPORT_SYMBOL(schedule_work_on);

2939 2940 2941 2942
/**
 * schedule_work - put work task in global workqueue
 * @work: job to be done
 *
2943 2944
 * Returns %false if @work was already on the kernel-global workqueue and
 * %true otherwise.
2945 2946 2947 2948
 *
 * This puts a job in the kernel-global workqueue if it was not already
 * queued and leaves it in the same position on the kernel-global
 * workqueue otherwise.
2949
 */
2950
bool schedule_work(struct work_struct *work)
L
Linus Torvalds 已提交
2951
{
2952
	return queue_work(system_wq, work);
L
Linus Torvalds 已提交
2953
}
2954
EXPORT_SYMBOL(schedule_work);
L
Linus Torvalds 已提交
2955

2956 2957 2958
/**
 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
 * @cpu: cpu to use
2959
 * @dwork: job to be done
2960 2961 2962 2963 2964
 * @delay: number of jiffies to wait
 *
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue on the specified CPU.
 */
2965 2966
bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
			      unsigned long delay)
L
Linus Torvalds 已提交
2967
{
2968
	return queue_delayed_work_on(cpu, system_wq, dwork, delay);
L
Linus Torvalds 已提交
2969
}
2970
EXPORT_SYMBOL(schedule_delayed_work_on);
L
Linus Torvalds 已提交
2971

2972 2973
/**
 * schedule_delayed_work - put work task in global workqueue after delay
2974 2975
 * @dwork: job to be done
 * @delay: number of jiffies to wait or 0 for immediate execution
2976 2977 2978 2979
 *
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue.
 */
2980
bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
L
Linus Torvalds 已提交
2981
{
2982
	return queue_delayed_work(system_wq, dwork, delay);
L
Linus Torvalds 已提交
2983
}
2984
EXPORT_SYMBOL(schedule_delayed_work);
L
Linus Torvalds 已提交
2985

2986
/**
2987
 * schedule_on_each_cpu - execute a function synchronously on each online CPU
2988 2989
 * @func: the function to call
 *
2990 2991
 * schedule_on_each_cpu() executes @func on each online CPU using the
 * system workqueue and blocks until all CPUs have completed.
2992
 * schedule_on_each_cpu() is very slow.
2993 2994 2995
 *
 * RETURNS:
 * 0 on success, -errno on failure.
2996
 */
2997
int schedule_on_each_cpu(work_func_t func)
2998 2999
{
	int cpu;
3000
	struct work_struct __percpu *works;
3001

3002 3003
	works = alloc_percpu(struct work_struct);
	if (!works)
3004
		return -ENOMEM;
3005

3006 3007
	get_online_cpus();

3008
	for_each_online_cpu(cpu) {
3009 3010 3011
		struct work_struct *work = per_cpu_ptr(works, cpu);

		INIT_WORK(work, func);
3012
		schedule_work_on(cpu, work);
3013
	}
3014 3015 3016 3017

	for_each_online_cpu(cpu)
		flush_work(per_cpu_ptr(works, cpu));

3018
	put_online_cpus();
3019
	free_percpu(works);
3020 3021 3022
	return 0;
}

3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046
/**
 * flush_scheduled_work - ensure that any scheduled work has run to completion.
 *
 * Forces execution of the kernel-global workqueue and blocks until its
 * completion.
 *
 * Think twice before calling this function!  It's very easy to get into
 * trouble if you don't take great care.  Either of the following situations
 * will lead to deadlock:
 *
 *	One of the work items currently on the workqueue needs to acquire
 *	a lock held by your code or its caller.
 *
 *	Your code is running in the context of a work routine.
 *
 * They will be detected by lockdep when they occur, but the first might not
 * occur very often.  It depends on what work items are on the workqueue and
 * what locks they need, which you have no control over.
 *
 * In most situations flushing the entire workqueue is overkill; you merely
 * need to know that a particular work item isn't queued and isn't running.
 * In such cases you should use cancel_delayed_work_sync() or
 * cancel_work_sync() instead.
 */
L
Linus Torvalds 已提交
3047 3048
void flush_scheduled_work(void)
{
3049
	flush_workqueue(system_wq);
L
Linus Torvalds 已提交
3050
}
3051
EXPORT_SYMBOL(flush_scheduled_work);
L
Linus Torvalds 已提交
3052

3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064
/**
 * execute_in_process_context - reliably execute the routine with user context
 * @fn:		the function to execute
 * @ew:		guaranteed storage for the execute work structure (must
 *		be available when the work executes)
 *
 * Executes the function immediately if process context is available,
 * otherwise schedules the function for delayed execution.
 *
 * Returns:	0 - function was executed
 *		1 - function was scheduled for execution
 */
3065
int execute_in_process_context(work_func_t fn, struct execute_work *ew)
3066 3067
{
	if (!in_interrupt()) {
3068
		fn(&ew->work);
3069 3070 3071
		return 0;
	}

3072
	INIT_WORK(&ew->work, fn);
3073 3074 3075 3076 3077 3078
	schedule_work(&ew->work);

	return 1;
}
EXPORT_SYMBOL_GPL(execute_in_process_context);

L
Linus Torvalds 已提交
3079 3080
int keventd_up(void)
{
3081
	return system_wq != NULL;
L
Linus Torvalds 已提交
3082 3083
}

3084
static int alloc_and_link_pwqs(struct workqueue_struct *wq)
T
Tejun Heo 已提交
3085
{
3086
	bool highpri = wq->flags & WQ_HIGHPRI;
3087 3088 3089
	int cpu;

	if (!(wq->flags & WQ_UNBOUND)) {
3090
		wq->pool_wq.pcpu = alloc_percpu(struct pool_workqueue);
3091 3092 3093 3094 3095
		if (!wq->pool_wq.pcpu)
			return -ENOMEM;

		for_each_possible_cpu(cpu) {
			struct pool_workqueue *pwq = get_pwq(cpu, wq);
3096

3097
			pwq->pool = get_std_worker_pool(cpu, highpri);
3098 3099 3100 3101 3102 3103 3104 3105 3106 3107
			list_add_tail(&pwq->pwqs_node, &wq->pwqs);
		}
	} else {
		struct pool_workqueue *pwq;

		pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL);
		if (!pwq)
			return -ENOMEM;

		wq->pool_wq.single = pwq;
3108
		pwq->pool = get_std_worker_pool(WORK_CPU_UNBOUND, highpri);
3109 3110 3111 3112
		list_add_tail(&pwq->pwqs_node, &wq->pwqs);
	}

	return 0;
T
Tejun Heo 已提交
3113 3114
}

3115
static void free_pwqs(struct workqueue_struct *wq)
T
Tejun Heo 已提交
3116
{
3117
	if (!(wq->flags & WQ_UNBOUND))
3118
		free_percpu(wq->pool_wq.pcpu);
3119 3120
	else
		kmem_cache_free(pwq_cache, wq->pool_wq.single);
T
Tejun Heo 已提交
3121 3122
}

3123 3124
static int wq_clamp_max_active(int max_active, unsigned int flags,
			       const char *name)
3125
{
3126 3127 3128
	int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;

	if (max_active < 1 || max_active > lim)
V
Valentin Ilie 已提交
3129 3130
		pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
			max_active, name, 1, lim);
3131

3132
	return clamp_val(max_active, 1, lim);
3133 3134
}

3135
struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3136 3137 3138
					       unsigned int flags,
					       int max_active,
					       struct lock_class_key *key,
3139
					       const char *lock_name, ...)
L
Linus Torvalds 已提交
3140
{
3141
	va_list args, args1;
L
Linus Torvalds 已提交
3142
	struct workqueue_struct *wq;
3143
	struct pool_workqueue *pwq;
3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157
	size_t namelen;

	/* determine namelen, allocate wq and format name */
	va_start(args, lock_name);
	va_copy(args1, args);
	namelen = vsnprintf(NULL, 0, fmt, args) + 1;

	wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
	if (!wq)
		goto err;

	vsnprintf(wq->name, namelen, fmt, args1);
	va_end(args);
	va_end(args1);
L
Linus Torvalds 已提交
3158

3159 3160 3161 3162 3163 3164 3165
	/*
	 * Workqueues which may be used during memory reclaim should
	 * have a rescuer to guarantee forward progress.
	 */
	if (flags & WQ_MEM_RECLAIM)
		flags |= WQ_RESCUER;

3166
	max_active = max_active ?: WQ_DFL_ACTIVE;
3167
	max_active = wq_clamp_max_active(max_active, flags, wq->name);
3168

3169
	/* init wq */
3170
	wq->flags = flags;
3171
	wq->saved_max_active = max_active;
3172
	mutex_init(&wq->flush_mutex);
3173
	atomic_set(&wq->nr_pwqs_to_flush, 0);
3174
	INIT_LIST_HEAD(&wq->pwqs);
3175 3176
	INIT_LIST_HEAD(&wq->flusher_queue);
	INIT_LIST_HEAD(&wq->flusher_overflow);
3177
	INIT_LIST_HEAD(&wq->maydays);
3178

3179
	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
3180
	INIT_LIST_HEAD(&wq->list);
3181

3182
	if (alloc_and_link_pwqs(wq) < 0)
3183 3184
		goto err;

3185
	for_each_pwq(pwq, wq) {
3186 3187 3188 3189 3190
		BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
		pwq->wq = wq;
		pwq->flush_color = -1;
		pwq->max_active = max_active;
		INIT_LIST_HEAD(&pwq->delayed_works);
3191
		INIT_LIST_HEAD(&pwq->mayday_node);
3192
	}
T
Tejun Heo 已提交
3193

3194 3195 3196 3197 3198 3199 3200
	if (flags & WQ_RESCUER) {
		struct worker *rescuer;

		wq->rescuer = rescuer = alloc_worker();
		if (!rescuer)
			goto err;

3201 3202
		rescuer->rescue_wq = wq;
		rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
3203
					       wq->name);
3204 3205 3206 3207 3208
		if (IS_ERR(rescuer->task))
			goto err;

		rescuer->task->flags |= PF_THREAD_BOUND;
		wake_up_process(rescuer->task);
3209 3210
	}

3211 3212 3213 3214 3215
	/*
	 * workqueue_lock protects global freeze state and workqueues
	 * list.  Grab it, set max_active accordingly and add the new
	 * workqueue to workqueues list.
	 */
3216
	spin_lock_irq(&workqueue_lock);
3217

3218
	if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
3219 3220
		for_each_pwq(pwq, wq)
			pwq->max_active = 0;
3221

T
Tejun Heo 已提交
3222
	list_add(&wq->list, &workqueues);
3223

3224
	spin_unlock_irq(&workqueue_lock);
T
Tejun Heo 已提交
3225

3226
	return wq;
T
Tejun Heo 已提交
3227 3228
err:
	if (wq) {
3229
		free_pwqs(wq);
3230
		kfree(wq->rescuer);
T
Tejun Heo 已提交
3231 3232 3233
		kfree(wq);
	}
	return NULL;
3234
}
3235
EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
L
Linus Torvalds 已提交
3236

3237 3238 3239 3240 3241 3242 3243 3244
/**
 * destroy_workqueue - safely terminate a workqueue
 * @wq: target workqueue
 *
 * Safely destroy a workqueue. All work currently pending will be done first.
 */
void destroy_workqueue(struct workqueue_struct *wq)
{
3245
	struct pool_workqueue *pwq;
3246

3247 3248
	/* drain it before proceeding with destruction */
	drain_workqueue(wq);
3249

3250
	/* sanity checks */
3251
	for_each_pwq(pwq, wq) {
3252 3253 3254 3255 3256 3257 3258 3259 3260 3261
		int i;

		for (i = 0; i < WORK_NR_COLORS; i++)
			if (WARN_ON(pwq->nr_in_flight[i]))
				return;
		if (WARN_ON(pwq->nr_active) ||
		    WARN_ON(!list_empty(&pwq->delayed_works)))
			return;
	}

3262 3263 3264 3265
	/*
	 * wq list is used to freeze wq, remove from list after
	 * flushing is complete in case freeze races us.
	 */
3266
	spin_lock_irq(&workqueue_lock);
3267
	list_del(&wq->list);
3268
	spin_unlock_irq(&workqueue_lock);
3269

3270 3271
	if (wq->flags & WQ_RESCUER) {
		kthread_stop(wq->rescuer->task);
3272
		kfree(wq->rescuer);
3273 3274
	}

3275
	free_pwqs(wq);
3276 3277 3278 3279
	kfree(wq);
}
EXPORT_SYMBOL_GPL(destroy_workqueue);

3280
/**
3281 3282
 * pwq_set_max_active - adjust max_active of a pwq
 * @pwq: target pool_workqueue
3283 3284
 * @max_active: new max_active value.
 *
3285
 * Set @pwq->max_active to @max_active and activate delayed works if
3286 3287 3288
 * increased.
 *
 * CONTEXT:
3289
 * spin_lock_irq(pool->lock).
3290
 */
3291
static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
3292
{
3293
	pwq->max_active = max_active;
3294

3295 3296 3297
	while (!list_empty(&pwq->delayed_works) &&
	       pwq->nr_active < pwq->max_active)
		pwq_activate_first_delayed(pwq);
3298 3299
}

3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311
/**
 * workqueue_set_max_active - adjust max_active of a workqueue
 * @wq: target workqueue
 * @max_active: new max_active value.
 *
 * Set max_active of @wq to @max_active.
 *
 * CONTEXT:
 * Don't call from IRQ context.
 */
void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
{
3312
	struct pool_workqueue *pwq;
3313

3314
	max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
3315

3316
	spin_lock_irq(&workqueue_lock);
3317 3318 3319

	wq->saved_max_active = max_active;

3320
	for_each_pwq(pwq, wq) {
3321
		struct worker_pool *pool = pwq->pool;
3322

3323
		spin_lock(&pool->lock);
3324

3325
		if (!(wq->flags & WQ_FREEZABLE) ||
3326
		    !(pool->flags & POOL_FREEZING))
3327
			pwq_set_max_active(pwq, max_active);
3328

3329
		spin_unlock(&pool->lock);
3330
	}
3331

3332
	spin_unlock_irq(&workqueue_lock);
3333
}
3334
EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3335

3336
/**
3337 3338 3339
 * workqueue_congested - test whether a workqueue is congested
 * @cpu: CPU in question
 * @wq: target workqueue
3340
 *
3341 3342 3343
 * Test whether @wq's cpu workqueue for @cpu is congested.  There is
 * no synchronization around this function and the test result is
 * unreliable and only useful as advisory hints or for debugging.
3344
 *
3345 3346
 * RETURNS:
 * %true if congested, %false otherwise.
3347
 */
3348
bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
L
Linus Torvalds 已提交
3349
{
3350
	struct pool_workqueue *pwq = get_pwq(cpu, wq);
3351

3352
	return !list_empty(&pwq->delayed_works);
L
Linus Torvalds 已提交
3353
}
3354
EXPORT_SYMBOL_GPL(workqueue_congested);
L
Linus Torvalds 已提交
3355

3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367
/**
 * work_busy - test whether a work is currently pending or running
 * @work: the work to be tested
 *
 * Test whether @work is currently pending or running.  There is no
 * synchronization around this function and the test result is
 * unreliable and only useful as advisory hints or for debugging.
 *
 * RETURNS:
 * OR'd bitmask of WORK_BUSY_* bits.
 */
unsigned int work_busy(struct work_struct *work)
L
Linus Torvalds 已提交
3368
{
3369
	struct worker_pool *pool = get_work_pool(work);
3370 3371
	unsigned long flags;
	unsigned int ret = 0;
L
Linus Torvalds 已提交
3372

3373 3374
	if (work_pending(work))
		ret |= WORK_BUSY_PENDING;
L
Linus Torvalds 已提交
3375

3376 3377 3378 3379 3380 3381
	if (pool) {
		spin_lock_irqsave(&pool->lock, flags);
		if (find_worker_executing_work(pool, work))
			ret |= WORK_BUSY_RUNNING;
		spin_unlock_irqrestore(&pool->lock, flags);
	}
L
Linus Torvalds 已提交
3382

3383
	return ret;
L
Linus Torvalds 已提交
3384
}
3385
EXPORT_SYMBOL_GPL(work_busy);
L
Linus Torvalds 已提交
3386

3387 3388 3389
/*
 * CPU hotplug.
 *
3390
 * There are two challenges in supporting CPU hotplug.  Firstly, there
3391
 * are a lot of assumptions on strong associations among work, pwq and
3392
 * pool which make migrating pending and scheduled works very
3393
 * difficult to implement without impacting hot paths.  Secondly,
3394
 * worker pools serve mix of short, long and very long running works making
3395 3396
 * blocked draining impractical.
 *
3397
 * This is solved by allowing the pools to be disassociated from the CPU
3398 3399
 * running as an unbound one and allowing it to be reattached later if the
 * cpu comes back online.
3400
 */
L
Linus Torvalds 已提交
3401

3402
static void wq_unbind_fn(struct work_struct *work)
3403
{
3404
	int cpu = smp_processor_id();
3405
	struct worker_pool *pool;
3406 3407
	struct worker *worker;
	int i;
3408

3409
	for_each_std_worker_pool(pool, cpu) {
3410
		WARN_ON_ONCE(cpu != smp_processor_id());
3411

3412 3413
		mutex_lock(&pool->assoc_mutex);
		spin_lock_irq(&pool->lock);
3414

3415 3416 3417 3418 3419 3420 3421
		/*
		 * We've claimed all manager positions.  Make all workers
		 * unbound and set DISASSOCIATED.  Before this, all workers
		 * except for the ones which are still executing works from
		 * before the last CPU down must be on the cpu.  After
		 * this, they may become diasporas.
		 */
3422
		list_for_each_entry(worker, &pool->idle_list, entry)
3423
			worker->flags |= WORKER_UNBOUND;
3424

3425
		for_each_busy_worker(worker, i, pool)
3426
			worker->flags |= WORKER_UNBOUND;
3427

3428
		pool->flags |= POOL_DISASSOCIATED;
3429

3430 3431 3432
		spin_unlock_irq(&pool->lock);
		mutex_unlock(&pool->assoc_mutex);
	}
3433

3434
	/*
3435
	 * Call schedule() so that we cross rq->lock and thus can guarantee
3436 3437
	 * sched callbacks see the %WORKER_UNBOUND flag.  This is necessary
	 * as scheduler callbacks may be invoked from other cpus.
3438 3439
	 */
	schedule();
3440

3441
	/*
3442 3443
	 * Sched callbacks are disabled now.  Zap nr_running.  After this,
	 * nr_running stays zero and need_more_worker() and keep_working()
3444 3445 3446
	 * are always true as long as the worklist is not empty.  Pools on
	 * @cpu now behave as unbound (in terms of concurrency management)
	 * pools which are served by workers tied to the CPU.
3447 3448 3449 3450
	 *
	 * On return from this function, the current worker would trigger
	 * unbound chain execution of pending work items if other workers
	 * didn't already.
3451
	 */
3452
	for_each_std_worker_pool(pool, cpu)
3453
		atomic_set(&pool->nr_running, 0);
3454 3455
}

T
Tejun Heo 已提交
3456 3457 3458 3459
/*
 * Workqueues should be brought up before normal priority CPU notifiers.
 * This will be registered high priority CPU notifier.
 */
3460
static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
T
Tejun Heo 已提交
3461 3462
					       unsigned long action,
					       void *hcpu)
3463 3464
{
	unsigned int cpu = (unsigned long)hcpu;
3465
	struct worker_pool *pool;
3466

T
Tejun Heo 已提交
3467
	switch (action & ~CPU_TASKS_FROZEN) {
3468
	case CPU_UP_PREPARE:
3469
		for_each_std_worker_pool(pool, cpu) {
3470 3471 3472 3473 3474 3475 3476 3477 3478
			struct worker *worker;

			if (pool->nr_workers)
				continue;

			worker = create_worker(pool);
			if (!worker)
				return NOTIFY_BAD;

3479
			spin_lock_irq(&pool->lock);
3480
			start_worker(worker);
3481
			spin_unlock_irq(&pool->lock);
3482
		}
T
Tejun Heo 已提交
3483
		break;
3484

3485 3486
	case CPU_DOWN_FAILED:
	case CPU_ONLINE:
3487
		for_each_std_worker_pool(pool, cpu) {
3488 3489 3490
			mutex_lock(&pool->assoc_mutex);
			spin_lock_irq(&pool->lock);

3491
			pool->flags &= ~POOL_DISASSOCIATED;
3492 3493 3494 3495 3496
			rebind_workers(pool);

			spin_unlock_irq(&pool->lock);
			mutex_unlock(&pool->assoc_mutex);
		}
3497
		break;
3498
	}
3499 3500 3501 3502 3503 3504 3505
	return NOTIFY_OK;
}

/*
 * Workqueues should be brought down after normal priority CPU notifiers.
 * This will be registered as low priority CPU notifier.
 */
3506
static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3507 3508 3509
						 unsigned long action,
						 void *hcpu)
{
T
Tejun Heo 已提交
3510 3511 3512
	unsigned int cpu = (unsigned long)hcpu;
	struct work_struct unbind_work;

3513 3514
	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_DOWN_PREPARE:
T
Tejun Heo 已提交
3515
		/* unbinding should happen on the local CPU */
3516
		INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
3517
		queue_work_on(cpu, system_highpri_wq, &unbind_work);
T
Tejun Heo 已提交
3518 3519
		flush_work(&unbind_work);
		break;
3520 3521 3522 3523
	}
	return NOTIFY_OK;
}

3524
#ifdef CONFIG_SMP
3525

3526
struct work_for_cpu {
3527
	struct work_struct work;
3528 3529 3530 3531 3532
	long (*fn)(void *);
	void *arg;
	long ret;
};

3533
static void work_for_cpu_fn(struct work_struct *work)
3534
{
3535 3536
	struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);

3537 3538 3539 3540 3541 3542 3543 3544 3545
	wfc->ret = wfc->fn(wfc->arg);
}

/**
 * work_on_cpu - run a function in user context on a particular cpu
 * @cpu: the cpu to run on
 * @fn: the function to run
 * @arg: the function arg
 *
3546 3547
 * This will return the value @fn returns.
 * It is up to the caller to ensure that the cpu doesn't go offline.
3548
 * The caller must not hold any locks which would prevent @fn from completing.
3549 3550 3551
 */
long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
{
3552
	struct work_for_cpu wfc = { .fn = fn, .arg = arg };
3553

3554 3555 3556
	INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
	schedule_work_on(cpu, &wfc.work);
	flush_work(&wfc.work);
3557 3558 3559 3560 3561
	return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu);
#endif /* CONFIG_SMP */

3562 3563 3564 3565 3566
#ifdef CONFIG_FREEZER

/**
 * freeze_workqueues_begin - begin freezing workqueues
 *
3567 3568
 * Start freezing workqueues.  After this function returns, all freezable
 * workqueues will queue new works to their frozen_works list instead of
3569
 * pool->worklist.
3570 3571
 *
 * CONTEXT:
3572
 * Grabs and releases workqueue_lock and pool->lock's.
3573 3574 3575
 */
void freeze_workqueues_begin(void)
{
T
Tejun Heo 已提交
3576
	struct worker_pool *pool;
3577 3578
	struct workqueue_struct *wq;
	struct pool_workqueue *pwq;
T
Tejun Heo 已提交
3579
	int id;
3580

3581
	spin_lock_irq(&workqueue_lock);
3582

3583
	WARN_ON_ONCE(workqueue_freezing);
3584 3585
	workqueue_freezing = true;

3586
	/* set FREEZING */
T
Tejun Heo 已提交
3587 3588 3589 3590
	for_each_pool(pool, id) {
		spin_lock(&pool->lock);
		WARN_ON_ONCE(pool->flags & POOL_FREEZING);
		pool->flags |= POOL_FREEZING;
3591 3592
		spin_unlock(&pool->lock);
	}
3593

3594 3595 3596 3597
	/* suppress further executions by setting max_active to zero */
	list_for_each_entry(wq, &workqueues, list) {
		if (!(wq->flags & WQ_FREEZABLE))
			continue;
3598

3599 3600 3601 3602
		for_each_pwq(pwq, wq) {
			spin_lock(&pwq->pool->lock);
			pwq->max_active = 0;
			spin_unlock(&pwq->pool->lock);
3603
		}
3604 3605
	}

3606
	spin_unlock_irq(&workqueue_lock);
3607 3608 3609
}

/**
3610
 * freeze_workqueues_busy - are freezable workqueues still busy?
3611 3612 3613 3614 3615 3616 3617 3618
 *
 * Check whether freezing is complete.  This function must be called
 * between freeze_workqueues_begin() and thaw_workqueues().
 *
 * CONTEXT:
 * Grabs and releases workqueue_lock.
 *
 * RETURNS:
3619 3620
 * %true if some freezable workqueues are still busy.  %false if freezing
 * is complete.
3621 3622 3623 3624
 */
bool freeze_workqueues_busy(void)
{
	bool busy = false;
3625 3626
	struct workqueue_struct *wq;
	struct pool_workqueue *pwq;
3627

3628
	spin_lock_irq(&workqueue_lock);
3629

3630
	WARN_ON_ONCE(!workqueue_freezing);
3631

3632 3633 3634
	list_for_each_entry(wq, &workqueues, list) {
		if (!(wq->flags & WQ_FREEZABLE))
			continue;
3635 3636 3637 3638
		/*
		 * nr_active is monotonically decreasing.  It's safe
		 * to peek without lock.
		 */
3639
		for_each_pwq(pwq, wq) {
3640
			WARN_ON_ONCE(pwq->nr_active < 0);
3641
			if (pwq->nr_active) {
3642 3643 3644 3645 3646 3647
				busy = true;
				goto out_unlock;
			}
		}
	}
out_unlock:
3648
	spin_unlock_irq(&workqueue_lock);
3649 3650 3651 3652 3653 3654 3655
	return busy;
}

/**
 * thaw_workqueues - thaw workqueues
 *
 * Thaw workqueues.  Normal queueing is restored and all collected
3656
 * frozen works are transferred to their respective pool worklists.
3657 3658
 *
 * CONTEXT:
3659
 * Grabs and releases workqueue_lock and pool->lock's.
3660 3661 3662
 */
void thaw_workqueues(void)
{
3663 3664 3665 3666
	struct workqueue_struct *wq;
	struct pool_workqueue *pwq;
	struct worker_pool *pool;
	int id;
3667

3668
	spin_lock_irq(&workqueue_lock);
3669 3670 3671 3672

	if (!workqueue_freezing)
		goto out_unlock;

3673 3674 3675 3676 3677 3678 3679
	/* clear FREEZING */
	for_each_pool(pool, id) {
		spin_lock(&pool->lock);
		WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
		pool->flags &= ~POOL_FREEZING;
		spin_unlock(&pool->lock);
	}
3680

3681 3682 3683 3684
	/* restore max_active and repopulate worklist */
	list_for_each_entry(wq, &workqueues, list) {
		if (!(wq->flags & WQ_FREEZABLE))
			continue;
3685

3686 3687 3688 3689
		for_each_pwq(pwq, wq) {
			spin_lock(&pwq->pool->lock);
			pwq_set_max_active(pwq, wq->saved_max_active);
			spin_unlock(&pwq->pool->lock);
3690
		}
3691 3692
	}

3693 3694 3695 3696 3697 3698 3699
	/* kick workers */
	for_each_pool(pool, id) {
		spin_lock(&pool->lock);
		wake_up_worker(pool);
		spin_unlock(&pool->lock);
	}

3700 3701
	workqueue_freezing = false;
out_unlock:
3702
	spin_unlock_irq(&workqueue_lock);
3703 3704 3705
}
#endif /* CONFIG_FREEZER */

3706
static int __init init_workqueues(void)
L
Linus Torvalds 已提交
3707
{
T
Tejun Heo 已提交
3708 3709
	unsigned int cpu;

3710 3711
	/* make sure we have enough bits for OFFQ pool ID */
	BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
3712
		     WORK_CPU_END * NR_STD_WORKER_POOLS);
3713

3714 3715 3716 3717
	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));

	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);

3718
	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3719
	hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3720

3721 3722
	/* initialize CPU pools */
	for_each_wq_cpu(cpu) {
3723
		struct worker_pool *pool;
3724

3725
		for_each_std_worker_pool(pool, cpu) {
3726
			spin_lock_init(&pool->lock);
3727
			pool->cpu = cpu;
3728
			pool->flags |= POOL_DISASSOCIATED;
3729 3730
			INIT_LIST_HEAD(&pool->worklist);
			INIT_LIST_HEAD(&pool->idle_list);
3731
			hash_init(pool->busy_hash);
3732

3733 3734 3735
			init_timer_deferrable(&pool->idle_timer);
			pool->idle_timer.function = idle_worker_timeout;
			pool->idle_timer.data = (unsigned long)pool;
3736

3737
			setup_timer(&pool->mayday_timer, pool_mayday_timeout,
3738 3739
				    (unsigned long)pool);

3740
			mutex_init(&pool->assoc_mutex);
3741
			ida_init(&pool->worker_ida);
T
Tejun Heo 已提交
3742 3743 3744

			/* alloc pool ID */
			BUG_ON(worker_pool_assign_id(pool));
3745
		}
3746 3747
	}

3748
	/* create the initial worker */
3749
	for_each_online_wq_cpu(cpu) {
3750
		struct worker_pool *pool;
3751

3752
		for_each_std_worker_pool(pool, cpu) {
3753 3754
			struct worker *worker;

3755 3756 3757
			if (cpu != WORK_CPU_UNBOUND)
				pool->flags &= ~POOL_DISASSOCIATED;

3758
			worker = create_worker(pool);
3759
			BUG_ON(!worker);
3760
			spin_lock_irq(&pool->lock);
3761
			start_worker(worker);
3762
			spin_unlock_irq(&pool->lock);
3763
		}
3764 3765
	}

3766
	system_wq = alloc_workqueue("events", 0, 0);
3767
	system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
3768
	system_long_wq = alloc_workqueue("events_long", 0, 0);
3769 3770
	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
					    WQ_UNBOUND_MAX_ACTIVE);
3771 3772
	system_freezable_wq = alloc_workqueue("events_freezable",
					      WQ_FREEZABLE, 0);
3773
	BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
3774
	       !system_unbound_wq || !system_freezable_wq);
3775
	return 0;
L
Linus Torvalds 已提交
3776
}
3777
early_initcall(init_workqueues);