backing-dev.c 21.2 KB
Newer Older
1 2 3

#include <linux/wait.h>
#include <linux/backing-dev.h>
4 5
#include <linux/kthread.h>
#include <linux/freezer.h>
6
#include <linux/fs.h>
7
#include <linux/pagemap.h>
8
#include <linux/mm.h>
9 10
#include <linux/sched.h>
#include <linux/module.h>
11 12
#include <linux/writeback.h>
#include <linux/device.h>
13
#include <trace/events/writeback.h>
14

15 16
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);

17
struct backing_dev_info default_backing_dev_info = {
18
	.name		= "default",
19 20 21 22 23
	.ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
	.state		= 0,
	.capabilities	= BDI_CAP_MAP_COPY,
};
EXPORT_SYMBOL_GPL(default_backing_dev_info);
24

J
Jörn Engel 已提交
25 26
struct backing_dev_info noop_backing_dev_info = {
	.name		= "noop",
27
	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
J
Jörn Engel 已提交
28 29 30
};
EXPORT_SYMBOL_GPL(noop_backing_dev_info);

31
static struct class *bdi_class;
32 33 34 35 36 37

/*
 * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
 * reader side protection for bdi_pending_list. bdi_list has RCU reader side
 * locking.
 */
38
DEFINE_SPINLOCK(bdi_lock);
39
LIST_HEAD(bdi_list);
40 41
LIST_HEAD(bdi_pending_list);

42 43 44 45 46 47 48 49 50 51 52
void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
{
	if (wb1 < wb2) {
		spin_lock(&wb1->list_lock);
		spin_lock_nested(&wb2->list_lock, 1);
	} else {
		spin_lock(&wb2->list_lock);
		spin_lock_nested(&wb1->list_lock, 1);
	}
}

53 54 55 56 57 58 59 60 61 62 63 64 65 66
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>

static struct dentry *bdi_debug_root;

static void bdi_debug_init(void)
{
	bdi_debug_root = debugfs_create_dir("bdi", NULL);
}

static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
	struct backing_dev_info *bdi = m->private;
C
Christoph Hellwig 已提交
67
	struct bdi_writeback *wb = &bdi->wb;
68 69 70
	unsigned long background_thresh;
	unsigned long dirty_thresh;
	unsigned long bdi_thresh;
71
	unsigned long nr_dirty, nr_io, nr_more_io;
72 73
	struct inode *inode;

74
	nr_dirty = nr_io = nr_more_io = 0;
75
	spin_lock(&wb->list_lock);
N
Nick Piggin 已提交
76
	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
C
Christoph Hellwig 已提交
77
		nr_dirty++;
N
Nick Piggin 已提交
78
	list_for_each_entry(inode, &wb->b_io, i_wb_list)
C
Christoph Hellwig 已提交
79
		nr_io++;
N
Nick Piggin 已提交
80
	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
C
Christoph Hellwig 已提交
81
		nr_more_io++;
82
	spin_unlock(&wb->list_lock);
83

84 85
	global_dirty_limits(&background_thresh, &dirty_thresh);
	bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
86 87 88

#define K(x) ((x) << (PAGE_SHIFT - 10))
	seq_printf(m,
89 90 91 92 93
		   "BdiWriteback:       %10lu kB\n"
		   "BdiReclaimable:     %10lu kB\n"
		   "BdiDirtyThresh:     %10lu kB\n"
		   "DirtyThresh:        %10lu kB\n"
		   "BackgroundThresh:   %10lu kB\n"
94
		   "BdiDirtied:         %10lu kB\n"
95 96 97 98 99 100 101
		   "BdiWritten:         %10lu kB\n"
		   "BdiWriteBandwidth:  %10lu kBps\n"
		   "b_dirty:            %10lu\n"
		   "b_io:               %10lu\n"
		   "b_more_io:          %10lu\n"
		   "bdi_list:           %10u\n"
		   "state:              %10lx\n",
102 103
		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
104 105 106
		   K(bdi_thresh),
		   K(dirty_thresh),
		   K(background_thresh),
107
		   (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
108
		   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
109
		   (unsigned long) K(bdi->write_bandwidth),
110 111 112
		   nr_dirty,
		   nr_io,
		   nr_more_io,
C
Christoph Hellwig 已提交
113
		   !list_empty(&bdi->bdi_list), bdi->state);
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
#undef K

	return 0;
}

static int bdi_debug_stats_open(struct inode *inode, struct file *file)
{
	return single_open(file, bdi_debug_stats_show, inode->i_private);
}

static const struct file_operations bdi_debug_stats_fops = {
	.open		= bdi_debug_stats_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release,
};

static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
{
	bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
	bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
					       bdi, &bdi_debug_stats_fops);
}

static void bdi_debug_unregister(struct backing_dev_info *bdi)
{
	debugfs_remove(bdi->debug_stats);
	debugfs_remove(bdi->debug_dir);
}
#else
static inline void bdi_debug_init(void)
{
}
static inline void bdi_debug_register(struct backing_dev_info *bdi,
				      const char *name)
{
}
static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
{
}
#endif

156 157 158 159 160 161
static ssize_t read_ahead_kb_store(struct device *dev,
				  struct device_attribute *attr,
				  const char *buf, size_t count)
{
	struct backing_dev_info *bdi = dev_get_drvdata(dev);
	unsigned long read_ahead_kb;
162
	ssize_t ret;
163

164 165 166 167 168 169 170
	ret = kstrtoul(buf, 10, &read_ahead_kb);
	if (ret < 0)
		return ret;

	bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);

	return count;
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
}

#define K(pages) ((pages) << (PAGE_SHIFT - 10))

#define BDI_SHOW(name, expr)						\
static ssize_t name##_show(struct device *dev,				\
			   struct device_attribute *attr, char *page)	\
{									\
	struct backing_dev_info *bdi = dev_get_drvdata(dev);		\
									\
	return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr);	\
}

BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))

186 187 188 189 190
static ssize_t min_ratio_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t count)
{
	struct backing_dev_info *bdi = dev_get_drvdata(dev);
	unsigned int ratio;
191 192 193 194 195 196 197 198 199
	ssize_t ret;

	ret = kstrtouint(buf, 10, &ratio);
	if (ret < 0)
		return ret;

	ret = bdi_set_min_ratio(bdi, ratio);
	if (!ret)
		ret = count;
200 201 202 203 204

	return ret;
}
BDI_SHOW(min_ratio, bdi->min_ratio)

205 206 207 208 209
static ssize_t max_ratio_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t count)
{
	struct backing_dev_info *bdi = dev_get_drvdata(dev);
	unsigned int ratio;
210 211 212 213 214 215 216 217 218
	ssize_t ret;

	ret = kstrtouint(buf, 10, &ratio);
	if (ret < 0)
		return ret;

	ret = bdi_set_max_ratio(bdi, ratio);
	if (!ret)
		ret = count;
219 220 221 222 223

	return ret;
}
BDI_SHOW(max_ratio, bdi->max_ratio)

224 225 226 227
#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)

static struct device_attribute bdi_dev_attrs[] = {
	__ATTR_RW(read_ahead_kb),
228
	__ATTR_RW(min_ratio),
229
	__ATTR_RW(max_ratio),
230 231 232 233 234 235
	__ATTR_NULL,
};

static __init int bdi_class_init(void)
{
	bdi_class = class_create(THIS_MODULE, "bdi");
236 237 238
	if (IS_ERR(bdi_class))
		return PTR_ERR(bdi_class);

239
	bdi_class->dev_attrs = bdi_dev_attrs;
240
	bdi_debug_init();
241 242
	return 0;
}
243
postcore_initcall(bdi_class_init);
244

245 246 247 248 249 250 251
static int __init default_bdi_init(void)
{
	int err;

	err = bdi_init(&default_backing_dev_info);
	if (!err)
		bdi_register(&default_backing_dev_info, NULL, "default");
252
	err = bdi_init(&noop_backing_dev_info);
253 254 255 256 257

	return err;
}
subsys_initcall(default_bdi_init);

258 259 260 261 262
int bdi_has_dirty_io(struct backing_dev_info *bdi)
{
	return wb_has_dirty_io(&bdi->wb);
}

263 264 265 266 267 268
static void wakeup_timer_fn(unsigned long data)
{
	struct backing_dev_info *bdi = (struct backing_dev_info *)data;

	spin_lock_bh(&bdi->wb_lock);
	if (bdi->wb.task) {
269
		trace_writeback_wake_thread(bdi);
270
		wake_up_process(bdi->wb.task);
271
	} else if (bdi->dev) {
272 273 274 275 276
		/*
		 * When bdi tasks are inactive for long time, they are killed.
		 * In this case we have to wake-up the forker thread which
		 * should create and run the bdi thread.
		 */
277
		trace_writeback_wake_forker_thread(bdi);
278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301
		wake_up_process(default_backing_dev_info.wb.task);
	}
	spin_unlock_bh(&bdi->wb_lock);
}

/*
 * This function is used when the first inode for this bdi is marked dirty. It
 * wakes-up the corresponding bdi thread which should then take care of the
 * periodic background write-out of dirty inodes. Since the write-out would
 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
 * set up a timer which wakes the bdi thread up later.
 *
 * Note, we wouldn't bother setting up the timer, but this function is on the
 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
 * by delaying the wake-up.
 */
void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
{
	unsigned long timeout;

	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
	mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
}

302 303 304 305 306 307 308 309 310 311 312 313
/*
 * Calculate the longest interval (jiffies) bdi threads are allowed to be
 * inactive.
 */
static unsigned long bdi_longest_inactive(void)
{
	unsigned long interval;

	interval = msecs_to_jiffies(dirty_writeback_interval * 10);
	return max(5UL * 60 * HZ, interval);
}

314 315 316 317 318 319 320 321 322 323 324
/*
 * Clear pending bit and wakeup anybody waiting for flusher thread creation or
 * shutdown
 */
static void bdi_clear_pending(struct backing_dev_info *bdi)
{
	clear_bit(BDI_pending, &bdi->state);
	smp_mb__after_clear_bit();
	wake_up_bit(&bdi->state, BDI_pending);
}

325
static int bdi_forker_thread(void *ptr)
326 327 328
{
	struct bdi_writeback *me = ptr;

P
Peter Zijlstra 已提交
329
	current->flags |= PF_SWAPWRITE;
C
Christoph Hellwig 已提交
330 331 332 333 334 335
	set_freezable();

	/*
	 * Our parent may run at a different priority, just set us to normal
	 */
	set_user_nice(current, 0);
336 337

	for (;;) {
338
		struct task_struct *task = NULL;
339
		struct backing_dev_info *bdi;
340 341 342
		enum {
			NO_ACTION,   /* Nothing to do */
			FORK_THREAD, /* Fork bdi thread */
343
			KILL_THREAD, /* Kill inactive bdi thread */
344
		} action = NO_ACTION;
345 346 347 348 349

		/*
		 * Temporary measure, we want to make sure we don't see
		 * dirty data on the default backing_dev_info
		 */
350 351
		if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
			del_timer(&me->wakeup_timer);
352
			wb_do_writeback(me, 0);
353
		}
354

355
		spin_lock_bh(&bdi_lock);
356 357 358
		/*
		 * In the following loop we are going to check whether we have
		 * some work to do without any synchronization with tasks
359 360
		 * waking us up to do work for them. Set the task state here
		 * so that we don't miss wakeups after verifying conditions.
361
		 */
362
		set_current_state(TASK_INTERRUPTIBLE);
363

364
		list_for_each_entry(bdi, &bdi_list, bdi_list) {
365 366 367 368
			bool have_dirty_io;

			if (!bdi_cap_writeback_dirty(bdi) ||
			     bdi_cap_flush_forker(bdi))
369 370
				continue;

371 372 373
			WARN(!test_bit(BDI_registered, &bdi->state),
			     "bdi %p/%s is not registered!\n", bdi, bdi->name);

374 375
			have_dirty_io = !list_empty(&bdi->work_list) ||
					wb_has_dirty_io(&bdi->wb);
376 377

			/*
378 379
			 * If the bdi has work to do, but the thread does not
			 * exist - create it.
380
			 */
381 382 383 384 385 386 387 388 389
			if (!bdi->wb.task && have_dirty_io) {
				/*
				 * Set the pending bit - if someone will try to
				 * unregister this bdi - it'll wait on this bit.
				 */
				set_bit(BDI_pending, &bdi->state);
				action = FORK_THREAD;
				break;
			}
390

391 392
			spin_lock(&bdi->wb_lock);

393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408
			/*
			 * If there is no work to do and the bdi thread was
			 * inactive long enough - kill it. The wb_lock is taken
			 * to make sure no-one adds more work to this bdi and
			 * wakes the bdi thread up.
			 */
			if (bdi->wb.task && !have_dirty_io &&
			    time_after(jiffies, bdi->wb.last_active +
						bdi_longest_inactive())) {
				task = bdi->wb.task;
				bdi->wb.task = NULL;
				spin_unlock(&bdi->wb_lock);
				set_bit(BDI_pending, &bdi->state);
				action = KILL_THREAD;
				break;
			}
409
			spin_unlock(&bdi->wb_lock);
410
		}
411
		spin_unlock_bh(&bdi_lock);
412

413 414 415 416
		/* Keep working if default bdi still has things to do */
		if (!list_empty(&me->bdi->work_list))
			__set_current_state(TASK_RUNNING);

417 418 419
		switch (action) {
		case FORK_THREAD:
			__set_current_state(TASK_RUNNING);
420 421
			task = kthread_create(bdi_writeback_thread, &bdi->wb,
					      "flush-%s", dev_name(bdi->dev));
422 423 424
			if (IS_ERR(task)) {
				/*
				 * If thread creation fails, force writeout of
425 426
				 * the bdi from the thread. Hopefully 1024 is
				 * large enough for efficient IO.
427
				 */
428 429
				writeback_inodes_wb(&bdi->wb, 1024,
						    WB_REASON_FORKER_THREAD);
430 431 432 433
			} else {
				/*
				 * The spinlock makes sure we do not lose
				 * wake-ups when racing with 'bdi_queue_work()'.
434 435
				 * And as soon as the bdi thread is visible, we
				 * can start it.
436
				 */
437
				spin_lock_bh(&bdi->wb_lock);
438
				bdi->wb.task = task;
439
				spin_unlock_bh(&bdi->wb_lock);
440
				wake_up_process(task);
441
			}
442
			bdi_clear_pending(bdi);
443 444 445 446 447
			break;

		case KILL_THREAD:
			__set_current_state(TASK_RUNNING);
			kthread_stop(task);
448
			bdi_clear_pending(bdi);
449
			break;
450

451
		case NO_ACTION:
452 453 454 455 456 457 458 459 460
			if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
				/*
				 * There are no dirty data. The only thing we
				 * should now care about is checking for
				 * inactive bdi threads and killing them. Thus,
				 * let's sleep for longer time, save energy and
				 * be friendly for battery-driven devices.
				 */
				schedule_timeout(bdi_longest_inactive());
461
			else
462
				schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
463
			try_to_freeze();
464
			break;
465 466 467 468 469 470
		}
	}

	return 0;
}

471 472 473 474 475 476 477 478 479
/*
 * Remove bdi from bdi_list, and ensure that it is no longer visible
 */
static void bdi_remove_from_list(struct backing_dev_info *bdi)
{
	spin_lock_bh(&bdi_lock);
	list_del_rcu(&bdi->bdi_list);
	spin_unlock_bh(&bdi_lock);

480
	synchronize_rcu_expedited();
481 482
}

483 484 485 486 487 488
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
		const char *fmt, ...)
{
	va_list args;
	struct device *dev;

489
	if (bdi->dev)	/* The driver needs to use separate queues per device */
490
		return 0;
491

492
	va_start(args, fmt);
493
	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
494
	va_end(args);
495 496
	if (IS_ERR(dev))
		return PTR_ERR(dev);
497

498 499
	bdi->dev = dev;

500 501 502 503 504 505 506 507
	/*
	 * Just start the forker thread for our default backing_dev_info,
	 * and add other bdi's to the list. They will get a thread created
	 * on-demand when they need it.
	 */
	if (bdi_cap_flush_forker(bdi)) {
		struct bdi_writeback *wb = &bdi->wb;

508
		wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
509
						dev_name(dev));
510 511
		if (IS_ERR(wb->task))
			return PTR_ERR(wb->task);
512 513 514
	}

	bdi_debug_register(bdi, dev_name(dev));
515
	set_bit(BDI_registered, &bdi->state);
516 517 518 519 520

	spin_lock_bh(&bdi_lock);
	list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
	spin_unlock_bh(&bdi_lock);

521
	trace_writeback_bdi_register(bdi);
522
	return 0;
523 524 525 526 527 528 529 530 531
}
EXPORT_SYMBOL(bdi_register);

int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
{
	return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
}
EXPORT_SYMBOL(bdi_register_dev);

532 533 534 535
/*
 * Remove bdi from the global list and shutdown any threads we have running
 */
static void bdi_wb_shutdown(struct backing_dev_info *bdi)
536
{
537 538
	struct task_struct *task;

539 540 541 542
	if (!bdi_cap_writeback_dirty(bdi))
		return;

	/*
543
	 * Make sure nobody finds us on the bdi_list anymore
544
	 */
545
	bdi_remove_from_list(bdi);
546 547

	/*
548
	 * If setup is pending, wait for that to complete first
549
	 */
550 551
	wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
			TASK_UNINTERRUPTIBLE);
552 553

	/*
C
Christoph Hellwig 已提交
554
	 * Finally, kill the kernel thread. We don't need to be RCU
555
	 * safe anymore, since the bdi is gone from visibility.
556
	 */
557 558 559 560 561 562 563
	spin_lock_bh(&bdi->wb_lock);
	task = bdi->wb.task;
	bdi->wb.task = NULL;
	spin_unlock_bh(&bdi->wb_lock);

	if (task)
		kthread_stop(task);
564 565
}

566 567 568 569 570 571 572 573 574 575
/*
 * This bdi is going away now, make sure that no super_blocks point to it
 */
static void bdi_prune_sb(struct backing_dev_info *bdi)
{
	struct super_block *sb;

	spin_lock(&sb_lock);
	list_for_each_entry(sb, &super_blocks, s_list) {
		if (sb->s_bdi == bdi)
576
			sb->s_bdi = &default_backing_dev_info;
577 578 579 580
	}
	spin_unlock(&sb_lock);
}

581 582
void bdi_unregister(struct backing_dev_info *bdi)
{
583 584 585
	struct device *dev = bdi->dev;

	if (dev) {
586
		bdi_set_min_ratio(bdi, 0);
587
		trace_writeback_bdi_unregister(bdi);
588
		bdi_prune_sb(bdi);
589
		del_timer_sync(&bdi->wb.wakeup_timer);
590

591 592
		if (!bdi_cap_flush_forker(bdi))
			bdi_wb_shutdown(bdi);
593
		bdi_debug_unregister(bdi);
594 595

		spin_lock_bh(&bdi->wb_lock);
596
		bdi->dev = NULL;
597 598 599
		spin_unlock_bh(&bdi->wb_lock);

		device_unregister(dev);
600 601 602
	}
}
EXPORT_SYMBOL(bdi_unregister);
603

604 605 606 607 608 609 610 611 612
static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
{
	memset(wb, 0, sizeof(*wb));

	wb->bdi = bdi;
	wb->last_old_flush = jiffies;
	INIT_LIST_HEAD(&wb->b_dirty);
	INIT_LIST_HEAD(&wb->b_io);
	INIT_LIST_HEAD(&wb->b_more_io);
613
	spin_lock_init(&wb->list_lock);
614 615 616
	setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
}

617 618 619 620 621
/*
 * Initial write bandwidth: 100 MB/s
 */
#define INIT_BW		(100 << (20 - PAGE_SHIFT))

622 623
int bdi_init(struct backing_dev_info *bdi)
{
624
	int i, err;
625

626 627
	bdi->dev = NULL;

628
	bdi->min_ratio = 0;
629
	bdi->max_ratio = 100;
630
	bdi->max_prop_frac = FPROP_FRAC_BASE;
631
	spin_lock_init(&bdi->wb_lock);
632
	INIT_LIST_HEAD(&bdi->bdi_list);
633 634 635 636
	INIT_LIST_HEAD(&bdi->work_list);

	bdi_wb_init(&bdi->wb, bdi);

637
	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
638
		err = percpu_counter_init(&bdi->bdi_stat[i], 0);
P
Peter Zijlstra 已提交
639 640 641 642 643
		if (err)
			goto err;
	}

	bdi->dirty_exceeded = 0;
644 645 646 647

	bdi->bw_time_stamp = jiffies;
	bdi->written_stamp = 0;

648
	bdi->balanced_dirty_ratelimit = INIT_BW;
W
Wu Fengguang 已提交
649
	bdi->dirty_ratelimit = INIT_BW;
650 651 652
	bdi->write_bandwidth = INIT_BW;
	bdi->avg_write_bandwidth = INIT_BW;

653
	err = fprop_local_init_percpu(&bdi->completions);
P
Peter Zijlstra 已提交
654 655 656

	if (err) {
err:
657
		while (i--)
P
Peter Zijlstra 已提交
658
			percpu_counter_destroy(&bdi->bdi_stat[i]);
659 660 661 662 663 664 665 666 667 668
	}

	return err;
}
EXPORT_SYMBOL(bdi_init);

void bdi_destroy(struct backing_dev_info *bdi)
{
	int i;

669 670 671 672 673 674 675
	/*
	 * Splice our entries to the default_backing_dev_info, if this
	 * bdi disappears
	 */
	if (bdi_has_dirty_io(bdi)) {
		struct bdi_writeback *dst = &default_backing_dev_info.wb;

676
		bdi_lock_two(&bdi->wb, dst);
677 678 679
		list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
		list_splice(&bdi->wb.b_io, &dst->b_io);
		list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
680 681
		spin_unlock(&bdi->wb.list_lock);
		spin_unlock(&dst->list_lock);
682
	}
683

684 685
	bdi_unregister(bdi);

686 687 688 689 690 691 692 693
	/*
	 * If bdi_unregister() had already been called earlier, the
	 * wakeup_timer could still be armed because bdi_prune_sb()
	 * can race with the bdi_wakeup_thread_delayed() calls from
	 * __mark_inode_dirty().
	 */
	del_timer_sync(&bdi->wb.wakeup_timer);

694 695
	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
		percpu_counter_destroy(&bdi->bdi_stat[i]);
P
Peter Zijlstra 已提交
696

697
	fprop_local_destroy_percpu(&bdi->completions);
698 699 700
}
EXPORT_SYMBOL(bdi_destroy);

701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727
/*
 * For use from filesystems to quickly init and register a bdi associated
 * with dirty writeback
 */
int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
			   unsigned int cap)
{
	char tmp[32];
	int err;

	bdi->name = name;
	bdi->capabilities = cap;
	err = bdi_init(bdi);
	if (err)
		return err;

	sprintf(tmp, "%.28s%s", name, "-%d");
	err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
	if (err) {
		bdi_destroy(bdi);
		return err;
	}

	return 0;
}
EXPORT_SYMBOL(bdi_setup_and_register);

728 729 730 731
static wait_queue_head_t congestion_wqh[2] = {
		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
	};
732
static atomic_t nr_bdi_congested[2];
733

734
void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
735 736
{
	enum bdi_state bit;
737
	wait_queue_head_t *wqh = &congestion_wqh[sync];
738

739
	bit = sync ? BDI_sync_congested : BDI_async_congested;
740 741
	if (test_and_clear_bit(bit, &bdi->state))
		atomic_dec(&nr_bdi_congested[sync]);
742 743 744 745 746 747
	smp_mb__after_clear_bit();
	if (waitqueue_active(wqh))
		wake_up(wqh);
}
EXPORT_SYMBOL(clear_bdi_congested);

748
void set_bdi_congested(struct backing_dev_info *bdi, int sync)
749 750 751
{
	enum bdi_state bit;

752
	bit = sync ? BDI_sync_congested : BDI_async_congested;
753 754
	if (!test_and_set_bit(bit, &bdi->state))
		atomic_inc(&nr_bdi_congested[sync]);
755 756 757 758 759
}
EXPORT_SYMBOL(set_bdi_congested);

/**
 * congestion_wait - wait for a backing_dev to become uncongested
760
 * @sync: SYNC or ASYNC IO
761 762 763 764 765 766
 * @timeout: timeout in jiffies
 *
 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
 * write congestion.  If no backing_devs are congested then just wait for the
 * next write to be completed.
 */
767
long congestion_wait(int sync, long timeout)
768 769
{
	long ret;
770
	unsigned long start = jiffies;
771
	DEFINE_WAIT(wait);
772
	wait_queue_head_t *wqh = &congestion_wqh[sync];
773 774 775 776

	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
	ret = io_schedule_timeout(timeout);
	finish_wait(wqh, &wait);
777 778 779 780

	trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
					jiffies_to_usecs(jiffies - start));

781 782 783
	return ret;
}
EXPORT_SYMBOL(congestion_wait);
P
Peter Zijlstra 已提交
784

785 786 787 788 789 790 791 792 793 794 795
/**
 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
 * @zone: A zone to check if it is heavily congested
 * @sync: SYNC or ASYNC IO
 * @timeout: timeout in jiffies
 *
 * In the event of a congested backing_dev (any backing_dev) and the given
 * @zone has experienced recent congestion, this waits for up to @timeout
 * jiffies for either a BDI to exit congestion of the given @sync queue
 * or a write to complete.
 *
L
Lucas De Marchi 已提交
796
 * In the absence of zone congestion, cond_resched() is called to yield
797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838
 * the processor if necessary but otherwise does not sleep.
 *
 * The return value is 0 if the sleep is for the full timeout. Otherwise,
 * it is the number of jiffies that were still remaining when the function
 * returned. return_value == timeout implies the function did not sleep.
 */
long wait_iff_congested(struct zone *zone, int sync, long timeout)
{
	long ret;
	unsigned long start = jiffies;
	DEFINE_WAIT(wait);
	wait_queue_head_t *wqh = &congestion_wqh[sync];

	/*
	 * If there is no congestion, or heavy congestion is not being
	 * encountered in the current zone, yield if necessary instead
	 * of sleeping on the congestion queue
	 */
	if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
			!zone_is_reclaim_congested(zone)) {
		cond_resched();

		/* In case we scheduled, work out time remaining */
		ret = timeout - (jiffies - start);
		if (ret < 0)
			ret = 0;

		goto out;
	}

	/* Sleep until uncongested or a write happens */
	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
	ret = io_schedule_timeout(timeout);
	finish_wait(wqh, &wait);

out:
	trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
					jiffies_to_usecs(jiffies - start));

	return ret;
}
EXPORT_SYMBOL(wait_iff_congested);
839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858

int pdflush_proc_obsolete(struct ctl_table *table, int write,
			void __user *buffer, size_t *lenp, loff_t *ppos)
{
	char kbuf[] = "0\n";

	if (*ppos) {
		*lenp = 0;
		return 0;
	}

	if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
		return -EFAULT;
	printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
			table->procname);

	*lenp = 2;
	*ppos += *lenp;
	return 2;
}