backing-dev.c 21.3 KB
Newer Older
1 2 3

#include <linux/wait.h>
#include <linux/backing-dev.h>
4 5
#include <linux/kthread.h>
#include <linux/freezer.h>
6
#include <linux/fs.h>
7
#include <linux/pagemap.h>
8
#include <linux/mm.h>
9 10
#include <linux/sched.h>
#include <linux/module.h>
11 12
#include <linux/writeback.h>
#include <linux/device.h>
13
#include <trace/events/writeback.h>
14

15 16
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);

17
struct backing_dev_info default_backing_dev_info = {
18
	.name		= "default",
19 20 21 22 23
	.ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
	.state		= 0,
	.capabilities	= BDI_CAP_MAP_COPY,
};
EXPORT_SYMBOL_GPL(default_backing_dev_info);
24

J
Jörn Engel 已提交
25 26
struct backing_dev_info noop_backing_dev_info = {
	.name		= "noop",
27
	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
J
Jörn Engel 已提交
28 29 30
};
EXPORT_SYMBOL_GPL(noop_backing_dev_info);

31
static struct class *bdi_class;
32 33 34 35 36 37

/*
 * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
 * reader side protection for bdi_pending_list. bdi_list has RCU reader side
 * locking.
 */
38
DEFINE_SPINLOCK(bdi_lock);
39
LIST_HEAD(bdi_list);
40 41 42 43 44 45 46 47
LIST_HEAD(bdi_pending_list);

static struct task_struct *sync_supers_tsk;
static struct timer_list sync_supers_timer;

static int bdi_sync_supers(void *);
static void sync_supers_timer_fn(unsigned long);

48 49 50 51 52 53 54 55 56 57 58
void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
{
	if (wb1 < wb2) {
		spin_lock(&wb1->list_lock);
		spin_lock_nested(&wb2->list_lock, 1);
	} else {
		spin_lock(&wb2->list_lock);
		spin_lock_nested(&wb1->list_lock, 1);
	}
}

59 60 61 62 63 64 65 66 67 68 69 70 71 72
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>

static struct dentry *bdi_debug_root;

static void bdi_debug_init(void)
{
	bdi_debug_root = debugfs_create_dir("bdi", NULL);
}

static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
	struct backing_dev_info *bdi = m->private;
C
Christoph Hellwig 已提交
73
	struct bdi_writeback *wb = &bdi->wb;
74 75 76
	unsigned long background_thresh;
	unsigned long dirty_thresh;
	unsigned long bdi_thresh;
77
	unsigned long nr_dirty, nr_io, nr_more_io;
78 79
	struct inode *inode;

80
	nr_dirty = nr_io = nr_more_io = 0;
81
	spin_lock(&wb->list_lock);
N
Nick Piggin 已提交
82
	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
C
Christoph Hellwig 已提交
83
		nr_dirty++;
N
Nick Piggin 已提交
84
	list_for_each_entry(inode, &wb->b_io, i_wb_list)
C
Christoph Hellwig 已提交
85
		nr_io++;
N
Nick Piggin 已提交
86
	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
C
Christoph Hellwig 已提交
87
		nr_more_io++;
88
	spin_unlock(&wb->list_lock);
89

90 91
	global_dirty_limits(&background_thresh, &dirty_thresh);
	bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
92 93 94 95 96 97 98

#define K(x) ((x) << (PAGE_SHIFT - 10))
	seq_printf(m,
		   "BdiWriteback:     %8lu kB\n"
		   "BdiReclaimable:   %8lu kB\n"
		   "BdiDirtyThresh:   %8lu kB\n"
		   "DirtyThresh:      %8lu kB\n"
99
		   "BackgroundThresh: %8lu kB\n"
100
		   "BdiWritten:       %8lu kB\n"
101 102 103 104
		   "b_dirty:          %8lu\n"
		   "b_io:             %8lu\n"
		   "b_more_io:        %8lu\n"
		   "bdi_list:         %8u\n"
C
Christoph Hellwig 已提交
105
		   "state:            %8lx\n",
106 107
		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
108 109 110 111 112 113 114
		   K(bdi_thresh),
		   K(dirty_thresh),
		   K(background_thresh),
		   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
		   nr_dirty,
		   nr_io,
		   nr_more_io,
C
Christoph Hellwig 已提交
115
		   !list_empty(&bdi->bdi_list), bdi->state);
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
#undef K

	return 0;
}

static int bdi_debug_stats_open(struct inode *inode, struct file *file)
{
	return single_open(file, bdi_debug_stats_show, inode->i_private);
}

static const struct file_operations bdi_debug_stats_fops = {
	.open		= bdi_debug_stats_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release,
};

static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
{
	bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
	bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
					       bdi, &bdi_debug_stats_fops);
}

static void bdi_debug_unregister(struct backing_dev_info *bdi)
{
	debugfs_remove(bdi->debug_stats);
	debugfs_remove(bdi->debug_dir);
}
#else
static inline void bdi_debug_init(void)
{
}
static inline void bdi_debug_register(struct backing_dev_info *bdi,
				      const char *name)
{
}
static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
{
}
#endif

158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
static ssize_t read_ahead_kb_store(struct device *dev,
				  struct device_attribute *attr,
				  const char *buf, size_t count)
{
	struct backing_dev_info *bdi = dev_get_drvdata(dev);
	char *end;
	unsigned long read_ahead_kb;
	ssize_t ret = -EINVAL;

	read_ahead_kb = simple_strtoul(buf, &end, 10);
	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
		bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
		ret = count;
	}
	return ret;
}

#define K(pages) ((pages) << (PAGE_SHIFT - 10))

#define BDI_SHOW(name, expr)						\
static ssize_t name##_show(struct device *dev,				\
			   struct device_attribute *attr, char *page)	\
{									\
	struct backing_dev_info *bdi = dev_get_drvdata(dev);		\
									\
	return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr);	\
}

BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))

188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
static ssize_t min_ratio_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t count)
{
	struct backing_dev_info *bdi = dev_get_drvdata(dev);
	char *end;
	unsigned int ratio;
	ssize_t ret = -EINVAL;

	ratio = simple_strtoul(buf, &end, 10);
	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
		ret = bdi_set_min_ratio(bdi, ratio);
		if (!ret)
			ret = count;
	}
	return ret;
}
BDI_SHOW(min_ratio, bdi->min_ratio)

206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
static ssize_t max_ratio_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t count)
{
	struct backing_dev_info *bdi = dev_get_drvdata(dev);
	char *end;
	unsigned int ratio;
	ssize_t ret = -EINVAL;

	ratio = simple_strtoul(buf, &end, 10);
	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
		ret = bdi_set_max_ratio(bdi, ratio);
		if (!ret)
			ret = count;
	}
	return ret;
}
BDI_SHOW(max_ratio, bdi->max_ratio)

224 225 226 227
#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)

static struct device_attribute bdi_dev_attrs[] = {
	__ATTR_RW(read_ahead_kb),
228
	__ATTR_RW(min_ratio),
229
	__ATTR_RW(max_ratio),
230 231 232 233 234 235
	__ATTR_NULL,
};

static __init int bdi_class_init(void)
{
	bdi_class = class_create(THIS_MODULE, "bdi");
236 237 238
	if (IS_ERR(bdi_class))
		return PTR_ERR(bdi_class);

239
	bdi_class->dev_attrs = bdi_dev_attrs;
240
	bdi_debug_init();
241 242
	return 0;
}
243
postcore_initcall(bdi_class_init);
244

245 246 247 248
static int __init default_bdi_init(void)
{
	int err;

249 250 251 252
	sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
	BUG_ON(IS_ERR(sync_supers_tsk));

	setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
253
	bdi_arm_supers_timer();
254

255 256 257
	err = bdi_init(&default_backing_dev_info);
	if (!err)
		bdi_register(&default_backing_dev_info, NULL, "default");
258
	err = bdi_init(&noop_backing_dev_info);
259 260 261 262 263

	return err;
}
subsys_initcall(default_bdi_init);

264 265 266 267 268 269
int bdi_has_dirty_io(struct backing_dev_info *bdi)
{
	return wb_has_dirty_io(&bdi->wb);
}

/*
270
 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
271 272
 * or we risk deadlocking on ->s_umount. The longer term solution would be
 * to implement sync_supers_bdi() or similar and simply do it from the
273
 * bdi writeback thread individually.
274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291
 */
static int bdi_sync_supers(void *unused)
{
	set_user_nice(current, 0);

	while (!kthread_should_stop()) {
		set_current_state(TASK_INTERRUPTIBLE);
		schedule();

		/*
		 * Do this periodically, like kupdated() did before.
		 */
		sync_supers();
	}

	return 0;
}

292
void bdi_arm_supers_timer(void)
293 294 295
{
	unsigned long next;

296 297 298
	if (!dirty_writeback_interval)
		return;

299 300 301 302 303 304 305
	next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
	mod_timer(&sync_supers_timer, round_jiffies_up(next));
}

static void sync_supers_timer_fn(unsigned long unused)
{
	wake_up_process(sync_supers_tsk);
306
	bdi_arm_supers_timer();
307 308
}

309 310 311 312 313 314
static void wakeup_timer_fn(unsigned long data)
{
	struct backing_dev_info *bdi = (struct backing_dev_info *)data;

	spin_lock_bh(&bdi->wb_lock);
	if (bdi->wb.task) {
315
		trace_writeback_wake_thread(bdi);
316 317 318 319 320 321 322
		wake_up_process(bdi->wb.task);
	} else {
		/*
		 * When bdi tasks are inactive for long time, they are killed.
		 * In this case we have to wake-up the forker thread which
		 * should create and run the bdi thread.
		 */
323
		trace_writeback_wake_forker_thread(bdi);
324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347
		wake_up_process(default_backing_dev_info.wb.task);
	}
	spin_unlock_bh(&bdi->wb_lock);
}

/*
 * This function is used when the first inode for this bdi is marked dirty. It
 * wakes-up the corresponding bdi thread which should then take care of the
 * periodic background write-out of dirty inodes. Since the write-out would
 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
 * set up a timer which wakes the bdi thread up later.
 *
 * Note, we wouldn't bother setting up the timer, but this function is on the
 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
 * by delaying the wake-up.
 */
void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
{
	unsigned long timeout;

	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
	mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
}

348 349 350 351 352 353 354 355 356 357 358 359
/*
 * Calculate the longest interval (jiffies) bdi threads are allowed to be
 * inactive.
 */
static unsigned long bdi_longest_inactive(void)
{
	unsigned long interval;

	interval = msecs_to_jiffies(dirty_writeback_interval * 10);
	return max(5UL * 60 * HZ, interval);
}

360
static int bdi_forker_thread(void *ptr)
361 362 363
{
	struct bdi_writeback *me = ptr;

P
Peter Zijlstra 已提交
364
	current->flags |= PF_SWAPWRITE;
C
Christoph Hellwig 已提交
365 366 367 368 369 370
	set_freezable();

	/*
	 * Our parent may run at a different priority, just set us to normal
	 */
	set_user_nice(current, 0);
371 372

	for (;;) {
373
		struct task_struct *task = NULL;
374
		struct backing_dev_info *bdi;
375 376 377
		enum {
			NO_ACTION,   /* Nothing to do */
			FORK_THREAD, /* Fork bdi thread */
378
			KILL_THREAD, /* Kill inactive bdi thread */
379
		} action = NO_ACTION;
380 381 382 383 384

		/*
		 * Temporary measure, we want to make sure we don't see
		 * dirty data on the default backing_dev_info
		 */
385 386
		if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
			del_timer(&me->wakeup_timer);
387
			wb_do_writeback(me, 0);
388
		}
389

390
		spin_lock_bh(&bdi_lock);
391
		set_current_state(TASK_INTERRUPTIBLE);
392

393
		list_for_each_entry(bdi, &bdi_list, bdi_list) {
394 395 396 397
			bool have_dirty_io;

			if (!bdi_cap_writeback_dirty(bdi) ||
			     bdi_cap_flush_forker(bdi))
398 399
				continue;

400 401 402
			WARN(!test_bit(BDI_registered, &bdi->state),
			     "bdi %p/%s is not registered!\n", bdi, bdi->name);

403 404
			have_dirty_io = !list_empty(&bdi->work_list) ||
					wb_has_dirty_io(&bdi->wb);
405 406

			/*
407 408
			 * If the bdi has work to do, but the thread does not
			 * exist - create it.
409
			 */
410 411 412 413 414 415 416 417 418
			if (!bdi->wb.task && have_dirty_io) {
				/*
				 * Set the pending bit - if someone will try to
				 * unregister this bdi - it'll wait on this bit.
				 */
				set_bit(BDI_pending, &bdi->state);
				action = FORK_THREAD;
				break;
			}
419

420 421
			spin_lock(&bdi->wb_lock);

422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437
			/*
			 * If there is no work to do and the bdi thread was
			 * inactive long enough - kill it. The wb_lock is taken
			 * to make sure no-one adds more work to this bdi and
			 * wakes the bdi thread up.
			 */
			if (bdi->wb.task && !have_dirty_io &&
			    time_after(jiffies, bdi->wb.last_active +
						bdi_longest_inactive())) {
				task = bdi->wb.task;
				bdi->wb.task = NULL;
				spin_unlock(&bdi->wb_lock);
				set_bit(BDI_pending, &bdi->state);
				action = KILL_THREAD;
				break;
			}
438
			spin_unlock(&bdi->wb_lock);
439
		}
440
		spin_unlock_bh(&bdi_lock);
441

442 443 444 445
		/* Keep working if default bdi still has things to do */
		if (!list_empty(&me->bdi->work_list))
			__set_current_state(TASK_RUNNING);

446 447 448
		switch (action) {
		case FORK_THREAD:
			__set_current_state(TASK_RUNNING);
449 450
			task = kthread_create(bdi_writeback_thread, &bdi->wb,
					      "flush-%s", dev_name(bdi->dev));
451 452 453
			if (IS_ERR(task)) {
				/*
				 * If thread creation fails, force writeout of
454 455
				 * the bdi from the thread. Hopefully 1024 is
				 * large enough for efficient IO.
456
				 */
457
				writeback_inodes_wb(&bdi->wb, 1024);
458 459 460 461
			} else {
				/*
				 * The spinlock makes sure we do not lose
				 * wake-ups when racing with 'bdi_queue_work()'.
462 463
				 * And as soon as the bdi thread is visible, we
				 * can start it.
464
				 */
465
				spin_lock_bh(&bdi->wb_lock);
466
				bdi->wb.task = task;
467
				spin_unlock_bh(&bdi->wb_lock);
468
				wake_up_process(task);
469 470 471 472 473 474
			}
			break;

		case KILL_THREAD:
			__set_current_state(TASK_RUNNING);
			kthread_stop(task);
475
			break;
476

477
		case NO_ACTION:
478 479 480 481 482 483 484 485 486
			if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
				/*
				 * There are no dirty data. The only thing we
				 * should now care about is checking for
				 * inactive bdi threads and killing them. Thus,
				 * let's sleep for longer time, save energy and
				 * be friendly for battery-driven devices.
				 */
				schedule_timeout(bdi_longest_inactive());
487
			else
488
				schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
489
			try_to_freeze();
490
			/* Back to the main loop */
491 492
			continue;
		}
493 494 495 496 497 498 499

		/*
		 * Clear pending bit and wakeup anybody waiting to tear us down.
		 */
		clear_bit(BDI_pending, &bdi->state);
		smp_mb__after_clear_bit();
		wake_up_bit(&bdi->state, BDI_pending);
500 501 502 503 504
	}

	return 0;
}

505 506 507 508 509 510 511 512 513 514 515 516
/*
 * Remove bdi from bdi_list, and ensure that it is no longer visible
 */
static void bdi_remove_from_list(struct backing_dev_info *bdi)
{
	spin_lock_bh(&bdi_lock);
	list_del_rcu(&bdi->bdi_list);
	spin_unlock_bh(&bdi_lock);

	synchronize_rcu();
}

517 518 519 520 521 522
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
		const char *fmt, ...)
{
	va_list args;
	struct device *dev;

523
	if (bdi->dev)	/* The driver needs to use separate queues per device */
524
		return 0;
525

526
	va_start(args, fmt);
527
	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
528
	va_end(args);
529 530
	if (IS_ERR(dev))
		return PTR_ERR(dev);
531

532 533
	bdi->dev = dev;

534 535 536 537 538 539 540 541
	/*
	 * Just start the forker thread for our default backing_dev_info,
	 * and add other bdi's to the list. They will get a thread created
	 * on-demand when they need it.
	 */
	if (bdi_cap_flush_forker(bdi)) {
		struct bdi_writeback *wb = &bdi->wb;

542
		wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
543
						dev_name(dev));
544 545
		if (IS_ERR(wb->task))
			return PTR_ERR(wb->task);
546 547 548
	}

	bdi_debug_register(bdi, dev_name(dev));
549
	set_bit(BDI_registered, &bdi->state);
550 551 552 553 554

	spin_lock_bh(&bdi_lock);
	list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
	spin_unlock_bh(&bdi_lock);

555
	trace_writeback_bdi_register(bdi);
556
	return 0;
557 558 559 560 561 562 563 564 565
}
EXPORT_SYMBOL(bdi_register);

int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
{
	return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
}
EXPORT_SYMBOL(bdi_register_dev);

566 567 568 569
/*
 * Remove bdi from the global list and shutdown any threads we have running
 */
static void bdi_wb_shutdown(struct backing_dev_info *bdi)
570
{
571 572 573 574
	if (!bdi_cap_writeback_dirty(bdi))
		return;

	/*
575
	 * Make sure nobody finds us on the bdi_list anymore
576
	 */
577
	bdi_remove_from_list(bdi);
578 579

	/*
580
	 * If setup is pending, wait for that to complete first
581
	 */
582 583
	wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
			TASK_UNINTERRUPTIBLE);
584 585

	/*
C
Christoph Hellwig 已提交
586
	 * Finally, kill the kernel thread. We don't need to be RCU
587 588 589
	 * safe anymore, since the bdi is gone from visibility. Force
	 * unfreeze of the thread before calling kthread_stop(), otherwise
	 * it would never exet if it is currently stuck in the refrigerator.
590
	 */
C
Christoph Hellwig 已提交
591 592 593
	if (bdi->wb.task) {
		thaw_process(bdi->wb.task);
		kthread_stop(bdi->wb.task);
594
	}
595 596
}

597 598 599 600 601 602 603 604 605 606
/*
 * This bdi is going away now, make sure that no super_blocks point to it
 */
static void bdi_prune_sb(struct backing_dev_info *bdi)
{
	struct super_block *sb;

	spin_lock(&sb_lock);
	list_for_each_entry(sb, &super_blocks, s_list) {
		if (sb->s_bdi == bdi)
607
			sb->s_bdi = &default_backing_dev_info;
608 609 610 611
	}
	spin_unlock(&sb_lock);
}

612 613 614
void bdi_unregister(struct backing_dev_info *bdi)
{
	if (bdi->dev) {
615
		trace_writeback_bdi_unregister(bdi);
616
		bdi_prune_sb(bdi);
617
		del_timer_sync(&bdi->wb.wakeup_timer);
618

619 620
		if (!bdi_cap_flush_forker(bdi))
			bdi_wb_shutdown(bdi);
621
		bdi_debug_unregister(bdi);
622 623 624 625 626
		device_unregister(bdi->dev);
		bdi->dev = NULL;
	}
}
EXPORT_SYMBOL(bdi_unregister);
627

628 629 630 631 632 633 634 635 636
static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
{
	memset(wb, 0, sizeof(*wb));

	wb->bdi = bdi;
	wb->last_old_flush = jiffies;
	INIT_LIST_HEAD(&wb->b_dirty);
	INIT_LIST_HEAD(&wb->b_io);
	INIT_LIST_HEAD(&wb->b_more_io);
637
	spin_lock_init(&wb->list_lock);
638 639 640
	setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
}

641 642 643 644 645
/*
 * Initial write bandwidth: 100 MB/s
 */
#define INIT_BW		(100 << (20 - PAGE_SHIFT))

646 647
int bdi_init(struct backing_dev_info *bdi)
{
648
	int i, err;
649

650 651
	bdi->dev = NULL;

652
	bdi->min_ratio = 0;
653 654
	bdi->max_ratio = 100;
	bdi->max_prop_frac = PROP_FRAC_BASE;
655
	spin_lock_init(&bdi->wb_lock);
656
	INIT_LIST_HEAD(&bdi->bdi_list);
657 658 659 660
	INIT_LIST_HEAD(&bdi->work_list);

	bdi_wb_init(&bdi->wb, bdi);

661
	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
662
		err = percpu_counter_init(&bdi->bdi_stat[i], 0);
P
Peter Zijlstra 已提交
663 664 665 666 667
		if (err)
			goto err;
	}

	bdi->dirty_exceeded = 0;
668 669 670 671 672 673 674

	bdi->bw_time_stamp = jiffies;
	bdi->written_stamp = 0;

	bdi->write_bandwidth = INIT_BW;
	bdi->avg_write_bandwidth = INIT_BW;

P
Peter Zijlstra 已提交
675 676 677 678
	err = prop_local_init_percpu(&bdi->completions);

	if (err) {
err:
679
		while (i--)
P
Peter Zijlstra 已提交
680
			percpu_counter_destroy(&bdi->bdi_stat[i]);
681 682 683 684 685 686 687 688 689 690
	}

	return err;
}
EXPORT_SYMBOL(bdi_init);

void bdi_destroy(struct backing_dev_info *bdi)
{
	int i;

691 692 693 694 695 696 697
	/*
	 * Splice our entries to the default_backing_dev_info, if this
	 * bdi disappears
	 */
	if (bdi_has_dirty_io(bdi)) {
		struct bdi_writeback *dst = &default_backing_dev_info.wb;

698
		bdi_lock_two(&bdi->wb, dst);
699 700 701
		list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
		list_splice(&bdi->wb.b_io, &dst->b_io);
		list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
702 703
		spin_unlock(&bdi->wb.list_lock);
		spin_unlock(&dst->list_lock);
704
	}
705

706 707
	bdi_unregister(bdi);

708 709
	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
		percpu_counter_destroy(&bdi->bdi_stat[i]);
P
Peter Zijlstra 已提交
710 711

	prop_local_destroy_percpu(&bdi->completions);
712 713 714
}
EXPORT_SYMBOL(bdi_destroy);

715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741
/*
 * For use from filesystems to quickly init and register a bdi associated
 * with dirty writeback
 */
int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
			   unsigned int cap)
{
	char tmp[32];
	int err;

	bdi->name = name;
	bdi->capabilities = cap;
	err = bdi_init(bdi);
	if (err)
		return err;

	sprintf(tmp, "%.28s%s", name, "-%d");
	err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
	if (err) {
		bdi_destroy(bdi);
		return err;
	}

	return 0;
}
EXPORT_SYMBOL(bdi_setup_and_register);

742 743 744 745
static wait_queue_head_t congestion_wqh[2] = {
		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
	};
746
static atomic_t nr_bdi_congested[2];
747

748
void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
749 750
{
	enum bdi_state bit;
751
	wait_queue_head_t *wqh = &congestion_wqh[sync];
752

753
	bit = sync ? BDI_sync_congested : BDI_async_congested;
754 755
	if (test_and_clear_bit(bit, &bdi->state))
		atomic_dec(&nr_bdi_congested[sync]);
756 757 758 759 760 761
	smp_mb__after_clear_bit();
	if (waitqueue_active(wqh))
		wake_up(wqh);
}
EXPORT_SYMBOL(clear_bdi_congested);

762
void set_bdi_congested(struct backing_dev_info *bdi, int sync)
763 764 765
{
	enum bdi_state bit;

766
	bit = sync ? BDI_sync_congested : BDI_async_congested;
767 768
	if (!test_and_set_bit(bit, &bdi->state))
		atomic_inc(&nr_bdi_congested[sync]);
769 770 771 772 773
}
EXPORT_SYMBOL(set_bdi_congested);

/**
 * congestion_wait - wait for a backing_dev to become uncongested
774
 * @sync: SYNC or ASYNC IO
775 776 777 778 779 780
 * @timeout: timeout in jiffies
 *
 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
 * write congestion.  If no backing_devs are congested then just wait for the
 * next write to be completed.
 */
781
long congestion_wait(int sync, long timeout)
782 783
{
	long ret;
784
	unsigned long start = jiffies;
785
	DEFINE_WAIT(wait);
786
	wait_queue_head_t *wqh = &congestion_wqh[sync];
787 788 789 790

	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
	ret = io_schedule_timeout(timeout);
	finish_wait(wqh, &wait);
791 792 793 794

	trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
					jiffies_to_usecs(jiffies - start));

795 796 797
	return ret;
}
EXPORT_SYMBOL(congestion_wait);
P
Peter Zijlstra 已提交
798

799 800 801 802 803 804 805 806 807 808 809
/**
 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
 * @zone: A zone to check if it is heavily congested
 * @sync: SYNC or ASYNC IO
 * @timeout: timeout in jiffies
 *
 * In the event of a congested backing_dev (any backing_dev) and the given
 * @zone has experienced recent congestion, this waits for up to @timeout
 * jiffies for either a BDI to exit congestion of the given @sync queue
 * or a write to complete.
 *
L
Lucas De Marchi 已提交
810
 * In the absence of zone congestion, cond_resched() is called to yield
811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852
 * the processor if necessary but otherwise does not sleep.
 *
 * The return value is 0 if the sleep is for the full timeout. Otherwise,
 * it is the number of jiffies that were still remaining when the function
 * returned. return_value == timeout implies the function did not sleep.
 */
long wait_iff_congested(struct zone *zone, int sync, long timeout)
{
	long ret;
	unsigned long start = jiffies;
	DEFINE_WAIT(wait);
	wait_queue_head_t *wqh = &congestion_wqh[sync];

	/*
	 * If there is no congestion, or heavy congestion is not being
	 * encountered in the current zone, yield if necessary instead
	 * of sleeping on the congestion queue
	 */
	if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
			!zone_is_reclaim_congested(zone)) {
		cond_resched();

		/* In case we scheduled, work out time remaining */
		ret = timeout - (jiffies - start);
		if (ret < 0)
			ret = 0;

		goto out;
	}

	/* Sleep until uncongested or a write happens */
	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
	ret = io_schedule_timeout(timeout);
	finish_wait(wqh, &wait);

out:
	trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
					jiffies_to_usecs(jiffies - start));

	return ret;
}
EXPORT_SYMBOL(wait_iff_congested);