md.c 251.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
L
Linus Torvalds 已提交
2 3
/*
   md.c : Multiple Devices driver for Linux
4
     Copyright (C) 1998, 1999, 2000 Ingo Molnar
L
Linus Torvalds 已提交
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22

     completely rewritten, based on the MD driver code from Marc Zyngier

   Changes:

   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
   - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
   - kmod support by: Cyrus Durgin
   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>

   - lots of fixes and improvements to the RAID1/RAID5 and generic
     RAID code (such as request based resynchronization):

     Neil Brown <neilb@cse.unsw.edu.au>.

23 24 25
   - persistent bitmap code
     Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.

26 27 28 29 30 31 32 33 34 35 36 37

   Errors, Warnings, etc.
   Please use:
     pr_crit() for error conditions that risk data loss
     pr_err() for error conditions that are unexpected, like an IO error
         or internal inconsistency
     pr_warn() for error conditions that could have been predicated, like
         adding a device to an array when it has incompatible metadata
     pr_info() for every interesting, very rare events, like an array starting
         or stopping, or resync starting or stopping
     pr_debug() for everything else.

L
Linus Torvalds 已提交
38 39
*/

40
#include <linux/sched/mm.h>
41
#include <linux/sched/signal.h>
42
#include <linux/kthread.h>
43
#include <linux/blkdev.h>
44
#include <linux/badblocks.h>
L
Linus Torvalds 已提交
45
#include <linux/sysctl.h>
46
#include <linux/seq_file.h>
A
Al Viro 已提交
47
#include <linux/fs.h>
48
#include <linux/poll.h>
49
#include <linux/ctype.h>
50
#include <linux/string.h>
51 52 53
#include <linux/hdreg.h>
#include <linux/proc_fs.h>
#include <linux/random.h>
54
#include <linux/module.h>
55
#include <linux/reboot.h>
56
#include <linux/file.h>
57
#include <linux/compat.h>
58
#include <linux/delay.h>
59 60
#include <linux/raid/md_p.h>
#include <linux/raid/md_u.h>
61
#include <linux/slab.h>
62 63
#include <linux/percpu-refcount.h>

64
#include <trace/events/block.h>
65
#include "md.h"
66
#include "md-bitmap.h"
67
#include "md-cluster.h"
L
Linus Torvalds 已提交
68 69

#ifndef MODULE
70
static void autostart_arrays(int part);
L
Linus Torvalds 已提交
71 72
#endif

73 74 75 76 77
/* pers_list is a list of registered personalities protected
 * by pers_lock.
 * pers_lock does extra service to protect accesses to
 * mddev->thread when the mutex cannot be held.
 */
78
static LIST_HEAD(pers_list);
L
Linus Torvalds 已提交
79 80
static DEFINE_SPINLOCK(pers_lock);

81 82
static struct kobj_type md_ktype;

83
struct md_cluster_operations *md_cluster_ops;
84
EXPORT_SYMBOL(md_cluster_ops);
85
static struct module *md_cluster_mod;
86

87
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
T
Tejun Heo 已提交
88 89
static struct workqueue_struct *md_wq;
static struct workqueue_struct *md_misc_wq;
90

91 92
static int remove_and_add_spares(struct mddev *mddev,
				 struct md_rdev *this);
93
static void mddev_detach(struct mddev *mddev);
94

95 96 97 98 99 100
/*
 * Default number of read corrections we'll attempt on an rdev
 * before ejecting it from the array. We divide the read error
 * count by 2 for every hour elapsed between read errors.
 */
#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
L
Linus Torvalds 已提交
101 102 103 104
/*
 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
 * is 1000 KB/sec, so the extra system load does not show up that much.
 * Increase it if you want to have more _guaranteed_ speed. Note that
105
 * the RAID driver will use the maximum available bandwidth if the IO
L
Linus Torvalds 已提交
106 107 108 109 110
 * subsystem is idle. There is also an 'absolute maximum' reconstruction
 * speed limit - in case reconstruction slows down your system despite
 * idle IO detection.
 *
 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
111
 * or /sys/block/mdX/md/sync_speed_{min,max}
L
Linus Torvalds 已提交
112 113 114 115
 */

static int sysctl_speed_limit_min = 1000;
static int sysctl_speed_limit_max = 200000;
116
static inline int speed_min(struct mddev *mddev)
117 118 119 120 121
{
	return mddev->sync_speed_min ?
		mddev->sync_speed_min : sysctl_speed_limit_min;
}

122
static inline int speed_max(struct mddev *mddev)
123 124 125 126
{
	return mddev->sync_speed_max ?
		mddev->sync_speed_max : sysctl_speed_limit_max;
}
L
Linus Torvalds 已提交
127

128 129 130 131 132 133 134 135 136 137 138 139 140
static int rdev_init_wb(struct md_rdev *rdev)
{
	if (rdev->bdev->bd_queue->nr_hw_queues == 1)
		return 0;

	spin_lock_init(&rdev->wb_list_lock);
	INIT_LIST_HEAD(&rdev->wb_list);
	init_waitqueue_head(&rdev->wb_io_wait);
	set_bit(WBCollisionCheck, &rdev->flags);

	return 1;
}

141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
/*
 * Create wb_info_pool if rdev is the first multi-queue device flaged
 * with writemostly, also write-behind mode is enabled.
 */
void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
			  bool is_suspend)
{
	if (mddev->bitmap_info.max_write_behind == 0)
		return;

	if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev))
		return;

	if (mddev->wb_info_pool == NULL) {
		unsigned int noio_flag;

		if (!is_suspend)
			mddev_suspend(mddev);
		noio_flag = memalloc_noio_save();
		mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS,
							sizeof(struct wb_info));
		memalloc_noio_restore(noio_flag);
		if (!mddev->wb_info_pool)
			pr_err("can't alloc memory pool for writemostly\n");
		if (!is_suspend)
			mddev_resume(mddev);
	}
}
EXPORT_SYMBOL_GPL(mddev_create_wb_pool);

/*
 * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck.
 */
static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev)
{
	if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags))
		return;

	if (mddev->wb_info_pool) {
		struct md_rdev *temp;
		int num = 0;

		/*
		 * Check if other rdevs need wb_info_pool.
		 */
		rdev_for_each(temp, mddev)
			if (temp != rdev &&
			    test_bit(WBCollisionCheck, &temp->flags))
				num++;
		if (!num) {
			mddev_suspend(rdev->mddev);
			mempool_destroy(mddev->wb_info_pool);
			mddev->wb_info_pool = NULL;
			mddev_resume(rdev->mddev);
		}
	}
}

L
Linus Torvalds 已提交
199 200
static struct ctl_table_header *raid_table_header;

201
static struct ctl_table raid_table[] = {
L
Linus Torvalds 已提交
202 203 204 205
	{
		.procname	= "speed_limit_min",
		.data		= &sysctl_speed_limit_min,
		.maxlen		= sizeof(int),
206
		.mode		= S_IRUGO|S_IWUSR,
207
		.proc_handler	= proc_dointvec,
L
Linus Torvalds 已提交
208 209 210 211 212
	},
	{
		.procname	= "speed_limit_max",
		.data		= &sysctl_speed_limit_max,
		.maxlen		= sizeof(int),
213
		.mode		= S_IRUGO|S_IWUSR,
214
		.proc_handler	= proc_dointvec,
L
Linus Torvalds 已提交
215
	},
216
	{ }
L
Linus Torvalds 已提交
217 218
};

219
static struct ctl_table raid_dir_table[] = {
L
Linus Torvalds 已提交
220 221 222
	{
		.procname	= "raid",
		.maxlen		= 0,
223
		.mode		= S_IRUGO|S_IXUGO,
L
Linus Torvalds 已提交
224 225
		.child		= raid_table,
	},
226
	{ }
L
Linus Torvalds 已提交
227 228
};

229
static struct ctl_table raid_root_table[] = {
L
Linus Torvalds 已提交
230 231 232 233 234 235
	{
		.procname	= "dev",
		.maxlen		= 0,
		.mode		= 0555,
		.child		= raid_dir_table,
	},
236
	{  }
L
Linus Torvalds 已提交
237 238
};

239
static const struct block_device_operations md_fops;
L
Linus Torvalds 已提交
240

241 242
static int start_readonly;

243 244 245 246 247 248 249 250 251 252
/*
 * The original mechanism for creating an md device is to create
 * a device node in /dev and to open it.  This causes races with device-close.
 * The preferred method is to write to the "new_array" module parameter.
 * This can avoid races.
 * Setting create_on_open to false disables the original mechanism
 * so all the races disappear.
 */
static bool create_on_open = true;

253
struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
254
			    struct mddev *mddev)
255
{
256
	if (!mddev || !bioset_initialized(&mddev->bio_set))
257 258
		return bio_alloc(gfp_mask, nr_iovecs);

259
	return bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
260 261 262
}
EXPORT_SYMBOL_GPL(bio_alloc_mddev);

263 264
static struct bio *md_bio_alloc_sync(struct mddev *mddev)
{
265
	if (!mddev || !bioset_initialized(&mddev->sync_set))
266 267
		return bio_alloc(GFP_NOIO, 1);

268
	return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
269 270
}

271 272 273 274 275 276 277 278 279 280
/*
 * We have a system wide 'event count' that is incremented
 * on any 'interesting' event, and readers of /proc/mdstat
 * can use 'poll' or 'select' to find out when the event
 * count increases.
 *
 * Events are:
 *  start array, stop array, error, add device, remove device,
 *  start build, activate spare
 */
281
static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
282
static atomic_t md_event_count;
283
void md_new_event(struct mddev *mddev)
284 285 286 287
{
	atomic_inc(&md_event_count);
	wake_up(&md_event_waiters);
}
288
EXPORT_SYMBOL_GPL(md_new_event);
289

L
Linus Torvalds 已提交
290 291 292 293 294 295 296 297 298 299 300 301 302 303
/*
 * Enables to iterate over all existing md arrays
 * all_mddevs_lock protects this list.
 */
static LIST_HEAD(all_mddevs);
static DEFINE_SPINLOCK(all_mddevs_lock);

/*
 * iterates through all used mddevs in the system.
 * We take care to grab the all_mddevs_lock whenever navigating
 * the list, and to always hold a refcount when unlocked.
 * Any code which breaks out of this loop while own
 * a reference to the current mddev and must mddev_put it.
 */
304
#define for_each_mddev(_mddev,_tmp)					\
L
Linus Torvalds 已提交
305
									\
306
	for (({ spin_lock(&all_mddevs_lock);				\
307 308 309 310
		_tmp = all_mddevs.next;					\
		_mddev = NULL;});					\
	     ({ if (_tmp != &all_mddevs)				\
			mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
L
Linus Torvalds 已提交
311
		spin_unlock(&all_mddevs_lock);				\
312 313 314
		if (_mddev) mddev_put(_mddev);				\
		_mddev = list_entry(_tmp, struct mddev, all_mddevs);	\
		_tmp != &all_mddevs;});					\
L
Linus Torvalds 已提交
315
	     ({ spin_lock(&all_mddevs_lock);				\
316
		_tmp = _tmp->next;})					\
L
Linus Torvalds 已提交
317 318
		)

319 320 321 322 323 324 325
/* Rather than calling directly into the personality make_request function,
 * IO requests come here first so that we can check if the device is
 * being suspended pending a reconfiguration.
 * We hold a refcount over the call to ->make_request.  By the time that
 * call has finished, the bio has been linked into some internal structure
 * and so is visible to ->quiesce(), so we don't need the refcount any more.
 */
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
static bool is_suspended(struct mddev *mddev, struct bio *bio)
{
	if (mddev->suspended)
		return true;
	if (bio_data_dir(bio) != WRITE)
		return false;
	if (mddev->suspend_lo >= mddev->suspend_hi)
		return false;
	if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
		return false;
	if (bio_end_sector(bio) < mddev->suspend_lo)
		return false;
	return true;
}

S
Shaohua Li 已提交
341 342 343 344
void md_handle_request(struct mddev *mddev, struct bio *bio)
{
check_suspended:
	rcu_read_lock();
345
	if (is_suspended(mddev, bio)) {
S
Shaohua Li 已提交
346 347 348 349
		DEFINE_WAIT(__wait);
		for (;;) {
			prepare_to_wait(&mddev->sb_wait, &__wait,
					TASK_UNINTERRUPTIBLE);
350
			if (!is_suspended(mddev, bio))
S
Shaohua Li 已提交
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371
				break;
			rcu_read_unlock();
			schedule();
			rcu_read_lock();
		}
		finish_wait(&mddev->sb_wait, &__wait);
	}
	atomic_inc(&mddev->active_io);
	rcu_read_unlock();

	if (!mddev->pers->make_request(mddev, bio)) {
		atomic_dec(&mddev->active_io);
		wake_up(&mddev->sb_wait);
		goto check_suspended;
	}

	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
		wake_up(&mddev->sb_wait);
}
EXPORT_SYMBOL(md_handle_request);

372
static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
L
Linus Torvalds 已提交
373
{
374
	const int rw = bio_data_dir(bio);
375
	const int sgrp = op_stat_group(bio_op(bio));
376
	struct mddev *mddev = q->queuedata;
377
	unsigned int sectors;
378

379 380 381 382 383
	if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
		bio_io_error(bio);
		return BLK_QC_T_NONE;
	}

384
	blk_queue_split(q, &bio);
385

N
NeilBrown 已提交
386
	if (mddev == NULL || mddev->pers == NULL) {
387
		bio_io_error(bio);
388
		return BLK_QC_T_NONE;
389
	}
390
	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
391
		if (bio_sectors(bio) != 0)
392
			bio->bi_status = BLK_STS_IOERR;
393
		bio_endio(bio);
394
		return BLK_QC_T_NONE;
395
	}
396

397 398 399 400 401
	/*
	 * save the sectors now since our bio can
	 * go away inside make_request
	 */
	sectors = bio_sectors(bio);
S
Shaohua Li 已提交
402
	/* bio could be mergeable after passing to underlayer */
J
Jens Axboe 已提交
403
	bio->bi_opf &= ~REQ_NOMERGE;
S
Shaohua Li 已提交
404 405

	md_handle_request(mddev, bio);
406

407 408 409
	part_stat_lock();
	part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
	part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
G
Gu Zheng 已提交
410
	part_stat_unlock();
411

412
	return BLK_QC_T_NONE;
413 414
}

415 416 417
/* mddev_suspend makes sure no new requests are submitted
 * to the device, and that any requests that have been submitted
 * are completely handled.
N
NeilBrown 已提交
418 419
 * Once mddev_detach() is called and completes, the module will be
 * completely unused.
420
 */
421
void mddev_suspend(struct mddev *mddev)
422
{
423
	WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
424
	lockdep_assert_held(&mddev->reconfig_mutex);
425 426
	if (mddev->suspended++)
		return;
427
	synchronize_rcu();
428
	wake_up(&mddev->sb_wait);
429 430
	set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
	smp_mb__after_atomic();
431 432
	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
	mddev->pers->quiesce(mddev, 1);
433 434
	clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
	wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
435 436

	del_timer_sync(&mddev->safemode_timer);
437
}
438
EXPORT_SYMBOL_GPL(mddev_suspend);
439

440
void mddev_resume(struct mddev *mddev)
441
{
442
	lockdep_assert_held(&mddev->reconfig_mutex);
443 444
	if (--mddev->suspended)
		return;
445 446
	wake_up(&mddev->sb_wait);
	mddev->pers->quiesce(mddev, 0);
447

448
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
449 450
	md_wakeup_thread(mddev->thread);
	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
L
Linus Torvalds 已提交
451
}
452
EXPORT_SYMBOL_GPL(mddev_resume);
L
Linus Torvalds 已提交
453

454
int mddev_congested(struct mddev *mddev, int bits)
455
{
456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471
	struct md_personality *pers = mddev->pers;
	int ret = 0;

	rcu_read_lock();
	if (mddev->suspended)
		ret = 1;
	else if (pers && pers->congested)
		ret = pers->congested(mddev, bits);
	rcu_read_unlock();
	return ret;
}
EXPORT_SYMBOL_GPL(mddev_congested);
static int md_congested(void *data, int bits)
{
	struct mddev *mddev = data;
	return mddev_congested(mddev, bits);
472 473
}

474
/*
T
Tejun Heo 已提交
475
 * Generic flush handling for md
476 477
 */

478
static void md_end_flush(struct bio *bio)
479
{
480 481
	struct md_rdev *rdev = bio->bi_private;
	struct mddev *mddev = rdev->mddev;
482 483 484

	rdev_dec_pending(rdev, mddev);

485 486 487
	if (atomic_dec_and_test(&mddev->flush_pending)) {
		/* The pre-request flush has finished */
		queue_work(md_wq, &mddev->flush_work);
488
	}
489
	bio_put(bio);
X
Xiao Ni 已提交
490
}
N
NeilBrown 已提交
491

492 493 494
static void md_submit_flush_data(struct work_struct *ws);

static void submit_flushes(struct work_struct *ws)
495
{
496
	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
497
	struct md_rdev *rdev;
498

N
NeilBrown 已提交
499
	mddev->start_flush = ktime_get_boottime();
500 501
	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
	atomic_set(&mddev->flush_pending, 1);
502
	rcu_read_lock();
N
NeilBrown 已提交
503
	rdev_for_each_rcu(rdev, mddev)
504 505 506 507 508 509 510 511 512 513
		if (rdev->raid_disk >= 0 &&
		    !test_bit(Faulty, &rdev->flags)) {
			/* Take two references, one is dropped
			 * when request finishes, one after
			 * we reclaim rcu_read_lock
			 */
			struct bio *bi;
			atomic_inc(&rdev->nr_pending);
			atomic_inc(&rdev->nr_pending);
			rcu_read_unlock();
514
			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
X
Xiao Ni 已提交
515
			bi->bi_end_io = md_end_flush;
516 517
			bi->bi_private = rdev;
			bio_set_dev(bi, rdev->bdev);
518
			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
519
			atomic_inc(&mddev->flush_pending);
520
			submit_bio(bi);
521 522 523 524
			rcu_read_lock();
			rdev_dec_pending(rdev, mddev);
		}
	rcu_read_unlock();
525 526 527
	if (atomic_dec_and_test(&mddev->flush_pending))
		queue_work(md_wq, &mddev->flush_work);
}
528

529 530 531 532 533 534 535 536 537 538 539
static void md_submit_flush_data(struct work_struct *ws)
{
	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
	struct bio *bio = mddev->flush_bio;

	/*
	 * must reset flush_bio before calling into md_handle_request to avoid a
	 * deadlock, because other bios passed md_handle_request suspend check
	 * could wait for this and below md_handle_request could wait for those
	 * bios because of suspend check
	 */
N
NeilBrown 已提交
540
	mddev->last_flush = mddev->start_flush;
541 542 543 544 545 546 547 548 549
	mddev->flush_bio = NULL;
	wake_up(&mddev->sb_wait);

	if (bio->bi_iter.bi_size == 0) {
		/* an empty barrier - all done */
		bio_endio(bio);
	} else {
		bio->bi_opf &= ~REQ_PREFLUSH;
		md_handle_request(mddev, bio);
550 551
	}
}
552

553 554 555 556 557 558 559
/*
 * Manages consolidation of flushes and submitting any flushes needed for
 * a bio with REQ_PREFLUSH.  Returns true if the bio is finished or is
 * being finished in another context.  Returns false if the flushing is
 * complete but still needs the I/O portion of the bio to be processed.
 */
bool md_flush_request(struct mddev *mddev, struct bio *bio)
560
{
N
NeilBrown 已提交
561
	ktime_t start = ktime_get_boottime();
562 563
	spin_lock_irq(&mddev->lock);
	wait_event_lock_irq(mddev->sb_wait,
N
NeilBrown 已提交
564 565
			    !mddev->flush_bio ||
			    ktime_after(mddev->last_flush, start),
566
			    mddev->lock);
N
NeilBrown 已提交
567 568 569 570 571
	if (!ktime_after(mddev->last_flush, start)) {
		WARN_ON(mddev->flush_bio);
		mddev->flush_bio = bio;
		bio = NULL;
	}
572 573
	spin_unlock_irq(&mddev->lock);

N
NeilBrown 已提交
574 575 576 577 578 579 580 581 582 583
	if (!bio) {
		INIT_WORK(&mddev->flush_work, submit_flushes);
		queue_work(md_wq, &mddev->flush_work);
	} else {
		/* flush was performed for some other bio while we waited. */
		if (bio->bi_iter.bi_size == 0)
			/* an empty barrier - all done */
			bio_endio(bio);
		else {
			bio->bi_opf &= ~REQ_PREFLUSH;
584
			return false;
N
NeilBrown 已提交
585 586
		}
	}
587
	return true;
588
}
T
Tejun Heo 已提交
589
EXPORT_SYMBOL(md_flush_request);
590

591
static inline struct mddev *mddev_get(struct mddev *mddev)
L
Linus Torvalds 已提交
592 593 594 595 596
{
	atomic_inc(&mddev->active);
	return mddev;
}

597
static void mddev_delayed_delete(struct work_struct *ws);
598

599
static void mddev_put(struct mddev *mddev)
L
Linus Torvalds 已提交
600 601 602
{
	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
		return;
603
	if (!mddev->raid_disks && list_empty(&mddev->disks) &&
604 605 606
	    mddev->ctime == 0 && !mddev->hold_active) {
		/* Array is not configured at all, and not held active,
		 * so destroy it */
607
		list_del_init(&mddev->all_mddevs);
608 609 610 611 612 613 614 615

		/*
		 * Call queue_work inside the spinlock so that
		 * flush_workqueue() after mddev_find will succeed in waiting
		 * for the work to be done.
		 */
		INIT_WORK(&mddev->del_work, mddev_delayed_delete);
		queue_work(md_misc_wq, &mddev->del_work);
616 617
	}
	spin_unlock(&all_mddevs_lock);
L
Linus Torvalds 已提交
618 619
}

620
static void md_safemode_timeout(struct timer_list *t);
621

622
void mddev_init(struct mddev *mddev)
623
{
624
	kobject_init(&mddev->kobj, &md_ktype);
625 626 627 628 629
	mutex_init(&mddev->open_mutex);
	mutex_init(&mddev->reconfig_mutex);
	mutex_init(&mddev->bitmap_info.mutex);
	INIT_LIST_HEAD(&mddev->disks);
	INIT_LIST_HEAD(&mddev->all_mddevs);
630
	timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
631 632 633
	atomic_set(&mddev->active, 1);
	atomic_set(&mddev->openers, 0);
	atomic_set(&mddev->active_io, 0);
634
	spin_lock_init(&mddev->lock);
635
	atomic_set(&mddev->flush_pending, 0);
636 637 638
	init_waitqueue_head(&mddev->sb_wait);
	init_waitqueue_head(&mddev->recovery_wait);
	mddev->reshape_position = MaxSector;
639
	mddev->reshape_backwards = 0;
640
	mddev->last_sync_action = "none";
641 642 643 644
	mddev->resync_min = 0;
	mddev->resync_max = MaxSector;
	mddev->level = LEVEL_NONE;
}
645
EXPORT_SYMBOL_GPL(mddev_init);
646

647
static struct mddev *mddev_find(dev_t unit)
L
Linus Torvalds 已提交
648
{
649
	struct mddev *mddev, *new = NULL;
L
Linus Torvalds 已提交
650

651 652 653
	if (unit && MAJOR(unit) != MD_MAJOR)
		unit &= ~((1<<MdpMinorShift)-1);

L
Linus Torvalds 已提交
654 655
 retry:
	spin_lock(&all_mddevs_lock);
656 657 658 659 660 661 662 663 664 665 666 667

	if (unit) {
		list_for_each_entry(mddev, &all_mddevs, all_mddevs)
			if (mddev->unit == unit) {
				mddev_get(mddev);
				spin_unlock(&all_mddevs_lock);
				kfree(new);
				return mddev;
			}

		if (new) {
			list_add(&new->all_mddevs, &all_mddevs);
L
Linus Torvalds 已提交
668
			spin_unlock(&all_mddevs_lock);
669 670
			new->hold_active = UNTIL_IOCTL;
			return new;
L
Linus Torvalds 已提交
671
		}
672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688
	} else if (new) {
		/* find an unused unit number */
		static int next_minor = 512;
		int start = next_minor;
		int is_free = 0;
		int dev = 0;
		while (!is_free) {
			dev = MKDEV(MD_MAJOR, next_minor);
			next_minor++;
			if (next_minor > MINORMASK)
				next_minor = 0;
			if (next_minor == start) {
				/* Oh dear, all in use. */
				spin_unlock(&all_mddevs_lock);
				kfree(new);
				return NULL;
			}
689

690 691 692 693 694 695 696 697 698 699
			is_free = 1;
			list_for_each_entry(mddev, &all_mddevs, all_mddevs)
				if (mddev->unit == dev) {
					is_free = 0;
					break;
				}
		}
		new->unit = dev;
		new->md_minor = MINOR(dev);
		new->hold_active = UNTIL_STOP;
L
Linus Torvalds 已提交
700 701 702 703 704 705
		list_add(&new->all_mddevs, &all_mddevs);
		spin_unlock(&all_mddevs_lock);
		return new;
	}
	spin_unlock(&all_mddevs_lock);

706
	new = kzalloc(sizeof(*new), GFP_KERNEL);
L
Linus Torvalds 已提交
707 708 709 710 711 712 713 714 715
	if (!new)
		return NULL;

	new->unit = unit;
	if (MAJOR(unit) == MD_MAJOR)
		new->md_minor = MINOR(unit);
	else
		new->md_minor = MINOR(unit) >> MdpMinorShift;

716
	mddev_init(new);
L
Linus Torvalds 已提交
717 718 719 720

	goto retry;
}

721 722
static struct attribute_group md_redundancy_group;

723
void mddev_unlock(struct mddev *mddev)
L
Linus Torvalds 已提交
724
{
725
	if (mddev->to_remove) {
726 727 728 729
		/* These cannot be removed under reconfig_mutex as
		 * an access to the files will try to take reconfig_mutex
		 * while holding the file unremovable, which leads to
		 * a deadlock.
730 731 732 733 734 735 736
		 * So hold set sysfs_active while the remove in happeing,
		 * and anything else which might set ->to_remove or my
		 * otherwise change the sysfs namespace will fail with
		 * -EBUSY if sysfs_active is still set.
		 * We set sysfs_active under reconfig_mutex and elsewhere
		 * test it under the same mutex to ensure its correct value
		 * is seen.
737
		 */
738 739
		struct attribute_group *to_remove = mddev->to_remove;
		mddev->to_remove = NULL;
740
		mddev->sysfs_active = 1;
741 742
		mutex_unlock(&mddev->reconfig_mutex);

N
NeilBrown 已提交
743 744 745 746 747 748 749 750 751 752
		if (mddev->kobj.sd) {
			if (to_remove != &md_redundancy_group)
				sysfs_remove_group(&mddev->kobj, to_remove);
			if (mddev->pers == NULL ||
			    mddev->pers->sync_request == NULL) {
				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
				if (mddev->sysfs_action)
					sysfs_put(mddev->sysfs_action);
				mddev->sysfs_action = NULL;
			}
753
		}
754
		mddev->sysfs_active = 0;
755 756
	} else
		mutex_unlock(&mddev->reconfig_mutex);
L
Linus Torvalds 已提交
757

C
Chris Dunlop 已提交
758 759
	/* As we've dropped the mutex we need a spinlock to
	 * make sure the thread doesn't disappear
760 761
	 */
	spin_lock(&pers_lock);
762
	md_wakeup_thread(mddev->thread);
763
	wake_up(&mddev->sb_wait);
764
	spin_unlock(&pers_lock);
L
Linus Torvalds 已提交
765
}
766
EXPORT_SYMBOL_GPL(mddev_unlock);
L
Linus Torvalds 已提交
767

768
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
769 770 771 772 773 774 775 776 777
{
	struct md_rdev *rdev;

	rdev_for_each_rcu(rdev, mddev)
		if (rdev->desc_nr == nr)
			return rdev;

	return NULL;
}
778
EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
779 780

static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
L
Linus Torvalds 已提交
781
{
782
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
783

N
NeilBrown 已提交
784
	rdev_for_each(rdev, mddev)
L
Linus Torvalds 已提交
785 786
		if (rdev->bdev->bd_dev == dev)
			return rdev;
787

L
Linus Torvalds 已提交
788 789 790
	return NULL;
}

791
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
792 793 794 795 796 797 798 799 800
{
	struct md_rdev *rdev;

	rdev_for_each_rcu(rdev, mddev)
		if (rdev->bdev->bd_dev == dev)
			return rdev;

	return NULL;
}
801
EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
802

803
static struct md_personality *find_pers(int level, char *clevel)
804
{
805
	struct md_personality *pers;
806 807
	list_for_each_entry(pers, &pers_list, list) {
		if (level != LEVEL_NONE && pers->level == level)
808
			return pers;
809 810 811
		if (strcmp(pers->name, clevel)==0)
			return pers;
	}
812 813 814
	return NULL;
}

815
/* return the offset of the super block in 512byte sectors */
816
static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
L
Linus Torvalds 已提交
817
{
818
	sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
819
	return MD_NEW_SIZE_SECTORS(num_sectors);
L
Linus Torvalds 已提交
820 821
}

822
static int alloc_disk_sb(struct md_rdev *rdev)
L
Linus Torvalds 已提交
823 824
{
	rdev->sb_page = alloc_page(GFP_KERNEL);
825
	if (!rdev->sb_page)
826
		return -ENOMEM;
L
Linus Torvalds 已提交
827 828 829
	return 0;
}

830
void md_rdev_clear(struct md_rdev *rdev)
L
Linus Torvalds 已提交
831 832
{
	if (rdev->sb_page) {
833
		put_page(rdev->sb_page);
L
Linus Torvalds 已提交
834 835
		rdev->sb_loaded = 0;
		rdev->sb_page = NULL;
836
		rdev->sb_start = 0;
837
		rdev->sectors = 0;
L
Linus Torvalds 已提交
838
	}
839 840 841 842
	if (rdev->bb_page) {
		put_page(rdev->bb_page);
		rdev->bb_page = NULL;
	}
843
	badblocks_exit(&rdev->badblocks);
L
Linus Torvalds 已提交
844
}
845
EXPORT_SYMBOL_GPL(md_rdev_clear);
L
Linus Torvalds 已提交
846

847
static void super_written(struct bio *bio)
848
{
849
	struct md_rdev *rdev = bio->bi_private;
850
	struct mddev *mddev = rdev->mddev;
851

852 853
	if (bio->bi_status) {
		pr_err("md: super_written gets error=%d\n", bio->bi_status);
854
		md_error(mddev, rdev);
855 856
		if (!test_bit(Faulty, &rdev->flags)
		    && (bio->bi_opf & MD_FAILFAST)) {
857
			set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
858 859 860 861
			set_bit(LastDev, &rdev->flags);
		}
	} else
		clear_bit(LastDev, &rdev->flags);
862

863 864
	if (atomic_dec_and_test(&mddev->pending_writes))
		wake_up(&mddev->sb_wait);
865
	rdev_dec_pending(rdev, mddev);
N
Neil Brown 已提交
866
	bio_put(bio);
867 868
}

869
void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
870 871 872 873 874 875 876 877
		   sector_t sector, int size, struct page *page)
{
	/* write first size bytes of page to sector of rdev
	 * Increment mddev->pending_writes before returning
	 * and decrement it on completion, waking up sb_wait
	 * if zero is reached.
	 * If an error occurred, call md_error
	 */
878 879 880
	struct bio *bio;
	int ff = 0;

881 882 883
	if (!page)
		return;

884 885 886
	if (test_bit(Faulty, &rdev->flags))
		return;

887
	bio = md_bio_alloc_sync(mddev);
888

889 890
	atomic_inc(&rdev->nr_pending);

891
	bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
892
	bio->bi_iter.bi_sector = sector;
893 894 895
	bio_add_page(bio, page, size, 0);
	bio->bi_private = rdev;
	bio->bi_end_io = super_written;
896 897 898 899 900

	if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
	    test_bit(FailFast, &rdev->flags) &&
	    !test_bit(LastDev, &rdev->flags))
		ff = MD_FAILFAST;
J
Jan Kara 已提交
901
	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff;
902

903
	atomic_inc(&mddev->pending_writes);
904
	submit_bio(bio);
905 906
}

907
int md_super_wait(struct mddev *mddev)
908
{
T
Tejun Heo 已提交
909
	/* wait for all superblock writes that were scheduled to complete */
910
	wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
911
	if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
912 913
		return -EAGAIN;
	return 0;
914 915
}

916
int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
M
Mike Christie 已提交
917
		 struct page *page, int op, int op_flags, bool metadata_op)
L
Linus Torvalds 已提交
918
{
919
	struct bio *bio = md_bio_alloc_sync(rdev->mddev);
L
Linus Torvalds 已提交
920 921
	int ret;

922 923 924 925
	if (metadata_op && rdev->meta_bdev)
		bio_set_dev(bio, rdev->meta_bdev);
	else
		bio_set_dev(bio, rdev->bdev);
M
Mike Christie 已提交
926
	bio_set_op_attrs(bio, op, op_flags);
J
Jonathan Brassow 已提交
927
	if (metadata_op)
928
		bio->bi_iter.bi_sector = sector + rdev->sb_start;
929 930 931
	else if (rdev->mddev->reshape_position != MaxSector &&
		 (rdev->mddev->reshape_backwards ==
		  (sector >= rdev->mddev->reshape_position)))
932
		bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
J
Jonathan Brassow 已提交
933
	else
934
		bio->bi_iter.bi_sector = sector + rdev->data_offset;
L
Linus Torvalds 已提交
935
	bio_add_page(bio, page, size, 0);
936 937

	submit_bio_wait(bio);
L
Linus Torvalds 已提交
938

939
	ret = !bio->bi_status;
L
Linus Torvalds 已提交
940 941 942
	bio_put(bio);
	return ret;
}
943
EXPORT_SYMBOL_GPL(sync_page_io);
L
Linus Torvalds 已提交
944

945
static int read_disk_sb(struct md_rdev *rdev, int size)
L
Linus Torvalds 已提交
946 947
{
	char b[BDEVNAME_SIZE];
N
NeilBrown 已提交
948

L
Linus Torvalds 已提交
949 950 951
	if (rdev->sb_loaded)
		return 0;

M
Mike Christie 已提交
952
	if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true))
L
Linus Torvalds 已提交
953 954 955 956 957
		goto fail;
	rdev->sb_loaded = 1;
	return 0;

fail:
958 959
	pr_err("md: disabled device %s, could not read superblock.\n",
	       bdevname(rdev->bdev,b));
L
Linus Torvalds 已提交
960 961 962
	return -EINVAL;
}

963
static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
L
Linus Torvalds 已提交
964
{
965
	return	sb1->set_uuid0 == sb2->set_uuid0 &&
A
Andre Noll 已提交
966 967 968
		sb1->set_uuid1 == sb2->set_uuid1 &&
		sb1->set_uuid2 == sb2->set_uuid2 &&
		sb1->set_uuid3 == sb2->set_uuid3;
L
Linus Torvalds 已提交
969 970
}

971
static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
L
Linus Torvalds 已提交
972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992
{
	int ret;
	mdp_super_t *tmp1, *tmp2;

	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);

	if (!tmp1 || !tmp2) {
		ret = 0;
		goto abort;
	}

	*tmp1 = *sb1;
	*tmp2 = *sb2;

	/*
	 * nr_disks is not constant
	 */
	tmp1->nr_disks = 0;
	tmp2->nr_disks = 0;

A
Andre Noll 已提交
993
	ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
L
Linus Torvalds 已提交
994
abort:
995 996
	kfree(tmp1);
	kfree(tmp2);
L
Linus Torvalds 已提交
997 998 999
	return ret;
}

1000 1001 1002 1003 1004 1005
static u32 md_csum_fold(u32 csum)
{
	csum = (csum & 0xffff) + (csum >> 16);
	return (csum & 0xffff) + (csum >> 16);
}

1006
static unsigned int calc_sb_csum(mdp_super_t *sb)
L
Linus Torvalds 已提交
1007
{
1008 1009 1010
	u64 newcsum = 0;
	u32 *sb32 = (u32*)sb;
	int i;
L
Linus Torvalds 已提交
1011 1012 1013 1014
	unsigned int disk_csum, csum;

	disk_csum = sb->sb_csum;
	sb->sb_csum = 0;
1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030

	for (i = 0; i < MD_SB_BYTES/4 ; i++)
		newcsum += sb32[i];
	csum = (newcsum & 0xffffffff) + (newcsum>>32);

#ifdef CONFIG_ALPHA
	/* This used to use csum_partial, which was wrong for several
	 * reasons including that different results are returned on
	 * different architectures.  It isn't critical that we get exactly
	 * the same return value as before (we always csum_fold before
	 * testing, and that removes any differences).  However as we
	 * know that csum_partial always returned a 16bit value on
	 * alphas, do a fold to maximise conformity to previous behaviour.
	 */
	sb->sb_csum = md_csum_fold(disk_csum);
#else
L
Linus Torvalds 已提交
1031
	sb->sb_csum = disk_csum;
1032
#endif
L
Linus Torvalds 已提交
1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043
	return csum;
}

/*
 * Handle superblock details.
 * We want to be able to handle multiple superblock formats
 * so we have a common interface to them all, and an array of
 * different handlers.
 * We rely on user-space to write the initial superblock, and support
 * reading and updating of superblocks.
 * Interface methods are:
1044
 *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
L
Linus Torvalds 已提交
1045 1046 1047 1048 1049 1050 1051 1052 1053
 *      loads and validates a superblock on dev.
 *      if refdev != NULL, compare superblocks on both devices
 *    Return:
 *      0 - dev has a superblock that is compatible with refdev
 *      1 - dev has a superblock that is compatible and newer than refdev
 *          so dev should be used as the refdev in future
 *     -EINVAL superblock incompatible or invalid
 *     -othererror e.g. -EIO
 *
1054
 *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
L
Linus Torvalds 已提交
1055 1056 1057 1058 1059
 *      Verify that dev is acceptable into mddev.
 *       The first time, mddev->raid_disks will be 0, and data from
 *       dev should be merged in.  Subsequent calls check that dev
 *       is new enough.  Return 0 or -EINVAL
 *
1060
 *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
L
Linus Torvalds 已提交
1061 1062 1063 1064 1065 1066
 *     Update the superblock for rdev with data in mddev
 *     This does not write to disc.
 *
 */

struct super_type  {
1067 1068
	char		    *name;
	struct module	    *owner;
1069 1070
	int		    (*load_super)(struct md_rdev *rdev,
					  struct md_rdev *refdev,
1071
					  int minor_version);
1072 1073 1074 1075
	int		    (*validate_super)(struct mddev *mddev,
					      struct md_rdev *rdev);
	void		    (*sync_super)(struct mddev *mddev,
					  struct md_rdev *rdev);
1076
	unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
1077
						sector_t num_sectors);
1078 1079
	int		    (*allow_new_offset)(struct md_rdev *rdev,
						unsigned long long new_offset);
L
Linus Torvalds 已提交
1080 1081
};

1082 1083 1084 1085 1086 1087 1088 1089
/*
 * Check that the given mddev has no bitmap.
 *
 * This function is called from the run method of all personalities that do not
 * support bitmaps. It prints an error message and returns non-zero if mddev
 * has a bitmap. Otherwise, it returns 0.
 *
 */
1090
int md_check_no_bitmap(struct mddev *mddev)
1091
{
1092
	if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1093
		return 0;
1094
	pr_warn("%s: bitmaps are not supported for %s\n",
1095 1096 1097 1098 1099
		mdname(mddev), mddev->pers->name);
	return 1;
}
EXPORT_SYMBOL(md_check_no_bitmap);

L
Linus Torvalds 已提交
1100
/*
1101
 * load_super for 0.90.0
L
Linus Torvalds 已提交
1102
 */
1103
static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
L
Linus Torvalds 已提交
1104 1105 1106 1107 1108 1109
{
	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
	mdp_super_t *sb;
	int ret;

	/*
1110
	 * Calculate the position of the superblock (512byte sectors),
L
Linus Torvalds 已提交
1111 1112 1113 1114
	 * it's at the end of the disk.
	 *
	 * It also happens to be a multiple of 4Kb.
	 */
1115
	rdev->sb_start = calc_dev_sboffset(rdev);
L
Linus Torvalds 已提交
1116

1117
	ret = read_disk_sb(rdev, MD_SB_BYTES);
1118 1119
	if (ret)
		return ret;
L
Linus Torvalds 已提交
1120 1121 1122 1123

	ret = -EINVAL;

	bdevname(rdev->bdev, b);
1124
	sb = page_address(rdev->sb_page);
L
Linus Torvalds 已提交
1125 1126

	if (sb->md_magic != MD_SB_MAGIC) {
1127
		pr_warn("md: invalid raid superblock magic on %s\n", b);
L
Linus Torvalds 已提交
1128 1129 1130 1131
		goto abort;
	}

	if (sb->major_version != 0 ||
1132 1133
	    sb->minor_version < 90 ||
	    sb->minor_version > 91) {
1134 1135
		pr_warn("Bad version number %d.%d on %s\n",
			sb->major_version, sb->minor_version, b);
L
Linus Torvalds 已提交
1136 1137 1138 1139 1140 1141
		goto abort;
	}

	if (sb->raid_disks <= 0)
		goto abort;

1142
	if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1143
		pr_warn("md: invalid superblock checksum on %s\n", b);
L
Linus Torvalds 已提交
1144 1145 1146 1147 1148
		goto abort;
	}

	rdev->preferred_minor = sb->md_minor;
	rdev->data_offset = 0;
1149
	rdev->new_data_offset = 0;
1150
	rdev->sb_size = MD_SB_BYTES;
1151
	rdev->badblocks.shift = -1;
L
Linus Torvalds 已提交
1152 1153 1154 1155 1156 1157

	if (sb->level == LEVEL_MULTIPATH)
		rdev->desc_nr = -1;
	else
		rdev->desc_nr = sb->this_disk.number;

1158
	if (!refdev) {
L
Linus Torvalds 已提交
1159
		ret = 1;
1160
	} else {
L
Linus Torvalds 已提交
1161
		__u64 ev1, ev2;
1162
		mdp_super_t *refsb = page_address(refdev->sb_page);
1163
		if (!md_uuid_equal(refsb, sb)) {
1164
			pr_warn("md: %s has different UUID to %s\n",
L
Linus Torvalds 已提交
1165 1166 1167
				b, bdevname(refdev->bdev,b2));
			goto abort;
		}
1168
		if (!md_sb_equal(refsb, sb)) {
1169 1170
			pr_warn("md: %s has same UUID but different superblock to %s\n",
				b, bdevname(refdev->bdev, b2));
L
Linus Torvalds 已提交
1171 1172 1173 1174 1175 1176
			goto abort;
		}
		ev1 = md_event(sb);
		ev2 = md_event(refsb);
		if (ev1 > ev2)
			ret = 1;
1177
		else
L
Linus Torvalds 已提交
1178 1179
			ret = 0;
	}
1180
	rdev->sectors = rdev->sb_start;
1181 1182 1183 1184
	/* Limit to 4TB as metadata cannot record more than that.
	 * (not needed for Linear and RAID0 as metadata doesn't
	 * record this size)
	 */
C
Christoph Hellwig 已提交
1185
	if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1186
		rdev->sectors = (sector_t)(2ULL << 32) - 2;
L
Linus Torvalds 已提交
1187

1188
	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1189 1190 1191
		/* "this cannot possibly happen" ... */
		ret = -EINVAL;

L
Linus Torvalds 已提交
1192 1193 1194 1195 1196 1197 1198
 abort:
	return ret;
}

/*
 * validate_super for 0.90.0
 */
1199
static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
L
Linus Torvalds 已提交
1200 1201
{
	mdp_disk_t *desc;
1202
	mdp_super_t *sb = page_address(rdev->sb_page);
1203
	__u64 ev1 = md_event(sb);
L
Linus Torvalds 已提交
1204

1205
	rdev->raid_disk = -1;
1206 1207
	clear_bit(Faulty, &rdev->flags);
	clear_bit(In_sync, &rdev->flags);
1208
	clear_bit(Bitmap_sync, &rdev->flags);
1209 1210
	clear_bit(WriteMostly, &rdev->flags);

L
Linus Torvalds 已提交
1211 1212 1213 1214
	if (mddev->raid_disks == 0) {
		mddev->major_version = 0;
		mddev->minor_version = sb->minor_version;
		mddev->patch_version = sb->patch_version;
1215
		mddev->external = 0;
1216
		mddev->chunk_sectors = sb->chunk_size >> 9;
L
Linus Torvalds 已提交
1217 1218 1219
		mddev->ctime = sb->ctime;
		mddev->utime = sb->utime;
		mddev->level = sb->level;
1220
		mddev->clevel[0] = 0;
L
Linus Torvalds 已提交
1221 1222
		mddev->layout = sb->layout;
		mddev->raid_disks = sb->raid_disks;
1223
		mddev->dev_sectors = ((sector_t)sb->size) * 2;
1224
		mddev->events = ev1;
1225
		mddev->bitmap_info.offset = 0;
1226 1227
		mddev->bitmap_info.space = 0;
		/* bitmap can use 60 K after the 4K superblocks */
1228
		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1229
		mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1230
		mddev->reshape_backwards = 0;
L
Linus Torvalds 已提交
1231

1232 1233 1234 1235 1236
		if (mddev->minor_version >= 91) {
			mddev->reshape_position = sb->reshape_position;
			mddev->delta_disks = sb->delta_disks;
			mddev->new_level = sb->new_level;
			mddev->new_layout = sb->new_layout;
1237
			mddev->new_chunk_sectors = sb->new_chunk >> 9;
1238 1239
			if (mddev->delta_disks < 0)
				mddev->reshape_backwards = 1;
1240 1241 1242 1243 1244
		} else {
			mddev->reshape_position = MaxSector;
			mddev->delta_disks = 0;
			mddev->new_level = mddev->level;
			mddev->new_layout = mddev->layout;
1245
			mddev->new_chunk_sectors = mddev->chunk_sectors;
1246
		}
1247 1248
		if (mddev->level == 0)
			mddev->layout = -1;
1249

L
Linus Torvalds 已提交
1250 1251 1252
		if (sb->state & (1<<MD_SB_CLEAN))
			mddev->recovery_cp = MaxSector;
		else {
1253
			if (sb->events_hi == sb->cp_events_hi &&
L
Linus Torvalds 已提交
1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265
				sb->events_lo == sb->cp_events_lo) {
				mddev->recovery_cp = sb->recovery_cp;
			} else
				mddev->recovery_cp = 0;
		}

		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
		memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
		memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);

		mddev->max_disks = MD_SB_DISKS;
1266 1267

		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1268
		    mddev->bitmap_info.file == NULL) {
1269 1270
			mddev->bitmap_info.offset =
				mddev->bitmap_info.default_offset;
1271
			mddev->bitmap_info.space =
1272
				mddev->bitmap_info.default_space;
1273
		}
1274

1275
	} else if (mddev->pers == NULL) {
1276 1277
		/* Insist on good event counter while assembling, except
		 * for spares (which don't need an event count) */
L
Linus Torvalds 已提交
1278
		++ev1;
1279 1280
		if (sb->disks[rdev->desc_nr].state & (
			    (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1281
			if (ev1 < mddev->events)
1282
				return -EINVAL;
1283 1284 1285 1286 1287 1288
	} else if (mddev->bitmap) {
		/* if adding to array with a bitmap, then we can accept an
		 * older device ... but not too old.
		 */
		if (ev1 < mddev->bitmap->events_cleared)
			return 0;
1289 1290
		if (ev1 < mddev->events)
			set_bit(Bitmap_sync, &rdev->flags);
1291 1292 1293 1294 1295
	} else {
		if (ev1 < mddev->events)
			/* just a hot-add of a new device, leave raid_disk at -1 */
			return 0;
	}
1296

L
Linus Torvalds 已提交
1297 1298 1299 1300
	if (mddev->level != LEVEL_MULTIPATH) {
		desc = sb->disks + rdev->desc_nr;

		if (desc->state & (1<<MD_DISK_FAULTY))
1301
			set_bit(Faulty, &rdev->flags);
1302 1303
		else if (desc->state & (1<<MD_DISK_SYNC) /* &&
			    desc->raid_disk < mddev->raid_disks */) {
1304
			set_bit(In_sync, &rdev->flags);
L
Linus Torvalds 已提交
1305
			rdev->raid_disk = desc->raid_disk;
1306
			rdev->saved_raid_disk = desc->raid_disk;
1307 1308 1309 1310 1311 1312 1313 1314
		} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
			/* active but not in sync implies recovery up to
			 * reshape position.  We don't know exactly where
			 * that is, so set to zero for now */
			if (mddev->minor_version >= 91) {
				rdev->recovery_offset = 0;
				rdev->raid_disk = desc->raid_disk;
			}
L
Linus Torvalds 已提交
1315
		}
1316 1317
		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
			set_bit(WriteMostly, &rdev->flags);
1318 1319
		if (desc->state & (1<<MD_DISK_FAILFAST))
			set_bit(FailFast, &rdev->flags);
1320
	} else /* MULTIPATH are always insync */
1321
		set_bit(In_sync, &rdev->flags);
L
Linus Torvalds 已提交
1322 1323 1324 1325 1326 1327
	return 0;
}

/*
 * sync_super for 0.90.0
 */
1328
static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
L
Linus Torvalds 已提交
1329 1330
{
	mdp_super_t *sb;
1331
	struct md_rdev *rdev2;
L
Linus Torvalds 已提交
1332
	int next_spare = mddev->raid_disks;
1333

L
Linus Torvalds 已提交
1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346
	/* make rdev->sb match mddev data..
	 *
	 * 1/ zero out disks
	 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
	 * 3/ any empty disks < next_spare become removed
	 *
	 * disks[0] gets initialised to REMOVED because
	 * we cannot be sure from other fields if it has
	 * been initialised or not.
	 */
	int i;
	int active=0, working=0,failed=0,spare=0,nr_disks=0;

1347 1348
	rdev->sb_size = MD_SB_BYTES;

1349
	sb = page_address(rdev->sb_page);
L
Linus Torvalds 已提交
1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361

	memset(sb, 0, sizeof(*sb));

	sb->md_magic = MD_SB_MAGIC;
	sb->major_version = mddev->major_version;
	sb->patch_version = mddev->patch_version;
	sb->gvalid_words  = 0; /* ignored */
	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
	memcpy(&sb->set_uuid3, mddev->uuid+12,4);

1362
	sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
L
Linus Torvalds 已提交
1363
	sb->level = mddev->level;
A
Andre Noll 已提交
1364
	sb->size = mddev->dev_sectors / 2;
L
Linus Torvalds 已提交
1365 1366
	sb->raid_disks = mddev->raid_disks;
	sb->md_minor = mddev->md_minor;
1367
	sb->not_persistent = 0;
1368
	sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
L
Linus Torvalds 已提交
1369 1370 1371 1372
	sb->state = 0;
	sb->events_hi = (mddev->events>>32);
	sb->events_lo = (u32)mddev->events;

1373 1374 1375 1376 1377 1378 1379 1380
	if (mddev->reshape_position == MaxSector)
		sb->minor_version = 90;
	else {
		sb->minor_version = 91;
		sb->reshape_position = mddev->reshape_position;
		sb->new_level = mddev->new_level;
		sb->delta_disks = mddev->delta_disks;
		sb->new_layout = mddev->new_layout;
1381
		sb->new_chunk = mddev->new_chunk_sectors << 9;
1382 1383
	}
	mddev->minor_version = sb->minor_version;
L
Linus Torvalds 已提交
1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394
	if (mddev->in_sync)
	{
		sb->recovery_cp = mddev->recovery_cp;
		sb->cp_events_hi = (mddev->events>>32);
		sb->cp_events_lo = (u32)mddev->events;
		if (mddev->recovery_cp == MaxSector)
			sb->state = (1<< MD_SB_CLEAN);
	} else
		sb->recovery_cp = 0;

	sb->layout = mddev->layout;
1395
	sb->chunk_size = mddev->chunk_sectors << 9;
L
Linus Torvalds 已提交
1396

1397
	if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1398 1399
		sb->state |= (1<<MD_SB_BITMAP_PRESENT);

L
Linus Torvalds 已提交
1400
	sb->disks[0].state = (1<<MD_DISK_REMOVED);
N
NeilBrown 已提交
1401
	rdev_for_each(rdev2, mddev) {
L
Linus Torvalds 已提交
1402
		mdp_disk_t *d;
1403
		int desc_nr;
1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416
		int is_active = test_bit(In_sync, &rdev2->flags);

		if (rdev2->raid_disk >= 0 &&
		    sb->minor_version >= 91)
			/* we have nowhere to store the recovery_offset,
			 * but if it is not below the reshape_position,
			 * we can piggy-back on that.
			 */
			is_active = 1;
		if (rdev2->raid_disk < 0 ||
		    test_bit(Faulty, &rdev2->flags))
			is_active = 0;
		if (is_active)
1417
			desc_nr = rdev2->raid_disk;
L
Linus Torvalds 已提交
1418
		else
1419
			desc_nr = next_spare++;
1420
		rdev2->desc_nr = desc_nr;
L
Linus Torvalds 已提交
1421 1422 1423 1424 1425
		d = &sb->disks[rdev2->desc_nr];
		nr_disks++;
		d->number = rdev2->desc_nr;
		d->major = MAJOR(rdev2->bdev->bd_dev);
		d->minor = MINOR(rdev2->bdev->bd_dev);
1426
		if (is_active)
L
Linus Torvalds 已提交
1427 1428 1429
			d->raid_disk = rdev2->raid_disk;
		else
			d->raid_disk = rdev2->desc_nr; /* compatibility */
1430
		if (test_bit(Faulty, &rdev2->flags))
L
Linus Torvalds 已提交
1431
			d->state = (1<<MD_DISK_FAULTY);
1432
		else if (is_active) {
L
Linus Torvalds 已提交
1433
			d->state = (1<<MD_DISK_ACTIVE);
1434 1435
			if (test_bit(In_sync, &rdev2->flags))
				d->state |= (1<<MD_DISK_SYNC);
L
Linus Torvalds 已提交
1436 1437 1438 1439 1440 1441 1442
			active++;
			working++;
		} else {
			d->state = 0;
			spare++;
			working++;
		}
1443 1444
		if (test_bit(WriteMostly, &rdev2->flags))
			d->state |= (1<<MD_DISK_WRITEMOSTLY);
1445 1446
		if (test_bit(FailFast, &rdev2->flags))
			d->state |= (1<<MD_DISK_FAILFAST);
L
Linus Torvalds 已提交
1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468
	}
	/* now set the "removed" and "faulty" bits on any missing devices */
	for (i=0 ; i < mddev->raid_disks ; i++) {
		mdp_disk_t *d = &sb->disks[i];
		if (d->state == 0 && d->number == 0) {
			d->number = i;
			d->raid_disk = i;
			d->state = (1<<MD_DISK_REMOVED);
			d->state |= (1<<MD_DISK_FAULTY);
			failed++;
		}
	}
	sb->nr_disks = nr_disks;
	sb->active_disks = active;
	sb->working_disks = working;
	sb->failed_disks = failed;
	sb->spare_disks = spare;

	sb->this_disk = sb->disks[rdev->desc_nr];
	sb->sb_csum = calc_sb_csum(sb);
}

1469 1470 1471 1472
/*
 * rdev_size_change for 0.90.0
 */
static unsigned long long
1473
super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1474
{
A
Andre Noll 已提交
1475
	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1476
		return 0; /* component must fit device */
1477
	if (rdev->mddev->bitmap_info.offset)
1478
		return 0; /* can't move bitmap */
1479
	rdev->sb_start = calc_dev_sboffset(rdev);
1480 1481
	if (!num_sectors || num_sectors > rdev->sb_start)
		num_sectors = rdev->sb_start;
1482 1483 1484
	/* Limit to 4TB as metadata cannot record more than that.
	 * 4TB == 2^32 KB, or 2*2^32 sectors.
	 */
C
Christoph Hellwig 已提交
1485
	if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1486
		num_sectors = (sector_t)(2ULL << 32) - 2;
1487 1488
	do {
		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1489
		       rdev->sb_page);
1490
	} while (md_super_wait(rdev->mddev) < 0);
1491
	return num_sectors;
1492 1493
}

1494 1495 1496 1497 1498 1499
static int
super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
{
	/* non-zero offset changes not possible with v0.90 */
	return new_offset == 0;
}
1500

L
Linus Torvalds 已提交
1501 1502 1503 1504
/*
 * version 1 superblock
 */

1505
static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
L
Linus Torvalds 已提交
1506
{
1507 1508
	__le32 disk_csum;
	u32 csum;
L
Linus Torvalds 已提交
1509 1510
	unsigned long long newcsum;
	int size = 256 + le32_to_cpu(sb->max_dev)*2;
1511
	__le32 *isuper = (__le32*)sb;
L
Linus Torvalds 已提交
1512 1513 1514 1515

	disk_csum = sb->sb_csum;
	sb->sb_csum = 0;
	newcsum = 0;
1516
	for (; size >= 4; size -= 4)
L
Linus Torvalds 已提交
1517 1518 1519
		newcsum += le32_to_cpu(*isuper++);

	if (size == 2)
1520
		newcsum += le16_to_cpu(*(__le16*) isuper);
L
Linus Torvalds 已提交
1521 1522 1523 1524 1525 1526

	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
	sb->sb_csum = disk_csum;
	return cpu_to_le32(csum);
}

1527
static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
L
Linus Torvalds 已提交
1528 1529 1530
{
	struct mdp_superblock_1 *sb;
	int ret;
1531
	sector_t sb_start;
1532
	sector_t sectors;
L
Linus Torvalds 已提交
1533
	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1534
	int bmask;
L
Linus Torvalds 已提交
1535 1536

	/*
1537
	 * Calculate the position of the superblock in 512byte sectors.
L
Linus Torvalds 已提交
1538 1539 1540 1541 1542 1543 1544 1545
	 * It is always aligned to a 4K boundary and
	 * depeding on minor_version, it can be:
	 * 0: At least 8K, but less than 12K, from end of device
	 * 1: At start of device
	 * 2: 4K from start of device.
	 */
	switch(minor_version) {
	case 0:
1546
		sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1547 1548
		sb_start -= 8*2;
		sb_start &= ~(sector_t)(4*2-1);
L
Linus Torvalds 已提交
1549 1550
		break;
	case 1:
1551
		sb_start = 0;
L
Linus Torvalds 已提交
1552 1553
		break;
	case 2:
1554
		sb_start = 8;
L
Linus Torvalds 已提交
1555 1556 1557 1558
		break;
	default:
		return -EINVAL;
	}
1559
	rdev->sb_start = sb_start;
L
Linus Torvalds 已提交
1560

1561 1562 1563 1564
	/* superblock is rarely larger than 1K, but it can be larger,
	 * and it is safe to read 4k, so we do that
	 */
	ret = read_disk_sb(rdev, 4096);
L
Linus Torvalds 已提交
1565 1566
	if (ret) return ret;

1567
	sb = page_address(rdev->sb_page);
L
Linus Torvalds 已提交
1568 1569 1570 1571

	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
	    sb->major_version != cpu_to_le32(1) ||
	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1572
	    le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1573
	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
L
Linus Torvalds 已提交
1574 1575 1576
		return -EINVAL;

	if (calc_sb_1_csum(sb) != sb->sb_csum) {
1577
		pr_warn("md: invalid superblock checksum on %s\n",
L
Linus Torvalds 已提交
1578 1579 1580 1581
			bdevname(rdev->bdev,b));
		return -EINVAL;
	}
	if (le64_to_cpu(sb->data_size) < 10) {
1582 1583
		pr_warn("md: data_size too small on %s\n",
			bdevname(rdev->bdev,b));
L
Linus Torvalds 已提交
1584 1585
		return -EINVAL;
	}
1586 1587 1588 1589 1590
	if (sb->pad0 ||
	    sb->pad3[0] ||
	    memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
		/* Some padding is non-zero, might be a new feature */
		return -EINVAL;
1591

L
Linus Torvalds 已提交
1592 1593
	rdev->preferred_minor = 0xffff;
	rdev->data_offset = le64_to_cpu(sb->data_offset);
1594 1595 1596 1597
	rdev->new_data_offset = rdev->data_offset;
	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
		rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1598
	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
L
Linus Torvalds 已提交
1599

1600
	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1601
	bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1602
	if (rdev->sb_size & bmask)
1603 1604 1605
		rdev->sb_size = (rdev->sb_size | bmask) + 1;

	if (minor_version
1606
	    && rdev->data_offset < sb_start + (rdev->sb_size/512))
1607
		return -EINVAL;
1608 1609 1610
	if (minor_version
	    && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
		return -EINVAL;
1611

1612 1613 1614 1615 1616
	if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
		rdev->desc_nr = -1;
	else
		rdev->desc_nr = le32_to_cpu(sb->dev_number);

1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628
	if (!rdev->bb_page) {
		rdev->bb_page = alloc_page(GFP_KERNEL);
		if (!rdev->bb_page)
			return -ENOMEM;
	}
	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
	    rdev->badblocks.count == 0) {
		/* need to load the bad block list.
		 * Currently we limit it to one page.
		 */
		s32 offset;
		sector_t bb_sector;
1629
		__le64 *bbp;
1630 1631 1632 1633 1634 1635 1636 1637 1638
		int i;
		int sectors = le16_to_cpu(sb->bblog_size);
		if (sectors > (PAGE_SIZE / 512))
			return -EINVAL;
		offset = le32_to_cpu(sb->bblog_offset);
		if (offset == 0)
			return -EINVAL;
		bb_sector = (long long)offset;
		if (!sync_page_io(rdev, bb_sector, sectors << 9,
M
Mike Christie 已提交
1639
				  rdev->bb_page, REQ_OP_READ, 0, true))
1640
			return -EIO;
1641
		bbp = (__le64 *)page_address(rdev->bb_page);
1642 1643 1644 1645 1646 1647 1648 1649 1650
		rdev->badblocks.shift = sb->bblog_shift;
		for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
			u64 bb = le64_to_cpu(*bbp);
			int count = bb & (0x3ff);
			u64 sector = bb >> 10;
			sector <<= sb->bblog_shift;
			count <<= sb->bblog_shift;
			if (bb + 1 == 0)
				break;
1651
			if (badblocks_set(&rdev->badblocks, sector, count, 1))
1652 1653
				return -EINVAL;
		}
1654 1655
	} else if (sb->bblog_offset != 0)
		rdev->badblocks.shift = 0;
1656

1657 1658
	if ((le32_to_cpu(sb->feature_map) &
	    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1659 1660 1661 1662 1663
		rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
		rdev->ppl.size = le16_to_cpu(sb->ppl.size);
		rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
	}

1664 1665 1666 1667
	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
	    sb->level != 0)
		return -EINVAL;

1668
	if (!refdev) {
1669
		ret = 1;
1670
	} else {
L
Linus Torvalds 已提交
1671
		__u64 ev1, ev2;
1672
		struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
L
Linus Torvalds 已提交
1673 1674 1675 1676 1677

		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
		    sb->level != refsb->level ||
		    sb->layout != refsb->layout ||
		    sb->chunksize != refsb->chunksize) {
1678
			pr_warn("md: %s has strangely different superblock to %s\n",
L
Linus Torvalds 已提交
1679 1680 1681 1682 1683 1684 1685 1686
				bdevname(rdev->bdev,b),
				bdevname(refdev->bdev,b2));
			return -EINVAL;
		}
		ev1 = le64_to_cpu(sb->events);
		ev2 = le64_to_cpu(refsb->events);

		if (ev1 > ev2)
1687 1688 1689
			ret = 1;
		else
			ret = 0;
L
Linus Torvalds 已提交
1690
	}
1691 1692 1693 1694 1695 1696
	if (minor_version) {
		sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
		sectors -= rdev->data_offset;
	} else
		sectors = rdev->sb_start;
	if (sectors < le64_to_cpu(sb->data_size))
L
Linus Torvalds 已提交
1697
		return -EINVAL;
1698
	rdev->sectors = le64_to_cpu(sb->data_size);
1699
	return ret;
L
Linus Torvalds 已提交
1700 1701
}

1702
static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
L
Linus Torvalds 已提交
1703
{
1704
	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1705
	__u64 ev1 = le64_to_cpu(sb->events);
L
Linus Torvalds 已提交
1706

1707
	rdev->raid_disk = -1;
1708 1709
	clear_bit(Faulty, &rdev->flags);
	clear_bit(In_sync, &rdev->flags);
1710
	clear_bit(Bitmap_sync, &rdev->flags);
1711 1712
	clear_bit(WriteMostly, &rdev->flags);

L
Linus Torvalds 已提交
1713 1714 1715
	if (mddev->raid_disks == 0) {
		mddev->major_version = 1;
		mddev->patch_version = 0;
1716
		mddev->external = 0;
1717
		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1718 1719
		mddev->ctime = le64_to_cpu(sb->ctime);
		mddev->utime = le64_to_cpu(sb->utime);
L
Linus Torvalds 已提交
1720
		mddev->level = le32_to_cpu(sb->level);
1721
		mddev->clevel[0] = 0;
L
Linus Torvalds 已提交
1722 1723
		mddev->layout = le32_to_cpu(sb->layout);
		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
A
Andre Noll 已提交
1724
		mddev->dev_sectors = le64_to_cpu(sb->size);
1725
		mddev->events = ev1;
1726
		mddev->bitmap_info.offset = 0;
1727 1728 1729 1730
		mddev->bitmap_info.space = 0;
		/* Default location for bitmap is 1K after superblock
		 * using 3K - total of 4K
		 */
1731
		mddev->bitmap_info.default_offset = 1024 >> 9;
1732
		mddev->bitmap_info.default_space = (4096-1024) >> 9;
1733 1734
		mddev->reshape_backwards = 0;

L
Linus Torvalds 已提交
1735 1736 1737 1738
		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
		memcpy(mddev->uuid, sb->set_uuid, 16);

		mddev->max_disks =  (4096-256)/2;
1739

1740
		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1741
		    mddev->bitmap_info.file == NULL) {
1742 1743
			mddev->bitmap_info.offset =
				(__s32)le32_to_cpu(sb->bitmap_offset);
1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757
			/* Metadata doesn't record how much space is available.
			 * For 1.0, we assume we can use up to the superblock
			 * if before, else to 4K beyond superblock.
			 * For others, assume no change is possible.
			 */
			if (mddev->minor_version > 0)
				mddev->bitmap_info.space = 0;
			else if (mddev->bitmap_info.offset > 0)
				mddev->bitmap_info.space =
					8 - mddev->bitmap_info.offset;
			else
				mddev->bitmap_info.space =
					-mddev->bitmap_info.offset;
		}
1758

1759 1760 1761 1762 1763
		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
			mddev->new_level = le32_to_cpu(sb->new_level);
			mddev->new_layout = le32_to_cpu(sb->new_layout);
1764
			mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1765 1766 1767 1768 1769
			if (mddev->delta_disks < 0 ||
			    (mddev->delta_disks == 0 &&
			     (le32_to_cpu(sb->feature_map)
			      & MD_FEATURE_RESHAPE_BACKWARDS)))
				mddev->reshape_backwards = 1;
1770 1771 1772 1773 1774
		} else {
			mddev->reshape_position = MaxSector;
			mddev->delta_disks = 0;
			mddev->new_level = mddev->level;
			mddev->new_layout = mddev->layout;
1775
			mddev->new_chunk_sectors = mddev->chunk_sectors;
1776 1777
		}

1778 1779 1780 1781
		if (mddev->level == 0 &&
		    !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
			mddev->layout = -1;

1782
		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1783
			set_bit(MD_HAS_JOURNAL, &mddev->flags);
1784

1785 1786
		if (le32_to_cpu(sb->feature_map) &
		    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
1787 1788 1789
			if (le32_to_cpu(sb->feature_map) &
			    (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
				return -EINVAL;
1790 1791 1792 1793
			if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
			    (le32_to_cpu(sb->feature_map) &
					    MD_FEATURE_MULTIPLE_PPLS))
				return -EINVAL;
1794 1795
			set_bit(MD_HAS_PPL, &mddev->flags);
		}
1796
	} else if (mddev->pers == NULL) {
1797 1798
		/* Insist of good event counter while assembling, except for
		 * spares (which don't need an event count) */
L
Linus Torvalds 已提交
1799
		++ev1;
1800 1801
		if (rdev->desc_nr >= 0 &&
		    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1802 1803
		    (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
		     le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1804 1805
			if (ev1 < mddev->events)
				return -EINVAL;
1806 1807 1808 1809 1810 1811
	} else if (mddev->bitmap) {
		/* If adding to array with a bitmap, then we can accept an
		 * older device, but not too old.
		 */
		if (ev1 < mddev->bitmap->events_cleared)
			return 0;
1812 1813
		if (ev1 < mddev->events)
			set_bit(Bitmap_sync, &rdev->flags);
1814 1815 1816 1817 1818
	} else {
		if (ev1 < mddev->events)
			/* just a hot-add of a new device, leave raid_disk at -1 */
			return 0;
	}
L
Linus Torvalds 已提交
1819 1820
	if (mddev->level != LEVEL_MULTIPATH) {
		int role;
1821 1822
		if (rdev->desc_nr < 0 ||
		    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1823
			role = MD_DISK_ROLE_SPARE;
1824 1825 1826
			rdev->desc_nr = -1;
		} else
			role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
L
Linus Torvalds 已提交
1827
		switch(role) {
1828
		case MD_DISK_ROLE_SPARE: /* spare */
L
Linus Torvalds 已提交
1829
			break;
1830
		case MD_DISK_ROLE_FAULTY: /* faulty */
1831
			set_bit(Faulty, &rdev->flags);
L
Linus Torvalds 已提交
1832
			break;
1833 1834 1835
		case MD_DISK_ROLE_JOURNAL: /* journal device */
			if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
				/* journal device without journal feature */
1836
				pr_warn("md: journal device provided without journal feature, ignoring the device\n");
1837 1838 1839
				return -EINVAL;
			}
			set_bit(Journal, &rdev->flags);
1840
			rdev->journal_tail = le64_to_cpu(sb->journal_tail);
1841
			rdev->raid_disk = 0;
1842
			break;
L
Linus Torvalds 已提交
1843
		default:
1844
			rdev->saved_raid_disk = role;
1845
			if ((le32_to_cpu(sb->feature_map) &
1846
			     MD_FEATURE_RECOVERY_OFFSET)) {
1847
				rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1848 1849 1850
				if (!(le32_to_cpu(sb->feature_map) &
				      MD_FEATURE_RECOVERY_BITMAP))
					rdev->saved_raid_disk = -1;
1851 1852 1853 1854 1855 1856 1857 1858 1859
			} else {
				/*
				 * If the array is FROZEN, then the device can't
				 * be in_sync with rest of array.
				 */
				if (!test_bit(MD_RECOVERY_FROZEN,
					      &mddev->recovery))
					set_bit(In_sync, &rdev->flags);
			}
L
Linus Torvalds 已提交
1860 1861 1862
			rdev->raid_disk = role;
			break;
		}
1863 1864
		if (sb->devflags & WriteMostly1)
			set_bit(WriteMostly, &rdev->flags);
1865 1866
		if (sb->devflags & FailFast1)
			set_bit(FailFast, &rdev->flags);
1867 1868
		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
			set_bit(Replacement, &rdev->flags);
1869
	} else /* MULTIPATH are always insync */
1870
		set_bit(In_sync, &rdev->flags);
1871

L
Linus Torvalds 已提交
1872 1873 1874
	return 0;
}

1875
static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
L
Linus Torvalds 已提交
1876 1877
{
	struct mdp_superblock_1 *sb;
1878
	struct md_rdev *rdev2;
L
Linus Torvalds 已提交
1879 1880 1881
	int max_dev, i;
	/* make rdev->sb match mddev and rdev data. */

1882
	sb = page_address(rdev->sb_page);
L
Linus Torvalds 已提交
1883 1884 1885

	sb->feature_map = 0;
	sb->pad0 = 0;
1886
	sb->recovery_offset = cpu_to_le64(0);
L
Linus Torvalds 已提交
1887 1888 1889 1890 1891 1892
	memset(sb->pad3, 0, sizeof(sb->pad3));

	sb->utime = cpu_to_le64((__u64)mddev->utime);
	sb->events = cpu_to_le64(mddev->events);
	if (mddev->in_sync)
		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1893 1894
	else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
		sb->resync_offset = cpu_to_le64(MaxSector);
L
Linus Torvalds 已提交
1895 1896 1897
	else
		sb->resync_offset = cpu_to_le64(0);

1898
	sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1899

1900
	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
A
Andre Noll 已提交
1901
	sb->size = cpu_to_le64(mddev->dev_sectors);
1902
	sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1903 1904
	sb->level = cpu_to_le32(mddev->level);
	sb->layout = cpu_to_le32(mddev->layout);
1905 1906 1907 1908
	if (test_bit(FailFast, &rdev->flags))
		sb->devflags |= FailFast1;
	else
		sb->devflags &= ~FailFast1;
1909

1910 1911 1912 1913
	if (test_bit(WriteMostly, &rdev->flags))
		sb->devflags |= WriteMostly1;
	else
		sb->devflags &= ~WriteMostly1;
1914 1915
	sb->data_offset = cpu_to_le64(rdev->data_offset);
	sb->data_size = cpu_to_le64(rdev->sectors);
1916

1917 1918
	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1919
		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1920
	}
1921

S
Shaohua Li 已提交
1922
	if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
1923
	    !test_bit(In_sync, &rdev->flags)) {
1924 1925 1926 1927
		sb->feature_map |=
			cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
		sb->recovery_offset =
			cpu_to_le64(rdev->recovery_offset);
1928 1929 1930
		if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
			sb->feature_map |=
				cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
1931
	}
1932 1933 1934
	/* Note: recovery_offset and journal_tail share space  */
	if (test_bit(Journal, &rdev->flags))
		sb->journal_tail = cpu_to_le64(rdev->journal_tail);
1935 1936 1937
	if (test_bit(Replacement, &rdev->flags))
		sb->feature_map |=
			cpu_to_le32(MD_FEATURE_REPLACEMENT);
1938

1939 1940 1941 1942 1943 1944
	if (mddev->reshape_position != MaxSector) {
		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
		sb->new_layout = cpu_to_le32(mddev->new_layout);
		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
		sb->new_level = cpu_to_le32(mddev->new_level);
1945
		sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1946 1947 1948 1949
		if (mddev->delta_disks == 0 &&
		    mddev->reshape_backwards)
			sb->feature_map
				|= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1950 1951 1952 1953 1954 1955
		if (rdev->new_data_offset != rdev->data_offset) {
			sb->feature_map
				|= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
			sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
							     - rdev->data_offset));
		}
1956
	}
1957

1958 1959 1960
	if (mddev_is_clustered(mddev))
		sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);

1961 1962 1963 1964 1965 1966 1967
	if (rdev->badblocks.count == 0)
		/* Nothing to do for bad blocks*/ ;
	else if (sb->bblog_offset == 0)
		/* Cannot record bad blocks on this device */
		md_error(mddev, rdev);
	else {
		struct badblocks *bb = &rdev->badblocks;
1968
		__le64 *bbp = (__le64 *)page_address(rdev->bb_page);
1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979
		u64 *p = bb->page;
		sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
		if (bb->changed) {
			unsigned seq;

retry:
			seq = read_seqbegin(&bb->lock);

			memset(bbp, 0xff, PAGE_SIZE);

			for (i = 0 ; i < bb->count ; i++) {
1980
				u64 internal_bb = p[i];
1981 1982
				u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
						| BB_LEN(internal_bb));
1983
				bbp[i] = cpu_to_le64(store_bb);
1984
			}
1985
			bb->changed = 0;
1986 1987 1988 1989 1990 1991 1992 1993 1994
			if (read_seqretry(&bb->lock, seq))
				goto retry;

			bb->sector = (rdev->sb_start +
				      (int)le32_to_cpu(sb->bblog_offset));
			bb->size = le16_to_cpu(sb->bblog_size);
		}
	}

L
Linus Torvalds 已提交
1995
	max_dev = 0;
N
NeilBrown 已提交
1996
	rdev_for_each(rdev2, mddev)
L
Linus Torvalds 已提交
1997 1998
		if (rdev2->desc_nr+1 > max_dev)
			max_dev = rdev2->desc_nr+1;
1999

2000 2001
	if (max_dev > le32_to_cpu(sb->max_dev)) {
		int bmask;
2002
		sb->max_dev = cpu_to_le32(max_dev);
2003 2004 2005 2006
		rdev->sb_size = max_dev * 2 + 256;
		bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
		if (rdev->sb_size & bmask)
			rdev->sb_size = (rdev->sb_size | bmask) + 1;
2007 2008 2009
	} else
		max_dev = le32_to_cpu(sb->max_dev);

L
Linus Torvalds 已提交
2010
	for (i=0; i<max_dev;i++)
2011
		sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2012

2013 2014
	if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
		sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
2015

2016
	if (test_bit(MD_HAS_PPL, &mddev->flags)) {
2017 2018 2019 2020 2021
		if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
			sb->feature_map |=
			    cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
		else
			sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
2022 2023 2024 2025
		sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
		sb->ppl.size = cpu_to_le16(rdev->ppl.size);
	}

N
NeilBrown 已提交
2026
	rdev_for_each(rdev2, mddev) {
L
Linus Torvalds 已提交
2027
		i = rdev2->desc_nr;
2028
		if (test_bit(Faulty, &rdev2->flags))
2029
			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
2030
		else if (test_bit(In_sync, &rdev2->flags))
L
Linus Torvalds 已提交
2031
			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2032
		else if (test_bit(Journal, &rdev2->flags))
2033
			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
2034
		else if (rdev2->raid_disk >= 0)
2035
			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
L
Linus Torvalds 已提交
2036
		else
2037
			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
L
Linus Torvalds 已提交
2038 2039 2040 2041 2042
	}

	sb->sb_csum = calc_sb_1_csum(sb);
}

2043
static unsigned long long
2044
super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
2045 2046
{
	struct mdp_superblock_1 *sb;
2047
	sector_t max_sectors;
A
Andre Noll 已提交
2048
	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
2049
		return 0; /* component must fit device */
2050 2051
	if (rdev->data_offset != rdev->new_data_offset)
		return 0; /* too confusing */
2052
	if (rdev->sb_start < rdev->data_offset) {
2053
		/* minor versions 1 and 2; superblock before data */
2054
		max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
2055 2056 2057
		max_sectors -= rdev->data_offset;
		if (!num_sectors || num_sectors > max_sectors)
			num_sectors = max_sectors;
2058
	} else if (rdev->mddev->bitmap_info.offset) {
2059 2060 2061 2062
		/* minor version 0 with bitmap we can't move */
		return 0;
	} else {
		/* minor version 0; superblock after data */
2063
		sector_t sb_start;
2064
		sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
2065
		sb_start &= ~(sector_t)(4*2 - 1);
2066
		max_sectors = rdev->sectors + sb_start - rdev->sb_start;
2067 2068
		if (!num_sectors || num_sectors > max_sectors)
			num_sectors = max_sectors;
2069
		rdev->sb_start = sb_start;
2070
	}
2071
	sb = page_address(rdev->sb_page);
2072
	sb->data_size = cpu_to_le64(num_sectors);
2073
	sb->super_offset = cpu_to_le64(rdev->sb_start);
2074
	sb->sb_csum = calc_sb_1_csum(sb);
2075 2076 2077 2078
	do {
		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
			       rdev->sb_page);
	} while (md_super_wait(rdev->mddev) < 0);
2079
	return num_sectors;
2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107

}

static int
super_1_allow_new_offset(struct md_rdev *rdev,
			 unsigned long long new_offset)
{
	/* All necessary checks on new >= old have been done */
	struct bitmap *bitmap;
	if (new_offset >= rdev->data_offset)
		return 1;

	/* with 1.0 metadata, there is no metadata to tread on
	 * so we can always move back */
	if (rdev->mddev->minor_version == 0)
		return 1;

	/* otherwise we must be sure not to step on
	 * any metadata, so stay:
	 * 36K beyond start of superblock
	 * beyond end of badblocks
	 * beyond write-intent bitmap
	 */
	if (rdev->sb_start + (32+4)*2 > new_offset)
		return 0;
	bitmap = rdev->mddev->bitmap;
	if (bitmap && !rdev->mddev->bitmap_info.file &&
	    rdev->sb_start + rdev->mddev->bitmap_info.offset +
2108
	    bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
2109 2110 2111 2112 2113
		return 0;
	if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
		return 0;

	return 1;
2114
}
L
Linus Torvalds 已提交
2115

A
Adrian Bunk 已提交
2116
static struct super_type super_types[] = {
L
Linus Torvalds 已提交
2117 2118 2119
	[0] = {
		.name	= "0.90.0",
		.owner	= THIS_MODULE,
2120 2121 2122 2123
		.load_super	    = super_90_load,
		.validate_super	    = super_90_validate,
		.sync_super	    = super_90_sync,
		.rdev_size_change   = super_90_rdev_size_change,
2124
		.allow_new_offset   = super_90_allow_new_offset,
L
Linus Torvalds 已提交
2125 2126 2127 2128
	},
	[1] = {
		.name	= "md-1",
		.owner	= THIS_MODULE,
2129 2130 2131 2132
		.load_super	    = super_1_load,
		.validate_super	    = super_1_validate,
		.sync_super	    = super_1_sync,
		.rdev_size_change   = super_1_rdev_size_change,
2133
		.allow_new_offset   = super_1_allow_new_offset,
L
Linus Torvalds 已提交
2134 2135 2136
	},
};

2137
static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148
{
	if (mddev->sync_super) {
		mddev->sync_super(mddev, rdev);
		return;
	}

	BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));

	super_types[mddev->major_version].sync_super(mddev, rdev);
}

2149
static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
L
Linus Torvalds 已提交
2150
{
2151
	struct md_rdev *rdev, *rdev2;
L
Linus Torvalds 已提交
2152

2153
	rcu_read_lock();
2154 2155 2156 2157 2158 2159 2160 2161 2162 2163
	rdev_for_each_rcu(rdev, mddev1) {
		if (test_bit(Faulty, &rdev->flags) ||
		    test_bit(Journal, &rdev->flags) ||
		    rdev->raid_disk == -1)
			continue;
		rdev_for_each_rcu(rdev2, mddev2) {
			if (test_bit(Faulty, &rdev2->flags) ||
			    test_bit(Journal, &rdev2->flags) ||
			    rdev2->raid_disk == -1)
				continue;
2164
			if (rdev->bdev->bd_contains ==
2165 2166
			    rdev2->bdev->bd_contains) {
				rcu_read_unlock();
2167
				return 1;
2168
			}
2169 2170
		}
	}
2171
	rcu_read_unlock();
L
Linus Torvalds 已提交
2172 2173 2174 2175 2176
	return 0;
}

static LIST_HEAD(pending_raid_disks);

2177 2178 2179 2180 2181 2182 2183
/*
 * Try to register data integrity profile for an mddev
 *
 * This is called when an array is started and after a disk has been kicked
 * from the array. It only succeeds if all working and active component devices
 * are integrity capable with matching profiles.
 */
2184
int md_integrity_register(struct mddev *mddev)
2185
{
2186
	struct md_rdev *rdev, *reference = NULL;
2187 2188 2189

	if (list_empty(&mddev->disks))
		return 0; /* nothing to do */
2190 2191
	if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
		return 0; /* shouldn't register, or already is */
N
NeilBrown 已提交
2192
	rdev_for_each(rdev, mddev) {
2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207
		/* skip spares and non-functional disks */
		if (test_bit(Faulty, &rdev->flags))
			continue;
		if (rdev->raid_disk < 0)
			continue;
		if (!reference) {
			/* Use the first rdev as the reference */
			reference = rdev;
			continue;
		}
		/* does this rdev's profile match the reference profile? */
		if (blk_integrity_compare(reference->bdev->bd_disk,
				rdev->bdev->bd_disk) < 0)
			return -EINVAL;
	}
2208 2209
	if (!reference || !bdev_get_integrity(reference->bdev))
		return 0;
2210 2211 2212 2213
	/*
	 * All component devices are integrity capable and have matching
	 * profiles, register the common profile for the md device.
	 */
2214 2215 2216
	blk_integrity_register(mddev->gendisk,
			       bdev_get_integrity(reference->bdev));

2217
	pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2218
	if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) {
2219
		pr_err("md: failed to create integrity pool for %s\n",
2220 2221 2222
		       mdname(mddev));
		return -EINVAL;
	}
2223 2224 2225 2226
	return 0;
}
EXPORT_SYMBOL(md_integrity_register);

2227 2228 2229 2230 2231
/*
 * Attempt to add an rdev, but only if it is consistent with the current
 * integrity profile
 */
int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
M
Martin K. Petersen 已提交
2232
{
2233
	struct blk_integrity *bi_mddev;
2234
	char name[BDEVNAME_SIZE];
2235 2236

	if (!mddev->gendisk)
2237
		return 0;
2238 2239

	bi_mddev = blk_get_integrity(mddev->gendisk);
M
Martin K. Petersen 已提交
2240

2241
	if (!bi_mddev) /* nothing to do */
2242 2243 2244
		return 0;

	if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
2245 2246
		pr_err("%s: incompatible integrity profile for %s\n",
		       mdname(mddev), bdevname(rdev->bdev, name));
2247 2248 2249 2250
		return -ENXIO;
	}

	return 0;
M
Martin K. Petersen 已提交
2251
}
2252
EXPORT_SYMBOL(md_integrity_add_rdev);
M
Martin K. Petersen 已提交
2253

2254
static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
L
Linus Torvalds 已提交
2255
{
2256
	char b[BDEVNAME_SIZE];
2257
	struct kobject *ko;
2258
	int err;
L
Linus Torvalds 已提交
2259

2260 2261 2262 2263
	/* prevent duplicates */
	if (find_rdev(mddev, rdev->bdev->bd_dev))
		return -EEXIST;

2264 2265 2266 2267
	if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) &&
	    mddev->pers)
		return -EROFS;

2268
	/* make sure rdev->sectors exceeds mddev->dev_sectors */
2269 2270 2271
	if (!test_bit(Journal, &rdev->flags) &&
	    rdev->sectors &&
	    (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2272 2273 2274 2275 2276 2277 2278 2279
		if (mddev->pers) {
			/* Cannot change size, so fail
			 * If mddev->level <= 0, then we don't care
			 * about aligning sizes (e.g. linear)
			 */
			if (mddev->level > 0)
				return -ENOSPC;
		} else
2280
			mddev->dev_sectors = rdev->sectors;
2281
	}
L
Linus Torvalds 已提交
2282 2283 2284 2285 2286

	/* Verify rdev->desc_nr is unique.
	 * If it is -1, assign a free number, else
	 * check number is not in use
	 */
2287
	rcu_read_lock();
L
Linus Torvalds 已提交
2288 2289
	if (rdev->desc_nr < 0) {
		int choice = 0;
2290 2291
		if (mddev->pers)
			choice = mddev->raid_disks;
2292
		while (md_find_rdev_nr_rcu(mddev, choice))
L
Linus Torvalds 已提交
2293 2294 2295
			choice++;
		rdev->desc_nr = choice;
	} else {
2296
		if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2297
			rcu_read_unlock();
L
Linus Torvalds 已提交
2298
			return -EBUSY;
2299
		}
L
Linus Torvalds 已提交
2300
	}
2301
	rcu_read_unlock();
2302 2303
	if (!test_bit(Journal, &rdev->flags) &&
	    mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2304 2305
		pr_warn("md: %s: array is limited to %d devices\n",
			mdname(mddev), mddev->max_disks);
2306 2307
		return -EBUSY;
	}
2308
	bdevname(rdev->bdev,b);
2309
	strreplace(b, '/', '!');
2310

L
Linus Torvalds 已提交
2311
	rdev->mddev = mddev;
2312
	pr_debug("md: bind<%s>\n", b);
2313

2314 2315 2316
	if (mddev->raid_disks)
		mddev_create_wb_pool(mddev, rdev, false);

2317
	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2318
		goto fail;
2319

T
Tejun Heo 已提交
2320
	ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
N
NeilBrown 已提交
2321 2322 2323
	if (sysfs_create_link(&rdev->kobj, ko, "block"))
		/* failure here is OK */;
	rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2324

2325
	list_add_rcu(&rdev->same_set, &mddev->disks);
2326
	bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2327 2328

	/* May as well allow recovery to be retried once */
2329
	mddev->recovery_disabled++;
M
Martin K. Petersen 已提交
2330

L
Linus Torvalds 已提交
2331
	return 0;
2332 2333

 fail:
2334 2335
	pr_warn("md: failed to register dev-%s for %s\n",
		b, mdname(mddev));
2336
	return err;
L
Linus Torvalds 已提交
2337 2338
}

2339
static void md_delayed_delete(struct work_struct *ws)
2340
{
2341
	struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2342
	kobject_del(&rdev->kobj);
2343
	kobject_put(&rdev->kobj);
2344 2345
}

2346
static void unbind_rdev_from_array(struct md_rdev *rdev)
L
Linus Torvalds 已提交
2347 2348
{
	char b[BDEVNAME_SIZE];
N
NeilBrown 已提交
2349

2350
	bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2351
	list_del_rcu(&rdev->same_set);
2352
	pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
2353
	mddev_destroy_wb_pool(rdev->mddev, rdev);
L
Linus Torvalds 已提交
2354
	rdev->mddev = NULL;
2355
	sysfs_remove_link(&rdev->kobj, "block");
2356 2357
	sysfs_put(rdev->sysfs_state);
	rdev->sysfs_state = NULL;
2358
	rdev->badblocks.count = 0;
2359
	/* We need to delay this, otherwise we can deadlock when
2360 2361
	 * writing to 'remove' to "dev/state".  We also need
	 * to delay it due to rcu usage.
2362
	 */
2363
	synchronize_rcu();
2364 2365
	INIT_WORK(&rdev->del_work, md_delayed_delete);
	kobject_get(&rdev->kobj);
T
Tejun Heo 已提交
2366
	queue_work(md_misc_wq, &rdev->del_work);
L
Linus Torvalds 已提交
2367 2368 2369 2370 2371 2372 2373
}

/*
 * prevent the device from being mounted, repartitioned or
 * otherwise reused by a RAID array (or any other kernel
 * subsystem), by bd_claiming the device.
 */
2374
static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
L
Linus Torvalds 已提交
2375 2376 2377 2378 2379
{
	int err = 0;
	struct block_device *bdev;
	char b[BDEVNAME_SIZE];

2380
	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2381
				 shared ? (struct md_rdev *)lock_rdev : rdev);
L
Linus Torvalds 已提交
2382
	if (IS_ERR(bdev)) {
2383
		pr_warn("md: could not open %s.\n", __bdevname(dev, b));
L
Linus Torvalds 已提交
2384 2385 2386 2387 2388 2389
		return PTR_ERR(bdev);
	}
	rdev->bdev = bdev;
	return err;
}

2390
static void unlock_rdev(struct md_rdev *rdev)
L
Linus Torvalds 已提交
2391 2392 2393
{
	struct block_device *bdev = rdev->bdev;
	rdev->bdev = NULL;
2394
	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
L
Linus Torvalds 已提交
2395 2396 2397 2398
}

void md_autodetect_dev(dev_t dev);

2399
static void export_rdev(struct md_rdev *rdev)
L
Linus Torvalds 已提交
2400 2401
{
	char b[BDEVNAME_SIZE];
N
NeilBrown 已提交
2402

2403
	pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b));
2404
	md_rdev_clear(rdev);
L
Linus Torvalds 已提交
2405
#ifndef MODULE
2406 2407
	if (test_bit(AutoDetected, &rdev->flags))
		md_autodetect_dev(rdev->bdev->bd_dev);
L
Linus Torvalds 已提交
2408 2409
#endif
	unlock_rdev(rdev);
2410
	kobject_put(&rdev->kobj);
L
Linus Torvalds 已提交
2411 2412
}

2413
void md_kick_rdev_from_array(struct md_rdev *rdev)
L
Linus Torvalds 已提交
2414 2415 2416 2417
{
	unbind_rdev_from_array(rdev);
	export_rdev(rdev);
}
2418
EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
L
Linus Torvalds 已提交
2419

2420
static void export_array(struct mddev *mddev)
L
Linus Torvalds 已提交
2421
{
N
NeilBrown 已提交
2422
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
2423

N
NeilBrown 已提交
2424 2425 2426
	while (!list_empty(&mddev->disks)) {
		rdev = list_first_entry(&mddev->disks, struct md_rdev,
					same_set);
2427
		md_kick_rdev_from_array(rdev);
L
Linus Torvalds 已提交
2428 2429 2430 2431 2432
	}
	mddev->raid_disks = 0;
	mddev->major_version = 0;
}

N
NeilBrown 已提交
2433 2434
static bool set_in_sync(struct mddev *mddev)
{
S
Shaohua Li 已提交
2435
	lockdep_assert_held(&mddev->lock);
2436 2437 2438 2439 2440 2441 2442
	if (!mddev->in_sync) {
		mddev->sync_checkers++;
		spin_unlock(&mddev->lock);
		percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
		spin_lock(&mddev->lock);
		if (!mddev->in_sync &&
		    percpu_ref_is_zero(&mddev->writes_pending)) {
N
NeilBrown 已提交
2443
			mddev->in_sync = 1;
2444 2445 2446 2447
			/*
			 * Ensure ->in_sync is visible before we clear
			 * ->sync_checkers.
			 */
2448
			smp_mb();
N
NeilBrown 已提交
2449 2450 2451
			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
			sysfs_notify_dirent_safe(mddev->sysfs_state);
		}
2452 2453
		if (--mddev->sync_checkers == 0)
			percpu_ref_switch_to_percpu(&mddev->writes_pending);
N
NeilBrown 已提交
2454 2455 2456 2457 2458 2459
	}
	if (mddev->safemode == 1)
		mddev->safemode = 0;
	return mddev->in_sync;
}

2460
static void sync_sbs(struct mddev *mddev, int nospares)
L
Linus Torvalds 已提交
2461
{
2462 2463 2464 2465 2466 2467
	/* Update each superblock (in-memory image), but
	 * if we are allowed to, skip spares which already
	 * have the right event counter, or have one earlier
	 * (which would mean they aren't being marked as dirty
	 * with the rest of the array)
	 */
2468
	struct md_rdev *rdev;
N
NeilBrown 已提交
2469
	rdev_for_each(rdev, mddev) {
2470 2471 2472 2473 2474 2475 2476
		if (rdev->sb_events == mddev->events ||
		    (nospares &&
		     rdev->raid_disk < 0 &&
		     rdev->sb_events+1 == mddev->events)) {
			/* Don't update this superblock */
			rdev->sb_loaded = 2;
		} else {
2477
			sync_super(mddev, rdev);
2478 2479
			rdev->sb_loaded = 1;
		}
L
Linus Torvalds 已提交
2480 2481 2482
	}
}

2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513
static bool does_sb_need_changing(struct mddev *mddev)
{
	struct md_rdev *rdev;
	struct mdp_superblock_1 *sb;
	int role;

	/* Find a good rdev */
	rdev_for_each(rdev, mddev)
		if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
			break;

	/* No good device found. */
	if (!rdev)
		return false;

	sb = page_address(rdev->sb_page);
	/* Check if a device has become faulty or a spare become active */
	rdev_for_each(rdev, mddev) {
		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
		/* Device activated? */
		if (role == 0xffff && rdev->raid_disk >=0 &&
		    !test_bit(Faulty, &rdev->flags))
			return true;
		/* Device turned faulty? */
		if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
			return true;
	}

	/* Check if any mddev parameters have changed */
	if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
	    (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2514
	    (mddev->layout != le32_to_cpu(sb->layout)) ||
2515 2516 2517 2518 2519 2520 2521
	    (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
	    (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
		return true;

	return false;
}

2522
void md_update_sb(struct mddev *mddev, int force_change)
L
Linus Torvalds 已提交
2523
{
2524
	struct md_rdev *rdev;
2525
	int sync_req;
2526
	int nospares = 0;
2527
	int any_badblocks_changed = 0;
2528
	int ret = -1;
L
Linus Torvalds 已提交
2529

2530 2531
	if (mddev->ro) {
		if (force_change)
2532
			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2533 2534
		return;
	}
2535

2536
repeat:
2537
	if (mddev_is_clustered(mddev)) {
2538
		if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2539
			force_change = 1;
2540
		if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2541
			nospares = 1;
2542
		ret = md_cluster_ops->metadata_update_start(mddev);
2543 2544
		/* Has someone else has updated the sb */
		if (!does_sb_need_changing(mddev)) {
2545 2546
			if (ret == 0)
				md_cluster_ops->metadata_update_cancel(mddev);
2547 2548 2549
			bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
							 BIT(MD_SB_CHANGE_DEVS) |
							 BIT(MD_SB_CHANGE_CLEAN));
2550 2551 2552
			return;
		}
	}
2553

2554 2555 2556 2557 2558 2559
	/*
	 * First make sure individual recovery_offsets are correct
	 * curr_resync_completed can only be used during recovery.
	 * During reshape/resync it might use array-addresses rather
	 * that device addresses.
	 */
N
NeilBrown 已提交
2560
	rdev_for_each(rdev, mddev) {
2561 2562
		if (rdev->raid_disk >= 0 &&
		    mddev->delta_disks >= 0 &&
2563 2564 2565
		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
		    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
		    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
S
Shaohua Li 已提交
2566
		    !test_bit(Journal, &rdev->flags) &&
2567 2568 2569 2570
		    !test_bit(In_sync, &rdev->flags) &&
		    mddev->curr_resync_completed > rdev->recovery_offset)
				rdev->recovery_offset = mddev->curr_resync_completed;

2571
	}
2572
	if (!mddev->persistent) {
2573 2574
		clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
		clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2575
		if (!mddev->external) {
2576
			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
N
NeilBrown 已提交
2577
			rdev_for_each(rdev, mddev) {
2578
				if (rdev->badblocks.changed) {
2579
					rdev->badblocks.changed = 0;
2580
					ack_all_badblocks(&rdev->badblocks);
2581 2582 2583 2584 2585 2586 2587
					md_error(mddev, rdev);
				}
				clear_bit(Blocked, &rdev->flags);
				clear_bit(BlockedBadBlocks, &rdev->flags);
				wake_up(&rdev->blocked_wait);
			}
		}
2588 2589 2590 2591
		wake_up(&mddev->sb_wait);
		return;
	}

2592
	spin_lock(&mddev->lock);
2593

2594
	mddev->utime = ktime_get_real_seconds();
2595

2596
	if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2597
		force_change = 1;
2598
	if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2599 2600 2601 2602 2603 2604 2605 2606
		/* just a clean<-> dirty transition, possibly leave spares alone,
		 * though if events isn't the right even/odd, we will have to do
		 * spares after all
		 */
		nospares = 1;
	if (force_change)
		nospares = 0;
	if (mddev->degraded)
2607 2608 2609 2610 2611 2612 2613 2614 2615
		/* If the array is degraded, then skipping spares is both
		 * dangerous and fairly pointless.
		 * Dangerous because a device that was removed from the array
		 * might have a event_count that still looks up-to-date,
		 * so it can be re-added without a resync.
		 * Pointless because if there are any spares to skip,
		 * then a recovery will happen and soon that array won't
		 * be degraded any more and the spare can go back to sleep then.
		 */
2616
		nospares = 0;
2617

2618
	sync_req = mddev->in_sync;
2619 2620 2621

	/* If this is just a dirty<->clean transition, and the array is clean
	 * and 'events' is odd, we can roll back to the previous clean state */
2622
	if (nospares
2623
	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2624 2625
	    && mddev->can_decrease_events
	    && mddev->events != 1) {
2626
		mddev->events--;
2627 2628
		mddev->can_decrease_events = 0;
	} else {
2629 2630
		/* otherwise we have to go forward and ... */
		mddev->events ++;
2631
		mddev->can_decrease_events = nospares;
2632
	}
L
Linus Torvalds 已提交
2633

N
NeilBrown 已提交
2634 2635 2636 2637 2638 2639
	/*
	 * This 64-bit counter should never wrap.
	 * Either we are in around ~1 trillion A.C., assuming
	 * 1 reboot per second, or we have a bug...
	 */
	WARN_ON(mddev->events == 0);
2640

N
NeilBrown 已提交
2641
	rdev_for_each(rdev, mddev) {
2642 2643
		if (rdev->badblocks.changed)
			any_badblocks_changed++;
2644 2645 2646
		if (test_bit(Faulty, &rdev->flags))
			set_bit(FaultRecorded, &rdev->flags);
	}
2647

2648
	sync_sbs(mddev, nospares);
2649
	spin_unlock(&mddev->lock);
L
Linus Torvalds 已提交
2650

2651 2652
	pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
		 mdname(mddev), mddev->in_sync);
L
Linus Torvalds 已提交
2653

2654 2655
	if (mddev->queue)
		blk_add_trace_msg(mddev->queue, "md md_update_sb");
2656
rewrite:
2657
	md_bitmap_update_sb(mddev->bitmap);
N
NeilBrown 已提交
2658
	rdev_for_each(rdev, mddev) {
L
Linus Torvalds 已提交
2659
		char b[BDEVNAME_SIZE];
2660

2661 2662
		if (rdev->sb_loaded != 1)
			continue; /* no noise on spare devices */
L
Linus Torvalds 已提交
2663

2664
		if (!test_bit(Faulty, &rdev->flags)) {
2665
			md_super_write(mddev,rdev,
2666
				       rdev->sb_start, rdev->sb_size,
2667
				       rdev->sb_page);
2668 2669 2670
			pr_debug("md: (write) %s's sb offset: %llu\n",
				 bdevname(rdev->bdev, b),
				 (unsigned long long)rdev->sb_start);
2671
			rdev->sb_events = mddev->events;
2672 2673 2674 2675 2676 2677 2678
			if (rdev->badblocks.size) {
				md_super_write(mddev, rdev,
					       rdev->badblocks.sector,
					       rdev->badblocks.size << 9,
					       rdev->bb_page);
				rdev->badblocks.size = 0;
			}
2679

2680
		} else
2681 2682
			pr_debug("md: %s (skipping faulty)\n",
				 bdevname(rdev->bdev, b));
2683

2684
		if (mddev->level == LEVEL_MULTIPATH)
L
Linus Torvalds 已提交
2685 2686 2687
			/* only need to write one superblock... */
			break;
	}
2688 2689
	if (md_super_wait(mddev) < 0)
		goto rewrite;
2690
	/* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2691

2692 2693 2694
	if (mddev_is_clustered(mddev) && ret == 0)
		md_cluster_ops->metadata_update_finish(mddev);

2695
	if (mddev->in_sync != sync_req ||
2696 2697
	    !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
			       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2698 2699
		/* have to write it out again */
		goto repeat;
2700
	wake_up(&mddev->sb_wait);
2701 2702
	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2703

N
NeilBrown 已提交
2704
	rdev_for_each(rdev, mddev) {
2705 2706 2707 2708
		if (test_and_clear_bit(FaultRecorded, &rdev->flags))
			clear_bit(Blocked, &rdev->flags);

		if (any_badblocks_changed)
2709
			ack_all_badblocks(&rdev->badblocks);
2710 2711 2712
		clear_bit(BlockedBadBlocks, &rdev->flags);
		wake_up(&rdev->blocked_wait);
	}
L
Linus Torvalds 已提交
2713
}
2714
EXPORT_SYMBOL(md_update_sb);
L
Linus Torvalds 已提交
2715

G
Goldwyn Rodrigues 已提交
2716 2717 2718 2719
static int add_bound_rdev(struct md_rdev *rdev)
{
	struct mddev *mddev = rdev->mddev;
	int err = 0;
2720
	bool add_journal = test_bit(Journal, &rdev->flags);
G
Goldwyn Rodrigues 已提交
2721

2722
	if (!mddev->pers->hot_remove_disk || add_journal) {
G
Goldwyn Rodrigues 已提交
2723 2724 2725 2726 2727 2728
		/* If there is hot_add_disk but no hot_remove_disk
		 * then added disks for geometry changes,
		 * and should be added immediately.
		 */
		super_types[mddev->major_version].
			validate_super(mddev, rdev);
2729 2730
		if (add_journal)
			mddev_suspend(mddev);
G
Goldwyn Rodrigues 已提交
2731
		err = mddev->pers->hot_add_disk(mddev, rdev);
2732 2733
		if (add_journal)
			mddev_resume(mddev);
G
Goldwyn Rodrigues 已提交
2734
		if (err) {
2735
			md_kick_rdev_from_array(rdev);
G
Goldwyn Rodrigues 已提交
2736 2737 2738 2739 2740
			return err;
		}
	}
	sysfs_notify_dirent_safe(rdev->sysfs_state);

2741
	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
G
Goldwyn Rodrigues 已提交
2742 2743 2744 2745 2746 2747 2748
	if (mddev->degraded)
		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
	md_new_event(mddev);
	md_wakeup_thread(mddev->thread);
	return 0;
}
L
Linus Torvalds 已提交
2749

2750
/* words written to sysfs files may, or may not, be \n terminated.
2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769
 * We want to accept with case. For this we use cmd_match.
 */
static int cmd_match(const char *cmd, const char *str)
{
	/* See if cmd, written into a sysfs file, matches
	 * str.  They must either be the same, or cmd can
	 * have a trailing newline
	 */
	while (*cmd && *str && *cmd == *str) {
		cmd++;
		str++;
	}
	if (*cmd == '\n')
		cmd++;
	if (*str || *cmd)
		return 0;
	return 1;
}

2770 2771
struct rdev_sysfs_entry {
	struct attribute attr;
2772 2773
	ssize_t (*show)(struct md_rdev *, char *);
	ssize_t (*store)(struct md_rdev *, const char *, size_t);
2774 2775 2776
};

static ssize_t
2777
state_show(struct md_rdev *rdev, char *page)
2778
{
2779
	char *sep = ",";
2780
	size_t len = 0;
2781
	unsigned long flags = READ_ONCE(rdev->flags);
2782

2783
	if (test_bit(Faulty, &flags) ||
2784 2785
	    (!test_bit(ExternalBbl, &flags) &&
	    rdev->badblocks.unacked_exist))
2786 2787 2788 2789 2790 2791 2792
		len += sprintf(page+len, "faulty%s", sep);
	if (test_bit(In_sync, &flags))
		len += sprintf(page+len, "in_sync%s", sep);
	if (test_bit(Journal, &flags))
		len += sprintf(page+len, "journal%s", sep);
	if (test_bit(WriteMostly, &flags))
		len += sprintf(page+len, "write_mostly%s", sep);
2793
	if (test_bit(Blocked, &flags) ||
2794
	    (rdev->badblocks.unacked_exist
2795 2796
	     && !test_bit(Faulty, &flags)))
		len += sprintf(page+len, "blocked%s", sep);
2797
	if (!test_bit(Faulty, &flags) &&
S
Shaohua Li 已提交
2798
	    !test_bit(Journal, &flags) &&
2799 2800 2801 2802 2803 2804 2805 2806 2807 2808
	    !test_bit(In_sync, &flags))
		len += sprintf(page+len, "spare%s", sep);
	if (test_bit(WriteErrorSeen, &flags))
		len += sprintf(page+len, "write_error%s", sep);
	if (test_bit(WantReplacement, &flags))
		len += sprintf(page+len, "want_replacement%s", sep);
	if (test_bit(Replacement, &flags))
		len += sprintf(page+len, "replacement%s", sep);
	if (test_bit(ExternalBbl, &flags))
		len += sprintf(page+len, "external_bbl%s", sep);
2809 2810
	if (test_bit(FailFast, &flags))
		len += sprintf(page+len, "failfast%s", sep);
2811 2812 2813

	if (len)
		len -= strlen(sep);
2814

2815 2816 2817
	return len+sprintf(page+len, "\n");
}

2818
static ssize_t
2819
state_store(struct md_rdev *rdev, const char *buf, size_t len)
2820 2821
{
	/* can write
2822
	 *  faulty  - simulates an error
2823
	 *  remove  - disconnects the device
2824 2825
	 *  writemostly - sets write_mostly
	 *  -writemostly - clears write_mostly
2826 2827
	 *  blocked - sets the Blocked flags
	 *  -blocked - clears the Blocked and possibly simulates an error
2828
	 *  insync - sets Insync providing device isn't active
2829 2830
	 *  -insync - clear Insync for a device with a slot assigned,
	 *            so that it gets rebuilt based on bitmap
2831 2832
	 *  write_error - sets WriteErrorSeen
	 *  -write_error - clears WriteErrorSeen
2833
	 *  {,-}failfast - set/clear FailFast
2834 2835 2836 2837
	 */
	int err = -EINVAL;
	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
		md_error(rdev->mddev, rdev);
2838 2839 2840 2841
		if (test_bit(Faulty, &rdev->flags))
			err = 0;
		else
			err = -EBUSY;
2842
	} else if (cmd_match(buf, "remove")) {
S
Shaohua Li 已提交
2843 2844 2845 2846
		if (rdev->mddev->pers) {
			clear_bit(Blocked, &rdev->flags);
			remove_and_add_spares(rdev->mddev, rdev);
		}
2847 2848 2849
		if (rdev->raid_disk >= 0)
			err = -EBUSY;
		else {
2850
			struct mddev *mddev = rdev->mddev;
2851
			err = 0;
2852 2853 2854 2855 2856
			if (mddev_is_clustered(mddev))
				err = md_cluster_ops->remove_disk(mddev, rdev);

			if (err == 0) {
				md_kick_rdev_from_array(rdev);
2857
				if (mddev->pers) {
2858
					set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2859 2860
					md_wakeup_thread(mddev->thread);
				}
2861 2862
				md_new_event(mddev);
			}
2863
		}
2864 2865
	} else if (cmd_match(buf, "writemostly")) {
		set_bit(WriteMostly, &rdev->flags);
2866
		mddev_create_wb_pool(rdev->mddev, rdev, false);
2867 2868
		err = 0;
	} else if (cmd_match(buf, "-writemostly")) {
2869
		mddev_destroy_wb_pool(rdev->mddev, rdev);
2870
		clear_bit(WriteMostly, &rdev->flags);
2871 2872 2873 2874 2875
		err = 0;
	} else if (cmd_match(buf, "blocked")) {
		set_bit(Blocked, &rdev->flags);
		err = 0;
	} else if (cmd_match(buf, "-blocked")) {
2876
		if (!test_bit(Faulty, &rdev->flags) &&
2877
		    !test_bit(ExternalBbl, &rdev->flags) &&
2878
		    rdev->badblocks.unacked_exist) {
2879 2880 2881 2882 2883
			/* metadata handler doesn't understand badblocks,
			 * so we need to fail the device
			 */
			md_error(rdev->mddev, rdev);
		}
2884
		clear_bit(Blocked, &rdev->flags);
2885
		clear_bit(BlockedBadBlocks, &rdev->flags);
2886 2887 2888 2889
		wake_up(&rdev->blocked_wait);
		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
		md_wakeup_thread(rdev->mddev->thread);

2890 2891 2892
		err = 0;
	} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
		set_bit(In_sync, &rdev->flags);
2893
		err = 0;
2894 2895 2896 2897 2898 2899
	} else if (cmd_match(buf, "failfast")) {
		set_bit(FailFast, &rdev->flags);
		err = 0;
	} else if (cmd_match(buf, "-failfast")) {
		clear_bit(FailFast, &rdev->flags);
		err = 0;
S
Shaohua Li 已提交
2900 2901
	} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
		   !test_bit(Journal, &rdev->flags)) {
2902 2903 2904 2905 2906 2907
		if (rdev->mddev->pers == NULL) {
			clear_bit(In_sync, &rdev->flags);
			rdev->saved_raid_disk = rdev->raid_disk;
			rdev->raid_disk = -1;
			err = 0;
		}
2908 2909 2910 2911 2912 2913
	} else if (cmd_match(buf, "write_error")) {
		set_bit(WriteErrorSeen, &rdev->flags);
		err = 0;
	} else if (cmd_match(buf, "-write_error")) {
		clear_bit(WriteErrorSeen, &rdev->flags);
		err = 0;
2914 2915 2916 2917 2918 2919
	} else if (cmd_match(buf, "want_replacement")) {
		/* Any non-spare device that is not a replacement can
		 * become want_replacement at any time, but we then need to
		 * check if recovery is needed.
		 */
		if (rdev->raid_disk >= 0 &&
S
Shaohua Li 已提交
2920
		    !test_bit(Journal, &rdev->flags) &&
2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950
		    !test_bit(Replacement, &rdev->flags))
			set_bit(WantReplacement, &rdev->flags);
		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
		md_wakeup_thread(rdev->mddev->thread);
		err = 0;
	} else if (cmd_match(buf, "-want_replacement")) {
		/* Clearing 'want_replacement' is always allowed.
		 * Once replacements starts it is too late though.
		 */
		err = 0;
		clear_bit(WantReplacement, &rdev->flags);
	} else if (cmd_match(buf, "replacement")) {
		/* Can only set a device as a replacement when array has not
		 * yet been started.  Once running, replacement is automatic
		 * from spares, or by assigning 'slot'.
		 */
		if (rdev->mddev->pers)
			err = -EBUSY;
		else {
			set_bit(Replacement, &rdev->flags);
			err = 0;
		}
	} else if (cmd_match(buf, "-replacement")) {
		/* Similarly, can only clear Replacement before start */
		if (rdev->mddev->pers)
			err = -EBUSY;
		else {
			clear_bit(Replacement, &rdev->flags);
			err = 0;
		}
G
Goldwyn Rodrigues 已提交
2951
	} else if (cmd_match(buf, "re-add")) {
2952 2953 2954 2955
		if (!rdev->mddev->pers)
			err = -EINVAL;
		else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
				rdev->saved_raid_disk >= 0) {
2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966
			/* clear_bit is performed _after_ all the devices
			 * have their local Faulty bit cleared. If any writes
			 * happen in the meantime in the local node, they
			 * will land in the local bitmap, which will be synced
			 * by this node eventually
			 */
			if (!mddev_is_clustered(rdev->mddev) ||
			    (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
				clear_bit(Faulty, &rdev->flags);
				err = add_bound_rdev(rdev);
			}
G
Goldwyn Rodrigues 已提交
2967 2968
		} else
			err = -EBUSY;
2969 2970 2971 2972 2973 2974 2975
	} else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
		set_bit(ExternalBbl, &rdev->flags);
		rdev->badblocks.shift = 0;
		err = 0;
	} else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
		clear_bit(ExternalBbl, &rdev->flags);
		err = 0;
2976
	}
N
NeilBrown 已提交
2977 2978
	if (!err)
		sysfs_notify_dirent_safe(rdev->sysfs_state);
2979 2980
	return err ? err : len;
}
2981
static struct rdev_sysfs_entry rdev_state =
2982
__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
2983

2984
static ssize_t
2985
errors_show(struct md_rdev *rdev, char *page)
2986 2987 2988 2989 2990
{
	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
}

static ssize_t
2991
errors_store(struct md_rdev *rdev, const char *buf, size_t len)
2992
{
A
Alexey Dobriyan 已提交
2993 2994 2995 2996 2997 2998 2999 3000
	unsigned int n;
	int rv;

	rv = kstrtouint(buf, 10, &n);
	if (rv < 0)
		return rv;
	atomic_set(&rdev->corrected_errors, n);
	return len;
3001 3002
}
static struct rdev_sysfs_entry rdev_errors =
3003
__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
3004

3005
static ssize_t
3006
slot_show(struct md_rdev *rdev, char *page)
3007
{
S
Shaohua Li 已提交
3008 3009 3010
	if (test_bit(Journal, &rdev->flags))
		return sprintf(page, "journal\n");
	else if (rdev->raid_disk < 0)
3011 3012 3013 3014 3015 3016
		return sprintf(page, "none\n");
	else
		return sprintf(page, "%d\n", rdev->raid_disk);
}

static ssize_t
3017
slot_store(struct md_rdev *rdev, const char *buf, size_t len)
3018
{
A
Alexey Dobriyan 已提交
3019
	int slot;
3020
	int err;
A
Alexey Dobriyan 已提交
3021

S
Shaohua Li 已提交
3022 3023
	if (test_bit(Journal, &rdev->flags))
		return -EBUSY;
3024 3025
	if (strncmp(buf, "none", 4)==0)
		slot = -1;
A
Alexey Dobriyan 已提交
3026 3027 3028 3029 3030
	else {
		err = kstrtouint(buf, 10, (unsigned int *)&slot);
		if (err < 0)
			return err;
	}
3031
	if (rdev->mddev->pers && slot == -1) {
3032 3033 3034 3035 3036 3037 3038 3039 3040 3041
		/* Setting 'slot' on an active array requires also
		 * updating the 'rd%d' link, and communicating
		 * with the personality with ->hot_*_disk.
		 * For now we only support removing
		 * failed/spare devices.  This normally happens automatically,
		 * but not when the metadata is externally managed.
		 */
		if (rdev->raid_disk == -1)
			return -EEXIST;
		/* personality does all needed checks */
3042
		if (rdev->mddev->pers->hot_remove_disk == NULL)
3043
			return -EINVAL;
3044 3045 3046 3047
		clear_bit(Blocked, &rdev->flags);
		remove_and_add_spares(rdev->mddev, rdev);
		if (rdev->raid_disk >= 0)
			return -EBUSY;
3048 3049
		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
		md_wakeup_thread(rdev->mddev->thread);
3050 3051
	} else if (rdev->mddev->pers) {
		/* Activating a spare .. or possibly reactivating
3052
		 * if we ever get bitmaps working here.
3053
		 */
3054
		int err;
3055 3056 3057 3058

		if (rdev->raid_disk != -1)
			return -EBUSY;

3059 3060 3061
		if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
			return -EBUSY;

3062 3063 3064
		if (rdev->mddev->pers->hot_add_disk == NULL)
			return -EINVAL;

3065 3066 3067 3068
		if (slot >= rdev->mddev->raid_disks &&
		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
			return -ENOSPC;

3069 3070 3071 3072 3073
		rdev->raid_disk = slot;
		if (test_bit(In_sync, &rdev->flags))
			rdev->saved_raid_disk = slot;
		else
			rdev->saved_raid_disk = -1;
3074
		clear_bit(In_sync, &rdev->flags);
3075
		clear_bit(Bitmap_sync, &rdev->flags);
3076 3077 3078 3079 3080 3081 3082 3083 3084
		err = rdev->mddev->pers->
			hot_add_disk(rdev->mddev, rdev);
		if (err) {
			rdev->raid_disk = -1;
			return err;
		} else
			sysfs_notify_dirent_safe(rdev->sysfs_state);
		if (sysfs_link_rdev(rdev->mddev, rdev))
			/* failure here is OK */;
3085
		/* don't wakeup anyone, leave that to userspace. */
3086
	} else {
3087 3088
		if (slot >= rdev->mddev->raid_disks &&
		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3089 3090 3091
			return -ENOSPC;
		rdev->raid_disk = slot;
		/* assume it is working */
3092 3093
		clear_bit(Faulty, &rdev->flags);
		clear_bit(WriteMostly, &rdev->flags);
3094
		set_bit(In_sync, &rdev->flags);
N
NeilBrown 已提交
3095
		sysfs_notify_dirent_safe(rdev->sysfs_state);
3096
	}
3097 3098 3099 3100
	return len;
}

static struct rdev_sysfs_entry rdev_slot =
3101
__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
3102

3103
static ssize_t
3104
offset_show(struct md_rdev *rdev, char *page)
3105
{
3106
	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3107 3108 3109
}

static ssize_t
3110
offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3111
{
3112
	unsigned long long offset;
3113
	if (kstrtoull(buf, 10, &offset) < 0)
3114
		return -EINVAL;
3115
	if (rdev->mddev->pers && rdev->raid_disk >= 0)
3116
		return -EBUSY;
3117
	if (rdev->sectors && rdev->mddev->external)
3118 3119 3120
		/* Must set offset before size, so overlap checks
		 * can be sane */
		return -EBUSY;
3121
	rdev->data_offset = offset;
3122
	rdev->new_data_offset = offset;
3123 3124 3125 3126
	return len;
}

static struct rdev_sysfs_entry rdev_offset =
3127
__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3128

3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140
static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
{
	return sprintf(page, "%llu\n",
		       (unsigned long long)rdev->new_data_offset);
}

static ssize_t new_offset_store(struct md_rdev *rdev,
				const char *buf, size_t len)
{
	unsigned long long new_offset;
	struct mddev *mddev = rdev->mddev;

3141
	if (kstrtoull(buf, 10, &new_offset) < 0)
3142 3143
		return -EINVAL;

3144 3145
	if (mddev->sync_thread ||
	    test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186
		return -EBUSY;
	if (new_offset == rdev->data_offset)
		/* reset is always permitted */
		;
	else if (new_offset > rdev->data_offset) {
		/* must not push array size beyond rdev_sectors */
		if (new_offset - rdev->data_offset
		    + mddev->dev_sectors > rdev->sectors)
				return -E2BIG;
	}
	/* Metadata worries about other space details. */

	/* decreasing the offset is inconsistent with a backwards
	 * reshape.
	 */
	if (new_offset < rdev->data_offset &&
	    mddev->reshape_backwards)
		return -EINVAL;
	/* Increasing offset is inconsistent with forwards
	 * reshape.  reshape_direction should be set to
	 * 'backwards' first.
	 */
	if (new_offset > rdev->data_offset &&
	    !mddev->reshape_backwards)
		return -EINVAL;

	if (mddev->pers && mddev->persistent &&
	    !super_types[mddev->major_version]
	    .allow_new_offset(rdev, new_offset))
		return -E2BIG;
	rdev->new_data_offset = new_offset;
	if (new_offset > rdev->data_offset)
		mddev->reshape_backwards = 1;
	else if (new_offset < rdev->data_offset)
		mddev->reshape_backwards = 0;

	return len;
}
static struct rdev_sysfs_entry rdev_new_offset =
__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);

3187
static ssize_t
3188
rdev_size_show(struct md_rdev *rdev, char *page)
3189
{
3190
	return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3191 3192
}

3193 3194 3195 3196 3197 3198 3199 3200 3201 3202
static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
{
	/* check if two start/length pairs overlap */
	if (s1+l1 <= s2)
		return 0;
	if (s2+l2 <= s1)
		return 0;
	return 1;
}

D
Dan Williams 已提交
3203 3204 3205 3206 3207
static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
{
	unsigned long long blocks;
	sector_t new;

3208
	if (kstrtoull(buf, 10, &blocks) < 0)
D
Dan Williams 已提交
3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221
		return -EINVAL;

	if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
		return -EINVAL; /* sector conversion overflow */

	new = blocks * 2;
	if (new != blocks * 2)
		return -EINVAL; /* unsigned long long to sector_t overflow */

	*sectors = new;
	return 0;
}

3222
static ssize_t
3223
rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3224
{
3225
	struct mddev *my_mddev = rdev->mddev;
3226
	sector_t oldsectors = rdev->sectors;
D
Dan Williams 已提交
3227
	sector_t sectors;
3228

S
Shaohua Li 已提交
3229 3230
	if (test_bit(Journal, &rdev->flags))
		return -EBUSY;
D
Dan Williams 已提交
3231
	if (strict_blocks_to_sectors(buf, &sectors) < 0)
N
Neil Brown 已提交
3232
		return -EINVAL;
3233 3234
	if (rdev->data_offset != rdev->new_data_offset)
		return -EINVAL; /* too confusing */
3235
	if (my_mddev->pers && rdev->raid_disk >= 0) {
N
Neil Brown 已提交
3236
		if (my_mddev->persistent) {
3237 3238 3239
			sectors = super_types[my_mddev->major_version].
				rdev_size_change(rdev, sectors);
			if (!sectors)
3240
				return -EBUSY;
3241
		} else if (!sectors)
3242
			sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
3243
				rdev->data_offset;
3244 3245 3246
		if (!my_mddev->pers->resize)
			/* Cannot change size for RAID0 or Linear etc */
			return -EINVAL;
3247
	}
3248
	if (sectors < my_mddev->dev_sectors)
3249
		return -EINVAL; /* component must fit device */
3250

3251 3252
	rdev->sectors = sectors;
	if (sectors > oldsectors && my_mddev->external) {
3253 3254 3255 3256 3257
		/* Need to check that all other rdevs with the same
		 * ->bdev do not overlap.  'rcu' is sufficient to walk
		 * the rdev lists safely.
		 * This check does not provide a hard guarantee, it
		 * just helps avoid dangerous mistakes.
3258
		 */
3259
		struct mddev *mddev;
3260
		int overlap = 0;
3261
		struct list_head *tmp;
3262

3263
		rcu_read_lock();
3264
		for_each_mddev(mddev, tmp) {
3265
			struct md_rdev *rdev2;
3266

N
NeilBrown 已提交
3267
			rdev_for_each(rdev2, mddev)
3268 3269 3270 3271 3272
				if (rdev->bdev == rdev2->bdev &&
				    rdev != rdev2 &&
				    overlaps(rdev->data_offset, rdev->sectors,
					     rdev2->data_offset,
					     rdev2->sectors)) {
3273 3274 3275 3276 3277 3278 3279 3280
					overlap = 1;
					break;
				}
			if (overlap) {
				mddev_put(mddev);
				break;
			}
		}
3281
		rcu_read_unlock();
3282 3283 3284
		if (overlap) {
			/* Someone else could have slipped in a size
			 * change here, but doing so is just silly.
3285
			 * We put oldsectors back because we *know* it is
3286 3287 3288
			 * safe, and trust userspace not to race with
			 * itself
			 */
3289
			rdev->sectors = oldsectors;
3290 3291 3292
			return -EBUSY;
		}
	}
3293 3294 3295 3296
	return len;
}

static struct rdev_sysfs_entry rdev_size =
3297
__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3298

3299
static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3300 3301 3302 3303 3304 3305 3306 3307 3308 3309
{
	unsigned long long recovery_start = rdev->recovery_offset;

	if (test_bit(In_sync, &rdev->flags) ||
	    recovery_start == MaxSector)
		return sprintf(page, "none\n");

	return sprintf(page, "%llu\n", recovery_start);
}

3310
static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3311 3312 3313 3314 3315
{
	unsigned long long recovery_start;

	if (cmd_match(buf, "none"))
		recovery_start = MaxSector;
3316
	else if (kstrtoull(buf, 10, &recovery_start))
3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333
		return -EINVAL;

	if (rdev->mddev->pers &&
	    rdev->raid_disk >= 0)
		return -EBUSY;

	rdev->recovery_offset = recovery_start;
	if (recovery_start == MaxSector)
		set_bit(In_sync, &rdev->flags);
	else
		clear_bit(In_sync, &rdev->flags);
	return len;
}

static struct rdev_sysfs_entry rdev_recovery_start =
__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);

3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344
/* sysfs access to bad-blocks list.
 * We present two files.
 * 'bad-blocks' lists sector numbers and lengths of ranges that
 *    are recorded as bad.  The list is truncated to fit within
 *    the one-page limit of sysfs.
 *    Writing "sector length" to this file adds an acknowledged
 *    bad block list.
 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
 *    been acknowledged.  Writing to this file adds bad blocks
 *    without acknowledging them.  This is largely for testing.
 */
3345
static ssize_t bb_show(struct md_rdev *rdev, char *page)
3346 3347 3348
{
	return badblocks_show(&rdev->badblocks, page, 0);
}
3349
static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3350
{
3351 3352 3353 3354 3355
	int rv = badblocks_store(&rdev->badblocks, page, len, 0);
	/* Maybe that ack was all we needed */
	if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
		wake_up(&rdev->blocked_wait);
	return rv;
3356 3357 3358 3359
}
static struct rdev_sysfs_entry rdev_bad_blocks =
__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);

3360
static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3361 3362 3363
{
	return badblocks_show(&rdev->badblocks, page, 1);
}
3364
static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3365 3366 3367 3368 3369 3370
{
	return badblocks_store(&rdev->badblocks, page, len, 1);
}
static struct rdev_sysfs_entry rdev_unack_bad_blocks =
__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);

3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442
static ssize_t
ppl_sector_show(struct md_rdev *rdev, char *page)
{
	return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
}

static ssize_t
ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
{
	unsigned long long sector;

	if (kstrtoull(buf, 10, &sector) < 0)
		return -EINVAL;
	if (sector != (sector_t)sector)
		return -EINVAL;

	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
	    rdev->raid_disk >= 0)
		return -EBUSY;

	if (rdev->mddev->persistent) {
		if (rdev->mddev->major_version == 0)
			return -EINVAL;
		if ((sector > rdev->sb_start &&
		     sector - rdev->sb_start > S16_MAX) ||
		    (sector < rdev->sb_start &&
		     rdev->sb_start - sector > -S16_MIN))
			return -EINVAL;
		rdev->ppl.offset = sector - rdev->sb_start;
	} else if (!rdev->mddev->external) {
		return -EBUSY;
	}
	rdev->ppl.sector = sector;
	return len;
}

static struct rdev_sysfs_entry rdev_ppl_sector =
__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);

static ssize_t
ppl_size_show(struct md_rdev *rdev, char *page)
{
	return sprintf(page, "%u\n", rdev->ppl.size);
}

static ssize_t
ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
{
	unsigned int size;

	if (kstrtouint(buf, 10, &size) < 0)
		return -EINVAL;

	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
	    rdev->raid_disk >= 0)
		return -EBUSY;

	if (rdev->mddev->persistent) {
		if (rdev->mddev->major_version == 0)
			return -EINVAL;
		if (size > U16_MAX)
			return -EINVAL;
	} else if (!rdev->mddev->external) {
		return -EBUSY;
	}
	rdev->ppl.size = size;
	return len;
}

static struct rdev_sysfs_entry rdev_ppl_size =
__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);

3443 3444
static struct attribute *rdev_default_attrs[] = {
	&rdev_state.attr,
3445
	&rdev_errors.attr,
3446
	&rdev_slot.attr,
3447
	&rdev_offset.attr,
3448
	&rdev_new_offset.attr,
3449
	&rdev_size.attr,
3450
	&rdev_recovery_start.attr,
3451 3452
	&rdev_bad_blocks.attr,
	&rdev_unack_bad_blocks.attr,
3453 3454
	&rdev_ppl_sector.attr,
	&rdev_ppl_size.attr,
3455 3456 3457 3458 3459 3460
	NULL,
};
static ssize_t
rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3461
	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3462 3463 3464

	if (!entry->show)
		return -EIO;
3465
	if (!rdev->mddev)
3466
		return -ENODEV;
3467
	return entry->show(rdev, page);
3468 3469 3470 3471 3472 3473 3474
}

static ssize_t
rdev_attr_store(struct kobject *kobj, struct attribute *attr,
	      const char *page, size_t length)
{
	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3475
	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3476
	ssize_t rv;
3477
	struct mddev *mddev = rdev->mddev;
3478 3479 3480

	if (!entry->store)
		return -EIO;
3481 3482
	if (!capable(CAP_SYS_ADMIN))
		return -EACCES;
3483
	rv = mddev ? mddev_lock(mddev) : -ENODEV;
3484
	if (!rv) {
3485
		if (rdev->mddev == NULL)
3486
			rv = -ENODEV;
3487 3488
		else
			rv = entry->store(rdev, page, length);
3489
		mddev_unlock(mddev);
3490 3491
	}
	return rv;
3492 3493 3494 3495
}

static void rdev_free(struct kobject *ko)
{
3496
	struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3497 3498
	kfree(rdev);
}
3499
static const struct sysfs_ops rdev_sysfs_ops = {
3500 3501 3502 3503 3504 3505 3506 3507 3508
	.show		= rdev_attr_show,
	.store		= rdev_attr_store,
};
static struct kobj_type rdev_ktype = {
	.release	= rdev_free,
	.sysfs_ops	= &rdev_sysfs_ops,
	.default_attrs	= rdev_default_attrs,
};

3509
int md_rdev_init(struct md_rdev *rdev)
N
NeilBrown 已提交
3510 3511 3512 3513 3514 3515
{
	rdev->desc_nr = -1;
	rdev->saved_raid_disk = -1;
	rdev->raid_disk = -1;
	rdev->flags = 0;
	rdev->data_offset = 0;
3516
	rdev->new_data_offset = 0;
N
NeilBrown 已提交
3517
	rdev->sb_events = 0;
3518
	rdev->last_read_error = 0;
3519 3520
	rdev->sb_loaded = 0;
	rdev->bb_page = NULL;
N
NeilBrown 已提交
3521 3522 3523 3524 3525 3526
	atomic_set(&rdev->nr_pending, 0);
	atomic_set(&rdev->read_errors, 0);
	atomic_set(&rdev->corrected_errors, 0);

	INIT_LIST_HEAD(&rdev->same_set);
	init_waitqueue_head(&rdev->blocked_wait);
3527 3528 3529 3530 3531

	/* Add space to store bad block list.
	 * This reserves the space even on arrays where it cannot
	 * be used - I wonder if that matters
	 */
3532
	return badblocks_init(&rdev->badblocks, 0);
N
NeilBrown 已提交
3533 3534
}
EXPORT_SYMBOL_GPL(md_rdev_init);
L
Linus Torvalds 已提交
3535 3536 3537 3538 3539 3540 3541 3542 3543 3544
/*
 * Import a device. If 'super_format' >= 0, then sanity check the superblock
 *
 * mark the device faulty if:
 *
 *   - the device is nonexistent (zero size)
 *   - the device has no valid superblock
 *
 * a faulty rdev _never_ has rdev->sb set.
 */
3545
static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
L
Linus Torvalds 已提交
3546 3547 3548
{
	char b[BDEVNAME_SIZE];
	int err;
3549
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
3550 3551
	sector_t size;

3552
	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3553
	if (!rdev)
L
Linus Torvalds 已提交
3554 3555
		return ERR_PTR(-ENOMEM);

3556 3557 3558 3559 3560
	err = md_rdev_init(rdev);
	if (err)
		goto abort_free;
	err = alloc_disk_sb(rdev);
	if (err)
L
Linus Torvalds 已提交
3561 3562
		goto abort_free;

3563
	err = lock_rdev(rdev, newdev, super_format == -2);
L
Linus Torvalds 已提交
3564 3565 3566
	if (err)
		goto abort_free;

3567
	kobject_init(&rdev->kobj, &rdev_ktype);
3568

3569
	size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
L
Linus Torvalds 已提交
3570
	if (!size) {
3571
		pr_warn("md: %s has zero or unknown size, marking faulty!\n",
L
Linus Torvalds 已提交
3572 3573 3574 3575 3576 3577 3578 3579 3580
			bdevname(rdev->bdev,b));
		err = -EINVAL;
		goto abort_free;
	}

	if (super_format >= 0) {
		err = super_types[super_format].
			load_super(rdev, NULL, super_minor);
		if (err == -EINVAL) {
3581
			pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n",
3582
				bdevname(rdev->bdev,b),
3583
				super_format, super_minor);
L
Linus Torvalds 已提交
3584 3585 3586
			goto abort_free;
		}
		if (err < 0) {
3587
			pr_warn("md: could not read %s's sb, not importing!\n",
L
Linus Torvalds 已提交
3588 3589 3590 3591
				bdevname(rdev->bdev,b));
			goto abort_free;
		}
	}
3592

L
Linus Torvalds 已提交
3593 3594 3595
	return rdev;

abort_free:
3596 3597
	if (rdev->bdev)
		unlock_rdev(rdev);
3598
	md_rdev_clear(rdev);
L
Linus Torvalds 已提交
3599 3600 3601 3602 3603 3604 3605 3606
	kfree(rdev);
	return ERR_PTR(err);
}

/*
 * Check a full RAID array for plausibility
 */

3607
static void analyze_sbs(struct mddev *mddev)
L
Linus Torvalds 已提交
3608 3609
{
	int i;
3610
	struct md_rdev *rdev, *freshest, *tmp;
L
Linus Torvalds 已提交
3611 3612 3613
	char b[BDEVNAME_SIZE];

	freshest = NULL;
N
NeilBrown 已提交
3614
	rdev_for_each_safe(rdev, tmp, mddev)
L
Linus Torvalds 已提交
3615 3616 3617 3618 3619 3620 3621 3622
		switch (super_types[mddev->major_version].
			load_super(rdev, freshest, mddev->minor_version)) {
		case 1:
			freshest = rdev;
			break;
		case 0:
			break;
		default:
3623
			pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n",
L
Linus Torvalds 已提交
3624
				bdevname(rdev->bdev,b));
3625
			md_kick_rdev_from_array(rdev);
L
Linus Torvalds 已提交
3626 3627 3628 3629 3630 3631
		}

	super_types[mddev->major_version].
		validate_super(mddev, freshest);

	i = 0;
N
NeilBrown 已提交
3632
	rdev_for_each_safe(rdev, tmp, mddev) {
3633 3634 3635
		if (mddev->max_disks &&
		    (rdev->desc_nr >= mddev->max_disks ||
		     i > mddev->max_disks)) {
3636 3637 3638
			pr_warn("md: %s: %s: only %d devices permitted\n",
				mdname(mddev), bdevname(rdev->bdev, b),
				mddev->max_disks);
3639
			md_kick_rdev_from_array(rdev);
3640 3641
			continue;
		}
3642
		if (rdev != freshest) {
L
Linus Torvalds 已提交
3643 3644
			if (super_types[mddev->major_version].
			    validate_super(mddev, rdev)) {
3645
				pr_warn("md: kicking non-fresh %s from array!\n",
L
Linus Torvalds 已提交
3646
					bdevname(rdev->bdev,b));
3647
				md_kick_rdev_from_array(rdev);
L
Linus Torvalds 已提交
3648 3649
				continue;
			}
3650
		}
L
Linus Torvalds 已提交
3651 3652 3653
		if (mddev->level == LEVEL_MULTIPATH) {
			rdev->desc_nr = i++;
			rdev->raid_disk = rdev->desc_nr;
3654
			set_bit(In_sync, &rdev->flags);
S
Shaohua Li 已提交
3655 3656 3657
		} else if (rdev->raid_disk >=
			    (mddev->raid_disks - min(0, mddev->delta_disks)) &&
			   !test_bit(Journal, &rdev->flags)) {
3658 3659
			rdev->raid_disk = -1;
			clear_bit(In_sync, &rdev->flags);
L
Linus Torvalds 已提交
3660 3661 3662 3663
		}
	}
}

3664 3665 3666
/* Read a fixed-point number.
 * Numbers in sysfs attributes should be in "standard" units where
 * possible, so time should be in seconds.
3667
 * However we internally use a a much smaller unit such as
3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695
 * milliseconds or jiffies.
 * This function takes a decimal number with a possible fractional
 * component, and produces an integer which is the result of
 * multiplying that number by 10^'scale'.
 * all without any floating-point arithmetic.
 */
int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
{
	unsigned long result = 0;
	long decimals = -1;
	while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
		if (*cp == '.')
			decimals = 0;
		else if (decimals < scale) {
			unsigned int value;
			value = *cp - '0';
			result = result * 10 + value;
			if (decimals >= 0)
				decimals++;
		}
		cp++;
	}
	if (*cp == '\n')
		cp++;
	if (*cp)
		return -EINVAL;
	if (decimals < 0)
		decimals = 0;
A
Andy Shevchenko 已提交
3696
	*res = result * int_pow(10, scale - decimals);
3697 3698 3699
	return 0;
}

3700
static ssize_t
3701
safe_delay_show(struct mddev *mddev, char *page)
3702 3703 3704 3705 3706
{
	int msec = (mddev->safemode_delay*1000)/HZ;
	return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
}
static ssize_t
3707
safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3708 3709
{
	unsigned long msec;
3710

3711
	if (mddev_is_clustered(mddev)) {
3712
		pr_warn("md: Safemode is disabled for clustered mode\n");
3713 3714 3715
		return -EINVAL;
	}

3716
	if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3717 3718 3719 3720
		return -EINVAL;
	if (msec == 0)
		mddev->safemode_delay = 0;
	else {
3721
		unsigned long old_delay = mddev->safemode_delay;
3722 3723 3724 3725 3726 3727 3728
		unsigned long new_delay = (msec*HZ)/1000;

		if (new_delay == 0)
			new_delay = 1;
		mddev->safemode_delay = new_delay;
		if (new_delay < old_delay || old_delay == 0)
			mod_timer(&mddev->safemode_timer, jiffies+1);
3729 3730 3731 3732
	}
	return len;
}
static struct md_sysfs_entry md_safe_delay =
3733
__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3734

3735
static ssize_t
3736
level_show(struct mddev *mddev, char *page)
3737
{
3738 3739 3740 3741
	struct md_personality *p;
	int ret;
	spin_lock(&mddev->lock);
	p = mddev->pers;
3742
	if (p)
3743
		ret = sprintf(page, "%s\n", p->name);
3744
	else if (mddev->clevel[0])
3745
		ret = sprintf(page, "%s\n", mddev->clevel);
3746
	else if (mddev->level != LEVEL_NONE)
3747
		ret = sprintf(page, "%d\n", mddev->level);
3748
	else
3749 3750 3751
		ret = 0;
	spin_unlock(&mddev->lock);
	return ret;
3752 3753
}

3754
static ssize_t
3755
level_store(struct mddev *mddev, const char *buf, size_t len)
3756
{
3757
	char clevel[16];
3758 3759
	ssize_t rv;
	size_t slen = len;
3760
	struct md_personality *pers, *oldpers;
3761
	long level;
3762
	void *priv, *oldpriv;
3763
	struct md_rdev *rdev;
3764

3765 3766 3767 3768 3769 3770 3771
	if (slen == 0 || slen >= sizeof(clevel))
		return -EINVAL;

	rv = mddev_lock(mddev);
	if (rv)
		return rv;

3772
	if (mddev->pers == NULL) {
3773 3774 3775 3776
		strncpy(mddev->clevel, buf, slen);
		if (mddev->clevel[slen-1] == '\n')
			slen--;
		mddev->clevel[slen] = 0;
3777
		mddev->level = LEVEL_NONE;
3778 3779
		rv = len;
		goto out_unlock;
3780
	}
3781
	rv = -EROFS;
3782
	if (mddev->ro)
3783
		goto out_unlock;
3784 3785 3786 3787 3788 3789 3790

	/* request to change the personality.  Need to ensure:
	 *  - array is not engaged in resync/recovery/reshape
	 *  - old personality can be suspended
	 *  - new personality will access other array.
	 */

3791
	rv = -EBUSY;
3792
	if (mddev->sync_thread ||
3793
	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3794 3795
	    mddev->reshape_position != MaxSector ||
	    mddev->sysfs_active)
3796
		goto out_unlock;
3797

3798
	rv = -EINVAL;
3799
	if (!mddev->pers->quiesce) {
3800 3801
		pr_warn("md: %s: %s does not support online personality change\n",
			mdname(mddev), mddev->pers->name);
3802
		goto out_unlock;
3803 3804 3805
	}

	/* Now find the new personality */
3806 3807 3808 3809
	strncpy(clevel, buf, slen);
	if (clevel[slen-1] == '\n')
		slen--;
	clevel[slen] = 0;
3810
	if (kstrtol(clevel, 10, &level))
3811
		level = LEVEL_NONE;
3812

3813 3814
	if (request_module("md-%s", clevel) != 0)
		request_module("md-level-%s", clevel);
3815
	spin_lock(&pers_lock);
3816
	pers = find_pers(level, clevel);
3817 3818
	if (!pers || !try_module_get(pers->owner)) {
		spin_unlock(&pers_lock);
3819
		pr_warn("md: personality %s not loaded\n", clevel);
3820 3821
		rv = -EINVAL;
		goto out_unlock;
3822 3823 3824 3825 3826 3827
	}
	spin_unlock(&pers_lock);

	if (pers == mddev->pers) {
		/* Nothing to do! */
		module_put(pers->owner);
3828 3829
		rv = len;
		goto out_unlock;
3830 3831 3832
	}
	if (!pers->takeover) {
		module_put(pers->owner);
3833 3834
		pr_warn("md: %s: %s does not support personality takeover\n",
			mdname(mddev), clevel);
3835 3836
		rv = -EINVAL;
		goto out_unlock;
3837 3838
	}

N
NeilBrown 已提交
3839
	rdev_for_each(rdev, mddev)
3840 3841
		rdev->new_raid_disk = rdev->raid_disk;

3842 3843 3844 3845 3846 3847 3848
	/* ->takeover must set new_* and/or delta_disks
	 * if it succeeds, and may set them when it fails.
	 */
	priv = pers->takeover(mddev);
	if (IS_ERR(priv)) {
		mddev->new_level = mddev->level;
		mddev->new_layout = mddev->layout;
3849
		mddev->new_chunk_sectors = mddev->chunk_sectors;
3850 3851
		mddev->raid_disks -= mddev->delta_disks;
		mddev->delta_disks = 0;
3852
		mddev->reshape_backwards = 0;
3853
		module_put(pers->owner);
3854 3855
		pr_warn("md: %s: %s would not accept array\n",
			mdname(mddev), clevel);
3856 3857
		rv = PTR_ERR(priv);
		goto out_unlock;
3858 3859 3860 3861
	}

	/* Looks like we have a winner */
	mddev_suspend(mddev);
3862
	mddev_detach(mddev);
3863 3864

	spin_lock(&mddev->lock);
3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875
	oldpers = mddev->pers;
	oldpriv = mddev->private;
	mddev->pers = pers;
	mddev->private = priv;
	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
	mddev->level = mddev->new_level;
	mddev->layout = mddev->new_layout;
	mddev->chunk_sectors = mddev->new_chunk_sectors;
	mddev->delta_disks = 0;
	mddev->reshape_backwards = 0;
	mddev->degraded = 0;
3876
	spin_unlock(&mddev->lock);
3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890

	if (oldpers->sync_request == NULL &&
	    mddev->external) {
		/* We are converting from a no-redundancy array
		 * to a redundancy array and metadata is managed
		 * externally so we need to be sure that writes
		 * won't block due to a need to transition
		 *      clean->dirty
		 * until external management is started.
		 */
		mddev->in_sync = 0;
		mddev->safemode_delay = 0;
		mddev->safemode = 0;
	}
3891

3892 3893 3894
	oldpers->free(mddev, oldpriv);

	if (oldpers->sync_request == NULL &&
3895 3896 3897
	    pers->sync_request != NULL) {
		/* need to add the md_redundancy_group */
		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3898 3899
			pr_warn("md: cannot register extra attributes for %s\n",
				mdname(mddev));
T
Tejun Heo 已提交
3900
		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
3901
	}
3902
	if (oldpers->sync_request != NULL &&
3903 3904 3905 3906 3907 3908
	    pers->sync_request == NULL) {
		/* need to remove the md_redundancy_group */
		if (mddev->to_remove == NULL)
			mddev->to_remove = &md_redundancy_group;
	}

3909 3910
	module_put(oldpers->owner);

N
NeilBrown 已提交
3911
	rdev_for_each(rdev, mddev) {
3912 3913
		if (rdev->raid_disk < 0)
			continue;
3914
		if (rdev->new_raid_disk >= mddev->raid_disks)
3915 3916 3917
			rdev->new_raid_disk = -1;
		if (rdev->new_raid_disk == rdev->raid_disk)
			continue;
3918
		sysfs_unlink_rdev(mddev, rdev);
3919
	}
N
NeilBrown 已提交
3920
	rdev_for_each(rdev, mddev) {
3921 3922 3923 3924 3925 3926
		if (rdev->raid_disk < 0)
			continue;
		if (rdev->new_raid_disk == rdev->raid_disk)
			continue;
		rdev->raid_disk = rdev->new_raid_disk;
		if (rdev->raid_disk < 0)
3927
			clear_bit(In_sync, &rdev->flags);
3928
		else {
3929
			if (sysfs_link_rdev(mddev, rdev))
3930 3931
				pr_warn("md: cannot register rd%d for %s after level change\n",
					rdev->raid_disk, mdname(mddev));
3932
		}
3933 3934
	}

3935
	if (pers->sync_request == NULL) {
3936 3937 3938 3939 3940 3941
		/* this is now an array without redundancy, so
		 * it must always be in_sync
		 */
		mddev->in_sync = 1;
		del_timer_sync(&mddev->safemode_timer);
	}
3942
	blk_set_stacking_limits(&mddev->queue->limits);
3943
	pers->run(mddev);
3944
	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
3945
	mddev_resume(mddev);
3946 3947
	if (!mddev->thread)
		md_update_sb(mddev, 1);
3948
	sysfs_notify(&mddev->kobj, NULL, "level");
3949
	md_new_event(mddev);
3950 3951 3952
	rv = len;
out_unlock:
	mddev_unlock(mddev);
3953 3954 3955 3956
	return rv;
}

static struct md_sysfs_entry md_level =
3957
__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
3958

3959
static ssize_t
3960
layout_show(struct mddev *mddev, char *page)
3961 3962
{
	/* just a number, not meaningful for all levels */
3963 3964 3965 3966
	if (mddev->reshape_position != MaxSector &&
	    mddev->layout != mddev->new_layout)
		return sprintf(page, "%d (%d)\n",
			       mddev->new_layout, mddev->layout);
3967 3968 3969 3970
	return sprintf(page, "%d\n", mddev->layout);
}

static ssize_t
3971
layout_store(struct mddev *mddev, const char *buf, size_t len)
3972
{
A
Alexey Dobriyan 已提交
3973
	unsigned int n;
3974
	int err;
3975

A
Alexey Dobriyan 已提交
3976 3977 3978
	err = kstrtouint(buf, 10, &n);
	if (err < 0)
		return err;
3979 3980 3981
	err = mddev_lock(mddev);
	if (err)
		return err;
3982

3983
	if (mddev->pers) {
3984
		if (mddev->pers->check_reshape == NULL)
3985 3986 3987 3988 3989 3990 3991 3992
			err = -EBUSY;
		else if (mddev->ro)
			err = -EROFS;
		else {
			mddev->new_layout = n;
			err = mddev->pers->check_reshape(mddev);
			if (err)
				mddev->new_layout = mddev->layout;
3993
		}
3994
	} else {
3995
		mddev->new_layout = n;
3996 3997 3998
		if (mddev->reshape_position == MaxSector)
			mddev->layout = n;
	}
3999 4000
	mddev_unlock(mddev);
	return err ?: len;
4001 4002
}
static struct md_sysfs_entry md_layout =
4003
__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
4004

4005
static ssize_t
4006
raid_disks_show(struct mddev *mddev, char *page)
4007
{
4008 4009
	if (mddev->raid_disks == 0)
		return 0;
4010 4011 4012 4013
	if (mddev->reshape_position != MaxSector &&
	    mddev->delta_disks != 0)
		return sprintf(page, "%d (%d)\n", mddev->raid_disks,
			       mddev->raid_disks - mddev->delta_disks);
4014 4015 4016
	return sprintf(page, "%d\n", mddev->raid_disks);
}

4017
static int update_raid_disks(struct mddev *mddev, int raid_disks);
4018 4019

static ssize_t
4020
raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
4021
{
A
Alexey Dobriyan 已提交
4022
	unsigned int n;
4023
	int err;
4024

A
Alexey Dobriyan 已提交
4025 4026 4027
	err = kstrtouint(buf, 10, &n);
	if (err < 0)
		return err;
4028

4029 4030 4031
	err = mddev_lock(mddev);
	if (err)
		return err;
4032
	if (mddev->pers)
4033
		err = update_raid_disks(mddev, n);
4034
	else if (mddev->reshape_position != MaxSector) {
4035
		struct md_rdev *rdev;
4036
		int olddisks = mddev->raid_disks - mddev->delta_disks;
4037

4038
		err = -EINVAL;
4039 4040 4041
		rdev_for_each(rdev, mddev) {
			if (olddisks < n &&
			    rdev->data_offset < rdev->new_data_offset)
4042
				goto out_unlock;
4043 4044
			if (olddisks > n &&
			    rdev->data_offset > rdev->new_data_offset)
4045
				goto out_unlock;
4046
		}
4047
		err = 0;
4048 4049
		mddev->delta_disks = n - olddisks;
		mddev->raid_disks = n;
4050
		mddev->reshape_backwards = (mddev->delta_disks < 0);
4051
	} else
4052
		mddev->raid_disks = n;
4053 4054 4055
out_unlock:
	mddev_unlock(mddev);
	return err ? err : len;
4056 4057
}
static struct md_sysfs_entry md_raid_disks =
4058
__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
4059

4060
static ssize_t
4061
chunk_size_show(struct mddev *mddev, char *page)
4062
{
4063
	if (mddev->reshape_position != MaxSector &&
4064 4065 4066
	    mddev->chunk_sectors != mddev->new_chunk_sectors)
		return sprintf(page, "%d (%d)\n",
			       mddev->new_chunk_sectors << 9,
4067 4068
			       mddev->chunk_sectors << 9);
	return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
4069 4070 4071
}

static ssize_t
4072
chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
4073
{
A
Alexey Dobriyan 已提交
4074
	unsigned long n;
4075
	int err;
4076

A
Alexey Dobriyan 已提交
4077 4078 4079
	err = kstrtoul(buf, 10, &n);
	if (err < 0)
		return err;
4080

4081 4082 4083
	err = mddev_lock(mddev);
	if (err)
		return err;
4084
	if (mddev->pers) {
4085
		if (mddev->pers->check_reshape == NULL)
4086 4087 4088 4089 4090 4091 4092 4093
			err = -EBUSY;
		else if (mddev->ro)
			err = -EROFS;
		else {
			mddev->new_chunk_sectors = n >> 9;
			err = mddev->pers->check_reshape(mddev);
			if (err)
				mddev->new_chunk_sectors = mddev->chunk_sectors;
4094
		}
4095
	} else {
4096
		mddev->new_chunk_sectors = n >> 9;
4097
		if (mddev->reshape_position == MaxSector)
4098
			mddev->chunk_sectors = n >> 9;
4099
	}
4100 4101
	mddev_unlock(mddev);
	return err ?: len;
4102 4103
}
static struct md_sysfs_entry md_chunk_size =
4104
__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
4105

4106
static ssize_t
4107
resync_start_show(struct mddev *mddev, char *page)
4108
{
4109 4110
	if (mddev->recovery_cp == MaxSector)
		return sprintf(page, "none\n");
4111 4112 4113 4114
	return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
}

static ssize_t
4115
resync_start_store(struct mddev *mddev, const char *buf, size_t len)
4116
{
A
Alexey Dobriyan 已提交
4117
	unsigned long long n;
4118
	int err;
A
Alexey Dobriyan 已提交
4119 4120 4121 4122 4123 4124 4125 4126 4127 4128

	if (cmd_match(buf, "none"))
		n = MaxSector;
	else {
		err = kstrtoull(buf, 10, &n);
		if (err < 0)
			return err;
		if (n != (sector_t)n)
			return -EINVAL;
	}
4129

4130 4131 4132
	err = mddev_lock(mddev);
	if (err)
		return err;
4133
	if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4134
		err = -EBUSY;
4135

4136 4137 4138
	if (!err) {
		mddev->recovery_cp = n;
		if (mddev->pers)
4139
			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4140 4141 4142
	}
	mddev_unlock(mddev);
	return err ?: len;
4143 4144
}
static struct md_sysfs_entry md_resync_start =
4145 4146
__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
		resync_start_show, resync_start_store);
4147

4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159
/*
 * The array state can be:
 *
 * clear
 *     No devices, no size, no level
 *     Equivalent to STOP_ARRAY ioctl
 * inactive
 *     May have some settings, but array is not active
 *        all IO results in error
 *     When written, doesn't tear down array, but just stops it
 * suspended (not supported yet)
 *     All IO requests will block. The array can be reconfigured.
4160
 *     Writing this, if accepted, will block until array is quiescent
4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182
 * readonly
 *     no resync can happen.  no superblocks get written.
 *     write requests fail
 * read-auto
 *     like readonly, but behaves like 'clean' on a write request.
 *
 * clean - no pending writes, but otherwise active.
 *     When written to inactive array, starts without resync
 *     If a write request arrives then
 *       if metadata is known, mark 'dirty' and switch to 'active'.
 *       if not known, block and switch to write-pending
 *     If written to an active array that has pending writes, then fails.
 * active
 *     fully active: IO and resync can be happening.
 *     When written to inactive array, starts with resync
 *
 * write-pending
 *     clean, but writes are blocked waiting for 'active' to be written.
 *
 * active-idle
 *     like active, but no writes have been seen for a while (100msec).
 *
4183 4184 4185 4186 4187
 * broken
 *     RAID0/LINEAR-only: same as clean, but array is missing a member.
 *     It's useful because RAID0/LINEAR mounted-arrays aren't stopped
 *     when a member is gone, so this state will at least alert the
 *     user that something is wrong.
4188 4189
 */
enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
4190
		   write_pending, active_idle, broken, bad_word};
4191
static char *array_states[] = {
4192
	"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
4193
	"write-pending", "active-idle", "broken", NULL };
4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204

static int match_word(const char *word, char **list)
{
	int n;
	for (n=0; list[n]; n++)
		if (cmd_match(word, list[n]))
			break;
	return n;
}

static ssize_t
4205
array_state_show(struct mddev *mddev, char *page)
4206 4207 4208
{
	enum array_state st = inactive;

4209
	if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
4210 4211 4212 4213 4214 4215 4216 4217
		switch(mddev->ro) {
		case 1:
			st = readonly;
			break;
		case 2:
			st = read_auto;
			break;
		case 0:
4218
			spin_lock(&mddev->lock);
4219
			if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4220
				st = write_pending;
4221 4222
			else if (mddev->in_sync)
				st = clean;
4223 4224 4225 4226
			else if (mddev->safemode)
				st = active_idle;
			else
				st = active;
4227
			spin_unlock(&mddev->lock);
4228
		}
4229 4230 4231 4232

		if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
			st = broken;
	} else {
4233 4234
		if (list_empty(&mddev->disks) &&
		    mddev->raid_disks == 0 &&
A
Andre Noll 已提交
4235
		    mddev->dev_sectors == 0)
4236 4237 4238 4239 4240 4241 4242
			st = clear;
		else
			st = inactive;
	}
	return sprintf(page, "%s\n", array_states[st]);
}

4243 4244 4245
static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
static int do_md_run(struct mddev *mddev);
4246
static int restart_array(struct mddev *mddev);
4247 4248

static ssize_t
4249
array_state_store(struct mddev *mddev, const char *buf, size_t len)
4250
{
N
NeilBrown 已提交
4251
	int err = 0;
4252
	enum array_state st = match_word(buf, array_states);
4253 4254 4255 4256 4257 4258 4259 4260

	if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
		/* don't take reconfig_mutex when toggling between
		 * clean and active
		 */
		spin_lock(&mddev->lock);
		if (st == active) {
			restart_array(mddev);
4261
			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4262
			md_wakeup_thread(mddev->thread);
4263 4264 4265
			wake_up(&mddev->sb_wait);
		} else /* st == clean */ {
			restart_array(mddev);
N
NeilBrown 已提交
4266
			if (!set_in_sync(mddev))
4267 4268
				err = -EBUSY;
		}
4269 4270
		if (!err)
			sysfs_notify_dirent_safe(mddev->sysfs_state);
4271
		spin_unlock(&mddev->lock);
4272
		return err ?: len;
4273 4274 4275 4276 4277
	}
	err = mddev_lock(mddev);
	if (err)
		return err;
	err = -EINVAL;
4278 4279 4280 4281 4282
	switch(st) {
	case bad_word:
		break;
	case clear:
		/* stopping an active array */
4283
		err = do_md_stop(mddev, 0, NULL);
4284 4285 4286
		break;
	case inactive:
		/* stopping an active array */
4287
		if (mddev->pers)
4288
			err = do_md_stop(mddev, 2, NULL);
4289
		else
4290
			err = 0; /* already inactive */
4291 4292 4293 4294 4295
		break;
	case suspended:
		break; /* not supported yet */
	case readonly:
		if (mddev->pers)
4296
			err = md_set_readonly(mddev, NULL);
4297 4298
		else {
			mddev->ro = 1;
4299
			set_disk_ro(mddev->gendisk, 1);
4300 4301 4302 4303 4304
			err = do_md_run(mddev);
		}
		break;
	case read_auto:
		if (mddev->pers) {
4305
			if (mddev->ro == 0)
4306
				err = md_set_readonly(mddev, NULL);
4307
			else if (mddev->ro == 1)
4308 4309 4310 4311 4312
				err = restart_array(mddev);
			if (err == 0) {
				mddev->ro = 2;
				set_disk_ro(mddev->gendisk, 0);
			}
4313 4314 4315 4316 4317 4318 4319
		} else {
			mddev->ro = 2;
			err = do_md_run(mddev);
		}
		break;
	case clean:
		if (mddev->pers) {
4320 4321 4322
			err = restart_array(mddev);
			if (err)
				break;
4323
			spin_lock(&mddev->lock);
N
NeilBrown 已提交
4324
			if (!set_in_sync(mddev))
4325
				err = -EBUSY;
4326
			spin_unlock(&mddev->lock);
4327 4328
		} else
			err = -EINVAL;
4329 4330 4331
		break;
	case active:
		if (mddev->pers) {
4332 4333 4334
			err = restart_array(mddev);
			if (err)
				break;
4335
			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4336 4337 4338 4339
			wake_up(&mddev->sb_wait);
			err = 0;
		} else {
			mddev->ro = 0;
4340
			set_disk_ro(mddev->gendisk, 0);
4341 4342 4343 4344 4345
			err = do_md_run(mddev);
		}
		break;
	case write_pending:
	case active_idle:
4346
	case broken:
4347 4348 4349
		/* these cannot be set */
		break;
	}
4350 4351

	if (!err) {
4352 4353
		if (mddev->hold_active == UNTIL_IOCTL)
			mddev->hold_active = 0;
N
NeilBrown 已提交
4354
		sysfs_notify_dirent_safe(mddev->sysfs_state);
4355
	}
4356 4357
	mddev_unlock(mddev);
	return err ?: len;
4358
}
4359
static struct md_sysfs_entry md_array_state =
4360
__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4361

4362
static ssize_t
4363
max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4364 4365 4366 4367 4368
	return sprintf(page, "%d\n",
		       atomic_read(&mddev->max_corr_read_errors));
}

static ssize_t
4369
max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4370
{
A
Alexey Dobriyan 已提交
4371 4372
	unsigned int n;
	int rv;
4373

A
Alexey Dobriyan 已提交
4374 4375 4376 4377 4378
	rv = kstrtouint(buf, 10, &n);
	if (rv < 0)
		return rv;
	atomic_set(&mddev->max_corr_read_errors, n);
	return len;
4379 4380 4381 4382 4383 4384
}

static struct md_sysfs_entry max_corr_read_errors =
__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
	max_corrected_read_errors_store);

4385
static ssize_t
4386
null_show(struct mddev *mddev, char *page)
4387 4388 4389 4390 4391
{
	return -EINVAL;
}

static ssize_t
4392
new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404
{
	/* buf must be %d:%d\n? giving major and minor numbers */
	/* The new device is added to the array.
	 * If the array has a persistent superblock, we read the
	 * superblock to initialise info and check validity.
	 * Otherwise, only checking done is that in bind_rdev_to_array,
	 * which mainly checks size.
	 */
	char *e;
	int major = simple_strtoul(buf, &e, 10);
	int minor;
	dev_t dev;
4405
	struct md_rdev *rdev;
4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417
	int err;

	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
		return -EINVAL;
	minor = simple_strtoul(e+1, &e, 10);
	if (*e && *e != '\n')
		return -EINVAL;
	dev = MKDEV(major, minor);
	if (major != MAJOR(dev) ||
	    minor != MINOR(dev))
		return -EOVERFLOW;

4418 4419 4420 4421 4422
	flush_workqueue(md_misc_wq);

	err = mddev_lock(mddev);
	if (err)
		return err;
4423 4424 4425 4426
	if (mddev->persistent) {
		rdev = md_import_device(dev, mddev->major_version,
					mddev->minor_version);
		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4427 4428 4429
			struct md_rdev *rdev0
				= list_entry(mddev->disks.next,
					     struct md_rdev, same_set);
4430 4431 4432 4433 4434
			err = super_types[mddev->major_version]
				.load_super(rdev, rdev0, mddev->minor_version);
			if (err < 0)
				goto out;
		}
4435 4436 4437
	} else if (mddev->external)
		rdev = md_import_device(dev, -2, -1);
	else
4438 4439
		rdev = md_import_device(dev, -1, -1);

4440 4441
	if (IS_ERR(rdev)) {
		mddev_unlock(mddev);
4442
		return PTR_ERR(rdev);
4443
	}
4444 4445 4446 4447
	err = bind_rdev_to_array(rdev, mddev);
 out:
	if (err)
		export_rdev(rdev);
4448
	mddev_unlock(mddev);
4449 4450
	if (!err)
		md_new_event(mddev);
4451 4452 4453 4454
	return err ? err : len;
}

static struct md_sysfs_entry md_new_device =
4455
__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4456

4457
static ssize_t
4458
bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4459 4460 4461
{
	char *end;
	unsigned long chunk, end_chunk;
4462
	int err;
4463

4464 4465 4466
	err = mddev_lock(mddev);
	if (err)
		return err;
4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478
	if (!mddev->bitmap)
		goto out;
	/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
	while (*buf) {
		chunk = end_chunk = simple_strtoul(buf, &end, 0);
		if (buf == end) break;
		if (*end == '-') { /* range */
			buf = end + 1;
			end_chunk = simple_strtoul(buf, &end, 0);
			if (buf == end) break;
		}
		if (*end && !isspace(*end)) break;
4479
		md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
4480
		buf = skip_spaces(end);
4481
	}
4482
	md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
4483
out:
4484
	mddev_unlock(mddev);
4485 4486 4487 4488 4489 4490
	return len;
}

static struct md_sysfs_entry md_bitmap =
__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);

4491
static ssize_t
4492
size_show(struct mddev *mddev, char *page)
4493
{
A
Andre Noll 已提交
4494 4495
	return sprintf(page, "%llu\n",
		(unsigned long long)mddev->dev_sectors / 2);
4496 4497
}

4498
static int update_size(struct mddev *mddev, sector_t num_sectors);
4499 4500

static ssize_t
4501
size_store(struct mddev *mddev, const char *buf, size_t len)
4502 4503 4504 4505 4506
{
	/* If array is inactive, we can reduce the component size, but
	 * not increase it (except from 0).
	 * If array is active, we can try an on-line resize
	 */
D
Dan Williams 已提交
4507 4508
	sector_t sectors;
	int err = strict_blocks_to_sectors(buf, &sectors);
4509

A
Andre Noll 已提交
4510 4511
	if (err < 0)
		return err;
4512 4513 4514
	err = mddev_lock(mddev);
	if (err)
		return err;
4515
	if (mddev->pers) {
A
Andre Noll 已提交
4516
		err = update_size(mddev, sectors);
4517 4518
		if (err == 0)
			md_update_sb(mddev, 1);
4519
	} else {
A
Andre Noll 已提交
4520 4521 4522
		if (mddev->dev_sectors == 0 ||
		    mddev->dev_sectors > sectors)
			mddev->dev_sectors = sectors;
4523 4524 4525
		else
			err = -ENOSPC;
	}
4526
	mddev_unlock(mddev);
4527 4528 4529 4530
	return err ? err : len;
}

static struct md_sysfs_entry md_size =
4531
__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4532

M
Masanari Iida 已提交
4533
/* Metadata version.
4534 4535 4536
 * This is one of
 *   'none' for arrays with no metadata (good luck...)
 *   'external' for arrays with externally managed metadata,
4537 4538 4539
 * or N.M for internally known formats
 */
static ssize_t
4540
metadata_show(struct mddev *mddev, char *page)
4541 4542 4543 4544
{
	if (mddev->persistent)
		return sprintf(page, "%d.%d\n",
			       mddev->major_version, mddev->minor_version);
4545 4546
	else if (mddev->external)
		return sprintf(page, "external:%s\n", mddev->metadata_type);
4547 4548 4549 4550 4551
	else
		return sprintf(page, "none\n");
}

static ssize_t
4552
metadata_store(struct mddev *mddev, const char *buf, size_t len)
4553 4554 4555
{
	int major, minor;
	char *e;
4556
	int err;
4557 4558 4559 4560
	/* Changing the details of 'external' metadata is
	 * always permitted.  Otherwise there must be
	 * no devices attached to the array.
	 */
4561 4562 4563 4564 4565

	err = mddev_lock(mddev);
	if (err)
		return err;
	err = -EBUSY;
4566 4567 4568
	if (mddev->external && strncmp(buf, "external:", 9) == 0)
		;
	else if (!list_empty(&mddev->disks))
4569
		goto out_unlock;
4570

4571
	err = 0;
4572 4573
	if (cmd_match(buf, "none")) {
		mddev->persistent = 0;
4574 4575 4576
		mddev->external = 0;
		mddev->major_version = 0;
		mddev->minor_version = 90;
4577
		goto out_unlock;
4578 4579
	}
	if (strncmp(buf, "external:", 9) == 0) {
4580
		size_t namelen = len-9;
4581 4582 4583 4584 4585 4586 4587 4588
		if (namelen >= sizeof(mddev->metadata_type))
			namelen = sizeof(mddev->metadata_type)-1;
		strncpy(mddev->metadata_type, buf+9, namelen);
		mddev->metadata_type[namelen] = 0;
		if (namelen && mddev->metadata_type[namelen-1] == '\n')
			mddev->metadata_type[--namelen] = 0;
		mddev->persistent = 0;
		mddev->external = 1;
4589 4590
		mddev->major_version = 0;
		mddev->minor_version = 90;
4591
		goto out_unlock;
4592 4593
	}
	major = simple_strtoul(buf, &e, 10);
4594
	err = -EINVAL;
4595
	if (e==buf || *e != '.')
4596
		goto out_unlock;
4597 4598
	buf = e+1;
	minor = simple_strtoul(buf, &e, 10);
4599
	if (e==buf || (*e && *e != '\n') )
4600 4601
		goto out_unlock;
	err = -ENOENT;
4602
	if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4603
		goto out_unlock;
4604 4605 4606
	mddev->major_version = major;
	mddev->minor_version = minor;
	mddev->persistent = 1;
4607
	mddev->external = 0;
4608 4609 4610 4611
	err = 0;
out_unlock:
	mddev_unlock(mddev);
	return err ?: len;
4612 4613 4614
}

static struct md_sysfs_entry md_metadata =
4615
__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4616

4617
static ssize_t
4618
action_show(struct mddev *mddev, char *page)
4619
{
4620
	char *type = "idle";
4621 4622
	unsigned long recovery = mddev->recovery;
	if (test_bit(MD_RECOVERY_FROZEN, &recovery))
4623
		type = "frozen";
4624 4625 4626
	else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
	    (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
		if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
4627
			type = "reshape";
4628 4629
		else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
			if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
4630
				type = "resync";
4631
			else if (test_bit(MD_RECOVERY_CHECK, &recovery))
4632 4633 4634
				type = "check";
			else
				type = "repair";
4635
		} else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4636
			type = "recover";
4637 4638
		else if (mddev->reshape_position != MaxSector)
			type = "reshape";
4639 4640 4641 4642 4643
	}
	return sprintf(page, "%s\n", type);
}

static ssize_t
4644
action_store(struct mddev *mddev, const char *page, size_t len)
4645
{
4646 4647 4648
	if (!mddev->pers || !mddev->pers->sync_request)
		return -EINVAL;

4649 4650

	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4651 4652 4653 4654
		if (cmd_match(page, "frozen"))
			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
		else
			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4655 4656 4657 4658 4659
		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
		    mddev_lock(mddev) == 0) {
			flush_workqueue(md_misc_wq);
			if (mddev->sync_thread) {
				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4660 4661
				md_reap_sync_thread(mddev);
			}
4662
			mddev_unlock(mddev);
4663
		}
4664
	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4665
		return -EBUSY;
4666
	else if (cmd_match(page, "resync"))
4667
		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4668
	else if (cmd_match(page, "recover")) {
4669
		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4670 4671
		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
	} else if (cmd_match(page, "reshape")) {
4672 4673 4674
		int err;
		if (mddev->pers->start_reshape == NULL)
			return -EINVAL;
4675 4676
		err = mddev_lock(mddev);
		if (!err) {
4677 4678 4679 4680 4681 4682
			if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
				err =  -EBUSY;
			else {
				clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
				err = mddev->pers->start_reshape(mddev);
			}
4683 4684
			mddev_unlock(mddev);
		}
4685 4686
		if (err)
			return err;
4687
		sysfs_notify(&mddev->kobj, NULL, "degraded");
4688
	} else {
4689
		if (cmd_match(page, "check"))
4690
			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4691
		else if (!cmd_match(page, "repair"))
4692
			return -EINVAL;
4693
		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4694 4695 4696
		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
	}
4697 4698 4699 4700 4701 4702 4703
	if (mddev->ro == 2) {
		/* A write to sync_action is enough to justify
		 * canceling read-auto mode
		 */
		mddev->ro = 0;
		md_wakeup_thread(mddev->sync_thread);
	}
4704
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4705
	md_wakeup_thread(mddev->thread);
N
NeilBrown 已提交
4706
	sysfs_notify_dirent_safe(mddev->sysfs_action);
4707 4708 4709
	return len;
}

4710
static struct md_sysfs_entry md_scan_mode =
4711
__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4712 4713 4714 4715 4716 4717 4718 4719 4720

static ssize_t
last_sync_action_show(struct mddev *mddev, char *page)
{
	return sprintf(page, "%s\n", mddev->last_sync_action);
}

static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);

4721
static ssize_t
4722
mismatch_cnt_show(struct mddev *mddev, char *page)
4723 4724
{
	return sprintf(page, "%llu\n",
4725 4726
		       (unsigned long long)
		       atomic64_read(&mddev->resync_mismatches));
4727 4728
}

4729
static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4730

4731
static ssize_t
4732
sync_min_show(struct mddev *mddev, char *page)
4733 4734 4735 4736 4737 4738
{
	return sprintf(page, "%d (%s)\n", speed_min(mddev),
		       mddev->sync_speed_min ? "local": "system");
}

static ssize_t
4739
sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4740
{
A
Alexey Dobriyan 已提交
4741 4742 4743
	unsigned int min;
	int rv;

4744
	if (strncmp(buf, "system", 6)==0) {
A
Alexey Dobriyan 已提交
4745 4746 4747 4748 4749 4750 4751
		min = 0;
	} else {
		rv = kstrtouint(buf, 10, &min);
		if (rv < 0)
			return rv;
		if (min == 0)
			return -EINVAL;
4752 4753 4754 4755 4756 4757 4758 4759 4760
	}
	mddev->sync_speed_min = min;
	return len;
}

static struct md_sysfs_entry md_sync_min =
__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);

static ssize_t
4761
sync_max_show(struct mddev *mddev, char *page)
4762 4763 4764 4765 4766 4767
{
	return sprintf(page, "%d (%s)\n", speed_max(mddev),
		       mddev->sync_speed_max ? "local": "system");
}

static ssize_t
4768
sync_max_store(struct mddev *mddev, const char *buf, size_t len)
4769
{
A
Alexey Dobriyan 已提交
4770 4771 4772
	unsigned int max;
	int rv;

4773
	if (strncmp(buf, "system", 6)==0) {
A
Alexey Dobriyan 已提交
4774 4775 4776 4777 4778 4779 4780
		max = 0;
	} else {
		rv = kstrtouint(buf, 10, &max);
		if (rv < 0)
			return rv;
		if (max == 0)
			return -EINVAL;
4781 4782 4783 4784 4785 4786 4787 4788
	}
	mddev->sync_speed_max = max;
	return len;
}

static struct md_sysfs_entry md_sync_max =
__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);

4789
static ssize_t
4790
degraded_show(struct mddev *mddev, char *page)
4791 4792 4793 4794
{
	return sprintf(page, "%d\n", mddev->degraded);
}
static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4795

4796
static ssize_t
4797
sync_force_parallel_show(struct mddev *mddev, char *page)
4798 4799 4800 4801 4802
{
	return sprintf(page, "%d\n", mddev->parallel_resync);
}

static ssize_t
4803
sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
4804 4805 4806
{
	long n;

4807
	if (kstrtol(buf, 10, &n))
4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825
		return -EINVAL;

	if (n != 0 && n != 1)
		return -EINVAL;

	mddev->parallel_resync = n;

	if (mddev->sync_thread)
		wake_up(&resync_wait);

	return len;
}

/* force parallel resync, even with shared block devices */
static struct md_sysfs_entry md_sync_force_parallel =
__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
       sync_force_parallel_show, sync_force_parallel_store);

4826
static ssize_t
4827
sync_speed_show(struct mddev *mddev, char *page)
4828 4829
{
	unsigned long resync, dt, db;
4830 4831
	if (mddev->curr_resync == 0)
		return sprintf(page, "none\n");
4832 4833
	resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
	dt = (jiffies - mddev->resync_mark) / HZ;
4834
	if (!dt) dt++;
4835 4836
	db = resync - mddev->resync_mark_cnt;
	return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
4837 4838
}

4839
static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4840 4841

static ssize_t
4842
sync_completed_show(struct mddev *mddev, char *page)
4843
{
4844
	unsigned long long max_sectors, resync;
4845

4846 4847 4848
	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
		return sprintf(page, "none\n");

4849 4850 4851 4852
	if (mddev->curr_resync == 1 ||
	    mddev->curr_resync == 2)
		return sprintf(page, "delayed\n");

4853 4854
	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
A
Andre Noll 已提交
4855
		max_sectors = mddev->resync_max_sectors;
4856
	else
A
Andre Noll 已提交
4857
		max_sectors = mddev->dev_sectors;
4858

4859
	resync = mddev->curr_resync_completed;
4860
	return sprintf(page, "%llu / %llu\n", resync, max_sectors);
4861 4862
}

4863 4864
static struct md_sysfs_entry md_sync_completed =
	__ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
4865

4866
static ssize_t
4867
min_sync_show(struct mddev *mddev, char *page)
4868 4869 4870 4871 4872
{
	return sprintf(page, "%llu\n",
		       (unsigned long long)mddev->resync_min);
}
static ssize_t
4873
min_sync_store(struct mddev *mddev, const char *buf, size_t len)
4874 4875
{
	unsigned long long min;
4876 4877
	int err;

4878
	if (kstrtoull(buf, 10, &min))
4879
		return -EINVAL;
4880 4881 4882

	spin_lock(&mddev->lock);
	err = -EINVAL;
4883
	if (min > mddev->resync_max)
4884 4885 4886
		goto out_unlock;

	err = -EBUSY;
4887
	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4888
		goto out_unlock;
4889

4890 4891
	/* Round down to multiple of 4K for safety */
	mddev->resync_min = round_down(min, 8);
4892
	err = 0;
4893

4894 4895 4896
out_unlock:
	spin_unlock(&mddev->lock);
	return err ?: len;
4897 4898 4899 4900 4901
}

static struct md_sysfs_entry md_min_sync =
__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);

4902
static ssize_t
4903
max_sync_show(struct mddev *mddev, char *page)
4904 4905 4906 4907 4908 4909 4910 4911
{
	if (mddev->resync_max == MaxSector)
		return sprintf(page, "max\n");
	else
		return sprintf(page, "%llu\n",
			       (unsigned long long)mddev->resync_max);
}
static ssize_t
4912
max_sync_store(struct mddev *mddev, const char *buf, size_t len)
4913
{
4914 4915
	int err;
	spin_lock(&mddev->lock);
4916 4917 4918
	if (strncmp(buf, "max", 3) == 0)
		mddev->resync_max = MaxSector;
	else {
4919
		unsigned long long max;
4920 4921 4922
		int chunk;

		err = -EINVAL;
4923
		if (kstrtoull(buf, 10, &max))
4924
			goto out_unlock;
4925
		if (max < mddev->resync_min)
4926 4927 4928
			goto out_unlock;

		err = -EBUSY;
4929
		if (max < mddev->resync_max &&
4930
		    mddev->ro == 0 &&
4931
		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4932
			goto out_unlock;
4933 4934

		/* Must be a multiple of chunk_size */
4935 4936
		chunk = mddev->chunk_sectors;
		if (chunk) {
4937
			sector_t temp = max;
4938 4939 4940 4941

			err = -EINVAL;
			if (sector_div(temp, chunk))
				goto out_unlock;
4942 4943 4944 4945
		}
		mddev->resync_max = max;
	}
	wake_up(&mddev->recovery_wait);
4946 4947 4948 4949
	err = 0;
out_unlock:
	spin_unlock(&mddev->lock);
	return err ?: len;
4950 4951 4952 4953 4954
}

static struct md_sysfs_entry md_max_sync =
__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);

4955
static ssize_t
4956
suspend_lo_show(struct mddev *mddev, char *page)
4957 4958 4959 4960 4961
{
	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
}

static ssize_t
4962
suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4963
{
4964
	unsigned long long new;
4965
	int err;
4966

A
Alexey Dobriyan 已提交
4967 4968 4969 4970
	err = kstrtoull(buf, 10, &new);
	if (err < 0)
		return err;
	if (new != (sector_t)new)
4971
		return -EINVAL;
4972

4973 4974 4975 4976 4977 4978 4979
	err = mddev_lock(mddev);
	if (err)
		return err;
	err = -EINVAL;
	if (mddev->pers == NULL ||
	    mddev->pers->quiesce == NULL)
		goto unlock;
4980
	mddev_suspend(mddev);
4981
	mddev->suspend_lo = new;
4982 4983
	mddev_resume(mddev);

4984 4985 4986 4987
	err = 0;
unlock:
	mddev_unlock(mddev);
	return err ?: len;
4988 4989 4990 4991 4992
}
static struct md_sysfs_entry md_suspend_lo =
__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);

static ssize_t
4993
suspend_hi_show(struct mddev *mddev, char *page)
4994 4995 4996 4997 4998
{
	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
}

static ssize_t
4999
suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
5000
{
5001
	unsigned long long new;
5002
	int err;
5003

A
Alexey Dobriyan 已提交
5004 5005 5006 5007
	err = kstrtoull(buf, 10, &new);
	if (err < 0)
		return err;
	if (new != (sector_t)new)
5008
		return -EINVAL;
5009

5010 5011 5012 5013
	err = mddev_lock(mddev);
	if (err)
		return err;
	err = -EINVAL;
5014
	if (mddev->pers == NULL)
5015
		goto unlock;
5016 5017

	mddev_suspend(mddev);
5018
	mddev->suspend_hi = new;
5019 5020
	mddev_resume(mddev);

5021 5022 5023 5024
	err = 0;
unlock:
	mddev_unlock(mddev);
	return err ?: len;
5025 5026 5027 5028
}
static struct md_sysfs_entry md_suspend_hi =
__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);

5029
static ssize_t
5030
reshape_position_show(struct mddev *mddev, char *page)
5031 5032 5033 5034 5035 5036 5037 5038 5039
{
	if (mddev->reshape_position != MaxSector)
		return sprintf(page, "%llu\n",
			       (unsigned long long)mddev->reshape_position);
	strcpy(page, "none\n");
	return 5;
}

static ssize_t
5040
reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
5041
{
5042
	struct md_rdev *rdev;
A
Alexey Dobriyan 已提交
5043
	unsigned long long new;
5044 5045
	int err;

A
Alexey Dobriyan 已提交
5046 5047 5048 5049
	err = kstrtoull(buf, 10, &new);
	if (err < 0)
		return err;
	if (new != (sector_t)new)
5050
		return -EINVAL;
5051 5052 5053 5054 5055 5056
	err = mddev_lock(mddev);
	if (err)
		return err;
	err = -EBUSY;
	if (mddev->pers)
		goto unlock;
5057 5058
	mddev->reshape_position = new;
	mddev->delta_disks = 0;
5059
	mddev->reshape_backwards = 0;
5060 5061
	mddev->new_level = mddev->level;
	mddev->new_layout = mddev->layout;
5062
	mddev->new_chunk_sectors = mddev->chunk_sectors;
5063 5064
	rdev_for_each(rdev, mddev)
		rdev->new_data_offset = rdev->data_offset;
5065 5066 5067 5068
	err = 0;
unlock:
	mddev_unlock(mddev);
	return err ?: len;
5069 5070 5071 5072 5073 5074
}

static struct md_sysfs_entry md_reshape_position =
__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
       reshape_position_store);

5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085
static ssize_t
reshape_direction_show(struct mddev *mddev, char *page)
{
	return sprintf(page, "%s\n",
		       mddev->reshape_backwards ? "backwards" : "forwards");
}

static ssize_t
reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
{
	int backwards = 0;
5086 5087
	int err;

5088 5089 5090 5091 5092 5093 5094 5095 5096
	if (cmd_match(buf, "forwards"))
		backwards = 0;
	else if (cmd_match(buf, "backwards"))
		backwards = 1;
	else
		return -EINVAL;
	if (mddev->reshape_backwards == backwards)
		return len;

5097 5098 5099
	err = mddev_lock(mddev);
	if (err)
		return err;
5100 5101
	/* check if we are allowed to change */
	if (mddev->delta_disks)
5102 5103
		err = -EBUSY;
	else if (mddev->persistent &&
5104
	    mddev->major_version == 0)
5105 5106 5107 5108 5109
		err =  -EINVAL;
	else
		mddev->reshape_backwards = backwards;
	mddev_unlock(mddev);
	return err ?: len;
5110 5111 5112 5113 5114 5115
}

static struct md_sysfs_entry md_reshape_direction =
__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
       reshape_direction_store);

D
Dan Williams 已提交
5116
static ssize_t
5117
array_size_show(struct mddev *mddev, char *page)
D
Dan Williams 已提交
5118 5119 5120 5121 5122 5123 5124 5125 5126
{
	if (mddev->external_size)
		return sprintf(page, "%llu\n",
			       (unsigned long long)mddev->array_sectors/2);
	else
		return sprintf(page, "default\n");
}

static ssize_t
5127
array_size_store(struct mddev *mddev, const char *buf, size_t len)
D
Dan Williams 已提交
5128 5129
{
	sector_t sectors;
5130 5131 5132 5133 5134
	int err;

	err = mddev_lock(mddev);
	if (err)
		return err;
D
Dan Williams 已提交
5135

5136
	/* cluster raid doesn't support change array_sectors */
5137 5138
	if (mddev_is_clustered(mddev)) {
		mddev_unlock(mddev);
5139
		return -EINVAL;
5140
	}
5141

D
Dan Williams 已提交
5142 5143 5144 5145 5146 5147 5148 5149 5150
	if (strncmp(buf, "default", 7) == 0) {
		if (mddev->pers)
			sectors = mddev->pers->size(mddev, 0, 0);
		else
			sectors = mddev->array_sectors;

		mddev->external_size = 0;
	} else {
		if (strict_blocks_to_sectors(buf, &sectors) < 0)
5151 5152 5153 5154 5155
			err = -EINVAL;
		else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
			err = -E2BIG;
		else
			mddev->external_size = 1;
D
Dan Williams 已提交
5156 5157
	}

5158 5159 5160 5161 5162 5163
	if (!err) {
		mddev->array_sectors = sectors;
		if (mddev->pers) {
			set_capacity(mddev->gendisk, mddev->array_sectors);
			revalidate_disk(mddev->gendisk);
		}
5164
	}
5165 5166
	mddev_unlock(mddev);
	return err ?: len;
D
Dan Williams 已提交
5167 5168 5169 5170 5171
}

static struct md_sysfs_entry md_array_size =
__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
       array_size_store);
5172

5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198
static ssize_t
consistency_policy_show(struct mddev *mddev, char *page)
{
	int ret;

	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
		ret = sprintf(page, "journal\n");
	} else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
		ret = sprintf(page, "ppl\n");
	} else if (mddev->bitmap) {
		ret = sprintf(page, "bitmap\n");
	} else if (mddev->pers) {
		if (mddev->pers->sync_request)
			ret = sprintf(page, "resync\n");
		else
			ret = sprintf(page, "none\n");
	} else {
		ret = sprintf(page, "unknown\n");
	}

	return ret;
}

static ssize_t
consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
{
5199 5200
	int err = 0;

5201
	if (mddev->pers) {
5202 5203 5204 5205
		if (mddev->pers->change_consistency_policy)
			err = mddev->pers->change_consistency_policy(mddev, buf);
		else
			err = -EBUSY;
5206 5207 5208
	} else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
		set_bit(MD_HAS_PPL, &mddev->flags);
	} else {
5209
		err = -EINVAL;
5210
	}
5211 5212

	return err ? err : len;
5213 5214 5215 5216 5217 5218
}

static struct md_sysfs_entry md_consistency_policy =
__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
       consistency_policy_store);

5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246
static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
{
	return sprintf(page, "%d\n", mddev->fail_last_dev);
}

/*
 * Setting fail_last_dev to true to allow last device to be forcibly removed
 * from RAID1/RAID10.
 */
static ssize_t
fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
{
	int ret;
	bool value;

	ret = kstrtobool(buf, &value);
	if (ret)
		return ret;

	if (value != mddev->fail_last_dev)
		mddev->fail_last_dev = value;

	return len;
}
static struct md_sysfs_entry md_fail_last_dev =
__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
       fail_last_dev_store);

5247 5248
static struct attribute *md_default_attrs[] = {
	&md_level.attr,
5249
	&md_layout.attr,
5250
	&md_raid_disks.attr,
5251
	&md_chunk_size.attr,
5252
	&md_size.attr,
5253
	&md_resync_start.attr,
5254
	&md_metadata.attr,
5255
	&md_new_device.attr,
5256
	&md_safe_delay.attr,
5257
	&md_array_state.attr,
5258
	&md_reshape_position.attr,
5259
	&md_reshape_direction.attr,
D
Dan Williams 已提交
5260
	&md_array_size.attr,
5261
	&max_corr_read_errors.attr,
5262
	&md_consistency_policy.attr,
5263
	&md_fail_last_dev.attr,
5264 5265 5266 5267
	NULL,
};

static struct attribute *md_redundancy_attrs[] = {
5268
	&md_scan_mode.attr,
5269
	&md_last_scan_mode.attr,
5270
	&md_mismatches.attr,
5271 5272 5273
	&md_sync_min.attr,
	&md_sync_max.attr,
	&md_sync_speed.attr,
5274
	&md_sync_force_parallel.attr,
5275
	&md_sync_completed.attr,
5276
	&md_min_sync.attr,
5277
	&md_max_sync.attr,
5278 5279
	&md_suspend_lo.attr,
	&md_suspend_hi.attr,
5280
	&md_bitmap.attr,
5281
	&md_degraded.attr,
5282 5283
	NULL,
};
5284 5285 5286 5287 5288
static struct attribute_group md_redundancy_group = {
	.name = NULL,
	.attrs = md_redundancy_attrs,
};

5289 5290 5291 5292
static ssize_t
md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5293
	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5294
	ssize_t rv;
5295 5296 5297

	if (!entry->show)
		return -EIO;
5298 5299 5300 5301 5302 5303 5304 5305
	spin_lock(&all_mddevs_lock);
	if (list_empty(&mddev->all_mddevs)) {
		spin_unlock(&all_mddevs_lock);
		return -EBUSY;
	}
	mddev_get(mddev);
	spin_unlock(&all_mddevs_lock);

5306
	rv = entry->show(mddev, page);
5307
	mddev_put(mddev);
5308
	return rv;
5309 5310 5311 5312 5313 5314 5315
}

static ssize_t
md_attr_store(struct kobject *kobj, struct attribute *attr,
	      const char *page, size_t length)
{
	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5316
	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5317
	ssize_t rv;
5318 5319 5320

	if (!entry->store)
		return -EIO;
5321 5322
	if (!capable(CAP_SYS_ADMIN))
		return -EACCES;
5323 5324 5325 5326 5327 5328 5329
	spin_lock(&all_mddevs_lock);
	if (list_empty(&mddev->all_mddevs)) {
		spin_unlock(&all_mddevs_lock);
		return -EBUSY;
	}
	mddev_get(mddev);
	spin_unlock(&all_mddevs_lock);
5330
	rv = entry->store(mddev, page, length);
5331
	mddev_put(mddev);
5332
	return rv;
5333 5334 5335 5336
}

static void md_free(struct kobject *ko)
{
5337
	struct mddev *mddev = container_of(ko, struct mddev, kobj);
5338 5339 5340 5341

	if (mddev->sysfs_state)
		sysfs_put(mddev->sysfs_state);

5342 5343
	if (mddev->gendisk)
		del_gendisk(mddev->gendisk);
5344 5345
	if (mddev->queue)
		blk_cleanup_queue(mddev->queue);
5346
	if (mddev->gendisk)
5347
		put_disk(mddev->gendisk);
5348
	percpu_ref_exit(&mddev->writes_pending);
5349

5350 5351
	bioset_exit(&mddev->bio_set);
	bioset_exit(&mddev->sync_set);
5352 5353 5354
	kfree(mddev);
}

5355
static const struct sysfs_ops md_sysfs_ops = {
5356 5357 5358 5359 5360 5361 5362 5363 5364
	.show	= md_attr_show,
	.store	= md_attr_store,
};
static struct kobj_type md_ktype = {
	.release	= md_free,
	.sysfs_ops	= &md_sysfs_ops,
	.default_attrs	= md_default_attrs,
};

L
Linus Torvalds 已提交
5365 5366
int mdp_major = 0;

5367 5368
static void mddev_delayed_delete(struct work_struct *ws)
{
5369
	struct mddev *mddev = container_of(ws, struct mddev, del_work);
5370

5371
	sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
5372 5373 5374 5375
	kobject_del(&mddev->kobj);
	kobject_put(&mddev->kobj);
}

5376 5377
static void no_op(struct percpu_ref *r) {}

5378 5379 5380 5381
int mddev_init_writes_pending(struct mddev *mddev)
{
	if (mddev->writes_pending.percpu_count_ptr)
		return 0;
5382 5383
	if (percpu_ref_init(&mddev->writes_pending, no_op,
			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0)
5384 5385 5386 5387 5388 5389 5390
		return -ENOMEM;
	/* We want to start with the refcount at zero */
	percpu_ref_put(&mddev->writes_pending);
	return 0;
}
EXPORT_SYMBOL_GPL(mddev_init_writes_pending);

5391
static int md_alloc(dev_t dev, char *name)
L
Linus Torvalds 已提交
5392
{
5393 5394 5395 5396 5397 5398 5399 5400 5401
	/*
	 * If dev is zero, name is the name of a device to allocate with
	 * an arbitrary minor number.  It will be "md_???"
	 * If dev is non-zero it must be a device number with a MAJOR of
	 * MD_MAJOR or mdp_major.  In this case, if "name" is NULL, then
	 * the device is being created by opening a node in /dev.
	 * If "name" is not NULL, the device is being created by
	 * writing to /sys/module/md_mod/parameters/new_array.
	 */
A
Arjan van de Ven 已提交
5402
	static DEFINE_MUTEX(disks_mutex);
5403
	struct mddev *mddev = mddev_find(dev);
L
Linus Torvalds 已提交
5404
	struct gendisk *disk;
5405 5406 5407
	int partitioned;
	int shift;
	int unit;
5408
	int error;
L
Linus Torvalds 已提交
5409 5410

	if (!mddev)
5411 5412 5413 5414 5415
		return -ENODEV;

	partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
	shift = partitioned ? MdpMinorShift : 0;
	unit = MINOR(mddev->unit) >> shift;
L
Linus Torvalds 已提交
5416

T
Tejun Heo 已提交
5417 5418
	/* wait for any previous instance of this device to be
	 * completely removed (mddev_delayed_delete).
5419
	 */
T
Tejun Heo 已提交
5420
	flush_workqueue(md_misc_wq);
5421

A
Arjan van de Ven 已提交
5422
	mutex_lock(&disks_mutex);
N
NeilBrown 已提交
5423 5424 5425
	error = -EEXIST;
	if (mddev->gendisk)
		goto abort;
5426

5427
	if (name && !dev) {
5428 5429
		/* Need to ensure that 'name' is not a duplicate.
		 */
5430
		struct mddev *mddev2;
5431 5432 5433 5434 5435 5436
		spin_lock(&all_mddevs_lock);

		list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
			if (mddev2->gendisk &&
			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
				spin_unlock(&all_mddevs_lock);
N
NeilBrown 已提交
5437
				goto abort;
5438 5439
			}
		spin_unlock(&all_mddevs_lock);
L
Linus Torvalds 已提交
5440
	}
5441 5442 5443 5444 5445
	if (name && dev)
		/*
		 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
		 */
		mddev->hold_active = UNTIL_STOP;
5446

N
NeilBrown 已提交
5447
	error = -ENOMEM;
5448
	mddev->queue = blk_alloc_queue(GFP_KERNEL);
N
NeilBrown 已提交
5449 5450
	if (!mddev->queue)
		goto abort;
5451 5452 5453
	mddev->queue->queuedata = mddev;

	blk_queue_make_request(mddev->queue, md_make_request);
5454
	blk_set_stacking_limits(&mddev->queue->limits);
5455

L
Linus Torvalds 已提交
5456 5457
	disk = alloc_disk(1 << shift);
	if (!disk) {
5458 5459
		blk_cleanup_queue(mddev->queue);
		mddev->queue = NULL;
N
NeilBrown 已提交
5460
		goto abort;
L
Linus Torvalds 已提交
5461
	}
5462
	disk->major = MAJOR(mddev->unit);
L
Linus Torvalds 已提交
5463
	disk->first_minor = unit << shift;
5464 5465 5466
	if (name)
		strcpy(disk->disk_name, name);
	else if (partitioned)
L
Linus Torvalds 已提交
5467
		sprintf(disk->disk_name, "md_d%d", unit);
5468
	else
L
Linus Torvalds 已提交
5469 5470 5471 5472
		sprintf(disk->disk_name, "md%d", unit);
	disk->fops = &md_fops;
	disk->private_data = mddev;
	disk->queue = mddev->queue;
5473
	blk_queue_write_cache(mddev->queue, true, true);
5474
	/* Allow extended partitions.  This makes the
5475
	 * 'mdp' device redundant, but we can't really
5476 5477 5478
	 * remove it now.
	 */
	disk->flags |= GENHD_FL_EXT_DEVT;
L
Linus Torvalds 已提交
5479
	mddev->gendisk = disk;
5480 5481 5482 5483 5484 5485
	/* As soon as we call add_disk(), another thread could get
	 * through to md_open, so make sure it doesn't get too far
	 */
	mutex_lock(&mddev->open_mutex);
	add_disk(disk);

5486
	error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
N
NeilBrown 已提交
5487 5488 5489 5490
	if (error) {
		/* This isn't possible, but as kobject_init_and_add is marked
		 * __must_check, we must do something with the result
		 */
5491 5492
		pr_debug("md: cannot register %s/md - name in use\n",
			 disk->disk_name);
N
NeilBrown 已提交
5493 5494
		error = 0;
	}
N
NeilBrown 已提交
5495 5496
	if (mddev->kobj.sd &&
	    sysfs_create_group(&mddev->kobj, &md_bitmap_group))
5497
		pr_debug("pointless warning\n");
5498
	mutex_unlock(&mddev->open_mutex);
N
NeilBrown 已提交
5499 5500
 abort:
	mutex_unlock(&disks_mutex);
N
NeilBrown 已提交
5501
	if (!error && mddev->kobj.sd) {
5502
		kobject_uevent(&mddev->kobj, KOBJ_ADD);
N
NeilBrown 已提交
5503
		mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
5504
	}
5505
	mddev_put(mddev);
N
NeilBrown 已提交
5506
	return error;
5507 5508 5509 5510
}

static struct kobject *md_probe(dev_t dev, int *part, void *data)
{
5511 5512
	if (create_on_open)
		md_alloc(dev, NULL);
L
Linus Torvalds 已提交
5513 5514 5515
	return NULL;
}

5516
static int add_named_array(const char *val, const struct kernel_param *kp)
5517
{
5518 5519 5520
	/*
	 * val must be "md_*" or "mdNNN".
	 * For "md_*" we allocate an array with a large free minor number, and
5521
	 * set the name to val.  val must not already be an active name.
5522 5523
	 * For "mdNNN" we allocate an array with the minor number NNN
	 * which must not already be in use.
5524 5525 5526
	 */
	int len = strlen(val);
	char buf[DISK_NAME_LEN];
5527
	unsigned long devnum;
5528 5529 5530 5531 5532 5533

	while (len && val[len-1] == '\n')
		len--;
	if (len >= DISK_NAME_LEN)
		return -E2BIG;
	strlcpy(buf, val, len+1);
5534 5535 5536 5537 5538 5539 5540 5541 5542
	if (strncmp(buf, "md_", 3) == 0)
		return md_alloc(0, buf);
	if (strncmp(buf, "md", 2) == 0 &&
	    isdigit(buf[2]) &&
	    kstrtoul(buf+2, 10, &devnum) == 0 &&
	    devnum <= MINORMASK)
		return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);

	return -EINVAL;
5543 5544
}

5545
static void md_safemode_timeout(struct timer_list *t)
L
Linus Torvalds 已提交
5546
{
5547
	struct mddev *mddev = from_timer(mddev, t, safemode_timer);
L
Linus Torvalds 已提交
5548

5549 5550 5551 5552
	mddev->safemode = 1;
	if (mddev->external)
		sysfs_notify_dirent_safe(mddev->sysfs_state);

L
Linus Torvalds 已提交
5553 5554 5555
	md_wakeup_thread(mddev->thread);
}

5556
static int start_dirty_degraded;
L
Linus Torvalds 已提交
5557

5558
int md_run(struct mddev *mddev)
L
Linus Torvalds 已提交
5559
{
5560
	int err;
5561
	struct md_rdev *rdev;
5562
	struct md_personality *pers;
L
Linus Torvalds 已提交
5563

5564 5565
	if (list_empty(&mddev->disks))
		/* cannot run an array with no devices.. */
L
Linus Torvalds 已提交
5566 5567 5568 5569
		return -EINVAL;

	if (mddev->pers)
		return -EBUSY;
5570 5571 5572
	/* Cannot run until previous stop completes properly */
	if (mddev->sysfs_active)
		return -EBUSY;
5573

L
Linus Torvalds 已提交
5574 5575 5576
	/*
	 * Analyze all RAID superblock(s)
	 */
5577 5578 5579
	if (!mddev->raid_disks) {
		if (!mddev->persistent)
			return -EINVAL;
5580
		analyze_sbs(mddev);
5581
	}
L
Linus Torvalds 已提交
5582

5583 5584 5585 5586
	if (mddev->level != LEVEL_NONE)
		request_module("md-level-%d", mddev->level);
	else if (mddev->clevel[0])
		request_module("md-%s", mddev->clevel);
L
Linus Torvalds 已提交
5587 5588 5589 5590 5591 5592

	/*
	 * Drop all container device buffers, from now on
	 * the only valid external interface is through the md
	 * device.
	 */
5593
	mddev->has_superblocks = false;
N
NeilBrown 已提交
5594
	rdev_for_each(rdev, mddev) {
5595
		if (test_bit(Faulty, &rdev->flags))
L
Linus Torvalds 已提交
5596 5597
			continue;
		sync_blockdev(rdev->bdev);
5598
		invalidate_bdev(rdev->bdev);
5599 5600 5601 5602 5603 5604 5605
		if (mddev->ro != 1 &&
		    (bdev_read_only(rdev->bdev) ||
		     bdev_read_only(rdev->meta_bdev))) {
			mddev->ro = 1;
			if (mddev->gendisk)
				set_disk_ro(mddev->gendisk, 1);
		}
5606

5607 5608 5609
		if (rdev->sb_page)
			mddev->has_superblocks = true;

5610 5611
		/* perform some consistency tests on the device.
		 * We don't want the data to overlap the metadata,
A
Andre Noll 已提交
5612
		 * Internal Bitmap issues have been handled elsewhere.
5613
		 */
5614 5615 5616
		if (rdev->meta_bdev) {
			/* Nothing to check */;
		} else if (rdev->data_offset < rdev->sb_start) {
A
Andre Noll 已提交
5617 5618
			if (mddev->dev_sectors &&
			    rdev->data_offset + mddev->dev_sectors
5619
			    > rdev->sb_start) {
5620 5621
				pr_warn("md: %s: data overlaps metadata\n",
					mdname(mddev));
5622 5623 5624
				return -EINVAL;
			}
		} else {
5625
			if (rdev->sb_start + rdev->sb_size/512
5626
			    > rdev->data_offset) {
5627 5628
				pr_warn("md: %s: metadata overlaps data\n",
					mdname(mddev));
5629 5630 5631
				return -EINVAL;
			}
		}
N
NeilBrown 已提交
5632
		sysfs_notify_dirent_safe(rdev->sysfs_state);
L
Linus Torvalds 已提交
5633 5634
	}

5635 5636 5637 5638
	if (!bioset_initialized(&mddev->bio_set)) {
		err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
		if (err)
			return err;
5639
	}
5640 5641 5642
	if (!bioset_initialized(&mddev->sync_set)) {
		err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
		if (err)
5643
			return err;
5644
	}
5645

L
Linus Torvalds 已提交
5646
	spin_lock(&pers_lock);
5647
	pers = find_pers(mddev->level, mddev->clevel);
5648
	if (!pers || !try_module_get(pers->owner)) {
L
Linus Torvalds 已提交
5649
		spin_unlock(&pers_lock);
5650
		if (mddev->level != LEVEL_NONE)
5651 5652
			pr_warn("md: personality for level %d is not loaded!\n",
				mddev->level);
5653
		else
5654 5655
			pr_warn("md: personality for level %s is not loaded!\n",
				mddev->clevel);
S
Shaohua Li 已提交
5656 5657
		err = -EINVAL;
		goto abort;
L
Linus Torvalds 已提交
5658 5659
	}
	spin_unlock(&pers_lock);
5660 5661 5662 5663
	if (mddev->level != pers->level) {
		mddev->level = pers->level;
		mddev->new_level = pers->level;
	}
5664
	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
L
Linus Torvalds 已提交
5665

5666
	if (mddev->reshape_position != MaxSector &&
5667
	    pers->start_reshape == NULL) {
5668 5669
		/* This personality cannot handle reshaping... */
		module_put(pers->owner);
S
Shaohua Li 已提交
5670 5671
		err = -EINVAL;
		goto abort;
5672 5673
	}

5674 5675 5676 5677 5678
	if (pers->sync_request) {
		/* Warn if this is a potentially silly
		 * configuration.
		 */
		char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5679
		struct md_rdev *rdev2;
5680
		int warned = 0;
5681

N
NeilBrown 已提交
5682 5683
		rdev_for_each(rdev, mddev)
			rdev_for_each(rdev2, mddev) {
5684 5685 5686
				if (rdev < rdev2 &&
				    rdev->bdev->bd_contains ==
				    rdev2->bdev->bd_contains) {
5687 5688 5689 5690
					pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n",
						mdname(mddev),
						bdevname(rdev->bdev,b),
						bdevname(rdev2->bdev,b2));
5691 5692 5693
					warned = 1;
				}
			}
5694

5695
		if (warned)
5696
			pr_warn("True protection against single-disk failure might be compromised.\n");
5697 5698
	}

5699
	mddev->recovery = 0;
A
Andre Noll 已提交
5700 5701 5702
	/* may be over-ridden by personality */
	mddev->resync_max_sectors = mddev->dev_sectors;

5703
	mddev->ok_start_degraded = start_dirty_degraded;
L
Linus Torvalds 已提交
5704

5705
	if (start_readonly && mddev->ro == 0)
5706 5707
		mddev->ro = 2; /* read-only, but switch on first write */

5708
	err = pers->run(mddev);
5709
	if (err)
5710
		pr_warn("md: pers->run() failed ...\n");
5711
	else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
5712 5713 5714 5715 5716 5717
		WARN_ONCE(!mddev->external_size,
			  "%s: default size too small, but 'external_size' not in effect?\n",
			  __func__);
		pr_warn("md: invalid array_size %llu > default size %llu\n",
			(unsigned long long)mddev->array_sectors / 2,
			(unsigned long long)pers->size(mddev, 0, 0) / 2);
D
Dan Williams 已提交
5718 5719
		err = -EINVAL;
	}
5720
	if (err == 0 && pers->sync_request &&
5721
	    (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
5722 5723
		struct bitmap *bitmap;

5724
		bitmap = md_bitmap_create(mddev, -1);
5725 5726
		if (IS_ERR(bitmap)) {
			err = PTR_ERR(bitmap);
5727 5728
			pr_warn("%s: failed to create bitmap (%d)\n",
				mdname(mddev), err);
5729 5730 5731
		} else
			mddev->bitmap = bitmap;

5732
	}
5733 5734
	if (err)
		goto bitmap_abort;
5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749

	if (mddev->bitmap_info.max_write_behind > 0) {
		bool creat_pool = false;

		rdev_for_each(rdev, mddev) {
			if (test_bit(WriteMostly, &rdev->flags) &&
			    rdev_init_wb(rdev))
				creat_pool = true;
		}
		if (creat_pool && mddev->wb_info_pool == NULL) {
			mddev->wb_info_pool =
				mempool_create_kmalloc_pool(NR_WB_INFOS,
						    sizeof(struct wb_info));
			if (!mddev->wb_info_pool) {
				err = -ENOMEM;
5750
				goto bitmap_abort;
5751 5752 5753 5754
			}
		}
	}

5755
	if (mddev->queue) {
S
Shaohua Li 已提交
5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767
		bool nonrot = true;

		rdev_for_each(rdev, mddev) {
			if (rdev->raid_disk >= 0 &&
			    !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
				nonrot = false;
				break;
			}
		}
		if (mddev->degraded)
			nonrot = false;
		if (nonrot)
5768
			blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue);
S
Shaohua Li 已提交
5769
		else
5770
			blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue);
5771 5772
		mddev->queue->backing_dev_info->congested_data = mddev;
		mddev->queue->backing_dev_info->congested_fn = md_congested;
5773
	}
5774
	if (pers->sync_request) {
N
NeilBrown 已提交
5775 5776
		if (mddev->kobj.sd &&
		    sysfs_create_group(&mddev->kobj, &md_redundancy_group))
5777 5778
			pr_warn("md: cannot register extra attributes for %s\n",
				mdname(mddev));
N
NeilBrown 已提交
5779
		mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
5780
	} else if (mddev->ro == 2) /* auto-readonly not meaningful */
5781 5782
		mddev->ro = 0;

5783 5784
	atomic_set(&mddev->max_corr_read_errors,
		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
L
Linus Torvalds 已提交
5785
	mddev->safemode = 0;
5786 5787 5788 5789
	if (mddev_is_clustered(mddev))
		mddev->safemode_delay = 0;
	else
		mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
L
Linus Torvalds 已提交
5790
	mddev->in_sync = 1;
5791
	smp_wmb();
5792 5793 5794
	spin_lock(&mddev->lock);
	mddev->pers = pers;
	spin_unlock(&mddev->lock);
N
NeilBrown 已提交
5795
	rdev_for_each(rdev, mddev)
5796
		if (rdev->raid_disk >= 0)
5797
			sysfs_link_rdev(mddev, rdev); /* failure here is OK */
5798

5799 5800 5801 5802 5803
	if (mddev->degraded && !mddev->ro)
		/* This ensures that recovering status is reported immediately
		 * via sysfs - until a lack of spares is confirmed.
		 */
		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
L
Linus Torvalds 已提交
5804
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5805

5806
	if (mddev->sb_flags)
5807
		md_update_sb(mddev, 0);
L
Linus Torvalds 已提交
5808

5809
	md_new_event(mddev);
L
Linus Torvalds 已提交
5810
	return 0;
X
Xiao Ni 已提交
5811

5812 5813 5814 5815 5816 5817 5818
bitmap_abort:
	mddev_detach(mddev);
	if (mddev->private)
		pers->free(mddev, mddev->private);
	mddev->private = NULL;
	module_put(pers->owner);
	md_bitmap_destroy(mddev);
X
Xiao Ni 已提交
5819
abort:
5820 5821
	bioset_exit(&mddev->bio_set);
	bioset_exit(&mddev->sync_set);
X
Xiao Ni 已提交
5822
	return err;
L
Linus Torvalds 已提交
5823
}
5824
EXPORT_SYMBOL_GPL(md_run);
L
Linus Torvalds 已提交
5825

5826
static int do_md_run(struct mddev *mddev)
5827 5828 5829
{
	int err;

5830
	set_bit(MD_NOT_READY, &mddev->flags);
5831 5832 5833
	err = md_run(mddev);
	if (err)
		goto out;
5834
	err = md_bitmap_load(mddev);
5835
	if (err) {
5836
		md_bitmap_destroy(mddev);
5837 5838
		goto out;
	}
5839

5840 5841 5842
	if (mddev_is_clustered(mddev))
		md_allow_write(mddev);

5843 5844 5845
	/* run start up tasks that require md_thread */
	md_start(mddev);

5846 5847 5848
	md_wakeup_thread(mddev->thread);
	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */

5849 5850
	set_capacity(mddev->gendisk, mddev->array_sectors);
	revalidate_disk(mddev->gendisk);
5851
	clear_bit(MD_NOT_READY, &mddev->flags);
5852
	mddev->changed = 1;
5853
	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5854 5855 5856
	sysfs_notify_dirent_safe(mddev->sysfs_state);
	sysfs_notify_dirent_safe(mddev->sysfs_action);
	sysfs_notify(&mddev->kobj, NULL, "degraded");
5857
out:
5858
	clear_bit(MD_NOT_READY, &mddev->flags);
5859 5860 5861
	return err;
}

5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876
int md_start(struct mddev *mddev)
{
	int ret = 0;

	if (mddev->pers->start) {
		set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
		ret = mddev->pers->start(mddev);
		clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
		md_wakeup_thread(mddev->sync_thread);
	}
	return ret;
}
EXPORT_SYMBOL_GPL(md_start);

5877
static int restart_array(struct mddev *mddev)
L
Linus Torvalds 已提交
5878 5879
{
	struct gendisk *disk = mddev->gendisk;
5880 5881 5882
	struct md_rdev *rdev;
	bool has_journal = false;
	bool has_readonly = false;
L
Linus Torvalds 已提交
5883

A
Andre Noll 已提交
5884
	/* Complain if it has no devices */
L
Linus Torvalds 已提交
5885
	if (list_empty(&mddev->disks))
A
Andre Noll 已提交
5886 5887 5888 5889 5890
		return -ENXIO;
	if (!mddev->pers)
		return -EINVAL;
	if (!mddev->ro)
		return -EBUSY;
5891

5892 5893 5894 5895 5896 5897 5898 5899 5900 5901
	rcu_read_lock();
	rdev_for_each_rcu(rdev, mddev) {
		if (test_bit(Journal, &rdev->flags) &&
		    !test_bit(Faulty, &rdev->flags))
			has_journal = true;
		if (bdev_read_only(rdev->bdev))
			has_readonly = true;
	}
	rcu_read_unlock();
	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
5902 5903
		/* Don't restart rw with journal missing/faulty */
			return -EINVAL;
5904 5905
	if (has_readonly)
		return -EROFS;
5906

A
Andre Noll 已提交
5907 5908 5909
	mddev->safemode = 0;
	mddev->ro = 0;
	set_disk_ro(disk, 0);
5910
	pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
A
Andre Noll 已提交
5911 5912 5913 5914
	/* Kick recovery or resync if necessary */
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
	md_wakeup_thread(mddev->thread);
	md_wakeup_thread(mddev->sync_thread);
N
NeilBrown 已提交
5915
	sysfs_notify_dirent_safe(mddev->sysfs_state);
A
Andre Noll 已提交
5916
	return 0;
L
Linus Torvalds 已提交
5917 5918
}

5919
static void md_clean(struct mddev *mddev)
N
NeilBrown 已提交
5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933
{
	mddev->array_sectors = 0;
	mddev->external_size = 0;
	mddev->dev_sectors = 0;
	mddev->raid_disks = 0;
	mddev->recovery_cp = 0;
	mddev->resync_min = 0;
	mddev->resync_max = MaxSector;
	mddev->reshape_position = MaxSector;
	mddev->external = 0;
	mddev->persistent = 0;
	mddev->level = LEVEL_NONE;
	mddev->clevel[0] = 0;
	mddev->flags = 0;
5934
	mddev->sb_flags = 0;
N
NeilBrown 已提交
5935 5936 5937 5938 5939 5940 5941
	mddev->ro = 0;
	mddev->metadata_type[0] = 0;
	mddev->chunk_sectors = 0;
	mddev->ctime = mddev->utime = 0;
	mddev->layout = 0;
	mddev->max_disks = 0;
	mddev->events = 0;
5942
	mddev->can_decrease_events = 0;
N
NeilBrown 已提交
5943
	mddev->delta_disks = 0;
5944
	mddev->reshape_backwards = 0;
N
NeilBrown 已提交
5945 5946 5947 5948
	mddev->new_level = LEVEL_NONE;
	mddev->new_layout = 0;
	mddev->new_chunk_sectors = 0;
	mddev->curr_resync = 0;
5949
	atomic64_set(&mddev->resync_mismatches, 0);
N
NeilBrown 已提交
5950 5951 5952 5953
	mddev->suspend_lo = mddev->suspend_hi = 0;
	mddev->sync_speed_min = mddev->sync_speed_max = 0;
	mddev->recovery = 0;
	mddev->in_sync = 0;
5954
	mddev->changed = 0;
N
NeilBrown 已提交
5955 5956
	mddev->degraded = 0;
	mddev->safemode = 0;
5957
	mddev->private = NULL;
5958
	mddev->cluster_info = NULL;
N
NeilBrown 已提交
5959 5960
	mddev->bitmap_info.offset = 0;
	mddev->bitmap_info.default_offset = 0;
5961
	mddev->bitmap_info.default_space = 0;
N
NeilBrown 已提交
5962 5963 5964
	mddev->bitmap_info.chunksize = 0;
	mddev->bitmap_info.daemon_sleep = 0;
	mddev->bitmap_info.max_write_behind = 0;
5965
	mddev->bitmap_info.nodes = 0;
N
NeilBrown 已提交
5966 5967
}

5968
static void __md_stop_writes(struct mddev *mddev)
5969
{
5970
	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5971
	flush_workqueue(md_misc_wq);
5972 5973
	if (mddev->sync_thread) {
		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5974
		md_reap_sync_thread(mddev);
5975 5976 5977 5978
	}

	del_timer_sync(&mddev->safemode_timer);

5979 5980 5981 5982
	if (mddev->pers && mddev->pers->quiesce) {
		mddev->pers->quiesce(mddev, 1);
		mddev->pers->quiesce(mddev, 0);
	}
5983
	md_bitmap_flush(mddev);
5984

5985
	if (mddev->ro == 0 &&
5986
	    ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
5987
	     mddev->sb_flags)) {
5988
		/* mark array as shutdown cleanly */
5989 5990
		if (!mddev_is_clustered(mddev))
			mddev->in_sync = 1;
5991 5992
		md_update_sb(mddev, 1);
	}
5993 5994
	mempool_destroy(mddev->wb_info_pool);
	mddev->wb_info_pool = NULL;
5995
}
5996

5997
void md_stop_writes(struct mddev *mddev)
5998
{
5999
	mddev_lock_nointr(mddev);
6000 6001 6002
	__md_stop_writes(mddev);
	mddev_unlock(mddev);
}
6003
EXPORT_SYMBOL_GPL(md_stop_writes);
6004

6005 6006
static void mddev_detach(struct mddev *mddev)
{
6007
	md_bitmap_wait_behind_writes(mddev);
6008
	if (mddev->pers && mddev->pers->quiesce) {
6009 6010 6011 6012 6013 6014 6015 6016
		mddev->pers->quiesce(mddev, 1);
		mddev->pers->quiesce(mddev, 0);
	}
	md_unregister_thread(&mddev->thread);
	if (mddev->queue)
		blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
}

6017
static void __md_stop(struct mddev *mddev)
N
NeilBrown 已提交
6018
{
6019
	struct md_personality *pers = mddev->pers;
6020
	md_bitmap_destroy(mddev);
6021
	mddev_detach(mddev);
6022 6023
	/* Ensure ->event_work is done */
	flush_workqueue(md_misc_wq);
6024
	spin_lock(&mddev->lock);
N
NeilBrown 已提交
6025
	mddev->pers = NULL;
6026 6027
	spin_unlock(&mddev->lock);
	pers->free(mddev, mddev->private);
6028
	mddev->private = NULL;
6029 6030 6031
	if (pers->sync_request && mddev->to_remove == NULL)
		mddev->to_remove = &md_redundancy_group;
	module_put(pers->owner);
N
NeilBrown 已提交
6032
	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
J
Jack Wang 已提交
6033 6034 6035 6036 6037 6038 6039 6040
}

void md_stop(struct mddev *mddev)
{
	/* stop the array and free an attached data structures.
	 * This is called from dm-raid
	 */
	__md_stop(mddev);
6041 6042
	bioset_exit(&mddev->bio_set);
	bioset_exit(&mddev->sync_set);
6043 6044
}

6045
EXPORT_SYMBOL_GPL(md_stop);
N
NeilBrown 已提交
6046

6047
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
6048 6049
{
	int err = 0;
6050 6051 6052 6053 6054 6055 6056
	int did_freeze = 0;

	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
		did_freeze = 1;
		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
	}
6057
	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
6058
		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6059
	if (mddev->sync_thread)
6060 6061 6062
		/* Thread might be blocked waiting for metadata update
		 * which will now never happen */
		wake_up_process(mddev->sync_thread->tsk);
6063

6064
	if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
6065
		return -EBUSY;
6066
	mddev_unlock(mddev);
6067 6068
	wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
					  &mddev->recovery));
6069
	wait_event(mddev->sb_wait,
6070
		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
6071 6072
	mddev_lock_nointr(mddev);

6073
	mutex_lock(&mddev->open_mutex);
6074
	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
6075
	    mddev->sync_thread ||
6076
	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6077
		pr_warn("md: %s still in use.\n",mdname(mddev));
6078 6079
		if (did_freeze) {
			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6080
			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6081 6082
			md_wakeup_thread(mddev->thread);
		}
6083 6084 6085 6086
		err = -EBUSY;
		goto out;
	}
	if (mddev->pers) {
6087
		__md_stop_writes(mddev);
6088 6089 6090 6091 6092 6093 6094

		err  = -ENXIO;
		if (mddev->ro==1)
			goto out;
		mddev->ro = 1;
		set_disk_ro(mddev->gendisk, 1);
		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6095 6096
		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
N
NeilBrown 已提交
6097
		sysfs_notify_dirent_safe(mddev->sysfs_state);
6098
		err = 0;
6099 6100 6101 6102 6103 6104
	}
out:
	mutex_unlock(&mddev->open_mutex);
	return err;
}

6105 6106 6107 6108
/* mode:
 *   0 - completely stop and dis-assemble array
 *   2 - stop but do not disassemble array
 */
6109
static int do_md_stop(struct mddev *mddev, int mode,
6110
		      struct block_device *bdev)
L
Linus Torvalds 已提交
6111 6112
{
	struct gendisk *disk = mddev->gendisk;
6113
	struct md_rdev *rdev;
6114 6115 6116 6117 6118 6119 6120
	int did_freeze = 0;

	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
		did_freeze = 1;
		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
	}
6121
	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
6122
		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6123
	if (mddev->sync_thread)
6124 6125 6126
		/* Thread might be blocked waiting for metadata update
		 * which will now never happen */
		wake_up_process(mddev->sync_thread->tsk);
6127

6128
	mddev_unlock(mddev);
6129 6130 6131
	wait_event(resync_wait, (mddev->sync_thread == NULL &&
				 !test_bit(MD_RECOVERY_RUNNING,
					   &mddev->recovery)));
6132
	mddev_lock_nointr(mddev);
L
Linus Torvalds 已提交
6133

N
NeilBrown 已提交
6134
	mutex_lock(&mddev->open_mutex);
6135
	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
6136 6137
	    mddev->sysfs_active ||
	    mddev->sync_thread ||
6138
	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6139
		pr_warn("md: %s still in use.\n",mdname(mddev));
N
NeilBrown 已提交
6140
		mutex_unlock(&mddev->open_mutex);
6141 6142
		if (did_freeze) {
			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6143
			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6144 6145
			md_wakeup_thread(mddev->thread);
		}
6146 6147
		return -EBUSY;
	}
N
NeilBrown 已提交
6148
	if (mddev->pers) {
6149 6150
		if (mddev->ro)
			set_disk_ro(disk, 0);
6151

6152
		__md_stop_writes(mddev);
6153
		__md_stop(mddev);
6154
		mddev->queue->backing_dev_info->congested_fn = NULL;
N
NeilBrown 已提交
6155

6156
		/* tell userspace to handle 'inactive' */
N
NeilBrown 已提交
6157
		sysfs_notify_dirent_safe(mddev->sysfs_state);
6158

N
NeilBrown 已提交
6159
		rdev_for_each(rdev, mddev)
6160 6161
			if (rdev->raid_disk >= 0)
				sysfs_unlink_rdev(mddev, rdev);
6162

6163
		set_capacity(disk, 0);
N
NeilBrown 已提交
6164
		mutex_unlock(&mddev->open_mutex);
6165
		mddev->changed = 1;
6166
		revalidate_disk(disk);
6167

6168 6169
		if (mddev->ro)
			mddev->ro = 0;
N
NeilBrown 已提交
6170 6171
	} else
		mutex_unlock(&mddev->open_mutex);
L
Linus Torvalds 已提交
6172 6173 6174
	/*
	 * Free resources if final stop
	 */
6175
	if (mode == 0) {
6176
		pr_info("md: %s stopped.\n", mdname(mddev));
L
Linus Torvalds 已提交
6177

6178
		if (mddev->bitmap_info.file) {
6179 6180
			struct file *f = mddev->bitmap_info.file;
			spin_lock(&mddev->lock);
6181
			mddev->bitmap_info.file = NULL;
6182 6183
			spin_unlock(&mddev->lock);
			fput(f);
6184
		}
6185
		mddev->bitmap_info.offset = 0;
6186

L
Linus Torvalds 已提交
6187 6188
		export_array(mddev);

N
NeilBrown 已提交
6189
		md_clean(mddev);
6190 6191
		if (mddev->hold_active == UNTIL_STOP)
			mddev->hold_active = 0;
6192
	}
6193
	md_new_event(mddev);
N
NeilBrown 已提交
6194
	sysfs_notify_dirent_safe(mddev->sysfs_state);
N
NeilBrown 已提交
6195
	return 0;
L
Linus Torvalds 已提交
6196 6197
}

J
Jeff Garzik 已提交
6198
#ifndef MODULE
6199
static void autorun_array(struct mddev *mddev)
L
Linus Torvalds 已提交
6200
{
6201
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6202 6203
	int err;

6204
	if (list_empty(&mddev->disks))
L
Linus Torvalds 已提交
6205 6206
		return;

6207
	pr_info("md: running: ");
L
Linus Torvalds 已提交
6208

N
NeilBrown 已提交
6209
	rdev_for_each(rdev, mddev) {
L
Linus Torvalds 已提交
6210
		char b[BDEVNAME_SIZE];
6211
		pr_cont("<%s>", bdevname(rdev->bdev,b));
L
Linus Torvalds 已提交
6212
	}
6213
	pr_cont("\n");
L
Linus Torvalds 已提交
6214

6215
	err = do_md_run(mddev);
L
Linus Torvalds 已提交
6216
	if (err) {
6217
		pr_warn("md: do_md_run() returned %d\n", err);
6218
		do_md_stop(mddev, 0, NULL);
L
Linus Torvalds 已提交
6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235
	}
}

/*
 * lets try to run arrays based on all disks that have arrived
 * until now. (those are in pending_raid_disks)
 *
 * the method: pick the first pending disk, collect all disks with
 * the same UUID, remove all from the pending list and put them into
 * the 'same_array' list. Then order this list based on superblock
 * update time (freshest comes first), kick out 'old' disks and
 * compare superblocks. If everything's fine then run it.
 *
 * If "unit" is allocated, then bump its reference count
 */
static void autorun_devices(int part)
{
6236
	struct md_rdev *rdev0, *rdev, *tmp;
6237
	struct mddev *mddev;
L
Linus Torvalds 已提交
6238 6239
	char b[BDEVNAME_SIZE];

6240
	pr_info("md: autorun ...\n");
L
Linus Torvalds 已提交
6241
	while (!list_empty(&pending_raid_disks)) {
6242
		int unit;
L
Linus Torvalds 已提交
6243
		dev_t dev;
6244
		LIST_HEAD(candidates);
L
Linus Torvalds 已提交
6245
		rdev0 = list_entry(pending_raid_disks.next,
6246
					 struct md_rdev, same_set);
L
Linus Torvalds 已提交
6247

6248
		pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b));
L
Linus Torvalds 已提交
6249
		INIT_LIST_HEAD(&candidates);
6250
		rdev_for_each_list(rdev, tmp, &pending_raid_disks)
L
Linus Torvalds 已提交
6251
			if (super_90_load(rdev, rdev0, 0) >= 0) {
6252 6253
				pr_debug("md:  adding %s ...\n",
					 bdevname(rdev->bdev,b));
L
Linus Torvalds 已提交
6254 6255 6256 6257 6258 6259 6260
				list_move(&rdev->same_set, &candidates);
			}
		/*
		 * now we have a set of devices, with all of them having
		 * mostly sane superblocks. It's time to allocate the
		 * mddev.
		 */
6261 6262 6263 6264 6265 6266 6267 6268 6269
		if (part) {
			dev = MKDEV(mdp_major,
				    rdev0->preferred_minor << MdpMinorShift);
			unit = MINOR(dev) >> MdpMinorShift;
		} else {
			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
			unit = MINOR(dev);
		}
		if (rdev0->preferred_minor != unit) {
6270 6271
			pr_warn("md: unit number in %s is bad: %d\n",
				bdevname(rdev0->bdev, b), rdev0->preferred_minor);
L
Linus Torvalds 已提交
6272 6273 6274 6275 6276
			break;
		}

		md_probe(dev, NULL, NULL);
		mddev = mddev_find(dev);
N
Neil Brown 已提交
6277 6278 6279
		if (!mddev || !mddev->gendisk) {
			if (mddev)
				mddev_put(mddev);
L
Linus Torvalds 已提交
6280 6281
			break;
		}
6282
		if (mddev_lock(mddev))
6283
			pr_warn("md: %s locked, cannot run\n", mdname(mddev));
L
Linus Torvalds 已提交
6284 6285
		else if (mddev->raid_disks || mddev->major_version
			 || !list_empty(&mddev->disks)) {
6286
			pr_warn("md: %s already running, cannot run %s\n",
L
Linus Torvalds 已提交
6287 6288 6289
				mdname(mddev), bdevname(rdev0->bdev,b));
			mddev_unlock(mddev);
		} else {
6290
			pr_debug("md: created %s\n", mdname(mddev));
6291
			mddev->persistent = 1;
6292
			rdev_for_each_list(rdev, tmp, &candidates) {
L
Linus Torvalds 已提交
6293 6294 6295 6296 6297 6298 6299 6300 6301 6302
				list_del_init(&rdev->same_set);
				if (bind_rdev_to_array(rdev, mddev))
					export_rdev(rdev);
			}
			autorun_array(mddev);
			mddev_unlock(mddev);
		}
		/* on success, candidates will be empty, on error
		 * it won't...
		 */
6303
		rdev_for_each_list(rdev, tmp, &candidates) {
6304
			list_del_init(&rdev->same_set);
L
Linus Torvalds 已提交
6305
			export_rdev(rdev);
6306
		}
L
Linus Torvalds 已提交
6307 6308
		mddev_put(mddev);
	}
6309
	pr_info("md: ... autorun DONE.\n");
L
Linus Torvalds 已提交
6310
}
J
Jeff Garzik 已提交
6311
#endif /* !MODULE */
L
Linus Torvalds 已提交
6312

6313
static int get_version(void __user *arg)
L
Linus Torvalds 已提交
6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326
{
	mdu_version_t ver;

	ver.major = MD_MAJOR_VERSION;
	ver.minor = MD_MINOR_VERSION;
	ver.patchlevel = MD_PATCHLEVEL_VERSION;

	if (copy_to_user(arg, &ver, sizeof(ver)))
		return -EFAULT;

	return 0;
}

6327
static int get_array_info(struct mddev *mddev, void __user *arg)
L
Linus Torvalds 已提交
6328 6329
{
	mdu_array_info_t info;
6330
	int nr,working,insync,failed,spare;
6331
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6332

6333 6334 6335
	nr = working = insync = failed = spare = 0;
	rcu_read_lock();
	rdev_for_each_rcu(rdev, mddev) {
L
Linus Torvalds 已提交
6336
		nr++;
6337
		if (test_bit(Faulty, &rdev->flags))
L
Linus Torvalds 已提交
6338 6339 6340
			failed++;
		else {
			working++;
6341
			if (test_bit(In_sync, &rdev->flags))
6342
				insync++;
6343 6344 6345
			else if (test_bit(Journal, &rdev->flags))
				/* TODO: add journal count to md_u.h */
				;
L
Linus Torvalds 已提交
6346 6347 6348 6349
			else
				spare++;
		}
	}
6350
	rcu_read_unlock();
L
Linus Torvalds 已提交
6351 6352 6353 6354

	info.major_version = mddev->major_version;
	info.minor_version = mddev->minor_version;
	info.patch_version = MD_PATCHLEVEL_VERSION;
6355
	info.ctime         = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
L
Linus Torvalds 已提交
6356
	info.level         = mddev->level;
A
Andre Noll 已提交
6357 6358
	info.size          = mddev->dev_sectors / 2;
	if (info.size != mddev->dev_sectors / 2) /* overflow */
6359
		info.size = -1;
L
Linus Torvalds 已提交
6360 6361 6362 6363 6364
	info.nr_disks      = nr;
	info.raid_disks    = mddev->raid_disks;
	info.md_minor      = mddev->md_minor;
	info.not_persistent= !mddev->persistent;

6365
	info.utime         = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
L
Linus Torvalds 已提交
6366 6367 6368
	info.state         = 0;
	if (mddev->in_sync)
		info.state = (1<<MD_SB_CLEAN);
6369
	if (mddev->bitmap && mddev->bitmap_info.offset)
6370
		info.state |= (1<<MD_SB_BITMAP_PRESENT);
6371 6372
	if (mddev_is_clustered(mddev))
		info.state |= (1<<MD_SB_CLUSTERED);
6373
	info.active_disks  = insync;
L
Linus Torvalds 已提交
6374 6375 6376 6377 6378
	info.working_disks = working;
	info.failed_disks  = failed;
	info.spare_disks   = spare;

	info.layout        = mddev->layout;
6379
	info.chunk_size    = mddev->chunk_sectors << 9;
L
Linus Torvalds 已提交
6380 6381 6382 6383 6384 6385 6386

	if (copy_to_user(arg, &info, sizeof(info)))
		return -EFAULT;

	return 0;
}

6387
static int get_bitmap_file(struct mddev *mddev, void __user * arg)
6388 6389
{
	mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
6390
	char *ptr;
6391
	int err;
6392

6393
	file = kzalloc(sizeof(*file), GFP_NOIO);
6394
	if (!file)
6395
		return -ENOMEM;
6396

6397 6398
	err = 0;
	spin_lock(&mddev->lock);
6399 6400 6401 6402 6403 6404 6405 6406 6407 6408
	/* bitmap enabled */
	if (mddev->bitmap_info.file) {
		ptr = file_path(mddev->bitmap_info.file, file->pathname,
				sizeof(file->pathname));
		if (IS_ERR(ptr))
			err = PTR_ERR(ptr);
		else
			memmove(file->pathname, ptr,
				sizeof(file->pathname)-(ptr-file->pathname));
	}
6409
	spin_unlock(&mddev->lock);
6410

6411 6412
	if (err == 0 &&
	    copy_to_user(arg, file, sizeof(*file)))
6413
		err = -EFAULT;
6414

6415 6416 6417 6418
	kfree(file);
	return err;
}

6419
static int get_disk_info(struct mddev *mddev, void __user * arg)
L
Linus Torvalds 已提交
6420 6421
{
	mdu_disk_info_t info;
6422
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6423 6424 6425 6426

	if (copy_from_user(&info, arg, sizeof(info)))
		return -EFAULT;

6427
	rcu_read_lock();
6428
	rdev = md_find_rdev_nr_rcu(mddev, info.number);
L
Linus Torvalds 已提交
6429 6430 6431 6432 6433
	if (rdev) {
		info.major = MAJOR(rdev->bdev->bd_dev);
		info.minor = MINOR(rdev->bdev->bd_dev);
		info.raid_disk = rdev->raid_disk;
		info.state = 0;
6434
		if (test_bit(Faulty, &rdev->flags))
L
Linus Torvalds 已提交
6435
			info.state |= (1<<MD_DISK_FAULTY);
6436
		else if (test_bit(In_sync, &rdev->flags)) {
L
Linus Torvalds 已提交
6437 6438 6439
			info.state |= (1<<MD_DISK_ACTIVE);
			info.state |= (1<<MD_DISK_SYNC);
		}
S
Shaohua Li 已提交
6440
		if (test_bit(Journal, &rdev->flags))
6441
			info.state |= (1<<MD_DISK_JOURNAL);
6442 6443
		if (test_bit(WriteMostly, &rdev->flags))
			info.state |= (1<<MD_DISK_WRITEMOSTLY);
6444 6445
		if (test_bit(FailFast, &rdev->flags))
			info.state |= (1<<MD_DISK_FAILFAST);
L
Linus Torvalds 已提交
6446 6447 6448 6449 6450
	} else {
		info.major = info.minor = 0;
		info.raid_disk = -1;
		info.state = (1<<MD_DISK_REMOVED);
	}
6451
	rcu_read_unlock();
L
Linus Torvalds 已提交
6452 6453 6454 6455 6456 6457 6458

	if (copy_to_user(arg, &info, sizeof(info)))
		return -EFAULT;

	return 0;
}

6459
static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
L
Linus Torvalds 已提交
6460 6461
{
	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
6462
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6463 6464
	dev_t dev = MKDEV(info->major,info->minor);

6465 6466
	if (mddev_is_clustered(mddev) &&
		!(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
6467 6468
		pr_warn("%s: Cannot add to clustered mddev.\n",
			mdname(mddev));
6469 6470 6471
		return -EINVAL;
	}

L
Linus Torvalds 已提交
6472 6473 6474 6475 6476 6477 6478 6479
	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
		return -EOVERFLOW;

	if (!mddev->raid_disks) {
		int err;
		/* expecting a device which has a superblock */
		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
		if (IS_ERR(rdev)) {
6480
			pr_warn("md: md_import_device returned %ld\n",
L
Linus Torvalds 已提交
6481 6482 6483 6484
				PTR_ERR(rdev));
			return PTR_ERR(rdev);
		}
		if (!list_empty(&mddev->disks)) {
6485 6486 6487
			struct md_rdev *rdev0
				= list_entry(mddev->disks.next,
					     struct md_rdev, same_set);
6488
			err = super_types[mddev->major_version]
L
Linus Torvalds 已提交
6489 6490
				.load_super(rdev, rdev0, mddev->minor_version);
			if (err < 0) {
6491
				pr_warn("md: %s has different UUID to %s\n",
6492
					bdevname(rdev->bdev,b),
L
Linus Torvalds 已提交
6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511
					bdevname(rdev0->bdev,b2));
				export_rdev(rdev);
				return -EINVAL;
			}
		}
		err = bind_rdev_to_array(rdev, mddev);
		if (err)
			export_rdev(rdev);
		return err;
	}

	/*
	 * add_new_disk can be used once the array is assembled
	 * to add "hot spares".  They must already have a superblock
	 * written
	 */
	if (mddev->pers) {
		int err;
		if (!mddev->pers->hot_add_disk) {
6512 6513
			pr_warn("%s: personality does not support diskops!\n",
				mdname(mddev));
L
Linus Torvalds 已提交
6514 6515
			return -EINVAL;
		}
6516 6517 6518 6519 6520
		if (mddev->persistent)
			rdev = md_import_device(dev, mddev->major_version,
						mddev->minor_version);
		else
			rdev = md_import_device(dev, -1, -1);
L
Linus Torvalds 已提交
6521
		if (IS_ERR(rdev)) {
6522
			pr_warn("md: md_import_device returned %ld\n",
L
Linus Torvalds 已提交
6523 6524 6525
				PTR_ERR(rdev));
			return PTR_ERR(rdev);
		}
6526
		/* set saved_raid_disk if appropriate */
6527 6528
		if (!mddev->persistent) {
			if (info->state & (1<<MD_DISK_SYNC)  &&
6529
			    info->raid_disk < mddev->raid_disks) {
6530
				rdev->raid_disk = info->raid_disk;
6531
				set_bit(In_sync, &rdev->flags);
6532
				clear_bit(Bitmap_sync, &rdev->flags);
6533
			} else
6534
				rdev->raid_disk = -1;
6535
			rdev->saved_raid_disk = rdev->raid_disk;
6536 6537 6538
		} else
			super_types[mddev->major_version].
				validate_super(mddev, rdev);
6539
		if ((info->state & (1<<MD_DISK_SYNC)) &&
6540
		     rdev->raid_disk != info->raid_disk) {
6541 6542 6543 6544 6545 6546 6547
			/* This was a hot-add request, but events doesn't
			 * match, so reject it.
			 */
			export_rdev(rdev);
			return -EINVAL;
		}

6548
		clear_bit(In_sync, &rdev->flags); /* just to be sure */
6549 6550
		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
			set_bit(WriteMostly, &rdev->flags);
6551 6552
		else
			clear_bit(WriteMostly, &rdev->flags);
6553 6554 6555 6556
		if (info->state & (1<<MD_DISK_FAILFAST))
			set_bit(FailFast, &rdev->flags);
		else
			clear_bit(FailFast, &rdev->flags);
6557

6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568
		if (info->state & (1<<MD_DISK_JOURNAL)) {
			struct md_rdev *rdev2;
			bool has_journal = false;

			/* make sure no existing journal disk */
			rdev_for_each(rdev2, mddev) {
				if (test_bit(Journal, &rdev2->flags)) {
					has_journal = true;
					break;
				}
			}
6569
			if (has_journal || mddev->bitmap) {
6570 6571 6572
				export_rdev(rdev);
				return -EBUSY;
			}
6573
			set_bit(Journal, &rdev->flags);
6574
		}
6575 6576 6577 6578
		/*
		 * check whether the device shows up in other nodes
		 */
		if (mddev_is_clustered(mddev)) {
6579
			if (info->state & (1 << MD_DISK_CANDIDATE))
6580
				set_bit(Candidate, &rdev->flags);
6581
			else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
6582
				/* --add initiated by this node */
6583
				err = md_cluster_ops->add_new_disk(mddev, rdev);
6584 6585 6586 6587 6588 6589 6590
				if (err) {
					export_rdev(rdev);
					return err;
				}
			}
		}

L
Linus Torvalds 已提交
6591 6592
		rdev->raid_disk = -1;
		err = bind_rdev_to_array(rdev, mddev);
6593

L
Linus Torvalds 已提交
6594 6595
		if (err)
			export_rdev(rdev);
6596 6597

		if (mddev_is_clustered(mddev)) {
6598 6599 6600 6601 6602 6603 6604 6605
			if (info->state & (1 << MD_DISK_CANDIDATE)) {
				if (!err) {
					err = md_cluster_ops->new_disk_ack(mddev,
						err == 0);
					if (err)
						md_kick_rdev_from_array(rdev);
				}
			} else {
6606 6607 6608 6609 6610 6611 6612
				if (err)
					md_cluster_ops->add_new_disk_cancel(mddev);
				else
					err = add_bound_rdev(rdev);
			}

		} else if (!err)
G
Goldwyn Rodrigues 已提交
6613
			err = add_bound_rdev(rdev);
6614

L
Linus Torvalds 已提交
6615 6616 6617 6618 6619 6620 6621
		return err;
	}

	/* otherwise, add_new_disk is only allowed
	 * for major_version==0 superblocks
	 */
	if (mddev->major_version != 0) {
6622
		pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
L
Linus Torvalds 已提交
6623 6624 6625 6626 6627
		return -EINVAL;
	}

	if (!(info->state & (1<<MD_DISK_FAULTY))) {
		int err;
6628
		rdev = md_import_device(dev, -1, 0);
L
Linus Torvalds 已提交
6629
		if (IS_ERR(rdev)) {
6630
			pr_warn("md: error, md_import_device() returned %ld\n",
L
Linus Torvalds 已提交
6631 6632 6633 6634 6635 6636 6637 6638 6639 6640
				PTR_ERR(rdev));
			return PTR_ERR(rdev);
		}
		rdev->desc_nr = info->number;
		if (info->raid_disk < mddev->raid_disks)
			rdev->raid_disk = info->raid_disk;
		else
			rdev->raid_disk = -1;

		if (rdev->raid_disk < mddev->raid_disks)
6641 6642
			if (info->state & (1<<MD_DISK_SYNC))
				set_bit(In_sync, &rdev->flags);
L
Linus Torvalds 已提交
6643

6644 6645
		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
			set_bit(WriteMostly, &rdev->flags);
6646 6647
		if (info->state & (1<<MD_DISK_FAILFAST))
			set_bit(FailFast, &rdev->flags);
6648

L
Linus Torvalds 已提交
6649
		if (!mddev->persistent) {
6650
			pr_debug("md: nonpersistent superblock ...\n");
6651 6652
			rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
		} else
6653
			rdev->sb_start = calc_dev_sboffset(rdev);
6654
		rdev->sectors = rdev->sb_start;
L
Linus Torvalds 已提交
6655

6656 6657 6658 6659 6660
		err = bind_rdev_to_array(rdev, mddev);
		if (err) {
			export_rdev(rdev);
			return err;
		}
L
Linus Torvalds 已提交
6661 6662 6663 6664 6665
	}

	return 0;
}

6666
static int hot_remove_disk(struct mddev *mddev, dev_t dev)
L
Linus Torvalds 已提交
6667 6668
{
	char b[BDEVNAME_SIZE];
6669
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6670

6671 6672 6673
	if (!mddev->pers)
		return -ENODEV;

L
Linus Torvalds 已提交
6674 6675 6676 6677
	rdev = find_rdev(mddev, dev);
	if (!rdev)
		return -ENXIO;

6678 6679
	if (rdev->raid_disk < 0)
		goto kick_rdev;
6680

6681 6682 6683
	clear_bit(Blocked, &rdev->flags);
	remove_and_add_spares(mddev, rdev);

L
Linus Torvalds 已提交
6684 6685 6686
	if (rdev->raid_disk >= 0)
		goto busy;

6687
kick_rdev:
6688
	if (mddev_is_clustered(mddev))
6689 6690
		md_cluster_ops->remove_disk(mddev, rdev);

6691
	md_kick_rdev_from_array(rdev);
6692
	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6693 6694 6695 6696
	if (mddev->thread)
		md_wakeup_thread(mddev->thread);
	else
		md_update_sb(mddev, 1);
6697
	md_new_event(mddev);
L
Linus Torvalds 已提交
6698 6699 6700

	return 0;
busy:
6701 6702
	pr_debug("md: cannot remove active disk %s from %s ...\n",
		 bdevname(rdev->bdev,b), mdname(mddev));
L
Linus Torvalds 已提交
6703 6704 6705
	return -EBUSY;
}

6706
static int hot_add_disk(struct mddev *mddev, dev_t dev)
L
Linus Torvalds 已提交
6707 6708 6709
{
	char b[BDEVNAME_SIZE];
	int err;
6710
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6711 6712 6713 6714 6715

	if (!mddev->pers)
		return -ENODEV;

	if (mddev->major_version != 0) {
6716
		pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
L
Linus Torvalds 已提交
6717 6718 6719 6720
			mdname(mddev));
		return -EINVAL;
	}
	if (!mddev->pers->hot_add_disk) {
6721
		pr_warn("%s: personality does not support diskops!\n",
L
Linus Torvalds 已提交
6722 6723 6724 6725
			mdname(mddev));
		return -EINVAL;
	}

6726
	rdev = md_import_device(dev, -1, 0);
L
Linus Torvalds 已提交
6727
	if (IS_ERR(rdev)) {
6728
		pr_warn("md: error, md_import_device() returned %ld\n",
L
Linus Torvalds 已提交
6729 6730 6731 6732 6733
			PTR_ERR(rdev));
		return -EINVAL;
	}

	if (mddev->persistent)
6734
		rdev->sb_start = calc_dev_sboffset(rdev);
L
Linus Torvalds 已提交
6735
	else
6736
		rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
L
Linus Torvalds 已提交
6737

6738
	rdev->sectors = rdev->sb_start;
L
Linus Torvalds 已提交
6739

6740
	if (test_bit(Faulty, &rdev->flags)) {
6741
		pr_warn("md: can not hot-add faulty %s disk to %s!\n",
L
Linus Torvalds 已提交
6742 6743 6744 6745
			bdevname(rdev->bdev,b), mdname(mddev));
		err = -EINVAL;
		goto abort_export;
	}
6746

6747
	clear_bit(In_sync, &rdev->flags);
L
Linus Torvalds 已提交
6748
	rdev->desc_nr = -1;
6749
	rdev->saved_raid_disk = -1;
6750 6751
	err = bind_rdev_to_array(rdev, mddev);
	if (err)
6752
		goto abort_export;
L
Linus Torvalds 已提交
6753 6754 6755 6756 6757 6758 6759 6760

	/*
	 * The rest should better be atomic, we can have disk failures
	 * noticed in interrupt contexts ...
	 */

	rdev->raid_disk = -1;

6761
	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6762 6763
	if (!mddev->thread)
		md_update_sb(mddev, 1);
L
Linus Torvalds 已提交
6764 6765 6766 6767 6768 6769
	/*
	 * Kick recovery, maybe this spare has to be added to the
	 * array immediately.
	 */
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
	md_wakeup_thread(mddev->thread);
6770
	md_new_event(mddev);
L
Linus Torvalds 已提交
6771 6772 6773 6774 6775 6776 6777
	return 0;

abort_export:
	export_rdev(rdev);
	return err;
}

6778
static int set_bitmap_file(struct mddev *mddev, int fd)
6779
{
6780
	int err = 0;
6781

6782
	if (mddev->pers) {
6783
		if (!mddev->pers->quiesce || !mddev->thread)
6784 6785 6786 6787 6788
			return -EBUSY;
		if (mddev->recovery || mddev->sync_thread)
			return -EBUSY;
		/* we should be able to change the bitmap.. */
	}
6789

6790
	if (fd >= 0) {
6791
		struct inode *inode;
N
NeilBrown 已提交
6792 6793 6794
		struct file *f;

		if (mddev->bitmap || mddev->bitmap_info.file)
6795
			return -EEXIST; /* cannot add when bitmap is present */
N
NeilBrown 已提交
6796
		f = fget(fd);
6797

N
NeilBrown 已提交
6798
		if (f == NULL) {
6799 6800
			pr_warn("%s: error: failed to get bitmap file\n",
				mdname(mddev));
6801 6802 6803
			return -EBADF;
		}

N
NeilBrown 已提交
6804
		inode = f->f_mapping->host;
6805
		if (!S_ISREG(inode->i_mode)) {
6806 6807
			pr_warn("%s: error: bitmap file must be a regular file\n",
				mdname(mddev));
6808
			err = -EBADF;
N
NeilBrown 已提交
6809
		} else if (!(f->f_mode & FMODE_WRITE)) {
6810 6811
			pr_warn("%s: error: bitmap file must open for write\n",
				mdname(mddev));
6812 6813
			err = -EBADF;
		} else if (atomic_read(&inode->i_writecount) != 1) {
6814 6815
			pr_warn("%s: error: bitmap file is already in use\n",
				mdname(mddev));
6816 6817 6818
			err = -EBUSY;
		}
		if (err) {
N
NeilBrown 已提交
6819
			fput(f);
6820 6821
			return err;
		}
N
NeilBrown 已提交
6822
		mddev->bitmap_info.file = f;
6823
		mddev->bitmap_info.offset = 0; /* file overrides offset */
6824 6825 6826 6827
	} else if (mddev->bitmap == NULL)
		return -ENOENT; /* cannot remove what isn't there */
	err = 0;
	if (mddev->pers) {
6828
		if (fd >= 0) {
6829 6830
			struct bitmap *bitmap;

6831
			bitmap = md_bitmap_create(mddev, -1);
6832
			mddev_suspend(mddev);
6833 6834
			if (!IS_ERR(bitmap)) {
				mddev->bitmap = bitmap;
6835
				err = md_bitmap_load(mddev);
6836 6837
			} else
				err = PTR_ERR(bitmap);
6838
			if (err) {
6839
				md_bitmap_destroy(mddev);
6840 6841
				fd = -1;
			}
6842
			mddev_resume(mddev);
6843
		} else if (fd < 0) {
6844
			mddev_suspend(mddev);
6845
			md_bitmap_destroy(mddev);
6846
			mddev_resume(mddev);
6847 6848 6849
		}
	}
	if (fd < 0) {
6850 6851 6852 6853 6854 6855 6856
		struct file *f = mddev->bitmap_info.file;
		if (f) {
			spin_lock(&mddev->lock);
			mddev->bitmap_info.file = NULL;
			spin_unlock(&mddev->lock);
			fput(f);
		}
6857 6858
	}

6859 6860 6861
	return err;
}

L
Linus Torvalds 已提交
6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874
/*
 * set_array_info is used two different ways
 * The original usage is when creating a new array.
 * In this usage, raid_disks is > 0 and it together with
 *  level, size, not_persistent,layout,chunksize determine the
 *  shape of the array.
 *  This will always create an array with a type-0.90.0 superblock.
 * The newer usage is when assembling an array.
 *  In this case raid_disks will be 0, and the major_version field is
 *  use to determine which style super-blocks are to be found on the devices.
 *  The minor and patch _version numbers are also kept incase the
 *  super_block handler wishes to interpret them.
 */
6875
static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
L
Linus Torvalds 已提交
6876 6877 6878 6879 6880
{

	if (info->raid_disks == 0) {
		/* just setting version number for superblock loading */
		if (info->major_version < 0 ||
6881
		    info->major_version >= ARRAY_SIZE(super_types) ||
L
Linus Torvalds 已提交
6882 6883
		    super_types[info->major_version].name == NULL) {
			/* maybe try to auto-load a module? */
6884
			pr_warn("md: superblock version %d not known\n",
L
Linus Torvalds 已提交
6885 6886 6887 6888 6889 6890
				info->major_version);
			return -EINVAL;
		}
		mddev->major_version = info->major_version;
		mddev->minor_version = info->minor_version;
		mddev->patch_version = info->patch_version;
6891
		mddev->persistent = !info->not_persistent;
6892 6893 6894
		/* ensure mddev_put doesn't delete this now that there
		 * is some minimal configuration.
		 */
6895
		mddev->ctime         = ktime_get_real_seconds();
L
Linus Torvalds 已提交
6896 6897 6898 6899 6900
		return 0;
	}
	mddev->major_version = MD_MAJOR_VERSION;
	mddev->minor_version = MD_MINOR_VERSION;
	mddev->patch_version = MD_PATCHLEVEL_VERSION;
6901
	mddev->ctime         = ktime_get_real_seconds();
L
Linus Torvalds 已提交
6902 6903

	mddev->level         = info->level;
6904
	mddev->clevel[0]     = 0;
A
Andre Noll 已提交
6905
	mddev->dev_sectors   = 2 * (sector_t)info->size;
L
Linus Torvalds 已提交
6906 6907 6908 6909 6910 6911 6912 6913 6914
	mddev->raid_disks    = info->raid_disks;
	/* don't set md_minor, it is determined by which /dev/md* was
	 * openned
	 */
	if (info->state & (1<<MD_SB_CLEAN))
		mddev->recovery_cp = MaxSector;
	else
		mddev->recovery_cp = 0;
	mddev->persistent    = ! info->not_persistent;
6915
	mddev->external	     = 0;
L
Linus Torvalds 已提交
6916 6917

	mddev->layout        = info->layout;
6918 6919 6920
	if (mddev->level == 0)
		/* Cannot trust RAID0 layout info here */
		mddev->layout = -1;
6921
	mddev->chunk_sectors = info->chunk_size >> 9;
L
Linus Torvalds 已提交
6922

6923
	if (mddev->persistent) {
6924 6925 6926
		mddev->max_disks = MD_SB_DISKS;
		mddev->flags = 0;
		mddev->sb_flags = 0;
6927 6928
	}
	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
L
Linus Torvalds 已提交
6929

6930
	mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6931
	mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
6932
	mddev->bitmap_info.offset = 0;
6933

6934 6935
	mddev->reshape_position = MaxSector;

L
Linus Torvalds 已提交
6936 6937 6938 6939 6940
	/*
	 * Generate a 128 bit UUID
	 */
	get_random_bytes(mddev->uuid, 16);

6941
	mddev->new_level = mddev->level;
6942
	mddev->new_chunk_sectors = mddev->chunk_sectors;
6943 6944
	mddev->new_layout = mddev->layout;
	mddev->delta_disks = 0;
6945
	mddev->reshape_backwards = 0;
6946

L
Linus Torvalds 已提交
6947 6948 6949
	return 0;
}

6950
void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
6951
{
S
Shaohua Li 已提交
6952
	lockdep_assert_held(&mddev->reconfig_mutex);
D
Dan Williams 已提交
6953 6954 6955 6956

	if (mddev->external_size)
		return;

6957 6958 6959 6960
	mddev->array_sectors = array_sectors;
}
EXPORT_SYMBOL(md_set_array_sectors);

6961
static int update_size(struct mddev *mddev, sector_t num_sectors)
6962
{
6963
	struct md_rdev *rdev;
6964
	int rv;
6965
	int fit = (num_sectors == 0);
6966
	sector_t old_dev_sectors = mddev->dev_sectors;
6967

6968 6969
	if (mddev->pers->resize == NULL)
		return -EINVAL;
6970 6971 6972 6973 6974
	/* The "num_sectors" is the number of sectors of each device that
	 * is used.  This can only make sense for arrays with redundancy.
	 * linear and raid0 always use whatever space is available. We can only
	 * consider changing this number if no resync or reconstruction is
	 * happening, and if the new size is acceptable. It must fit before the
6975
	 * sb_start or, if that is <data_offset, it must fit before the size
6976 6977
	 * of each device.  If num_sectors is zero, we find the largest size
	 * that fits.
6978
	 */
6979 6980
	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
	    mddev->sync_thread)
6981
		return -EBUSY;
6982 6983
	if (mddev->ro)
		return -EROFS;
6984

N
NeilBrown 已提交
6985
	rdev_for_each(rdev, mddev) {
6986
		sector_t avail = rdev->sectors;
6987

6988 6989 6990
		if (fit && (num_sectors == 0 || num_sectors > avail))
			num_sectors = avail;
		if (avail < num_sectors)
6991 6992
			return -ENOSPC;
	}
6993
	rv = mddev->pers->resize(mddev, num_sectors);
6994
	if (!rv) {
6995 6996 6997
		if (mddev_is_clustered(mddev))
			md_cluster_ops->update_size(mddev, old_dev_sectors);
		else if (mddev->queue) {
6998 6999 7000 7001
			set_capacity(mddev->gendisk, mddev->array_sectors);
			revalidate_disk(mddev->gendisk);
		}
	}
7002 7003 7004
	return rv;
}

7005
static int update_raid_disks(struct mddev *mddev, int raid_disks)
7006 7007
{
	int rv;
7008
	struct md_rdev *rdev;
7009
	/* change the number of raid disks */
7010
	if (mddev->pers->check_reshape == NULL)
7011
		return -EINVAL;
7012 7013
	if (mddev->ro)
		return -EROFS;
7014
	if (raid_disks <= 0 ||
7015
	    (mddev->max_disks && raid_disks >= mddev->max_disks))
7016
		return -EINVAL;
7017 7018 7019
	if (mddev->sync_thread ||
	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
	    mddev->reshape_position != MaxSector)
7020
		return -EBUSY;
7021 7022 7023 7024 7025 7026 7027 7028 7029 7030

	rdev_for_each(rdev, mddev) {
		if (mddev->raid_disks < raid_disks &&
		    rdev->data_offset < rdev->new_data_offset)
			return -EINVAL;
		if (mddev->raid_disks > raid_disks &&
		    rdev->data_offset > rdev->new_data_offset)
			return -EINVAL;
	}

7031
	mddev->delta_disks = raid_disks - mddev->raid_disks;
7032 7033 7034 7035
	if (mddev->delta_disks < 0)
		mddev->reshape_backwards = 1;
	else if (mddev->delta_disks > 0)
		mddev->reshape_backwards = 0;
7036 7037

	rv = mddev->pers->check_reshape(mddev);
7038
	if (rv < 0) {
7039
		mddev->delta_disks = 0;
7040 7041
		mddev->reshape_backwards = 0;
	}
7042 7043 7044
	return rv;
}

L
Linus Torvalds 已提交
7045 7046 7047 7048 7049 7050 7051 7052
/*
 * update_array_info is used to change the configuration of an
 * on-line array.
 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
 * fields in the info are checked against the array.
 * Any differences that cannot be handled will cause an error.
 * Normally, only one change can be managed at a time.
 */
7053
static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
L
Linus Torvalds 已提交
7054 7055 7056
{
	int rv = 0;
	int cnt = 0;
7057 7058 7059
	int state = 0;

	/* calculate expected state,ignoring low bits */
7060
	if (mddev->bitmap && mddev->bitmap_info.offset)
7061
		state |= (1 << MD_SB_BITMAP_PRESENT);
L
Linus Torvalds 已提交
7062 7063 7064 7065 7066 7067 7068

	if (mddev->major_version != info->major_version ||
	    mddev->minor_version != info->minor_version ||
/*	    mddev->patch_version != info->patch_version || */
	    mddev->ctime         != info->ctime         ||
	    mddev->level         != info->level         ||
/*	    mddev->layout        != info->layout        || */
F
Firo Yang 已提交
7069
	    mddev->persistent	 != !info->not_persistent ||
7070
	    mddev->chunk_sectors != info->chunk_size >> 9 ||
7071 7072 7073
	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
	    ((state^info->state) & 0xfffffe00)
		)
L
Linus Torvalds 已提交
7074 7075
		return -EINVAL;
	/* Check there is only one change */
A
Andre Noll 已提交
7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087
	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
		cnt++;
	if (mddev->raid_disks != info->raid_disks)
		cnt++;
	if (mddev->layout != info->layout)
		cnt++;
	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
		cnt++;
	if (cnt == 0)
		return 0;
	if (cnt > 1)
		return -EINVAL;
L
Linus Torvalds 已提交
7088 7089 7090 7091 7092 7093

	if (mddev->layout != info->layout) {
		/* Change layout
		 * we don't need to do anything at the md level, the
		 * personality will take care of it all.
		 */
7094
		if (mddev->pers->check_reshape == NULL)
L
Linus Torvalds 已提交
7095
			return -EINVAL;
7096 7097
		else {
			mddev->new_layout = info->layout;
7098
			rv = mddev->pers->check_reshape(mddev);
7099 7100 7101 7102
			if (rv)
				mddev->new_layout = mddev->layout;
			return rv;
		}
L
Linus Torvalds 已提交
7103
	}
A
Andre Noll 已提交
7104
	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7105
		rv = update_size(mddev, (sector_t)info->size * 2);
7106

7107 7108 7109
	if (mddev->raid_disks    != info->raid_disks)
		rv = update_raid_disks(mddev, info->raid_disks);

7110
	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
7111 7112 7113 7114 7115 7116 7117 7118
		if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
			rv = -EINVAL;
			goto err;
		}
		if (mddev->recovery || mddev->sync_thread) {
			rv = -EBUSY;
			goto err;
		}
7119
		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
7120
			struct bitmap *bitmap;
7121
			/* add the bitmap */
7122 7123 7124 7125 7126 7127 7128 7129
			if (mddev->bitmap) {
				rv = -EEXIST;
				goto err;
			}
			if (mddev->bitmap_info.default_offset == 0) {
				rv = -EINVAL;
				goto err;
			}
7130 7131
			mddev->bitmap_info.offset =
				mddev->bitmap_info.default_offset;
7132 7133
			mddev->bitmap_info.space =
				mddev->bitmap_info.default_space;
7134
			bitmap = md_bitmap_create(mddev, -1);
7135
			mddev_suspend(mddev);
7136 7137
			if (!IS_ERR(bitmap)) {
				mddev->bitmap = bitmap;
7138
				rv = md_bitmap_load(mddev);
7139 7140
			} else
				rv = PTR_ERR(bitmap);
7141
			if (rv)
7142
				md_bitmap_destroy(mddev);
7143
			mddev_resume(mddev);
7144 7145
		} else {
			/* remove the bitmap */
7146 7147 7148 7149 7150 7151 7152 7153
			if (!mddev->bitmap) {
				rv = -ENOENT;
				goto err;
			}
			if (mddev->bitmap->storage.file) {
				rv = -EINVAL;
				goto err;
			}
7154 7155 7156
			if (mddev->bitmap_info.nodes) {
				/* hold PW on all the bitmap lock */
				if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
7157
					pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
7158 7159 7160 7161 7162 7163 7164 7165
					rv = -EPERM;
					md_cluster_ops->unlock_all_bitmaps(mddev);
					goto err;
				}

				mddev->bitmap_info.nodes = 0;
				md_cluster_ops->leave(mddev);
			}
7166
			mddev_suspend(mddev);
7167
			md_bitmap_destroy(mddev);
7168
			mddev_resume(mddev);
7169
			mddev->bitmap_info.offset = 0;
7170 7171
		}
	}
7172
	md_update_sb(mddev, 1);
7173 7174
	return rv;
err:
L
Linus Torvalds 已提交
7175 7176 7177
	return rv;
}

7178
static int set_disk_faulty(struct mddev *mddev, dev_t dev)
L
Linus Torvalds 已提交
7179
{
7180
	struct md_rdev *rdev;
7181
	int err = 0;
L
Linus Torvalds 已提交
7182 7183 7184 7185

	if (mddev->pers == NULL)
		return -ENODEV;

7186
	rcu_read_lock();
7187
	rdev = md_find_rdev_rcu(mddev, dev);
L
Linus Torvalds 已提交
7188
	if (!rdev)
7189 7190 7191 7192 7193 7194 7195 7196
		err =  -ENODEV;
	else {
		md_error(mddev, rdev);
		if (!test_bit(Faulty, &rdev->flags))
			err = -EBUSY;
	}
	rcu_read_unlock();
	return err;
L
Linus Torvalds 已提交
7197 7198
}

7199 7200 7201 7202 7203 7204
/*
 * We have a problem here : there is no easy way to give a CHS
 * virtual geometry. We currently pretend that we have a 2 heads
 * 4 sectors (with a BIG number of cylinders...). This drives
 * dosfs just mad... ;-)
 */
7205 7206
static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
7207
	struct mddev *mddev = bdev->bd_disk->private_data;
7208 7209 7210

	geo->heads = 2;
	geo->sectors = 4;
7211
	geo->cylinders = mddev->array_sectors / 8;
7212 7213 7214
	return 0;
}

7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233
static inline bool md_ioctl_valid(unsigned int cmd)
{
	switch (cmd) {
	case ADD_NEW_DISK:
	case BLKROSET:
	case GET_ARRAY_INFO:
	case GET_BITMAP_FILE:
	case GET_DISK_INFO:
	case HOT_ADD_DISK:
	case HOT_REMOVE_DISK:
	case RAID_AUTORUN:
	case RAID_VERSION:
	case RESTART_ARRAY_RW:
	case RUN_ARRAY:
	case SET_ARRAY_INFO:
	case SET_BITMAP_FILE:
	case SET_DISK_FAULTY:
	case STOP_ARRAY:
	case STOP_ARRAY_RO:
7234
	case CLUSTERED_DISK_NACK:
7235 7236 7237 7238 7239 7240
		return true;
	default:
		return false;
	}
}

A
Al Viro 已提交
7241
static int md_ioctl(struct block_device *bdev, fmode_t mode,
L
Linus Torvalds 已提交
7242 7243 7244 7245
			unsigned int cmd, unsigned long arg)
{
	int err = 0;
	void __user *argp = (void __user *)arg;
7246
	struct mddev *mddev = NULL;
7247
	int ro;
7248
	bool did_set_md_closing = false;
L
Linus Torvalds 已提交
7249

7250 7251 7252
	if (!md_ioctl_valid(cmd))
		return -ENOTTY;

7253 7254 7255 7256 7257 7258 7259 7260 7261
	switch (cmd) {
	case RAID_VERSION:
	case GET_ARRAY_INFO:
	case GET_DISK_INFO:
		break;
	default:
		if (!capable(CAP_SYS_ADMIN))
			return -EACCES;
	}
L
Linus Torvalds 已提交
7262 7263 7264 7265 7266

	/*
	 * Commands dealing with the RAID driver but not any
	 * particular array:
	 */
7267 7268 7269
	switch (cmd) {
	case RAID_VERSION:
		err = get_version(argp);
7270
		goto out;
L
Linus Torvalds 已提交
7271 7272

#ifndef MODULE
7273 7274 7275
	case RAID_AUTORUN:
		err = 0;
		autostart_arrays(arg);
7276
		goto out;
L
Linus Torvalds 已提交
7277
#endif
7278
	default:;
L
Linus Torvalds 已提交
7279 7280 7281 7282 7283 7284
	}

	/*
	 * Commands creating/starting a new array:
	 */

A
Al Viro 已提交
7285
	mddev = bdev->bd_disk->private_data;
L
Linus Torvalds 已提交
7286 7287 7288

	if (!mddev) {
		BUG();
7289
		goto out;
L
Linus Torvalds 已提交
7290 7291
	}

7292 7293 7294 7295 7296 7297 7298
	/* Some actions do not requires the mutex */
	switch (cmd) {
	case GET_ARRAY_INFO:
		if (!mddev->raid_disks && !mddev->external)
			err = -ENODEV;
		else
			err = get_array_info(mddev, argp);
7299
		goto out;
7300 7301 7302 7303 7304 7305

	case GET_DISK_INFO:
		if (!mddev->raid_disks && !mddev->external)
			err = -ENODEV;
		else
			err = get_disk_info(mddev, argp);
7306
		goto out;
7307 7308 7309

	case SET_DISK_FAULTY:
		err = set_disk_faulty(mddev, new_decode_dev(arg));
7310
		goto out;
7311 7312 7313 7314 7315

	case GET_BITMAP_FILE:
		err = get_bitmap_file(mddev, argp);
		goto out;

7316 7317
	}

7318 7319 7320 7321
	if (cmd == ADD_NEW_DISK)
		/* need to ensure md_delayed_delete() has completed */
		flush_workqueue(md_misc_wq);

7322 7323 7324 7325
	if (cmd == HOT_REMOVE_DISK)
		/* need to ensure recovery thread has run */
		wait_event_interruptible_timeout(mddev->sb_wait,
						 !test_bit(MD_RECOVERY_NEEDED,
7326
							   &mddev->recovery),
7327
						 msecs_to_jiffies(5000));
7328 7329 7330 7331 7332
	if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
		/* Need to flush page cache, and ensure no-one else opens
		 * and writes
		 */
		mutex_lock(&mddev->open_mutex);
7333
		if (mddev->pers && atomic_read(&mddev->openers) > 1) {
7334 7335
			mutex_unlock(&mddev->open_mutex);
			err = -EBUSY;
7336
			goto out;
7337
		}
7338
		WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags));
7339
		set_bit(MD_CLOSING, &mddev->flags);
7340
		did_set_md_closing = true;
7341 7342 7343
		mutex_unlock(&mddev->open_mutex);
		sync_blockdev(bdev);
	}
L
Linus Torvalds 已提交
7344 7345
	err = mddev_lock(mddev);
	if (err) {
7346 7347
		pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
			 err, cmd);
7348
		goto out;
L
Linus Torvalds 已提交
7349 7350
	}

7351 7352 7353 7354 7355 7356
	if (cmd == SET_ARRAY_INFO) {
		mdu_array_info_t info;
		if (!arg)
			memset(&info, 0, sizeof(info));
		else if (copy_from_user(&info, argp, sizeof(info))) {
			err = -EFAULT;
7357
			goto unlock;
7358 7359 7360 7361
		}
		if (mddev->pers) {
			err = update_array_info(mddev, &info);
			if (err) {
7362
				pr_warn("md: couldn't update array info. %d\n", err);
7363
				goto unlock;
L
Linus Torvalds 已提交
7364
			}
7365
			goto unlock;
7366 7367
		}
		if (!list_empty(&mddev->disks)) {
7368
			pr_warn("md: array %s already has disks!\n", mdname(mddev));
7369
			err = -EBUSY;
7370
			goto unlock;
7371 7372
		}
		if (mddev->raid_disks) {
7373
			pr_warn("md: array %s already initialised!\n", mdname(mddev));
7374
			err = -EBUSY;
7375
			goto unlock;
7376 7377 7378
		}
		err = set_array_info(mddev, &info);
		if (err) {
7379
			pr_warn("md: couldn't set array info. %d\n", err);
7380
			goto unlock;
7381
		}
7382
		goto unlock;
L
Linus Torvalds 已提交
7383 7384 7385 7386 7387
	}

	/*
	 * Commands querying/configuring an existing array:
	 */
7388
	/* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
7389
	 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
7390 7391 7392 7393
	if ((!mddev->raid_disks && !mddev->external)
	    && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
	    && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
	    && cmd != GET_BITMAP_FILE) {
L
Linus Torvalds 已提交
7394
		err = -ENODEV;
7395
		goto unlock;
L
Linus Torvalds 已提交
7396 7397 7398 7399 7400
	}

	/*
	 * Commands even a read-only array can execute:
	 */
7401 7402 7403
	switch (cmd) {
	case RESTART_ARRAY_RW:
		err = restart_array(mddev);
7404
		goto unlock;
L
Linus Torvalds 已提交
7405

7406 7407
	case STOP_ARRAY:
		err = do_md_stop(mddev, 0, bdev);
7408
		goto unlock;
L
Linus Torvalds 已提交
7409

7410 7411
	case STOP_ARRAY_RO:
		err = md_set_readonly(mddev, bdev);
7412
		goto unlock;
L
Linus Torvalds 已提交
7413

7414 7415
	case HOT_REMOVE_DISK:
		err = hot_remove_disk(mddev, new_decode_dev(arg));
7416
		goto unlock;
7417

7418 7419
	case ADD_NEW_DISK:
		/* We can support ADD_NEW_DISK on read-only arrays
W
Wei Fang 已提交
7420
		 * only if we are re-adding a preexisting device.
7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431
		 * So require mddev->pers and MD_DISK_SYNC.
		 */
		if (mddev->pers) {
			mdu_disk_info_t info;
			if (copy_from_user(&info, argp, sizeof(info)))
				err = -EFAULT;
			else if (!(info.state & (1<<MD_DISK_SYNC)))
				/* Need to clear read-only for this */
				break;
			else
				err = add_new_disk(mddev, &info);
7432
			goto unlock;
7433 7434 7435
		}
		break;

7436 7437 7438
	case BLKROSET:
		if (get_user(ro, (int __user *)(arg))) {
			err = -EFAULT;
7439
			goto unlock;
7440 7441
		}
		err = -EINVAL;
7442

7443 7444 7445 7446
		/* if the bdev is going readonly the value of mddev->ro
		 * does not matter, no writes are coming
		 */
		if (ro)
7447
			goto unlock;
7448

7449 7450
		/* are we are already prepared for writes? */
		if (mddev->ro != 1)
7451
			goto unlock;
7452

7453 7454 7455 7456 7457 7458 7459 7460
		/* transitioning to readauto need only happen for
		 * arrays that call md_write_start
		 */
		if (mddev->pers) {
			err = restart_array(mddev);
			if (err == 0) {
				mddev->ro = 2;
				set_disk_ro(mddev->gendisk, 0);
7461
			}
7462
		}
7463
		goto unlock;
L
Linus Torvalds 已提交
7464 7465 7466 7467
	}

	/*
	 * The remaining ioctls are changing the state of the
7468
	 * superblock, so we do not allow them on read-only arrays.
L
Linus Torvalds 已提交
7469
	 */
7470
	if (mddev->ro && mddev->pers) {
7471 7472
		if (mddev->ro == 2) {
			mddev->ro = 0;
N
NeilBrown 已提交
7473
			sysfs_notify_dirent_safe(mddev->sysfs_state);
7474
			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7475 7476 7477 7478
			/* mddev_unlock will wake thread */
			/* If a device failed while we were read-only, we
			 * need to make sure the metadata is updated now.
			 */
7479
			if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
7480 7481
				mddev_unlock(mddev);
				wait_event(mddev->sb_wait,
7482 7483
					   !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
					   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7484
				mddev_lock_nointr(mddev);
7485
			}
7486 7487
		} else {
			err = -EROFS;
7488
			goto unlock;
7489
		}
L
Linus Torvalds 已提交
7490 7491
	}

7492 7493
	switch (cmd) {
	case ADD_NEW_DISK:
L
Linus Torvalds 已提交
7494
	{
7495 7496 7497 7498 7499
		mdu_disk_info_t info;
		if (copy_from_user(&info, argp, sizeof(info)))
			err = -EFAULT;
		else
			err = add_new_disk(mddev, &info);
7500
		goto unlock;
7501
	}
L
Linus Torvalds 已提交
7502

7503 7504 7505 7506 7507 7508 7509
	case CLUSTERED_DISK_NACK:
		if (mddev_is_clustered(mddev))
			md_cluster_ops->new_disk_ack(mddev, false);
		else
			err = -EINVAL;
		goto unlock;

7510 7511
	case HOT_ADD_DISK:
		err = hot_add_disk(mddev, new_decode_dev(arg));
7512
		goto unlock;
L
Linus Torvalds 已提交
7513

7514 7515
	case RUN_ARRAY:
		err = do_md_run(mddev);
7516
		goto unlock;
L
Linus Torvalds 已提交
7517

7518 7519
	case SET_BITMAP_FILE:
		err = set_bitmap_file(mddev, (int)arg);
7520
		goto unlock;
7521

7522 7523
	default:
		err = -EINVAL;
7524
		goto unlock;
L
Linus Torvalds 已提交
7525 7526
	}

7527
unlock:
7528 7529 7530
	if (mddev->hold_active == UNTIL_IOCTL &&
	    err != -EINVAL)
		mddev->hold_active = 0;
L
Linus Torvalds 已提交
7531
	mddev_unlock(mddev);
7532
out:
7533 7534
	if(did_set_md_closing)
		clear_bit(MD_CLOSING, &mddev->flags);
L
Linus Torvalds 已提交
7535 7536
	return err;
}
7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555
#ifdef CONFIG_COMPAT
static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
		    unsigned int cmd, unsigned long arg)
{
	switch (cmd) {
	case HOT_REMOVE_DISK:
	case HOT_ADD_DISK:
	case SET_DISK_FAULTY:
	case SET_BITMAP_FILE:
		/* These take in integer arg, do not convert */
		break;
	default:
		arg = (unsigned long)compat_ptr(arg);
		break;
	}

	return md_ioctl(bdev, mode, cmd, arg);
}
#endif /* CONFIG_COMPAT */
L
Linus Torvalds 已提交
7556

A
Al Viro 已提交
7557
static int md_open(struct block_device *bdev, fmode_t mode)
L
Linus Torvalds 已提交
7558 7559 7560 7561 7562
{
	/*
	 * Succeed if we can lock the mddev, which confirms that
	 * it isn't being stopped right now.
	 */
7563
	struct mddev *mddev = mddev_find(bdev->bd_dev);
L
Linus Torvalds 已提交
7564 7565
	int err;

7566 7567 7568
	if (!mddev)
		return -ENODEV;

7569 7570 7571 7572 7573 7574
	if (mddev->gendisk != bdev->bd_disk) {
		/* we are racing with mddev_put which is discarding this
		 * bd_disk.
		 */
		mddev_put(mddev);
		/* Wait until bdev->bd_disk is definitely gone */
T
Tejun Heo 已提交
7575
		flush_workqueue(md_misc_wq);
7576 7577 7578 7579 7580
		/* Then retry the open from the top */
		return -ERESTARTSYS;
	}
	BUG_ON(mddev != bdev->bd_disk->private_data);

N
NeilBrown 已提交
7581
	if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
L
Linus Torvalds 已提交
7582 7583
		goto out;

7584 7585
	if (test_bit(MD_CLOSING, &mddev->flags)) {
		mutex_unlock(&mddev->open_mutex);
7586 7587
		err = -ENODEV;
		goto out;
7588 7589
	}

L
Linus Torvalds 已提交
7590
	err = 0;
7591
	atomic_inc(&mddev->openers);
N
NeilBrown 已提交
7592
	mutex_unlock(&mddev->open_mutex);
L
Linus Torvalds 已提交
7593

7594
	check_disk_change(bdev);
L
Linus Torvalds 已提交
7595
 out:
7596 7597
	if (err)
		mddev_put(mddev);
L
Linus Torvalds 已提交
7598 7599 7600
	return err;
}

7601
static void md_release(struct gendisk *disk, fmode_t mode)
L
Linus Torvalds 已提交
7602
{
7603
	struct mddev *mddev = disk->private_data;
L
Linus Torvalds 已提交
7604

E
Eric Sesterhenn 已提交
7605
	BUG_ON(!mddev);
7606
	atomic_dec(&mddev->openers);
L
Linus Torvalds 已提交
7607 7608
	mddev_put(mddev);
}
7609 7610 7611

static int md_media_changed(struct gendisk *disk)
{
7612
	struct mddev *mddev = disk->private_data;
7613 7614 7615 7616 7617 7618

	return mddev->changed;
}

static int md_revalidate(struct gendisk *disk)
{
7619
	struct mddev *mddev = disk->private_data;
7620 7621 7622 7623

	mddev->changed = 0;
	return 0;
}
7624
static const struct block_device_operations md_fops =
L
Linus Torvalds 已提交
7625 7626
{
	.owner		= THIS_MODULE,
A
Al Viro 已提交
7627 7628
	.open		= md_open,
	.release	= md_release,
N
NeilBrown 已提交
7629
	.ioctl		= md_ioctl,
7630 7631 7632
#ifdef CONFIG_COMPAT
	.compat_ioctl	= md_compat_ioctl,
#endif
7633
	.getgeo		= md_getgeo,
7634 7635
	.media_changed  = md_media_changed,
	.revalidate_disk= md_revalidate,
L
Linus Torvalds 已提交
7636 7637
};

7638
static int md_thread(void *arg)
L
Linus Torvalds 已提交
7639
{
7640
	struct md_thread *thread = arg;
L
Linus Torvalds 已提交
7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653

	/*
	 * md_thread is a 'system-thread', it's priority should be very
	 * high. We avoid resource deadlocks individually in each
	 * raid personality. (RAID5 does preallocation) We also use RR and
	 * the very same RT priority as kswapd, thus we will never get
	 * into a priority inversion deadlock.
	 *
	 * we definitely have to have equal or higher priority than
	 * bdflush, otherwise bdflush will deadlock if there are too
	 * many dirty RAID5 blocks.
	 */

N
NeilBrown 已提交
7654
	allow_signal(SIGKILL);
7655
	while (!kthread_should_stop()) {
L
Linus Torvalds 已提交
7656

7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667
		/* We need to wait INTERRUPTIBLE so that
		 * we don't add to the load-average.
		 * That means we need to be sure no signals are
		 * pending
		 */
		if (signal_pending(current))
			flush_signals(current);

		wait_event_interruptible_timeout
			(thread->wqueue,
			 test_bit(THREAD_WAKEUP, &thread->flags)
7668
			 || kthread_should_stop() || kthread_should_park(),
7669
			 thread->timeout);
L
Linus Torvalds 已提交
7670

7671
		clear_bit(THREAD_WAKEUP, &thread->flags);
7672 7673
		if (kthread_should_park())
			kthread_parkme();
7674
		if (!kthread_should_stop())
S
Shaohua Li 已提交
7675
			thread->run(thread);
L
Linus Torvalds 已提交
7676
	}
7677

L
Linus Torvalds 已提交
7678 7679 7680
	return 0;
}

7681
void md_wakeup_thread(struct md_thread *thread)
L
Linus Torvalds 已提交
7682 7683
{
	if (thread) {
7684
		pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
7685 7686
		set_bit(THREAD_WAKEUP, &thread->flags);
		wake_up(&thread->wqueue);
L
Linus Torvalds 已提交
7687 7688
	}
}
7689
EXPORT_SYMBOL(md_wakeup_thread);
L
Linus Torvalds 已提交
7690

S
Shaohua Li 已提交
7691 7692
struct md_thread *md_register_thread(void (*run) (struct md_thread *),
		struct mddev *mddev, const char *name)
L
Linus Torvalds 已提交
7693
{
7694
	struct md_thread *thread;
L
Linus Torvalds 已提交
7695

7696
	thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
L
Linus Torvalds 已提交
7697 7698 7699 7700 7701 7702 7703
	if (!thread)
		return NULL;

	init_waitqueue_head(&thread->wqueue);

	thread->run = run;
	thread->mddev = mddev;
7704
	thread->timeout = MAX_SCHEDULE_TIMEOUT;
7705 7706 7707
	thread->tsk = kthread_run(md_thread, thread,
				  "%s_%s",
				  mdname(thread->mddev),
7708
				  name);
7709
	if (IS_ERR(thread->tsk)) {
L
Linus Torvalds 已提交
7710 7711 7712 7713 7714
		kfree(thread);
		return NULL;
	}
	return thread;
}
7715
EXPORT_SYMBOL(md_register_thread);
L
Linus Torvalds 已提交
7716

7717
void md_unregister_thread(struct md_thread **threadp)
L
Linus Torvalds 已提交
7718
{
7719
	struct md_thread *thread = *threadp;
7720 7721
	if (!thread)
		return;
7722
	pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
7723 7724 7725 7726 7727 7728
	/* Locking ensures that mddev_unlock does not wake_up a
	 * non-existent thread
	 */
	spin_lock(&pers_lock);
	*threadp = NULL;
	spin_unlock(&pers_lock);
7729 7730

	kthread_stop(thread->tsk);
L
Linus Torvalds 已提交
7731 7732
	kfree(thread);
}
7733
EXPORT_SYMBOL(md_unregister_thread);
L
Linus Torvalds 已提交
7734

7735
void md_error(struct mddev *mddev, struct md_rdev *rdev)
L
Linus Torvalds 已提交
7736
{
7737
	if (!rdev || test_bit(Faulty, &rdev->flags))
L
Linus Torvalds 已提交
7738
		return;
7739

7740
	if (!mddev->pers || !mddev->pers->error_handler)
L
Linus Torvalds 已提交
7741 7742
		return;
	mddev->pers->error_handler(mddev,rdev);
7743 7744
	if (mddev->degraded)
		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
N
NeilBrown 已提交
7745
	sysfs_notify_dirent_safe(rdev->sysfs_state);
L
Linus Torvalds 已提交
7746 7747 7748
	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
	md_wakeup_thread(mddev->thread);
7749
	if (mddev->event_work.func)
T
Tejun Heo 已提交
7750
		queue_work(md_misc_wq, &mddev->event_work);
7751
	md_new_event(mddev);
L
Linus Torvalds 已提交
7752
}
7753
EXPORT_SYMBOL(md_error);
L
Linus Torvalds 已提交
7754 7755 7756 7757 7758 7759

/* seq_file implementation /proc/mdstat */

static void status_unused(struct seq_file *seq)
{
	int i = 0;
7760
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
7761 7762 7763

	seq_printf(seq, "unused devices: ");

7764
	list_for_each_entry(rdev, &pending_raid_disks, same_set) {
L
Linus Torvalds 已提交
7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775
		char b[BDEVNAME_SIZE];
		i++;
		seq_printf(seq, "%s ",
			      bdevname(rdev->bdev,b));
	}
	if (!i)
		seq_printf(seq, "<none>");

	seq_printf(seq, "\n");
}

7776
static int status_resync(struct seq_file *seq, struct mddev *mddev)
L
Linus Torvalds 已提交
7777
{
7778
	sector_t max_sectors, resync, res;
7779 7780 7781
	unsigned long dt, db = 0;
	sector_t rt, curr_mark_cnt, resync_mark_cnt;
	int scale, recovery_active;
7782
	unsigned int per_milli;
L
Linus Torvalds 已提交
7783

7784 7785
	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7786
		max_sectors = mddev->resync_max_sectors;
L
Linus Torvalds 已提交
7787
	else
7788
		max_sectors = mddev->dev_sectors;
L
Linus Torvalds 已提交
7789

7790 7791 7792 7793 7794
	resync = mddev->curr_resync;
	if (resync <= 3) {
		if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
			/* Still cleaning up */
			resync = max_sectors;
7795 7796 7797
	} else if (resync > max_sectors)
		resync = max_sectors;
	else
7798 7799 7800
		resync -= atomic_read(&mddev->recovery_active);

	if (resync == 0) {
7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817
		if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
			struct md_rdev *rdev;

			rdev_for_each(rdev, mddev)
				if (rdev->raid_disk >= 0 &&
				    !test_bit(Faulty, &rdev->flags) &&
				    rdev->recovery_offset != MaxSector &&
				    rdev->recovery_offset) {
					seq_printf(seq, "\trecover=REMOTE");
					return 1;
				}
			if (mddev->reshape_position != MaxSector)
				seq_printf(seq, "\treshape=REMOTE");
			else
				seq_printf(seq, "\tresync=REMOTE");
			return 1;
		}
7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828
		if (mddev->recovery_cp < MaxSector) {
			seq_printf(seq, "\tresync=PENDING");
			return 1;
		}
		return 0;
	}
	if (resync < 3) {
		seq_printf(seq, "\tresync=DELAYED");
		return 1;
	}

N
NeilBrown 已提交
7829
	WARN_ON(max_sectors == 0);
7830
	/* Pick 'scale' such that (resync>>scale)*1000 will fit
7831
	 * in a sector_t, and (max_sectors>>scale) will fit in a
7832 7833 7834 7835 7836
	 * u32, as those are the requirements for sector_div.
	 * Thus 'scale' must be at least 10
	 */
	scale = 10;
	if (sizeof(sector_t) > sizeof(unsigned long)) {
7837
		while ( max_sectors/2 > (1ULL<<(scale+32)))
7838 7839 7840
			scale++;
	}
	res = (resync>>scale)*1000;
7841
	sector_div(res, (u32)((max_sectors>>scale)+1));
7842 7843

	per_milli = res;
L
Linus Torvalds 已提交
7844
	{
7845
		int i, x = per_milli/50, y = 20-x;
L
Linus Torvalds 已提交
7846 7847 7848 7849 7850 7851 7852 7853
		seq_printf(seq, "[");
		for (i = 0; i < x; i++)
			seq_printf(seq, "=");
		seq_printf(seq, ">");
		for (i = 0; i < y; i++)
			seq_printf(seq, ".");
		seq_printf(seq, "] ");
	}
7854
	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
7855 7856
		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
		    "reshape" :
7857 7858 7859 7860 7861
		    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
		     "check" :
		     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
		      "resync" : "recovery"))),
		   per_milli/10, per_milli % 10,
7862 7863
		   (unsigned long long) resync/2,
		   (unsigned long long) max_sectors/2);
L
Linus Torvalds 已提交
7864 7865 7866 7867 7868

	/*
	 * dt: time from mark until now
	 * db: blocks written from mark until now
	 * rt: remaining time
7869
	 *
7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880
	 * rt is a sector_t, which is always 64bit now. We are keeping
	 * the original algorithm, but it is not really necessary.
	 *
	 * Original algorithm:
	 *   So we divide before multiply in case it is 32bit and close
	 *   to the limit.
	 *   We scale the divisor (db) by 32 to avoid losing precision
	 *   near the end of resync when the number of remaining sectors
	 *   is close to 'db'.
	 *   We then divide rt by 32 after multiplying by db to compensate.
	 *   The '+1' avoids division by zero if db is very small.
L
Linus Torvalds 已提交
7881 7882 7883
	 */
	dt = ((jiffies - mddev->resync_mark) / HZ);
	if (!dt) dt++;
7884 7885 7886 7887 7888 7889 7890

	curr_mark_cnt = mddev->curr_mark_cnt;
	recovery_active = atomic_read(&mddev->recovery_active);
	resync_mark_cnt = mddev->resync_mark_cnt;

	if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
		db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
L
Linus Torvalds 已提交
7891

7892
	rt = max_sectors - resync;    /* number of remaining sectors */
7893
	rt = div64_u64(rt, db/32+1);
7894 7895 7896 7897 7898
	rt *= dt;
	rt >>= 5;

	seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
		   ((unsigned long)rt % 60)/6);
L
Linus Torvalds 已提交
7899

7900
	seq_printf(seq, " speed=%ldK/sec", db/2/dt);
7901
	return 1;
L
Linus Torvalds 已提交
7902 7903 7904 7905 7906 7907
}

static void *md_seq_start(struct seq_file *seq, loff_t *pos)
{
	struct list_head *tmp;
	loff_t l = *pos;
7908
	struct mddev *mddev;
L
Linus Torvalds 已提交
7909 7910 7911 7912 7913 7914 7915 7916 7917 7918

	if (l >= 0x10000)
		return NULL;
	if (!l--)
		/* header */
		return (void*)1;

	spin_lock(&all_mddevs_lock);
	list_for_each(tmp,&all_mddevs)
		if (!l--) {
7919
			mddev = list_entry(tmp, struct mddev, all_mddevs);
L
Linus Torvalds 已提交
7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932
			mddev_get(mddev);
			spin_unlock(&all_mddevs_lock);
			return mddev;
		}
	spin_unlock(&all_mddevs_lock);
	if (!l--)
		return (void*)2;/* tail */
	return NULL;
}

static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct list_head *tmp;
7933
	struct mddev *next_mddev, *mddev = v;
7934

L
Linus Torvalds 已提交
7935 7936 7937 7938 7939 7940 7941 7942 7943 7944
	++*pos;
	if (v == (void*)2)
		return NULL;

	spin_lock(&all_mddevs_lock);
	if (v == (void*)1)
		tmp = all_mddevs.next;
	else
		tmp = mddev->all_mddevs.next;
	if (tmp != &all_mddevs)
7945
		next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
L
Linus Torvalds 已提交
7946 7947 7948
	else {
		next_mddev = (void*)2;
		*pos = 0x10000;
7949
	}
L
Linus Torvalds 已提交
7950 7951 7952 7953 7954 7955 7956 7957 7958 7959
	spin_unlock(&all_mddevs_lock);

	if (v != (void*)1)
		mddev_put(mddev);
	return next_mddev;

}

static void md_seq_stop(struct seq_file *seq, void *v)
{
7960
	struct mddev *mddev = v;
L
Linus Torvalds 已提交
7961 7962 7963 7964 7965 7966 7967

	if (mddev && v != (void*)1 && v != (void*)2)
		mddev_put(mddev);
}

static int md_seq_show(struct seq_file *seq, void *v)
{
7968
	struct mddev *mddev = v;
7969
	sector_t sectors;
7970
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
7971 7972

	if (v == (void*)1) {
7973
		struct md_personality *pers;
L
Linus Torvalds 已提交
7974 7975
		seq_printf(seq, "Personalities : ");
		spin_lock(&pers_lock);
7976 7977
		list_for_each_entry(pers, &pers_list, list)
			seq_printf(seq, "[%s] ", pers->name);
L
Linus Torvalds 已提交
7978 7979 7980

		spin_unlock(&pers_lock);
		seq_printf(seq, "\n");
7981
		seq->poll_event = atomic_read(&md_event_count);
L
Linus Torvalds 已提交
7982 7983 7984 7985 7986 7987 7988
		return 0;
	}
	if (v == (void*)2) {
		status_unused(seq);
		return 0;
	}

7989
	spin_lock(&mddev->lock);
L
Linus Torvalds 已提交
7990 7991 7992 7993
	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
		seq_printf(seq, "%s : %sactive", mdname(mddev),
						mddev->pers ? "" : "in");
		if (mddev->pers) {
7994
			if (mddev->ro==1)
L
Linus Torvalds 已提交
7995
				seq_printf(seq, " (read-only)");
7996
			if (mddev->ro==2)
7997
				seq_printf(seq, " (auto-read-only)");
L
Linus Torvalds 已提交
7998 7999 8000
			seq_printf(seq, " %s", mddev->pers->name);
		}

8001
		sectors = 0;
8002 8003
		rcu_read_lock();
		rdev_for_each_rcu(rdev, mddev) {
L
Linus Torvalds 已提交
8004 8005 8006
			char b[BDEVNAME_SIZE];
			seq_printf(seq, " %s[%d]",
				bdevname(rdev->bdev,b), rdev->desc_nr);
8007 8008
			if (test_bit(WriteMostly, &rdev->flags))
				seq_printf(seq, "(W)");
S
Shaohua Li 已提交
8009 8010
			if (test_bit(Journal, &rdev->flags))
				seq_printf(seq, "(J)");
8011
			if (test_bit(Faulty, &rdev->flags)) {
L
Linus Torvalds 已提交
8012 8013
				seq_printf(seq, "(F)");
				continue;
8014 8015
			}
			if (rdev->raid_disk < 0)
8016
				seq_printf(seq, "(S)"); /* spare */
8017 8018
			if (test_bit(Replacement, &rdev->flags))
				seq_printf(seq, "(R)");
8019
			sectors += rdev->sectors;
L
Linus Torvalds 已提交
8020
		}
8021
		rcu_read_unlock();
L
Linus Torvalds 已提交
8022 8023 8024 8025

		if (!list_empty(&mddev->disks)) {
			if (mddev->pers)
				seq_printf(seq, "\n      %llu blocks",
8026 8027
					   (unsigned long long)
					   mddev->array_sectors / 2);
L
Linus Torvalds 已提交
8028 8029
			else
				seq_printf(seq, "\n      %llu blocks",
8030
					   (unsigned long long)sectors / 2);
L
Linus Torvalds 已提交
8031
		}
8032 8033 8034 8035 8036 8037 8038
		if (mddev->persistent) {
			if (mddev->major_version != 0 ||
			    mddev->minor_version != 90) {
				seq_printf(seq," super %d.%d",
					   mddev->major_version,
					   mddev->minor_version);
			}
8039 8040 8041 8042
		} else if (mddev->external)
			seq_printf(seq, " super external:%s",
				   mddev->metadata_type);
		else
8043
			seq_printf(seq, " super non-persistent");
L
Linus Torvalds 已提交
8044 8045

		if (mddev->pers) {
8046
			mddev->pers->status(seq, mddev);
8047
			seq_printf(seq, "\n      ");
8048
			if (mddev->pers->sync_request) {
8049
				if (status_resync(seq, mddev))
8050 8051
					seq_printf(seq, "\n      ");
			}
8052 8053 8054
		} else
			seq_printf(seq, "\n       ");

8055
		md_bitmap_status(seq, mddev->bitmap);
L
Linus Torvalds 已提交
8056 8057 8058

		seq_printf(seq, "\n");
	}
8059
	spin_unlock(&mddev->lock);
8060

L
Linus Torvalds 已提交
8061 8062 8063
	return 0;
}

J
Jan Engelhardt 已提交
8064
static const struct seq_operations md_seq_ops = {
L
Linus Torvalds 已提交
8065 8066 8067 8068 8069 8070 8071 8072
	.start  = md_seq_start,
	.next   = md_seq_next,
	.stop   = md_seq_stop,
	.show   = md_seq_show,
};

static int md_seq_open(struct inode *inode, struct file *file)
{
8073
	struct seq_file *seq;
L
Linus Torvalds 已提交
8074 8075 8076
	int error;

	error = seq_open(file, &md_seq_ops);
8077
	if (error)
8078 8079 8080 8081
		return error;

	seq = file->private_data;
	seq->poll_event = atomic_read(&md_event_count);
L
Linus Torvalds 已提交
8082 8083 8084
	return error;
}

8085
static int md_unloading;
8086
static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
8087
{
8088
	struct seq_file *seq = filp->private_data;
8089
	__poll_t mask;
8090

8091
	if (md_unloading)
8092
		return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
8093 8094 8095
	poll_wait(filp, &md_event_waiters, wait);

	/* always allow read */
8096
	mask = EPOLLIN | EPOLLRDNORM;
8097

8098
	if (seq->poll_event != atomic_read(&md_event_count))
8099
		mask |= EPOLLERR | EPOLLPRI;
8100 8101 8102
	return mask;
}

8103
static const struct file_operations md_seq_fops = {
8104
	.owner		= THIS_MODULE,
L
Linus Torvalds 已提交
8105 8106 8107
	.open           = md_seq_open,
	.read           = seq_read,
	.llseek         = seq_lseek,
8108
	.release	= seq_release,
8109
	.poll		= mdstat_poll,
L
Linus Torvalds 已提交
8110 8111
};

8112
int register_md_personality(struct md_personality *p)
L
Linus Torvalds 已提交
8113
{
8114 8115
	pr_debug("md: %s personality registered for level %d\n",
		 p->name, p->level);
L
Linus Torvalds 已提交
8116
	spin_lock(&pers_lock);
8117
	list_add_tail(&p->list, &pers_list);
L
Linus Torvalds 已提交
8118 8119 8120
	spin_unlock(&pers_lock);
	return 0;
}
8121
EXPORT_SYMBOL(register_md_personality);
L
Linus Torvalds 已提交
8122

8123
int unregister_md_personality(struct md_personality *p)
L
Linus Torvalds 已提交
8124
{
8125
	pr_debug("md: %s personality unregistered\n", p->name);
L
Linus Torvalds 已提交
8126
	spin_lock(&pers_lock);
8127
	list_del_init(&p->list);
L
Linus Torvalds 已提交
8128 8129 8130
	spin_unlock(&pers_lock);
	return 0;
}
8131
EXPORT_SYMBOL(unregister_md_personality);
L
Linus Torvalds 已提交
8132

8133 8134
int register_md_cluster_operations(struct md_cluster_operations *ops,
				   struct module *module)
8135
{
8136
	int ret = 0;
8137
	spin_lock(&pers_lock);
8138 8139 8140 8141 8142 8143
	if (md_cluster_ops != NULL)
		ret = -EALREADY;
	else {
		md_cluster_ops = ops;
		md_cluster_mod = module;
	}
8144
	spin_unlock(&pers_lock);
8145
	return ret;
8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159
}
EXPORT_SYMBOL(register_md_cluster_operations);

int unregister_md_cluster_operations(void)
{
	spin_lock(&pers_lock);
	md_cluster_ops = NULL;
	spin_unlock(&pers_lock);
	return 0;
}
EXPORT_SYMBOL(unregister_md_cluster_operations);

int md_setup_cluster(struct mddev *mddev, int nodes)
{
8160 8161
	if (!md_cluster_ops)
		request_module("md-cluster");
8162
	spin_lock(&pers_lock);
8163
	/* ensure module won't be unloaded */
8164
	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
8165
		pr_warn("can't find md-cluster module or get it's reference.\n");
8166 8167 8168 8169 8170
		spin_unlock(&pers_lock);
		return -ENOENT;
	}
	spin_unlock(&pers_lock);

G
Goldwyn Rodrigues 已提交
8171
	return md_cluster_ops->join(mddev, nodes);
8172 8173 8174 8175
}

void md_cluster_stop(struct mddev *mddev)
{
G
Goldwyn Rodrigues 已提交
8176 8177
	if (!md_cluster_ops)
		return;
8178 8179 8180 8181
	md_cluster_ops->leave(mddev);
	module_put(md_cluster_mod);
}

8182
static int is_mddev_idle(struct mddev *mddev, int init)
L
Linus Torvalds 已提交
8183
{
8184
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
8185
	int idle;
N
NeilBrown 已提交
8186
	int curr_events;
L
Linus Torvalds 已提交
8187 8188

	idle = 1;
8189 8190
	rcu_read_lock();
	rdev_for_each_rcu(rdev, mddev) {
L
Linus Torvalds 已提交
8191
		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
8192
		curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
N
NeilBrown 已提交
8193
			      atomic_read(&disk->sync_io);
8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213
		/* sync IO will cause sync_io to increase before the disk_stats
		 * as sync_io is counted when a request starts, and
		 * disk_stats is counted when it completes.
		 * So resync activity will cause curr_events to be smaller than
		 * when there was no such activity.
		 * non-sync IO will cause disk_stat to increase without
		 * increasing sync_io so curr_events will (eventually)
		 * be larger than it was before.  Once it becomes
		 * substantially larger, the test below will cause
		 * the array to appear non-idle, and resync will slow
		 * down.
		 * If there is a lot of outstanding resync activity when
		 * we set last_event to curr_events, then all that activity
		 * completing might cause the array to appear non-idle
		 * and resync will be slowed down even though there might
		 * not have been non-resync activity.  This will only
		 * happen once though.  'last_events' will soon reflect
		 * the state where there is little or no outstanding
		 * resync requests, and further resync activity will
		 * always make curr_events less than last_events.
8214
		 *
L
Linus Torvalds 已提交
8215
		 */
N
NeilBrown 已提交
8216
		if (init || curr_events - rdev->last_events > 64) {
L
Linus Torvalds 已提交
8217 8218 8219 8220
			rdev->last_events = curr_events;
			idle = 0;
		}
	}
8221
	rcu_read_unlock();
L
Linus Torvalds 已提交
8222 8223 8224
	return idle;
}

8225
void md_done_sync(struct mddev *mddev, int blocks, int ok)
L
Linus Torvalds 已提交
8226 8227 8228 8229 8230
{
	/* another "blocks" (512byte) blocks have been synced */
	atomic_sub(blocks, &mddev->recovery_active);
	wake_up(&mddev->recovery_wait);
	if (!ok) {
8231
		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8232
		set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
L
Linus Torvalds 已提交
8233 8234 8235 8236
		md_wakeup_thread(mddev->thread);
		// stop recovery, signal do_sync ....
	}
}
8237
EXPORT_SYMBOL(md_done_sync);
L
Linus Torvalds 已提交
8238

8239 8240
/* md_write_start(mddev, bi)
 * If we need to update some array metadata (e.g. 'active' flag
8241 8242
 * in superblock) before writing, schedule a superblock update
 * and wait for it to complete.
8243 8244
 * A return value of 'false' means that the write wasn't recorded
 * and cannot proceed as the array is being suspend.
8245
 */
8246
bool md_write_start(struct mddev *mddev, struct bio *bi)
L
Linus Torvalds 已提交
8247
{
8248
	int did_change = 0;
8249

8250
	if (bio_data_dir(bi) != WRITE)
8251
		return true;
8252

8253 8254 8255 8256 8257 8258
	BUG_ON(mddev->ro == 1);
	if (mddev->ro == 2) {
		/* need to switch to read/write */
		mddev->ro = 0;
		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
8259
		md_wakeup_thread(mddev->sync_thread);
8260
		did_change = 1;
8261
	}
8262 8263
	rcu_read_lock();
	percpu_ref_get(&mddev->writes_pending);
8264
	smp_mb(); /* Match smp_mb in set_in_sync() */
8265 8266
	if (mddev->safemode == 1)
		mddev->safemode = 0;
8267
	/* sync_checkers is always 0 when writes_pending is in per-cpu mode */
N
NeilBrown 已提交
8268
	if (mddev->in_sync || mddev->sync_checkers) {
8269
		spin_lock(&mddev->lock);
8270 8271
		if (mddev->in_sync) {
			mddev->in_sync = 0;
8272 8273
			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
			set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8274
			md_wakeup_thread(mddev->thread);
8275
			did_change = 1;
8276
		}
8277
		spin_unlock(&mddev->lock);
8278
	}
8279
	rcu_read_unlock();
8280
	if (did_change)
N
NeilBrown 已提交
8281
		sysfs_notify_dirent_safe(mddev->sysfs_state);
8282 8283
	if (!mddev->has_superblocks)
		return true;
8284
	wait_event(mddev->sb_wait,
8285 8286
		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
		   mddev->suspended);
8287 8288 8289 8290 8291
	if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
		percpu_ref_put(&mddev->writes_pending);
		return false;
	}
	return true;
L
Linus Torvalds 已提交
8292
}
8293
EXPORT_SYMBOL(md_write_start);
L
Linus Torvalds 已提交
8294

8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307
/* md_write_inc can only be called when md_write_start() has
 * already been called at least once of the current request.
 * It increments the counter and is useful when a single request
 * is split into several parts.  Each part causes an increment and
 * so needs a matching md_write_end().
 * Unlike md_write_start(), it is safe to call md_write_inc() inside
 * a spinlocked region.
 */
void md_write_inc(struct mddev *mddev, struct bio *bi)
{
	if (bio_data_dir(bi) != WRITE)
		return;
	WARN_ON_ONCE(mddev->in_sync || mddev->ro);
8308
	percpu_ref_get(&mddev->writes_pending);
8309 8310 8311
}
EXPORT_SYMBOL(md_write_inc);

8312
void md_write_end(struct mddev *mddev)
L
Linus Torvalds 已提交
8313
{
8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324
	percpu_ref_put(&mddev->writes_pending);

	if (mddev->safemode == 2)
		md_wakeup_thread(mddev->thread);
	else if (mddev->safemode_delay)
		/* The roundup() ensures this only performs locking once
		 * every ->safemode_delay jiffies
		 */
		mod_timer(&mddev->safemode_timer,
			  roundup(jiffies, mddev->safemode_delay) +
			  mddev->safemode_delay);
L
Linus Torvalds 已提交
8325
}
8326

8327
EXPORT_SYMBOL(md_write_end);
L
Linus Torvalds 已提交
8328

8329 8330 8331 8332 8333 8334
/* md_allow_write(mddev)
 * Calling this ensures that the array is marked 'active' so that writes
 * may proceed without blocking.  It is important to call this before
 * attempting a GFP_KERNEL allocation while holding the mddev lock.
 * Must be called with mddev_lock held.
 */
8335
void md_allow_write(struct mddev *mddev)
8336 8337
{
	if (!mddev->pers)
8338
		return;
8339
	if (mddev->ro)
8340
		return;
8341
	if (!mddev->pers->sync_request)
8342
		return;
8343

8344
	spin_lock(&mddev->lock);
8345 8346
	if (mddev->in_sync) {
		mddev->in_sync = 0;
8347 8348
		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
		set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8349 8350 8351
		if (mddev->safemode_delay &&
		    mddev->safemode == 0)
			mddev->safemode = 1;
8352
		spin_unlock(&mddev->lock);
8353
		md_update_sb(mddev, 0);
N
NeilBrown 已提交
8354
		sysfs_notify_dirent_safe(mddev->sysfs_state);
8355 8356 8357
		/* wait for the dirty state to be recorded in the metadata */
		wait_event(mddev->sb_wait,
			   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8358
	} else
8359
		spin_unlock(&mddev->lock);
8360 8361 8362
}
EXPORT_SYMBOL_GPL(md_allow_write);

L
Linus Torvalds 已提交
8363 8364
#define SYNC_MARKS	10
#define	SYNC_MARK_STEP	(3*HZ)
8365
#define UPDATE_FREQUENCY (5*60*HZ)
S
Shaohua Li 已提交
8366
void md_do_sync(struct md_thread *thread)
L
Linus Torvalds 已提交
8367
{
S
Shaohua Li 已提交
8368
	struct mddev *mddev = thread->mddev;
8369
	struct mddev *mddev2;
8370
	unsigned int currspeed = 0, window;
X
Xiao Ni 已提交
8371
	sector_t max_sectors,j, io_sectors, recovery_done;
L
Linus Torvalds 已提交
8372
	unsigned long mark[SYNC_MARKS];
8373
	unsigned long update_time;
L
Linus Torvalds 已提交
8374 8375 8376 8377
	sector_t mark_cnt[SYNC_MARKS];
	int last_mark,m;
	struct list_head *tmp;
	sector_t last_check;
8378
	int skipped = 0;
8379
	struct md_rdev *rdev;
8380
	char *desc, *action = NULL;
M
majianpeng 已提交
8381
	struct blk_plug plug;
8382
	int ret;
L
Linus Torvalds 已提交
8383 8384

	/* just incase thread restarts... */
8385 8386
	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
	    test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
L
Linus Torvalds 已提交
8387
		return;
8388 8389
	if (mddev->ro) {/* never try to sync a read-only array */
		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8390
		return;
8391
	}
L
Linus Torvalds 已提交
8392

8393 8394 8395 8396 8397
	if (mddev_is_clustered(mddev)) {
		ret = md_cluster_ops->resync_start(mddev);
		if (ret)
			goto skip;

8398
		set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
8399 8400 8401 8402 8403 8404 8405 8406
		if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
			test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
			test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
		     && ((unsigned long long)mddev->curr_resync_completed
			 < (unsigned long long)mddev->resync_max_sectors))
			goto skip;
	}

8407
	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8408
		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
8409
			desc = "data-check";
8410 8411
			action = "check";
		} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8412
			desc = "requested-resync";
8413 8414
			action = "repair";
		} else
8415 8416 8417 8418 8419 8420
			desc = "resync";
	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
		desc = "reshape";
	else
		desc = "recovery";

8421 8422
	mddev->last_sync_action = action ?: desc;

L
Linus Torvalds 已提交
8423 8424 8425 8426
	/* we overload curr_resync somewhat here.
	 * 0 == not engaged in resync at all
	 * 2 == checking that there is no conflict with another sync
	 * 1 == like 2, but have yielded to allow conflicting resync to
8427
	 *		commence
L
Linus Torvalds 已提交
8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439
	 * other == active in resync - this many blocks
	 *
	 * Before starting a resync we must have set curr_resync to
	 * 2, and then checked that every "conflicting" array has curr_resync
	 * less than ours.  When we find one that is the same or higher
	 * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
	 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
	 * This will mean we have to start checking from the beginning again.
	 *
	 */

	do {
8440
		int mddev2_minor = -1;
L
Linus Torvalds 已提交
8441 8442 8443
		mddev->curr_resync = 2;

	try_again:
8444
		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
L
Linus Torvalds 已提交
8445
			goto skip;
8446
		for_each_mddev(mddev2, tmp) {
L
Linus Torvalds 已提交
8447 8448
			if (mddev2 == mddev)
				continue;
8449 8450 8451
			if (!mddev->parallel_resync
			&&  mddev2->curr_resync
			&&  match_mddev_units(mddev, mddev2)) {
L
Linus Torvalds 已提交
8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462
				DEFINE_WAIT(wq);
				if (mddev < mddev2 && mddev->curr_resync == 2) {
					/* arbitrarily yield */
					mddev->curr_resync = 1;
					wake_up(&resync_wait);
				}
				if (mddev > mddev2 && mddev->curr_resync == 1)
					/* no need to wait here, we can wait the next
					 * time 'round when curr_resync == 2
					 */
					continue;
8463 8464 8465 8466 8467
				/* We need to wait 'interruptible' so as not to
				 * contribute to the load average, and not to
				 * be caught by 'softlockup'
				 */
				prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
8468
				if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8469
				    mddev2->curr_resync >= mddev->curr_resync) {
8470 8471
					if (mddev2_minor != mddev2->md_minor) {
						mddev2_minor = mddev2->md_minor;
8472 8473 8474
						pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
							desc, mdname(mddev),
							mdname(mddev2));
8475
					}
L
Linus Torvalds 已提交
8476
					mddev_put(mddev2);
8477 8478
					if (signal_pending(current))
						flush_signals(current);
L
Linus Torvalds 已提交
8479 8480 8481 8482 8483 8484 8485 8486 8487
					schedule();
					finish_wait(&resync_wait, &wq);
					goto try_again;
				}
				finish_wait(&resync_wait, &wq);
			}
		}
	} while (mddev->curr_resync < 2);

8488
	j = 0;
8489
	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
L
Linus Torvalds 已提交
8490
		/* resync follows the size requested by the personality,
8491
		 * which defaults to physical size, but can be virtual size
L
Linus Torvalds 已提交
8492 8493
		 */
		max_sectors = mddev->resync_max_sectors;
8494
		atomic64_set(&mddev->resync_mismatches, 0);
8495
		/* we don't use the checkpoint if there's a bitmap */
8496 8497 8498
		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
			j = mddev->resync_min;
		else if (!mddev->bitmap)
8499
			j = mddev->recovery_cp;
8500

8501
	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
8502
		max_sectors = mddev->resync_max_sectors;
8503 8504 8505 8506 8507 8508 8509 8510 8511
		/*
		 * If the original node aborts reshaping then we continue the
		 * reshaping, so set j again to avoid restart reshape from the
		 * first beginning
		 */
		if (mddev_is_clustered(mddev) &&
		    mddev->reshape_position != MaxSector)
			j = mddev->reshape_position;
	} else {
L
Linus Torvalds 已提交
8512
		/* recovery follows the physical size of devices */
A
Andre Noll 已提交
8513
		max_sectors = mddev->dev_sectors;
8514
		j = MaxSector;
8515
		rcu_read_lock();
N
NeilBrown 已提交
8516
		rdev_for_each_rcu(rdev, mddev)
8517
			if (rdev->raid_disk >= 0 &&
S
Shaohua Li 已提交
8518
			    !test_bit(Journal, &rdev->flags) &&
8519 8520 8521 8522
			    !test_bit(Faulty, &rdev->flags) &&
			    !test_bit(In_sync, &rdev->flags) &&
			    rdev->recovery_offset < j)
				j = rdev->recovery_offset;
8523
		rcu_read_unlock();
8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536

		/* If there is a bitmap, we need to make sure all
		 * writes that started before we added a spare
		 * complete before we start doing a recovery.
		 * Otherwise the write might complete and (via
		 * bitmap_endwrite) set a bit in the bitmap after the
		 * recovery has checked that bit and skipped that
		 * region.
		 */
		if (mddev->bitmap) {
			mddev->pers->quiesce(mddev, 1);
			mddev->pers->quiesce(mddev, 0);
		}
8537
	}
L
Linus Torvalds 已提交
8538

8539 8540 8541 8542
	pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
	pr_debug("md: minimum _guaranteed_  speed: %d KB/sec/disk.\n", speed_min(mddev));
	pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
		 speed_max(mddev), desc);
L
Linus Torvalds 已提交
8543

N
NeilBrown 已提交
8544
	is_mddev_idle(mddev, 1); /* this initializes IO event counters */
8545

8546
	io_sectors = 0;
L
Linus Torvalds 已提交
8547 8548
	for (m = 0; m < SYNC_MARKS; m++) {
		mark[m] = jiffies;
8549
		mark_cnt[m] = io_sectors;
L
Linus Torvalds 已提交
8550 8551 8552 8553 8554 8555 8556 8557
	}
	last_mark = 0;
	mddev->resync_mark = mark[last_mark];
	mddev->resync_mark_cnt = mark_cnt[last_mark];

	/*
	 * Tune reconstruction:
	 */
8558
	window = 32 * (PAGE_SIZE / 512);
8559 8560
	pr_debug("md: using %dk window, over a total of %lluk.\n",
		 window/2, (unsigned long long)max_sectors/2);
L
Linus Torvalds 已提交
8561 8562 8563 8564 8565

	atomic_set(&mddev->recovery_active, 0);
	last_check = 0;

	if (j>2) {
8566 8567
		pr_debug("md: resuming %s of %s from checkpoint.\n",
			 desc, mdname(mddev));
L
Linus Torvalds 已提交
8568
		mddev->curr_resync = j;
8569 8570
	} else
		mddev->curr_resync = 3; /* no longer delayed */
8571
	mddev->curr_resync_completed = j;
8572 8573
	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
	md_new_event(mddev);
8574
	update_time = jiffies;
L
Linus Torvalds 已提交
8575

M
majianpeng 已提交
8576
	blk_start_plug(&plug);
L
Linus Torvalds 已提交
8577
	while (j < max_sectors) {
8578
		sector_t sectors;
L
Linus Torvalds 已提交
8579

8580
		skipped = 0;
8581

8582 8583 8584 8585
		if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
		    ((mddev->curr_resync > mddev->curr_resync_completed &&
		      (mddev->curr_resync - mddev->curr_resync_completed)
		      > (max_sectors >> 4)) ||
8586
		     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
8587
		     (j - mddev->curr_resync_completed)*2
8588 8589
		     >= mddev->resync_max - mddev->curr_resync_completed ||
		     mddev->curr_resync_completed > mddev->resync_max
8590
			    )) {
8591 8592 8593
			/* time to update curr_resync_completed */
			wait_event(mddev->recovery_wait,
				   atomic_read(&mddev->recovery_active) == 0);
8594
			mddev->curr_resync_completed = j;
K
kernelmail 已提交
8595 8596 8597
			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
			    j > mddev->recovery_cp)
				mddev->recovery_cp = j;
8598
			update_time = jiffies;
8599
			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8600
			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8601
		}
8602

8603 8604
		while (j >= mddev->resync_max &&
		       !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8605 8606 8607 8608 8609 8610 8611
			/* As this condition is controlled by user-space,
			 * we can block indefinitely, so use '_interruptible'
			 * to avoid triggering warnings.
			 */
			flush_signals(current); /* just in case */
			wait_event_interruptible(mddev->recovery_wait,
						 mddev->resync_max > j
8612 8613
						 || test_bit(MD_RECOVERY_INTR,
							     &mddev->recovery));
8614
		}
8615

8616 8617
		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
			break;
8618

8619
		sectors = mddev->pers->sync_request(mddev, j, &skipped);
8620
		if (sectors == 0) {
8621
			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8622
			break;
L
Linus Torvalds 已提交
8623
		}
8624 8625 8626 8627 8628 8629

		if (!skipped) { /* actual IO requested */
			io_sectors += sectors;
			atomic_add(sectors, &mddev->recovery_active);
		}

8630 8631 8632
		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
			break;

L
Linus Torvalds 已提交
8633
		j += sectors;
8634 8635 8636
		if (j > max_sectors)
			/* when skipping, extra large numbers can be returned. */
			j = max_sectors;
8637 8638
		if (j > 2)
			mddev->curr_resync = j;
8639
		mddev->curr_mark_cnt = io_sectors;
8640
		if (last_check == 0)
8641
			/* this is the earliest that rebuild will be
8642 8643 8644
			 * visible in /proc/mdstat
			 */
			md_new_event(mddev);
8645 8646

		if (last_check + window > io_sectors || j == max_sectors)
L
Linus Torvalds 已提交
8647 8648
			continue;

8649
		last_check = io_sectors;
L
Linus Torvalds 已提交
8650 8651 8652 8653 8654 8655 8656 8657
	repeat:
		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
			/* step marks */
			int next = (last_mark+1) % SYNC_MARKS;

			mddev->resync_mark = mark[next];
			mddev->resync_mark_cnt = mark_cnt[next];
			mark[next] = jiffies;
8658
			mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
L
Linus Torvalds 已提交
8659 8660 8661
			last_mark = next;
		}

8662 8663
		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
			break;
L
Linus Torvalds 已提交
8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674

		/*
		 * this loop exits only if either when we are slower than
		 * the 'hard' speed limit, or the system was IO-idle for
		 * a jiffy.
		 * the system might be non-idle CPU-wise, but we only care
		 * about not overloading the IO subsystem. (things like an
		 * e2fsck being done on the RAID array should execute fast)
		 */
		cond_resched();

X
Xiao Ni 已提交
8675 8676
		recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
		currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
8677
			/((jiffies-mddev->resync_mark)/HZ +1) +1;
L
Linus Torvalds 已提交
8678

8679
		if (currspeed > speed_min(mddev)) {
8680
			if (currspeed > speed_max(mddev)) {
8681
				msleep(500);
L
Linus Torvalds 已提交
8682 8683
				goto repeat;
			}
8684 8685 8686 8687 8688 8689 8690 8691
			if (!is_mddev_idle(mddev, 0)) {
				/*
				 * Give other IO more of a chance.
				 * The faster the devices, the less we wait.
				 */
				wait_event(mddev->recovery_wait,
					   !atomic_read(&mddev->recovery_active));
			}
L
Linus Torvalds 已提交
8692 8693
		}
	}
8694 8695 8696
	pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
		test_bit(MD_RECOVERY_INTR, &mddev->recovery)
		? "interrupted" : "done");
L
Linus Torvalds 已提交
8697 8698 8699
	/*
	 * this also signals 'finished resyncing' to md_stop
	 */
M
majianpeng 已提交
8700
	blk_finish_plug(&plug);
L
Linus Torvalds 已提交
8701 8702
	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));

8703 8704
	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8705
	    mddev->curr_resync > 3) {
8706 8707 8708
		mddev->curr_resync_completed = mddev->curr_resync;
		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
	}
8709
	mddev->pers->sync_request(mddev, max_sectors, &skipped);
L
Linus Torvalds 已提交
8710

8711
	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
8712
	    mddev->curr_resync > 3) {
8713 8714 8715
		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
				if (mddev->curr_resync >= mddev->recovery_cp) {
8716 8717
					pr_debug("md: checkpointing %s of %s.\n",
						 desc, mdname(mddev));
8718 8719 8720 8721 8722 8723 8724
					if (test_bit(MD_RECOVERY_ERROR,
						&mddev->recovery))
						mddev->recovery_cp =
							mddev->curr_resync_completed;
					else
						mddev->recovery_cp =
							mddev->curr_resync;
8725 8726 8727 8728 8729 8730
				}
			} else
				mddev->recovery_cp = MaxSector;
		} else {
			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
				mddev->curr_resync = MaxSector;
8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743
			if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
				rcu_read_lock();
				rdev_for_each_rcu(rdev, mddev)
					if (rdev->raid_disk >= 0 &&
					    mddev->delta_disks >= 0 &&
					    !test_bit(Journal, &rdev->flags) &&
					    !test_bit(Faulty, &rdev->flags) &&
					    !test_bit(In_sync, &rdev->flags) &&
					    rdev->recovery_offset < mddev->curr_resync)
						rdev->recovery_offset = mddev->curr_resync;
				rcu_read_unlock();
			}
8744
		}
L
Linus Torvalds 已提交
8745
	}
8746
 skip:
8747 8748 8749
	/* set CHANGE_PENDING here since maybe another update is needed,
	 * so other nodes are informed. It should be harmless for normal
	 * raid */
8750 8751
	set_mask_bits(&mddev->sb_flags, 0,
		      BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
8752

8753 8754 8755 8756 8757 8758 8759 8760 8761
	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
			!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
			mddev->delta_disks > 0 &&
			mddev->pers->finish_reshape &&
			mddev->pers->size &&
			mddev->queue) {
		mddev_lock_nointr(mddev);
		md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
		mddev_unlock(mddev);
8762 8763 8764 8765
		if (!mddev_is_clustered(mddev)) {
			set_capacity(mddev->gendisk, mddev->array_sectors);
			revalidate_disk(mddev->gendisk);
		}
8766 8767
	}

8768
	spin_lock(&mddev->lock);
8769 8770 8771 8772 8773 8774 8775
	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
		/* We completed so min/max setting can be forgotten if used. */
		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
			mddev->resync_min = 0;
		mddev->resync_max = MaxSector;
	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
		mddev->resync_min = mddev->curr_resync_completed;
8776
	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
L
Linus Torvalds 已提交
8777
	mddev->curr_resync = 0;
8778 8779
	spin_unlock(&mddev->lock);

L
Linus Torvalds 已提交
8780 8781
	wake_up(&resync_wait);
	md_wakeup_thread(mddev->thread);
8782
	return;
L
Linus Torvalds 已提交
8783
}
8784
EXPORT_SYMBOL_GPL(md_do_sync);
L
Linus Torvalds 已提交
8785

8786 8787
static int remove_and_add_spares(struct mddev *mddev,
				 struct md_rdev *this)
8788
{
8789
	struct md_rdev *rdev;
8790
	int spares = 0;
8791
	int removed = 0;
8792
	bool remove_some = false;
8793

8794 8795 8796 8797
	if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
		/* Mustn't remove devices when resync thread is running */
		return 0;

8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816
	rdev_for_each(rdev, mddev) {
		if ((this == NULL || rdev == this) &&
		    rdev->raid_disk >= 0 &&
		    !test_bit(Blocked, &rdev->flags) &&
		    test_bit(Faulty, &rdev->flags) &&
		    atomic_read(&rdev->nr_pending)==0) {
			/* Faulty non-Blocked devices with nr_pending == 0
			 * never get nr_pending incremented,
			 * never get Faulty cleared, and never get Blocked set.
			 * So we can synchronize_rcu now rather than once per device
			 */
			remove_some = true;
			set_bit(RemoveSynchronized, &rdev->flags);
		}
	}

	if (remove_some)
		synchronize_rcu();
	rdev_for_each(rdev, mddev) {
8817 8818
		if ((this == NULL || rdev == this) &&
		    rdev->raid_disk >= 0 &&
8819
		    !test_bit(Blocked, &rdev->flags) &&
8820
		    ((test_bit(RemoveSynchronized, &rdev->flags) ||
S
Shaohua Li 已提交
8821 8822
		     (!test_bit(In_sync, &rdev->flags) &&
		      !test_bit(Journal, &rdev->flags))) &&
8823
		    atomic_read(&rdev->nr_pending)==0)) {
8824
			if (mddev->pers->hot_remove_disk(
8825
				    mddev, rdev) == 0) {
8826
				sysfs_unlink_rdev(mddev, rdev);
8827
				rdev->saved_raid_disk = rdev->raid_disk;
8828
				rdev->raid_disk = -1;
8829
				removed++;
8830 8831
			}
		}
8832 8833 8834 8835
		if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
			clear_bit(RemoveSynchronized, &rdev->flags);
	}

8836 8837
	if (removed && mddev->kobj.sd)
		sysfs_notify(&mddev->kobj, NULL, "degraded");
8838

8839
	if (this && removed)
8840 8841
		goto no_add;

N
NeilBrown 已提交
8842
	rdev_for_each(rdev, mddev) {
8843 8844
		if (this && this != rdev)
			continue;
8845 8846
		if (test_bit(Candidate, &rdev->flags))
			continue;
8847 8848
		if (rdev->raid_disk >= 0 &&
		    !test_bit(In_sync, &rdev->flags) &&
S
Shaohua Li 已提交
8849
		    !test_bit(Journal, &rdev->flags) &&
8850 8851
		    !test_bit(Faulty, &rdev->flags))
			spares++;
8852 8853 8854 8855
		if (rdev->raid_disk >= 0)
			continue;
		if (test_bit(Faulty, &rdev->flags))
			continue;
8856 8857 8858 8859 8860
		if (!test_bit(Journal, &rdev->flags)) {
			if (mddev->ro &&
			    ! (rdev->saved_raid_disk >= 0 &&
			       !test_bit(Bitmap_sync, &rdev->flags)))
				continue;
8861

8862 8863
			rdev->recovery_offset = 0;
		}
8864 8865 8866 8867
		if (mddev->pers->
		    hot_add_disk(mddev, rdev) == 0) {
			if (sysfs_link_rdev(mddev, rdev))
				/* failure here is OK */;
8868 8869
			if (!test_bit(Journal, &rdev->flags))
				spares++;
8870
			md_new_event(mddev);
8871
			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8872
		}
8873
	}
8874
no_add:
8875
	if (removed)
8876
		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8877 8878
	return spares;
}
8879

8880 8881 8882
static void md_start_sync(struct work_struct *ws)
{
	struct mddev *mddev = container_of(ws, struct mddev, del_work);
8883

8884 8885 8886 8887
	mddev->sync_thread = md_register_thread(md_do_sync,
						mddev,
						"resync");
	if (!mddev->sync_thread) {
8888 8889
		pr_warn("%s: could not start resync thread...\n",
			mdname(mddev));
8890 8891 8892 8893 8894 8895
		/* leave the spares where they are, it shouldn't hurt */
		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
		clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
		clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
		clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8896
		wake_up(&resync_wait);
8897 8898 8899 8900 8901 8902 8903 8904 8905 8906
		if (test_and_clear_bit(MD_RECOVERY_RECOVER,
				       &mddev->recovery))
			if (mddev->sysfs_action)
				sysfs_notify_dirent_safe(mddev->sysfs_action);
	} else
		md_wakeup_thread(mddev->sync_thread);
	sysfs_notify_dirent_safe(mddev->sysfs_action);
	md_new_event(mddev);
}

L
Linus Torvalds 已提交
8907 8908 8909 8910 8911 8912 8913 8914 8915 8916
/*
 * This routine is regularly called by all per-raid-array threads to
 * deal with generic issues like resync and super-block update.
 * Raid personalities that don't have a thread (linear/raid0) do not
 * need this as they never do any recovery or update the superblock.
 *
 * It does not do any resync itself, but rather "forks" off other threads
 * to do that as needed.
 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
 * "->recovery" and create a thread at ->sync_thread.
8917
 * When the thread finishes it sets MD_RECOVERY_DONE
L
Linus Torvalds 已提交
8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928
 * and wakeups up this thread which will reap the thread and finish up.
 * This thread also removes any faulty devices (with nr_pending == 0).
 *
 * The overall approach is:
 *  1/ if the superblock needs updating, update it.
 *  2/ If a recovery thread is running, don't do anything else.
 *  3/ If recovery has finished, clean up, possibly marking spares active.
 *  4/ If there are any faulty devices, remove them.
 *  5/ If array is degraded, try to add spares devices
 *  6/ If array has spares or is not in-sync, start a resync thread.
 */
8929
void md_check_recovery(struct mddev *mddev)
L
Linus Torvalds 已提交
8930
{
8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942
	if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
		/* Write superblock - thread that called mddev_suspend()
		 * holds reconfig_mutex for us.
		 */
		set_bit(MD_UPDATING_SB, &mddev->flags);
		smp_mb__after_atomic();
		if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
			md_update_sb(mddev, 0);
		clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
		wake_up(&mddev->sb_wait);
	}

8943 8944 8945
	if (mddev->suspended)
		return;

8946
	if (mddev->bitmap)
8947
		md_bitmap_daemon_work(mddev);
L
Linus Torvalds 已提交
8948

8949
	if (signal_pending(current)) {
8950
		if (mddev->pers->sync_request && !mddev->external) {
8951 8952
			pr_debug("md: %s in immediate safe mode\n",
				 mdname(mddev));
8953 8954 8955 8956 8957
			mddev->safemode = 2;
		}
		flush_signals(current);
	}

8958 8959
	if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
		return;
L
Linus Torvalds 已提交
8960
	if ( ! (
8961
		(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
L
Linus Torvalds 已提交
8962
		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
8963
		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8964
		(mddev->external == 0 && mddev->safemode == 1) ||
8965
		(mddev->safemode == 2
8966
		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
L
Linus Torvalds 已提交
8967 8968
		))
		return;
8969

8970
	if (mddev_trylock(mddev)) {
8971
		int spares = 0;
8972
		bool try_set_sync = mddev->safemode != 0;
8973

8974
		if (!mddev->external && mddev->safemode == 1)
8975 8976
			mddev->safemode = 0;

8977
		if (mddev->ro) {
8978 8979 8980 8981 8982 8983 8984 8985 8986
			struct md_rdev *rdev;
			if (!mddev->external && mddev->in_sync)
				/* 'Blocked' flag not needed as failed devices
				 * will be recorded if array switched to read/write.
				 * Leaving it set will prevent the device
				 * from being removed.
				 */
				rdev_for_each(rdev, mddev)
					clear_bit(Blocked, &rdev->flags);
8987 8988 8989 8990 8991 8992
			/* On a read-only array we can:
			 * - remove failed devices
			 * - add already-in_sync devices if the array itself
			 *   is in-sync.
			 * As we only add devices that are already in-sync,
			 * we can activate the spares immediately.
8993
			 */
8994
			remove_and_add_spares(mddev, NULL);
8995 8996 8997
			/* There is no thread, but we need to call
			 * ->spare_active and clear saved_raid_disk
			 */
8998
			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8999
			md_reap_sync_thread(mddev);
9000
			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9001
			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9002
			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
9003 9004 9005
			goto unlock;
		}

9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017
		if (mddev_is_clustered(mddev)) {
			struct md_rdev *rdev;
			/* kick the device if another node issued a
			 * remove disk.
			 */
			rdev_for_each(rdev, mddev) {
				if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
						rdev->raid_disk < 0)
					md_kick_rdev_from_array(rdev);
			}
		}

9018
		if (try_set_sync && !mddev->external && !mddev->in_sync) {
9019
			spin_lock(&mddev->lock);
N
NeilBrown 已提交
9020
			set_in_sync(mddev);
9021
			spin_unlock(&mddev->lock);
9022 9023
		}

9024
		if (mddev->sb_flags)
9025
			md_update_sb(mddev, 0);
9026

L
Linus Torvalds 已提交
9027 9028 9029 9030 9031 9032 9033
		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
			/* resync/recovery still happening */
			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
			goto unlock;
		}
		if (mddev->sync_thread) {
9034
			md_reap_sync_thread(mddev);
L
Linus Torvalds 已提交
9035 9036
			goto unlock;
		}
9037 9038 9039
		/* Set RUNNING before clearing NEEDED to avoid
		 * any transients in the value of "sync_action".
		 */
9040
		mddev->curr_resync_completed = 0;
9041
		spin_lock(&mddev->lock);
9042
		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9043
		spin_unlock(&mddev->lock);
9044 9045 9046 9047 9048
		/* Clear some bits that don't mean anything, but
		 * might be left set
		 */
		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
L
Linus Torvalds 已提交
9049

9050 9051
		if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
		    test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
9052
			goto not_running;
L
Linus Torvalds 已提交
9053 9054 9055
		/* no recovery is running.
		 * remove any failed drives, then
		 * add spares if possible.
9056
		 * Spares are also removed and re-added, to allow
L
Linus Torvalds 已提交
9057 9058 9059
		 * the personality to fail the re-add.
		 */

9060
		if (mddev->reshape_position != MaxSector) {
9061 9062
			if (mddev->pers->check_reshape == NULL ||
			    mddev->pers->check_reshape(mddev) != 0)
9063
				/* Cannot proceed */
9064
				goto not_running;
9065
			set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9066
			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9067
		} else if ((spares = remove_and_add_spares(mddev, NULL))) {
9068 9069
			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9070
			clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9071
			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9072 9073
		} else if (mddev->recovery_cp < MaxSector) {
			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9074
			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9075 9076
		} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
			/* nothing to be done ... */
9077
			goto not_running;
9078

L
Linus Torvalds 已提交
9079
		if (mddev->pers->sync_request) {
9080
			if (spares) {
9081 9082 9083 9084
				/* We are adding a device or devices to an array
				 * which has the bitmap stored on all devices.
				 * So make sure all bitmap pages get written
				 */
9085
				md_bitmap_write_all(mddev->bitmap);
9086
			}
9087 9088 9089
			INIT_WORK(&mddev->del_work, md_start_sync);
			queue_work(md_misc_wq, &mddev->del_work);
			goto unlock;
L
Linus Torvalds 已提交
9090
		}
9091
	not_running:
9092 9093
		if (!mddev->sync_thread) {
			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9094
			wake_up(&resync_wait);
9095 9096
			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
					       &mddev->recovery))
9097
				if (mddev->sysfs_action)
N
NeilBrown 已提交
9098
					sysfs_notify_dirent_safe(mddev->sysfs_action);
9099
		}
9100 9101
	unlock:
		wake_up(&mddev->sb_wait);
L
Linus Torvalds 已提交
9102 9103 9104
		mddev_unlock(mddev);
	}
}
9105
EXPORT_SYMBOL(md_check_recovery);
L
Linus Torvalds 已提交
9106

9107 9108 9109
void md_reap_sync_thread(struct mddev *mddev)
{
	struct md_rdev *rdev;
9110 9111
	sector_t old_dev_sectors = mddev->dev_sectors;
	bool is_reshaped = false;
9112 9113 9114 9115

	/* resync has finished, collect result */
	md_unregister_thread(&mddev->sync_thread);
	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9116 9117
	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
	    mddev->degraded != mddev->raid_disks) {
9118 9119 9120 9121 9122
		/* success...*/
		/* activate any spares */
		if (mddev->pers->spare_active(mddev)) {
			sysfs_notify(&mddev->kobj, NULL,
				     "degraded");
9123
			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9124 9125 9126
		}
	}
	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9127
	    mddev->pers->finish_reshape) {
9128
		mddev->pers->finish_reshape(mddev);
9129 9130 9131
		if (mddev_is_clustered(mddev))
			is_reshaped = true;
	}
9132 9133

	/* If array is no-longer degraded, then any saved_raid_disk
9134
	 * information must be scrapped.
9135
	 */
9136 9137
	if (!mddev->degraded)
		rdev_for_each(rdev, mddev)
9138 9139 9140
			rdev->saved_raid_disk = -1;

	md_update_sb(mddev, 1);
9141
	/* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
9142 9143 9144 9145
	 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
	 * clustered raid */
	if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
		md_cluster_ops->resync_finish(mddev);
9146
	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9147
	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9148 9149 9150 9151
	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9152 9153 9154 9155 9156 9157 9158 9159
	/*
	 * We call md_cluster_ops->update_size here because sync_size could
	 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
	 * so it is time to update size across cluster.
	 */
	if (mddev_is_clustered(mddev) && is_reshaped
				      && !test_bit(MD_CLOSING, &mddev->flags))
		md_cluster_ops->update_size(mddev, old_dev_sectors);
9160
	wake_up(&resync_wait);
9161 9162 9163 9164 9165 9166 9167
	/* flag recovery needed just to double check */
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
	sysfs_notify_dirent_safe(mddev->sysfs_action);
	md_new_event(mddev);
	if (mddev->event_work.func)
		queue_work(md_misc_wq, &mddev->event_work);
}
9168
EXPORT_SYMBOL(md_reap_sync_thread);
9169

9170
void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
9171
{
N
NeilBrown 已提交
9172
	sysfs_notify_dirent_safe(rdev->sysfs_state);
9173
	wait_event_timeout(rdev->blocked_wait,
9174 9175
			   !test_bit(Blocked, &rdev->flags) &&
			   !test_bit(BlockedBadBlocks, &rdev->flags),
9176 9177 9178 9179 9180
			   msecs_to_jiffies(5000));
	rdev_dec_pending(rdev, mddev);
}
EXPORT_SYMBOL(md_wait_for_blocked_rdev);

9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194
void md_finish_reshape(struct mddev *mddev)
{
	/* called be personality module when reshape completes. */
	struct md_rdev *rdev;

	rdev_for_each(rdev, mddev) {
		if (rdev->data_offset > rdev->new_data_offset)
			rdev->sectors += rdev->data_offset - rdev->new_data_offset;
		else
			rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
		rdev->data_offset = rdev->new_data_offset;
	}
}
EXPORT_SYMBOL(md_finish_reshape);
9195

9196
/* Bad block management */
9197

9198
/* Returns 1 on success, 0 on failure */
9199
int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9200
		       int is_new)
9201
{
9202
	struct mddev *mddev = rdev->mddev;
9203 9204 9205 9206 9207
	int rv;
	if (is_new)
		s += rdev->new_data_offset;
	else
		s += rdev->data_offset;
9208 9209
	rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
	if (rv == 0) {
9210
		/* Make sure they get written out promptly */
9211 9212 9213
		if (test_bit(ExternalBbl, &rdev->flags))
			sysfs_notify(&rdev->kobj, NULL,
				     "unacknowledged_bad_blocks");
9214
		sysfs_notify_dirent_safe(rdev->sysfs_state);
9215 9216
		set_mask_bits(&mddev->sb_flags, 0,
			      BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
9217
		md_wakeup_thread(rdev->mddev->thread);
9218 9219 9220
		return 1;
	} else
		return 0;
9221 9222 9223
}
EXPORT_SYMBOL_GPL(rdev_set_badblocks);

9224 9225
int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
			 int is_new)
9226
{
9227
	int rv;
9228 9229 9230 9231
	if (is_new)
		s += rdev->new_data_offset;
	else
		s += rdev->data_offset;
9232 9233 9234 9235
	rv = badblocks_clear(&rdev->badblocks, s, sectors);
	if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
		sysfs_notify(&rdev->kobj, NULL, "bad_blocks");
	return rv;
9236 9237 9238
}
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);

A
Adrian Bunk 已提交
9239 9240
static int md_notify_reboot(struct notifier_block *this,
			    unsigned long code, void *x)
L
Linus Torvalds 已提交
9241 9242
{
	struct list_head *tmp;
9243
	struct mddev *mddev;
9244
	int need_delay = 0;
L
Linus Torvalds 已提交
9245

9246 9247
	for_each_mddev(mddev, tmp) {
		if (mddev_trylock(mddev)) {
9248 9249
			if (mddev->pers)
				__md_stop_writes(mddev);
9250 9251
			if (mddev->persistent)
				mddev->safemode = 2;
9252
			mddev_unlock(mddev);
9253
		}
9254
		need_delay = 1;
L
Linus Torvalds 已提交
9255
	}
9256 9257 9258 9259 9260 9261 9262 9263 9264
	/*
	 * certain more exotic SCSI devices are known to be
	 * volatile wrt too early system reboots. While the
	 * right place to handle this issue is the given
	 * driver, we do want to have a safe RAID driver ...
	 */
	if (need_delay)
		mdelay(1000*1);

L
Linus Torvalds 已提交
9265 9266 9267
	return NOTIFY_DONE;
}

A
Adrian Bunk 已提交
9268
static struct notifier_block md_notifier = {
L
Linus Torvalds 已提交
9269 9270 9271 9272 9273 9274 9275
	.notifier_call	= md_notify_reboot,
	.next		= NULL,
	.priority	= INT_MAX, /* before any real devices */
};

static void md_geninit(void)
{
9276
	pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
L
Linus Torvalds 已提交
9277

9278
	proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
L
Linus Torvalds 已提交
9279 9280
}

A
Adrian Bunk 已提交
9281
static int __init md_init(void)
L
Linus Torvalds 已提交
9282
{
T
Tejun Heo 已提交
9283 9284
	int ret = -ENOMEM;

9285
	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
T
Tejun Heo 已提交
9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299
	if (!md_wq)
		goto err_wq;

	md_misc_wq = alloc_workqueue("md_misc", 0, 0);
	if (!md_misc_wq)
		goto err_misc_wq;

	if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
		goto err_md;

	if ((ret = register_blkdev(0, "mdp")) < 0)
		goto err_mdp;
	mdp_major = ret;

9300
	blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
9301 9302
			    md_probe, NULL, NULL);
	blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
L
Linus Torvalds 已提交
9303 9304 9305
			    md_probe, NULL, NULL);

	register_reboot_notifier(&md_notifier);
9306
	raid_table_header = register_sysctl_table(raid_root_table);
L
Linus Torvalds 已提交
9307 9308

	md_geninit();
9309
	return 0;
L
Linus Torvalds 已提交
9310

T
Tejun Heo 已提交
9311 9312 9313 9314 9315 9316 9317 9318 9319
err_mdp:
	unregister_blkdev(MD_MAJOR, "md");
err_md:
	destroy_workqueue(md_misc_wq);
err_misc_wq:
	destroy_workqueue(md_wq);
err_wq:
	return ret;
}
L
Linus Torvalds 已提交
9320

9321
static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
9322
{
9323 9324 9325 9326
	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
	struct md_rdev *rdev2;
	int role, ret;
	char b[BDEVNAME_SIZE];
9327

9328 9329 9330 9331 9332 9333 9334 9335 9336
	/*
	 * If size is changed in another node then we need to
	 * do resize as well.
	 */
	if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
		ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
		if (ret)
			pr_info("md-cluster: resize failed\n");
		else
9337
			md_bitmap_update_sb(mddev->bitmap);
9338 9339
	}

9340 9341 9342 9343 9344 9345 9346
	/* Check for change of roles in the active devices */
	rdev_for_each(rdev2, mddev) {
		if (test_bit(Faulty, &rdev2->flags))
			continue;

		/* Check if the roles changed */
		role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357

		if (test_bit(Candidate, &rdev2->flags)) {
			if (role == 0xfffe) {
				pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
				md_kick_rdev_from_array(rdev2);
				continue;
			}
			else
				clear_bit(Candidate, &rdev2->flags);
		}

9358
		if (role != rdev2->raid_disk) {
9359 9360 9361 9362 9363 9364
			/*
			 * got activated except reshape is happening.
			 */
			if (rdev2->raid_disk == -1 && role != 0xffff &&
			    !(le32_to_cpu(sb->feature_map) &
			      MD_FEATURE_RESHAPE_ACTIVE)) {
9365 9366 9367
				rdev2->saved_raid_disk = role;
				ret = remove_and_add_spares(mddev, rdev2);
				pr_info("Activated spare: %s\n",
9368
					bdevname(rdev2->bdev,b));
9369 9370 9371 9372
				/* wakeup mddev->thread here, so array could
				 * perform resync with the new activated disk */
				set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
				md_wakeup_thread(mddev->thread);
9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383
			}
			/* device faulty
			 * We just want to do the minimum to mark the disk
			 * as faulty. The recovery is performed by the
			 * one who initiated the error.
			 */
			if ((role == 0xfffe) || (role == 0xfffd)) {
				md_error(mddev, rdev2);
				clear_bit(Blocked, &rdev2->flags);
			}
		}
9384
	}
9385

9386 9387
	if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
		update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
9388

9389 9390 9391 9392 9393 9394 9395 9396 9397 9398
	/*
	 * Since mddev->delta_disks has already updated in update_raid_disks,
	 * so it is time to check reshape.
	 */
	if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
		/*
		 * reshape is happening in the remote node, we need to
		 * update reshape_position and call start_reshape.
		 */
9399
		mddev->reshape_position = le64_to_cpu(sb->reshape_position);
9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412
		if (mddev->pers->update_reshape_pos)
			mddev->pers->update_reshape_pos(mddev);
		if (mddev->pers->start_reshape)
			mddev->pers->start_reshape(mddev);
	} else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
		   mddev->reshape_position != MaxSector &&
		   !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
		/* reshape is just done in another node. */
		mddev->reshape_position = MaxSector;
		if (mddev->pers->update_reshape_pos)
			mddev->pers->update_reshape_pos(mddev);
	}

9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426
	/* Finally set the event to be up to date */
	mddev->events = le64_to_cpu(sb->events);
}

static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
	int err;
	struct page *swapout = rdev->sb_page;
	struct mdp_superblock_1 *sb;

	/* Store the sb page of the rdev in the swapout temporary
	 * variable in case we err in the future
	 */
	rdev->sb_page = NULL;
9427 9428 9429 9430 9431 9432 9433
	err = alloc_disk_sb(rdev);
	if (err == 0) {
		ClearPageUptodate(rdev->sb_page);
		rdev->sb_loaded = 0;
		err = super_types[mddev->major_version].
			load_super(rdev, NULL, mddev->minor_version);
	}
9434 9435 9436
	if (err < 0) {
		pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
				__func__, __LINE__, rdev->desc_nr, err);
9437 9438
		if (rdev->sb_page)
			put_page(rdev->sb_page);
9439 9440 9441
		rdev->sb_page = swapout;
		rdev->sb_loaded = 1;
		return err;
9442 9443
	}

9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486
	sb = page_address(rdev->sb_page);
	/* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
	 * is not set
	 */

	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
		rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);

	/* The other node finished recovery, call spare_active to set
	 * device In_sync and mddev->degraded
	 */
	if (rdev->recovery_offset == MaxSector &&
	    !test_bit(In_sync, &rdev->flags) &&
	    mddev->pers->spare_active(mddev))
		sysfs_notify(&mddev->kobj, NULL, "degraded");

	put_page(swapout);
	return 0;
}

void md_reload_sb(struct mddev *mddev, int nr)
{
	struct md_rdev *rdev;
	int err;

	/* Find the rdev */
	rdev_for_each_rcu(rdev, mddev) {
		if (rdev->desc_nr == nr)
			break;
	}

	if (!rdev || rdev->desc_nr != nr) {
		pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
		return;
	}

	err = read_rdev(mddev, rdev);
	if (err < 0)
		return;

	check_sb_changes(mddev, rdev);

	/* Read all rdev's to update recovery_offset */
9487 9488 9489 9490
	rdev_for_each_rcu(rdev, mddev) {
		if (!test_bit(Faulty, &rdev->flags))
			read_rdev(mddev, rdev);
	}
9491 9492 9493
}
EXPORT_SYMBOL(md_reload_sb);

L
Linus Torvalds 已提交
9494 9495 9496 9497 9498 9499
#ifndef MODULE

/*
 * Searches all registered partitions for autorun RAID arrays
 * at boot time.
 */
9500

9501
static DEFINE_MUTEX(detected_devices_mutex);
9502 9503 9504 9505 9506
static LIST_HEAD(all_detected_devices);
struct detected_devices_node {
	struct list_head list;
	dev_t dev;
};
L
Linus Torvalds 已提交
9507 9508 9509

void md_autodetect_dev(dev_t dev)
{
9510 9511 9512 9513 9514
	struct detected_devices_node *node_detected_dev;

	node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
	if (node_detected_dev) {
		node_detected_dev->dev = dev;
9515
		mutex_lock(&detected_devices_mutex);
9516
		list_add_tail(&node_detected_dev->list, &all_detected_devices);
9517
		mutex_unlock(&detected_devices_mutex);
9518
	}
L
Linus Torvalds 已提交
9519 9520 9521 9522
}

static void autostart_arrays(int part)
{
9523
	struct md_rdev *rdev;
9524 9525 9526
	struct detected_devices_node *node_detected_dev;
	dev_t dev;
	int i_scanned, i_passed;
L
Linus Torvalds 已提交
9527

9528 9529
	i_scanned = 0;
	i_passed = 0;
L
Linus Torvalds 已提交
9530

9531
	pr_info("md: Autodetecting RAID arrays.\n");
L
Linus Torvalds 已提交
9532

9533
	mutex_lock(&detected_devices_mutex);
9534 9535 9536 9537 9538 9539 9540
	while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
		i_scanned++;
		node_detected_dev = list_entry(all_detected_devices.next,
					struct detected_devices_node, list);
		list_del(&node_detected_dev->list);
		dev = node_detected_dev->dev;
		kfree(node_detected_dev);
S
Shaohua Li 已提交
9541
		mutex_unlock(&detected_devices_mutex);
9542
		rdev = md_import_device(dev,0, 90);
S
Shaohua Li 已提交
9543
		mutex_lock(&detected_devices_mutex);
L
Linus Torvalds 已提交
9544 9545 9546
		if (IS_ERR(rdev))
			continue;

N
NeilBrown 已提交
9547
		if (test_bit(Faulty, &rdev->flags))
L
Linus Torvalds 已提交
9548
			continue;
N
NeilBrown 已提交
9549

9550
		set_bit(AutoDetected, &rdev->flags);
L
Linus Torvalds 已提交
9551
		list_add(&rdev->same_set, &pending_raid_disks);
9552
		i_passed++;
L
Linus Torvalds 已提交
9553
	}
9554
	mutex_unlock(&detected_devices_mutex);
9555

9556
	pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
L
Linus Torvalds 已提交
9557 9558 9559 9560

	autorun_devices(part);
}

J
Jeff Garzik 已提交
9561
#endif /* !MODULE */
L
Linus Torvalds 已提交
9562 9563 9564

static __exit void md_exit(void)
{
9565
	struct mddev *mddev;
L
Linus Torvalds 已提交
9566
	struct list_head *tmp;
9567
	int delay = 1;
9568

9569
	blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
9570
	blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
L
Linus Torvalds 已提交
9571

C
Christoph Hellwig 已提交
9572
	unregister_blkdev(MD_MAJOR,"md");
L
Linus Torvalds 已提交
9573 9574 9575
	unregister_blkdev(mdp_major, "mdp");
	unregister_reboot_notifier(&md_notifier);
	unregister_sysctl_table(raid_table_header);
9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586

	/* We cannot unload the modules while some process is
	 * waiting for us in select() or poll() - wake them up
	 */
	md_unloading = 1;
	while (waitqueue_active(&md_event_waiters)) {
		/* not safe to leave yet */
		wake_up(&md_event_waiters);
		msleep(delay);
		delay += delay;
	}
L
Linus Torvalds 已提交
9587
	remove_proc_entry("mdstat", NULL);
9588

9589
	for_each_mddev(mddev, tmp) {
L
Linus Torvalds 已提交
9590
		export_array(mddev);
9591
		mddev->ctime = 0;
9592
		mddev->hold_active = 0;
9593 9594 9595 9596 9597 9598
		/*
		 * for_each_mddev() will call mddev_put() at the end of each
		 * iteration.  As the mddev is now fully clear, this will
		 * schedule the mddev for destruction by a workqueue, and the
		 * destroy_workqueue() below will wait for that to complete.
		 */
L
Linus Torvalds 已提交
9599
	}
T
Tejun Heo 已提交
9600 9601
	destroy_workqueue(md_misc_wq);
	destroy_workqueue(md_wq);
L
Linus Torvalds 已提交
9602 9603
}

9604
subsys_initcall(md_init);
L
Linus Torvalds 已提交
9605 9606
module_exit(md_exit)

9607
static int get_ro(char *buffer, const struct kernel_param *kp)
9608 9609 9610
{
	return sprintf(buffer, "%d", start_readonly);
}
9611
static int set_ro(const char *val, const struct kernel_param *kp)
9612
{
A
Alexey Dobriyan 已提交
9613
	return kstrtouint(val, 10, (unsigned int *)&start_readonly);
9614 9615
}

9616 9617
module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
9618
module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
9619
module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
9620

L
Linus Torvalds 已提交
9621
MODULE_LICENSE("GPL");
9622
MODULE_DESCRIPTION("MD RAID framework");
9623
MODULE_ALIAS("md");
9624
MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);