md.c 244.5 KB
Newer Older
L
Linus Torvalds 已提交
1 2
/*
   md.c : Multiple Devices driver for Linux
3
     Copyright (C) 1998, 1999, 2000 Ingo Molnar
L
Linus Torvalds 已提交
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21

     completely rewritten, based on the MD driver code from Marc Zyngier

   Changes:

   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
   - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
   - kmod support by: Cyrus Durgin
   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>

   - lots of fixes and improvements to the RAID1/RAID5 and generic
     RAID code (such as request based resynchronization):

     Neil Brown <neilb@cse.unsw.edu.au>.

22 23 24
   - persistent bitmap code
     Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.

L
Linus Torvalds 已提交
25 26 27 28 29 30 31 32
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   You should have received a copy of the GNU General Public License
   (for example /usr/src/linux/COPYING); if not, write to the Free
   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 34 35 36 37 38 39 40 41 42 43 44

   Errors, Warnings, etc.
   Please use:
     pr_crit() for error conditions that risk data loss
     pr_err() for error conditions that are unexpected, like an IO error
         or internal inconsistency
     pr_warn() for error conditions that could have been predicated, like
         adding a device to an array when it has incompatible metadata
     pr_info() for every interesting, very rare events, like an array starting
         or stopping, or resync starting or stopping
     pr_debug() for everything else.

L
Linus Torvalds 已提交
45 46
*/

47
#include <linux/sched/signal.h>
48
#include <linux/kthread.h>
49
#include <linux/blkdev.h>
50
#include <linux/badblocks.h>
L
Linus Torvalds 已提交
51
#include <linux/sysctl.h>
52
#include <linux/seq_file.h>
A
Al Viro 已提交
53
#include <linux/fs.h>
54
#include <linux/poll.h>
55
#include <linux/ctype.h>
56
#include <linux/string.h>
57 58 59
#include <linux/hdreg.h>
#include <linux/proc_fs.h>
#include <linux/random.h>
60
#include <linux/module.h>
61
#include <linux/reboot.h>
62
#include <linux/file.h>
63
#include <linux/compat.h>
64
#include <linux/delay.h>
65 66
#include <linux/raid/md_p.h>
#include <linux/raid/md_u.h>
67
#include <linux/slab.h>
68 69
#include <linux/percpu-refcount.h>

70
#include <trace/events/block.h>
71
#include "md.h"
72
#include "md-bitmap.h"
73
#include "md-cluster.h"
L
Linus Torvalds 已提交
74 75

#ifndef MODULE
76
static void autostart_arrays(int part);
L
Linus Torvalds 已提交
77 78
#endif

79 80 81 82 83
/* pers_list is a list of registered personalities protected
 * by pers_lock.
 * pers_lock does extra service to protect accesses to
 * mddev->thread when the mutex cannot be held.
 */
84
static LIST_HEAD(pers_list);
L
Linus Torvalds 已提交
85 86
static DEFINE_SPINLOCK(pers_lock);

87
struct md_cluster_operations *md_cluster_ops;
88
EXPORT_SYMBOL(md_cluster_ops);
89 90 91
struct module *md_cluster_mod;
EXPORT_SYMBOL(md_cluster_mod);

92
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
T
Tejun Heo 已提交
93 94
static struct workqueue_struct *md_wq;
static struct workqueue_struct *md_misc_wq;
95

96 97
static int remove_and_add_spares(struct mddev *mddev,
				 struct md_rdev *this);
98
static void mddev_detach(struct mddev *mddev);
99

100 101 102 103 104 105
/*
 * Default number of read corrections we'll attempt on an rdev
 * before ejecting it from the array. We divide the read error
 * count by 2 for every hour elapsed between read errors.
 */
#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
L
Linus Torvalds 已提交
106 107 108 109
/*
 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
 * is 1000 KB/sec, so the extra system load does not show up that much.
 * Increase it if you want to have more _guaranteed_ speed. Note that
110
 * the RAID driver will use the maximum available bandwidth if the IO
L
Linus Torvalds 已提交
111 112 113 114 115
 * subsystem is idle. There is also an 'absolute maximum' reconstruction
 * speed limit - in case reconstruction slows down your system despite
 * idle IO detection.
 *
 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
116
 * or /sys/block/mdX/md/sync_speed_{min,max}
L
Linus Torvalds 已提交
117 118 119 120
 */

static int sysctl_speed_limit_min = 1000;
static int sysctl_speed_limit_max = 200000;
121
static inline int speed_min(struct mddev *mddev)
122 123 124 125 126
{
	return mddev->sync_speed_min ?
		mddev->sync_speed_min : sysctl_speed_limit_min;
}

127
static inline int speed_max(struct mddev *mddev)
128 129 130 131
{
	return mddev->sync_speed_max ?
		mddev->sync_speed_max : sysctl_speed_limit_max;
}
L
Linus Torvalds 已提交
132 133 134

static struct ctl_table_header *raid_table_header;

135
static struct ctl_table raid_table[] = {
L
Linus Torvalds 已提交
136 137 138 139
	{
		.procname	= "speed_limit_min",
		.data		= &sysctl_speed_limit_min,
		.maxlen		= sizeof(int),
140
		.mode		= S_IRUGO|S_IWUSR,
141
		.proc_handler	= proc_dointvec,
L
Linus Torvalds 已提交
142 143 144 145 146
	},
	{
		.procname	= "speed_limit_max",
		.data		= &sysctl_speed_limit_max,
		.maxlen		= sizeof(int),
147
		.mode		= S_IRUGO|S_IWUSR,
148
		.proc_handler	= proc_dointvec,
L
Linus Torvalds 已提交
149
	},
150
	{ }
L
Linus Torvalds 已提交
151 152
};

153
static struct ctl_table raid_dir_table[] = {
L
Linus Torvalds 已提交
154 155 156
	{
		.procname	= "raid",
		.maxlen		= 0,
157
		.mode		= S_IRUGO|S_IXUGO,
L
Linus Torvalds 已提交
158 159
		.child		= raid_table,
	},
160
	{ }
L
Linus Torvalds 已提交
161 162
};

163
static struct ctl_table raid_root_table[] = {
L
Linus Torvalds 已提交
164 165 166 167 168 169
	{
		.procname	= "dev",
		.maxlen		= 0,
		.mode		= 0555,
		.child		= raid_dir_table,
	},
170
	{  }
L
Linus Torvalds 已提交
171 172
};

173
static const struct block_device_operations md_fops;
L
Linus Torvalds 已提交
174

175 176
static int start_readonly;

177 178 179 180 181 182 183 184 185 186
/*
 * The original mechanism for creating an md device is to create
 * a device node in /dev and to open it.  This causes races with device-close.
 * The preferred method is to write to the "new_array" module parameter.
 * This can avoid races.
 * Setting create_on_open to false disables the original mechanism
 * so all the races disappear.
 */
static bool create_on_open = true;

187
/* bio_clone_mddev
188
 * like bio_clone_bioset, but with a local bio set
189 190 191
 */

struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
192
			    struct mddev *mddev)
193 194 195
{
	struct bio *b;

196
	if (!mddev || !bioset_initialized(&mddev->bio_set))
197 198
		return bio_alloc(gfp_mask, nr_iovecs);

199
	b = bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
200 201 202 203 204 205
	if (!b)
		return NULL;
	return b;
}
EXPORT_SYMBOL_GPL(bio_alloc_mddev);

206 207
static struct bio *md_bio_alloc_sync(struct mddev *mddev)
{
208
	if (!mddev || !bioset_initialized(&mddev->sync_set))
209 210
		return bio_alloc(GFP_NOIO, 1);

211
	return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
212 213
}

214 215 216 217 218 219 220 221 222 223
/*
 * We have a system wide 'event count' that is incremented
 * on any 'interesting' event, and readers of /proc/mdstat
 * can use 'poll' or 'select' to find out when the event
 * count increases.
 *
 * Events are:
 *  start array, stop array, error, add device, remove device,
 *  start build, activate spare
 */
224
static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
225
static atomic_t md_event_count;
226
void md_new_event(struct mddev *mddev)
227 228 229 230
{
	atomic_inc(&md_event_count);
	wake_up(&md_event_waiters);
}
231
EXPORT_SYMBOL_GPL(md_new_event);
232

L
Linus Torvalds 已提交
233 234 235 236 237 238 239 240 241 242 243 244 245 246
/*
 * Enables to iterate over all existing md arrays
 * all_mddevs_lock protects this list.
 */
static LIST_HEAD(all_mddevs);
static DEFINE_SPINLOCK(all_mddevs_lock);

/*
 * iterates through all used mddevs in the system.
 * We take care to grab the all_mddevs_lock whenever navigating
 * the list, and to always hold a refcount when unlocked.
 * Any code which breaks out of this loop while own
 * a reference to the current mddev and must mddev_put it.
 */
247
#define for_each_mddev(_mddev,_tmp)					\
L
Linus Torvalds 已提交
248
									\
249
	for (({ spin_lock(&all_mddevs_lock);				\
250 251 252 253
		_tmp = all_mddevs.next;					\
		_mddev = NULL;});					\
	     ({ if (_tmp != &all_mddevs)				\
			mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
L
Linus Torvalds 已提交
254
		spin_unlock(&all_mddevs_lock);				\
255 256 257
		if (_mddev) mddev_put(_mddev);				\
		_mddev = list_entry(_tmp, struct mddev, all_mddevs);	\
		_tmp != &all_mddevs;});					\
L
Linus Torvalds 已提交
258
	     ({ spin_lock(&all_mddevs_lock);				\
259
		_tmp = _tmp->next;})					\
L
Linus Torvalds 已提交
260 261
		)

262 263 264 265 266 267 268
/* Rather than calling directly into the personality make_request function,
 * IO requests come here first so that we can check if the device is
 * being suspended pending a reconfiguration.
 * We hold a refcount over the call to ->make_request.  By the time that
 * call has finished, the bio has been linked into some internal structure
 * and so is visible to ->quiesce(), so we don't need the refcount any more.
 */
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
static bool is_suspended(struct mddev *mddev, struct bio *bio)
{
	if (mddev->suspended)
		return true;
	if (bio_data_dir(bio) != WRITE)
		return false;
	if (mddev->suspend_lo >= mddev->suspend_hi)
		return false;
	if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
		return false;
	if (bio_end_sector(bio) < mddev->suspend_lo)
		return false;
	return true;
}

S
Shaohua Li 已提交
284 285 286 287
void md_handle_request(struct mddev *mddev, struct bio *bio)
{
check_suspended:
	rcu_read_lock();
288
	if (is_suspended(mddev, bio)) {
S
Shaohua Li 已提交
289 290 291 292
		DEFINE_WAIT(__wait);
		for (;;) {
			prepare_to_wait(&mddev->sb_wait, &__wait,
					TASK_UNINTERRUPTIBLE);
293
			if (!is_suspended(mddev, bio))
S
Shaohua Li 已提交
294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
				break;
			rcu_read_unlock();
			schedule();
			rcu_read_lock();
		}
		finish_wait(&mddev->sb_wait, &__wait);
	}
	atomic_inc(&mddev->active_io);
	rcu_read_unlock();

	if (!mddev->pers->make_request(mddev, bio)) {
		atomic_dec(&mddev->active_io);
		wake_up(&mddev->sb_wait);
		goto check_suspended;
	}

	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
		wake_up(&mddev->sb_wait);
}
EXPORT_SYMBOL(md_handle_request);

315
static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
L
Linus Torvalds 已提交
316
{
317
	const int rw = bio_data_dir(bio);
318
	struct mddev *mddev = q->queuedata;
319
	unsigned int sectors;
G
Gu Zheng 已提交
320
	int cpu;
321

322
	blk_queue_split(q, &bio);
323

N
NeilBrown 已提交
324
	if (mddev == NULL || mddev->pers == NULL) {
325
		bio_io_error(bio);
326
		return BLK_QC_T_NONE;
327
	}
328
	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
329
		if (bio_sectors(bio) != 0)
330
			bio->bi_status = BLK_STS_IOERR;
331
		bio_endio(bio);
332
		return BLK_QC_T_NONE;
333
	}
334

335 336 337 338 339
	/*
	 * save the sectors now since our bio can
	 * go away inside make_request
	 */
	sectors = bio_sectors(bio);
S
Shaohua Li 已提交
340
	/* bio could be mergeable after passing to underlayer */
J
Jens Axboe 已提交
341
	bio->bi_opf &= ~REQ_NOMERGE;
S
Shaohua Li 已提交
342 343

	md_handle_request(mddev, bio);
344

G
Gu Zheng 已提交
345 346 347 348
	cpu = part_stat_lock();
	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
	part_stat_unlock();
349

350
	return BLK_QC_T_NONE;
351 352
}

353 354 355
/* mddev_suspend makes sure no new requests are submitted
 * to the device, and that any requests that have been submitted
 * are completely handled.
N
NeilBrown 已提交
356 357
 * Once mddev_detach() is called and completes, the module will be
 * completely unused.
358
 */
359
void mddev_suspend(struct mddev *mddev)
360
{
361
	WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
362
	lockdep_assert_held(&mddev->reconfig_mutex);
363 364
	if (mddev->suspended++)
		return;
365
	synchronize_rcu();
366
	wake_up(&mddev->sb_wait);
367 368
	set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
	smp_mb__after_atomic();
369 370
	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
	mddev->pers->quiesce(mddev, 1);
371 372
	clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
	wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
373 374

	del_timer_sync(&mddev->safemode_timer);
375
}
376
EXPORT_SYMBOL_GPL(mddev_suspend);
377

378
void mddev_resume(struct mddev *mddev)
379
{
380
	lockdep_assert_held(&mddev->reconfig_mutex);
381 382
	if (--mddev->suspended)
		return;
383 384
	wake_up(&mddev->sb_wait);
	mddev->pers->quiesce(mddev, 0);
385

386
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
387 388
	md_wakeup_thread(mddev->thread);
	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
L
Linus Torvalds 已提交
389
}
390
EXPORT_SYMBOL_GPL(mddev_resume);
L
Linus Torvalds 已提交
391

392
int mddev_congested(struct mddev *mddev, int bits)
393
{
394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409
	struct md_personality *pers = mddev->pers;
	int ret = 0;

	rcu_read_lock();
	if (mddev->suspended)
		ret = 1;
	else if (pers && pers->congested)
		ret = pers->congested(mddev, bits);
	rcu_read_unlock();
	return ret;
}
EXPORT_SYMBOL_GPL(mddev_congested);
static int md_congested(void *data, int bits)
{
	struct mddev *mddev = data;
	return mddev_congested(mddev, bits);
410 411
}

412
/*
T
Tejun Heo 已提交
413
 * Generic flush handling for md
414 415
 */

416
static void md_end_flush(struct bio *bio)
417
{
418
	struct md_rdev *rdev = bio->bi_private;
419
	struct mddev *mddev = rdev->mddev;
420 421 422 423

	rdev_dec_pending(rdev, mddev);

	if (atomic_dec_and_test(&mddev->flush_pending)) {
T
Tejun Heo 已提交
424
		/* The pre-request flush has finished */
T
Tejun Heo 已提交
425
		queue_work(md_wq, &mddev->flush_work);
426 427 428 429
	}
	bio_put(bio);
}

N
NeilBrown 已提交
430 431
static void md_submit_flush_data(struct work_struct *ws);

432
static void submit_flushes(struct work_struct *ws)
433
{
434
	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
435
	struct md_rdev *rdev;
436

N
NeilBrown 已提交
437 438
	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
	atomic_set(&mddev->flush_pending, 1);
439
	rcu_read_lock();
N
NeilBrown 已提交
440
	rdev_for_each_rcu(rdev, mddev)
441 442 443 444 445 446 447 448 449 450
		if (rdev->raid_disk >= 0 &&
		    !test_bit(Faulty, &rdev->flags)) {
			/* Take two references, one is dropped
			 * when request finishes, one after
			 * we reclaim rcu_read_lock
			 */
			struct bio *bi;
			atomic_inc(&rdev->nr_pending);
			atomic_inc(&rdev->nr_pending);
			rcu_read_unlock();
451
			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
T
Tejun Heo 已提交
452
			bi->bi_end_io = md_end_flush;
453
			bi->bi_private = rdev;
454
			bio_set_dev(bi, rdev->bdev);
455
			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
456
			atomic_inc(&mddev->flush_pending);
457
			submit_bio(bi);
458 459 460 461
			rcu_read_lock();
			rdev_dec_pending(rdev, mddev);
		}
	rcu_read_unlock();
N
NeilBrown 已提交
462 463
	if (atomic_dec_and_test(&mddev->flush_pending))
		queue_work(md_wq, &mddev->flush_work);
464 465
}

T
Tejun Heo 已提交
466
static void md_submit_flush_data(struct work_struct *ws)
467
{
468
	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
T
Tejun Heo 已提交
469
	struct bio *bio = mddev->flush_bio;
470

471 472 473 474 475 476 477 478 479
	/*
	 * must reset flush_bio before calling into md_handle_request to avoid a
	 * deadlock, because other bios passed md_handle_request suspend check
	 * could wait for this and below md_handle_request could wait for those
	 * bios because of suspend check
	 */
	mddev->flush_bio = NULL;
	wake_up(&mddev->sb_wait);

480
	if (bio->bi_iter.bi_size == 0)
481
		/* an empty barrier - all done */
482
		bio_endio(bio);
483
	else {
J
Jens Axboe 已提交
484
		bio->bi_opf &= ~REQ_PREFLUSH;
485
		md_handle_request(mddev, bio);
486 487 488
	}
}

489
void md_flush_request(struct mddev *mddev, struct bio *bio)
490
{
491
	spin_lock_irq(&mddev->lock);
492
	wait_event_lock_irq(mddev->sb_wait,
T
Tejun Heo 已提交
493
			    !mddev->flush_bio,
494
			    mddev->lock);
T
Tejun Heo 已提交
495
	mddev->flush_bio = bio;
496
	spin_unlock_irq(&mddev->lock);
497

498 499
	INIT_WORK(&mddev->flush_work, submit_flushes);
	queue_work(md_wq, &mddev->flush_work);
500
}
T
Tejun Heo 已提交
501
EXPORT_SYMBOL(md_flush_request);
502

503
static inline struct mddev *mddev_get(struct mddev *mddev)
L
Linus Torvalds 已提交
504 505 506 507 508
{
	atomic_inc(&mddev->active);
	return mddev;
}

509
static void mddev_delayed_delete(struct work_struct *ws);
510

511
static void mddev_put(struct mddev *mddev)
L
Linus Torvalds 已提交
512
{
513 514 515 516
	struct bio_set bs, sync_bs;

	memset(&bs, 0, sizeof(bs));
	memset(&sync_bs, 0, sizeof(sync_bs));
517

L
Linus Torvalds 已提交
518 519
	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
		return;
520
	if (!mddev->raid_disks && list_empty(&mddev->disks) &&
521 522 523
	    mddev->ctime == 0 && !mddev->hold_active) {
		/* Array is not configured at all, and not held active,
		 * so destroy it */
524
		list_del_init(&mddev->all_mddevs);
525
		bs = mddev->bio_set;
S
Shaohua Li 已提交
526
		sync_bs = mddev->sync_set;
527 528
		memset(&mddev->bio_set, 0, sizeof(mddev->bio_set));
		memset(&mddev->sync_set, 0, sizeof(mddev->sync_set));
529
		if (mddev->gendisk) {
T
Tejun Heo 已提交
530 531 532 533
			/* We did a probe so need to clean up.  Call
			 * queue_work inside the spinlock so that
			 * flush_workqueue() after mddev_find will
			 * succeed in waiting for the work to be done.
534 535
			 */
			INIT_WORK(&mddev->del_work, mddev_delayed_delete);
T
Tejun Heo 已提交
536
			queue_work(md_misc_wq, &mddev->del_work);
537 538 539 540
		} else
			kfree(mddev);
	}
	spin_unlock(&all_mddevs_lock);
541 542
	bioset_exit(&bs);
	bioset_exit(&sync_bs);
L
Linus Torvalds 已提交
543 544
}

545
static void md_safemode_timeout(struct timer_list *t);
546

547
void mddev_init(struct mddev *mddev)
548 549 550 551 552 553
{
	mutex_init(&mddev->open_mutex);
	mutex_init(&mddev->reconfig_mutex);
	mutex_init(&mddev->bitmap_info.mutex);
	INIT_LIST_HEAD(&mddev->disks);
	INIT_LIST_HEAD(&mddev->all_mddevs);
554
	timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
555 556 557
	atomic_set(&mddev->active, 1);
	atomic_set(&mddev->openers, 0);
	atomic_set(&mddev->active_io, 0);
558
	spin_lock_init(&mddev->lock);
559 560 561 562
	atomic_set(&mddev->flush_pending, 0);
	init_waitqueue_head(&mddev->sb_wait);
	init_waitqueue_head(&mddev->recovery_wait);
	mddev->reshape_position = MaxSector;
563
	mddev->reshape_backwards = 0;
564
	mddev->last_sync_action = "none";
565 566 567 568
	mddev->resync_min = 0;
	mddev->resync_max = MaxSector;
	mddev->level = LEVEL_NONE;
}
569
EXPORT_SYMBOL_GPL(mddev_init);
570

571
static struct mddev *mddev_find(dev_t unit)
L
Linus Torvalds 已提交
572
{
573
	struct mddev *mddev, *new = NULL;
L
Linus Torvalds 已提交
574

575 576 577
	if (unit && MAJOR(unit) != MD_MAJOR)
		unit &= ~((1<<MdpMinorShift)-1);

L
Linus Torvalds 已提交
578 579
 retry:
	spin_lock(&all_mddevs_lock);
580 581 582 583 584 585 586 587 588 589 590 591

	if (unit) {
		list_for_each_entry(mddev, &all_mddevs, all_mddevs)
			if (mddev->unit == unit) {
				mddev_get(mddev);
				spin_unlock(&all_mddevs_lock);
				kfree(new);
				return mddev;
			}

		if (new) {
			list_add(&new->all_mddevs, &all_mddevs);
L
Linus Torvalds 已提交
592
			spin_unlock(&all_mddevs_lock);
593 594
			new->hold_active = UNTIL_IOCTL;
			return new;
L
Linus Torvalds 已提交
595
		}
596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612
	} else if (new) {
		/* find an unused unit number */
		static int next_minor = 512;
		int start = next_minor;
		int is_free = 0;
		int dev = 0;
		while (!is_free) {
			dev = MKDEV(MD_MAJOR, next_minor);
			next_minor++;
			if (next_minor > MINORMASK)
				next_minor = 0;
			if (next_minor == start) {
				/* Oh dear, all in use. */
				spin_unlock(&all_mddevs_lock);
				kfree(new);
				return NULL;
			}
613

614 615 616 617 618 619 620 621 622 623
			is_free = 1;
			list_for_each_entry(mddev, &all_mddevs, all_mddevs)
				if (mddev->unit == dev) {
					is_free = 0;
					break;
				}
		}
		new->unit = dev;
		new->md_minor = MINOR(dev);
		new->hold_active = UNTIL_STOP;
L
Linus Torvalds 已提交
624 625 626 627 628 629
		list_add(&new->all_mddevs, &all_mddevs);
		spin_unlock(&all_mddevs_lock);
		return new;
	}
	spin_unlock(&all_mddevs_lock);

630
	new = kzalloc(sizeof(*new), GFP_KERNEL);
L
Linus Torvalds 已提交
631 632 633 634 635 636 637 638 639
	if (!new)
		return NULL;

	new->unit = unit;
	if (MAJOR(unit) == MD_MAJOR)
		new->md_minor = MINOR(unit);
	else
		new->md_minor = MINOR(unit) >> MdpMinorShift;

640
	mddev_init(new);
L
Linus Torvalds 已提交
641 642 643 644

	goto retry;
}

645 646
static struct attribute_group md_redundancy_group;

647
void mddev_unlock(struct mddev *mddev)
L
Linus Torvalds 已提交
648
{
649
	if (mddev->to_remove) {
650 651 652 653
		/* These cannot be removed under reconfig_mutex as
		 * an access to the files will try to take reconfig_mutex
		 * while holding the file unremovable, which leads to
		 * a deadlock.
654 655 656 657 658 659 660
		 * So hold set sysfs_active while the remove in happeing,
		 * and anything else which might set ->to_remove or my
		 * otherwise change the sysfs namespace will fail with
		 * -EBUSY if sysfs_active is still set.
		 * We set sysfs_active under reconfig_mutex and elsewhere
		 * test it under the same mutex to ensure its correct value
		 * is seen.
661
		 */
662 663
		struct attribute_group *to_remove = mddev->to_remove;
		mddev->to_remove = NULL;
664
		mddev->sysfs_active = 1;
665 666
		mutex_unlock(&mddev->reconfig_mutex);

N
NeilBrown 已提交
667 668 669 670 671 672 673 674 675 676
		if (mddev->kobj.sd) {
			if (to_remove != &md_redundancy_group)
				sysfs_remove_group(&mddev->kobj, to_remove);
			if (mddev->pers == NULL ||
			    mddev->pers->sync_request == NULL) {
				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
				if (mddev->sysfs_action)
					sysfs_put(mddev->sysfs_action);
				mddev->sysfs_action = NULL;
			}
677
		}
678
		mddev->sysfs_active = 0;
679 680
	} else
		mutex_unlock(&mddev->reconfig_mutex);
L
Linus Torvalds 已提交
681

C
Chris Dunlop 已提交
682 683
	/* As we've dropped the mutex we need a spinlock to
	 * make sure the thread doesn't disappear
684 685
	 */
	spin_lock(&pers_lock);
686
	md_wakeup_thread(mddev->thread);
687
	wake_up(&mddev->sb_wait);
688
	spin_unlock(&pers_lock);
L
Linus Torvalds 已提交
689
}
690
EXPORT_SYMBOL_GPL(mddev_unlock);
L
Linus Torvalds 已提交
691

692
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
693 694 695 696 697 698 699 700 701
{
	struct md_rdev *rdev;

	rdev_for_each_rcu(rdev, mddev)
		if (rdev->desc_nr == nr)
			return rdev;

	return NULL;
}
702
EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
703 704

static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
L
Linus Torvalds 已提交
705
{
706
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
707

N
NeilBrown 已提交
708
	rdev_for_each(rdev, mddev)
L
Linus Torvalds 已提交
709 710
		if (rdev->bdev->bd_dev == dev)
			return rdev;
711

L
Linus Torvalds 已提交
712 713 714
	return NULL;
}

715
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
716 717 718 719 720 721 722 723 724
{
	struct md_rdev *rdev;

	rdev_for_each_rcu(rdev, mddev)
		if (rdev->bdev->bd_dev == dev)
			return rdev;

	return NULL;
}
725
EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
726

727
static struct md_personality *find_pers(int level, char *clevel)
728
{
729
	struct md_personality *pers;
730 731
	list_for_each_entry(pers, &pers_list, list) {
		if (level != LEVEL_NONE && pers->level == level)
732
			return pers;
733 734 735
		if (strcmp(pers->name, clevel)==0)
			return pers;
	}
736 737 738
	return NULL;
}

739
/* return the offset of the super block in 512byte sectors */
740
static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
L
Linus Torvalds 已提交
741
{
742
	sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
743
	return MD_NEW_SIZE_SECTORS(num_sectors);
L
Linus Torvalds 已提交
744 745
}

746
static int alloc_disk_sb(struct md_rdev *rdev)
L
Linus Torvalds 已提交
747 748
{
	rdev->sb_page = alloc_page(GFP_KERNEL);
749
	if (!rdev->sb_page)
750
		return -ENOMEM;
L
Linus Torvalds 已提交
751 752 753
	return 0;
}

754
void md_rdev_clear(struct md_rdev *rdev)
L
Linus Torvalds 已提交
755 756
{
	if (rdev->sb_page) {
757
		put_page(rdev->sb_page);
L
Linus Torvalds 已提交
758 759
		rdev->sb_loaded = 0;
		rdev->sb_page = NULL;
760
		rdev->sb_start = 0;
761
		rdev->sectors = 0;
L
Linus Torvalds 已提交
762
	}
763 764 765 766
	if (rdev->bb_page) {
		put_page(rdev->bb_page);
		rdev->bb_page = NULL;
	}
767
	badblocks_exit(&rdev->badblocks);
L
Linus Torvalds 已提交
768
}
769
EXPORT_SYMBOL_GPL(md_rdev_clear);
L
Linus Torvalds 已提交
770

771
static void super_written(struct bio *bio)
772
{
773
	struct md_rdev *rdev = bio->bi_private;
774
	struct mddev *mddev = rdev->mddev;
775

776 777
	if (bio->bi_status) {
		pr_err("md: super_written gets error=%d\n", bio->bi_status);
778
		md_error(mddev, rdev);
779 780
		if (!test_bit(Faulty, &rdev->flags)
		    && (bio->bi_opf & MD_FAILFAST)) {
781
			set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
782 783 784 785
			set_bit(LastDev, &rdev->flags);
		}
	} else
		clear_bit(LastDev, &rdev->flags);
786

787 788
	if (atomic_dec_and_test(&mddev->pending_writes))
		wake_up(&mddev->sb_wait);
789
	rdev_dec_pending(rdev, mddev);
N
Neil Brown 已提交
790
	bio_put(bio);
791 792
}

793
void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
794 795 796 797 798 799 800 801
		   sector_t sector, int size, struct page *page)
{
	/* write first size bytes of page to sector of rdev
	 * Increment mddev->pending_writes before returning
	 * and decrement it on completion, waking up sb_wait
	 * if zero is reached.
	 * If an error occurred, call md_error
	 */
802 803 804
	struct bio *bio;
	int ff = 0;

805 806 807
	if (!page)
		return;

808 809 810
	if (test_bit(Faulty, &rdev->flags))
		return;

811
	bio = md_bio_alloc_sync(mddev);
812

813 814
	atomic_inc(&rdev->nr_pending);

815
	bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
816
	bio->bi_iter.bi_sector = sector;
817 818 819
	bio_add_page(bio, page, size, 0);
	bio->bi_private = rdev;
	bio->bi_end_io = super_written;
820 821 822 823 824

	if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
	    test_bit(FailFast, &rdev->flags) &&
	    !test_bit(LastDev, &rdev->flags))
		ff = MD_FAILFAST;
J
Jan Kara 已提交
825
	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff;
826

827
	atomic_inc(&mddev->pending_writes);
828
	submit_bio(bio);
829 830
}

831
int md_super_wait(struct mddev *mddev)
832
{
T
Tejun Heo 已提交
833
	/* wait for all superblock writes that were scheduled to complete */
834
	wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
835
	if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
836 837
		return -EAGAIN;
	return 0;
838 839
}

840
int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
M
Mike Christie 已提交
841
		 struct page *page, int op, int op_flags, bool metadata_op)
L
Linus Torvalds 已提交
842
{
843
	struct bio *bio = md_bio_alloc_sync(rdev->mddev);
L
Linus Torvalds 已提交
844 845
	int ret;

846 847 848 849
	if (metadata_op && rdev->meta_bdev)
		bio_set_dev(bio, rdev->meta_bdev);
	else
		bio_set_dev(bio, rdev->bdev);
M
Mike Christie 已提交
850
	bio_set_op_attrs(bio, op, op_flags);
J
Jonathan Brassow 已提交
851
	if (metadata_op)
852
		bio->bi_iter.bi_sector = sector + rdev->sb_start;
853 854 855
	else if (rdev->mddev->reshape_position != MaxSector &&
		 (rdev->mddev->reshape_backwards ==
		  (sector >= rdev->mddev->reshape_position)))
856
		bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
J
Jonathan Brassow 已提交
857
	else
858
		bio->bi_iter.bi_sector = sector + rdev->data_offset;
L
Linus Torvalds 已提交
859
	bio_add_page(bio, page, size, 0);
860 861

	submit_bio_wait(bio);
L
Linus Torvalds 已提交
862

863
	ret = !bio->bi_status;
L
Linus Torvalds 已提交
864 865 866
	bio_put(bio);
	return ret;
}
867
EXPORT_SYMBOL_GPL(sync_page_io);
L
Linus Torvalds 已提交
868

869
static int read_disk_sb(struct md_rdev *rdev, int size)
L
Linus Torvalds 已提交
870 871
{
	char b[BDEVNAME_SIZE];
N
NeilBrown 已提交
872

L
Linus Torvalds 已提交
873 874 875
	if (rdev->sb_loaded)
		return 0;

M
Mike Christie 已提交
876
	if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true))
L
Linus Torvalds 已提交
877 878 879 880 881
		goto fail;
	rdev->sb_loaded = 1;
	return 0;

fail:
882 883
	pr_err("md: disabled device %s, could not read superblock.\n",
	       bdevname(rdev->bdev,b));
L
Linus Torvalds 已提交
884 885 886
	return -EINVAL;
}

887
static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
L
Linus Torvalds 已提交
888
{
889
	return	sb1->set_uuid0 == sb2->set_uuid0 &&
A
Andre Noll 已提交
890 891 892
		sb1->set_uuid1 == sb2->set_uuid1 &&
		sb1->set_uuid2 == sb2->set_uuid2 &&
		sb1->set_uuid3 == sb2->set_uuid3;
L
Linus Torvalds 已提交
893 894
}

895
static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
L
Linus Torvalds 已提交
896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916
{
	int ret;
	mdp_super_t *tmp1, *tmp2;

	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);

	if (!tmp1 || !tmp2) {
		ret = 0;
		goto abort;
	}

	*tmp1 = *sb1;
	*tmp2 = *sb2;

	/*
	 * nr_disks is not constant
	 */
	tmp1->nr_disks = 0;
	tmp2->nr_disks = 0;

A
Andre Noll 已提交
917
	ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
L
Linus Torvalds 已提交
918
abort:
919 920
	kfree(tmp1);
	kfree(tmp2);
L
Linus Torvalds 已提交
921 922 923
	return ret;
}

924 925 926 927 928 929
static u32 md_csum_fold(u32 csum)
{
	csum = (csum & 0xffff) + (csum >> 16);
	return (csum & 0xffff) + (csum >> 16);
}

930
static unsigned int calc_sb_csum(mdp_super_t *sb)
L
Linus Torvalds 已提交
931
{
932 933 934
	u64 newcsum = 0;
	u32 *sb32 = (u32*)sb;
	int i;
L
Linus Torvalds 已提交
935 936 937 938
	unsigned int disk_csum, csum;

	disk_csum = sb->sb_csum;
	sb->sb_csum = 0;
939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954

	for (i = 0; i < MD_SB_BYTES/4 ; i++)
		newcsum += sb32[i];
	csum = (newcsum & 0xffffffff) + (newcsum>>32);

#ifdef CONFIG_ALPHA
	/* This used to use csum_partial, which was wrong for several
	 * reasons including that different results are returned on
	 * different architectures.  It isn't critical that we get exactly
	 * the same return value as before (we always csum_fold before
	 * testing, and that removes any differences).  However as we
	 * know that csum_partial always returned a 16bit value on
	 * alphas, do a fold to maximise conformity to previous behaviour.
	 */
	sb->sb_csum = md_csum_fold(disk_csum);
#else
L
Linus Torvalds 已提交
955
	sb->sb_csum = disk_csum;
956
#endif
L
Linus Torvalds 已提交
957 958 959 960 961 962 963 964 965 966 967
	return csum;
}

/*
 * Handle superblock details.
 * We want to be able to handle multiple superblock formats
 * so we have a common interface to them all, and an array of
 * different handlers.
 * We rely on user-space to write the initial superblock, and support
 * reading and updating of superblocks.
 * Interface methods are:
968
 *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
L
Linus Torvalds 已提交
969 970 971 972 973 974 975 976 977
 *      loads and validates a superblock on dev.
 *      if refdev != NULL, compare superblocks on both devices
 *    Return:
 *      0 - dev has a superblock that is compatible with refdev
 *      1 - dev has a superblock that is compatible and newer than refdev
 *          so dev should be used as the refdev in future
 *     -EINVAL superblock incompatible or invalid
 *     -othererror e.g. -EIO
 *
978
 *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
L
Linus Torvalds 已提交
979 980 981 982 983
 *      Verify that dev is acceptable into mddev.
 *       The first time, mddev->raid_disks will be 0, and data from
 *       dev should be merged in.  Subsequent calls check that dev
 *       is new enough.  Return 0 or -EINVAL
 *
984
 *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
L
Linus Torvalds 已提交
985 986 987 988 989 990
 *     Update the superblock for rdev with data in mddev
 *     This does not write to disc.
 *
 */

struct super_type  {
991 992
	char		    *name;
	struct module	    *owner;
993 994
	int		    (*load_super)(struct md_rdev *rdev,
					  struct md_rdev *refdev,
995
					  int minor_version);
996 997 998 999
	int		    (*validate_super)(struct mddev *mddev,
					      struct md_rdev *rdev);
	void		    (*sync_super)(struct mddev *mddev,
					  struct md_rdev *rdev);
1000
	unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
1001
						sector_t num_sectors);
1002 1003
	int		    (*allow_new_offset)(struct md_rdev *rdev,
						unsigned long long new_offset);
L
Linus Torvalds 已提交
1004 1005
};

1006 1007 1008 1009 1010 1011 1012 1013
/*
 * Check that the given mddev has no bitmap.
 *
 * This function is called from the run method of all personalities that do not
 * support bitmaps. It prints an error message and returns non-zero if mddev
 * has a bitmap. Otherwise, it returns 0.
 *
 */
1014
int md_check_no_bitmap(struct mddev *mddev)
1015
{
1016
	if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1017
		return 0;
1018
	pr_warn("%s: bitmaps are not supported for %s\n",
1019 1020 1021 1022 1023
		mdname(mddev), mddev->pers->name);
	return 1;
}
EXPORT_SYMBOL(md_check_no_bitmap);

L
Linus Torvalds 已提交
1024
/*
1025
 * load_super for 0.90.0
L
Linus Torvalds 已提交
1026
 */
1027
static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
L
Linus Torvalds 已提交
1028 1029 1030 1031 1032 1033
{
	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
	mdp_super_t *sb;
	int ret;

	/*
1034
	 * Calculate the position of the superblock (512byte sectors),
L
Linus Torvalds 已提交
1035 1036 1037 1038
	 * it's at the end of the disk.
	 *
	 * It also happens to be a multiple of 4Kb.
	 */
1039
	rdev->sb_start = calc_dev_sboffset(rdev);
L
Linus Torvalds 已提交
1040

1041
	ret = read_disk_sb(rdev, MD_SB_BYTES);
1042 1043
	if (ret)
		return ret;
L
Linus Torvalds 已提交
1044 1045 1046 1047

	ret = -EINVAL;

	bdevname(rdev->bdev, b);
1048
	sb = page_address(rdev->sb_page);
L
Linus Torvalds 已提交
1049 1050

	if (sb->md_magic != MD_SB_MAGIC) {
1051
		pr_warn("md: invalid raid superblock magic on %s\n", b);
L
Linus Torvalds 已提交
1052 1053 1054 1055
		goto abort;
	}

	if (sb->major_version != 0 ||
1056 1057
	    sb->minor_version < 90 ||
	    sb->minor_version > 91) {
1058 1059
		pr_warn("Bad version number %d.%d on %s\n",
			sb->major_version, sb->minor_version, b);
L
Linus Torvalds 已提交
1060 1061 1062 1063 1064 1065
		goto abort;
	}

	if (sb->raid_disks <= 0)
		goto abort;

1066
	if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1067
		pr_warn("md: invalid superblock checksum on %s\n", b);
L
Linus Torvalds 已提交
1068 1069 1070 1071 1072
		goto abort;
	}

	rdev->preferred_minor = sb->md_minor;
	rdev->data_offset = 0;
1073
	rdev->new_data_offset = 0;
1074
	rdev->sb_size = MD_SB_BYTES;
1075
	rdev->badblocks.shift = -1;
L
Linus Torvalds 已提交
1076 1077 1078 1079 1080 1081

	if (sb->level == LEVEL_MULTIPATH)
		rdev->desc_nr = -1;
	else
		rdev->desc_nr = sb->this_disk.number;

1082
	if (!refdev) {
L
Linus Torvalds 已提交
1083
		ret = 1;
1084
	} else {
L
Linus Torvalds 已提交
1085
		__u64 ev1, ev2;
1086
		mdp_super_t *refsb = page_address(refdev->sb_page);
1087
		if (!md_uuid_equal(refsb, sb)) {
1088
			pr_warn("md: %s has different UUID to %s\n",
L
Linus Torvalds 已提交
1089 1090 1091
				b, bdevname(refdev->bdev,b2));
			goto abort;
		}
1092
		if (!md_sb_equal(refsb, sb)) {
1093 1094
			pr_warn("md: %s has same UUID but different superblock to %s\n",
				b, bdevname(refdev->bdev, b2));
L
Linus Torvalds 已提交
1095 1096 1097 1098 1099 1100
			goto abort;
		}
		ev1 = md_event(sb);
		ev2 = md_event(refsb);
		if (ev1 > ev2)
			ret = 1;
1101
		else
L
Linus Torvalds 已提交
1102 1103
			ret = 0;
	}
1104
	rdev->sectors = rdev->sb_start;
1105 1106 1107 1108
	/* Limit to 4TB as metadata cannot record more than that.
	 * (not needed for Linear and RAID0 as metadata doesn't
	 * record this size)
	 */
1109 1110 1111
	if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) &&
	    sb->level >= 1)
		rdev->sectors = (sector_t)(2ULL << 32) - 2;
L
Linus Torvalds 已提交
1112

1113
	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1114 1115 1116
		/* "this cannot possibly happen" ... */
		ret = -EINVAL;

L
Linus Torvalds 已提交
1117 1118 1119 1120 1121 1122 1123
 abort:
	return ret;
}

/*
 * validate_super for 0.90.0
 */
1124
static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
L
Linus Torvalds 已提交
1125 1126
{
	mdp_disk_t *desc;
1127
	mdp_super_t *sb = page_address(rdev->sb_page);
1128
	__u64 ev1 = md_event(sb);
L
Linus Torvalds 已提交
1129

1130
	rdev->raid_disk = -1;
1131 1132
	clear_bit(Faulty, &rdev->flags);
	clear_bit(In_sync, &rdev->flags);
1133
	clear_bit(Bitmap_sync, &rdev->flags);
1134 1135
	clear_bit(WriteMostly, &rdev->flags);

L
Linus Torvalds 已提交
1136 1137 1138 1139
	if (mddev->raid_disks == 0) {
		mddev->major_version = 0;
		mddev->minor_version = sb->minor_version;
		mddev->patch_version = sb->patch_version;
1140
		mddev->external = 0;
1141
		mddev->chunk_sectors = sb->chunk_size >> 9;
L
Linus Torvalds 已提交
1142 1143 1144
		mddev->ctime = sb->ctime;
		mddev->utime = sb->utime;
		mddev->level = sb->level;
1145
		mddev->clevel[0] = 0;
L
Linus Torvalds 已提交
1146 1147
		mddev->layout = sb->layout;
		mddev->raid_disks = sb->raid_disks;
1148
		mddev->dev_sectors = ((sector_t)sb->size) * 2;
1149
		mddev->events = ev1;
1150
		mddev->bitmap_info.offset = 0;
1151 1152
		mddev->bitmap_info.space = 0;
		/* bitmap can use 60 K after the 4K superblocks */
1153
		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1154
		mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1155
		mddev->reshape_backwards = 0;
L
Linus Torvalds 已提交
1156

1157 1158 1159 1160 1161
		if (mddev->minor_version >= 91) {
			mddev->reshape_position = sb->reshape_position;
			mddev->delta_disks = sb->delta_disks;
			mddev->new_level = sb->new_level;
			mddev->new_layout = sb->new_layout;
1162
			mddev->new_chunk_sectors = sb->new_chunk >> 9;
1163 1164
			if (mddev->delta_disks < 0)
				mddev->reshape_backwards = 1;
1165 1166 1167 1168 1169
		} else {
			mddev->reshape_position = MaxSector;
			mddev->delta_disks = 0;
			mddev->new_level = mddev->level;
			mddev->new_layout = mddev->layout;
1170
			mddev->new_chunk_sectors = mddev->chunk_sectors;
1171 1172
		}

L
Linus Torvalds 已提交
1173 1174 1175
		if (sb->state & (1<<MD_SB_CLEAN))
			mddev->recovery_cp = MaxSector;
		else {
1176
			if (sb->events_hi == sb->cp_events_hi &&
L
Linus Torvalds 已提交
1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188
				sb->events_lo == sb->cp_events_lo) {
				mddev->recovery_cp = sb->recovery_cp;
			} else
				mddev->recovery_cp = 0;
		}

		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
		memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
		memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);

		mddev->max_disks = MD_SB_DISKS;
1189 1190

		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1191
		    mddev->bitmap_info.file == NULL) {
1192 1193
			mddev->bitmap_info.offset =
				mddev->bitmap_info.default_offset;
1194
			mddev->bitmap_info.space =
1195
				mddev->bitmap_info.default_space;
1196
		}
1197

1198
	} else if (mddev->pers == NULL) {
1199 1200
		/* Insist on good event counter while assembling, except
		 * for spares (which don't need an event count) */
L
Linus Torvalds 已提交
1201
		++ev1;
1202 1203
		if (sb->disks[rdev->desc_nr].state & (
			    (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1204
			if (ev1 < mddev->events)
1205
				return -EINVAL;
1206 1207 1208 1209 1210 1211
	} else if (mddev->bitmap) {
		/* if adding to array with a bitmap, then we can accept an
		 * older device ... but not too old.
		 */
		if (ev1 < mddev->bitmap->events_cleared)
			return 0;
1212 1213
		if (ev1 < mddev->events)
			set_bit(Bitmap_sync, &rdev->flags);
1214 1215 1216 1217 1218
	} else {
		if (ev1 < mddev->events)
			/* just a hot-add of a new device, leave raid_disk at -1 */
			return 0;
	}
1219

L
Linus Torvalds 已提交
1220 1221 1222 1223
	if (mddev->level != LEVEL_MULTIPATH) {
		desc = sb->disks + rdev->desc_nr;

		if (desc->state & (1<<MD_DISK_FAULTY))
1224
			set_bit(Faulty, &rdev->flags);
1225 1226
		else if (desc->state & (1<<MD_DISK_SYNC) /* &&
			    desc->raid_disk < mddev->raid_disks */) {
1227
			set_bit(In_sync, &rdev->flags);
L
Linus Torvalds 已提交
1228
			rdev->raid_disk = desc->raid_disk;
1229
			rdev->saved_raid_disk = desc->raid_disk;
1230 1231 1232 1233 1234 1235 1236 1237
		} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
			/* active but not in sync implies recovery up to
			 * reshape position.  We don't know exactly where
			 * that is, so set to zero for now */
			if (mddev->minor_version >= 91) {
				rdev->recovery_offset = 0;
				rdev->raid_disk = desc->raid_disk;
			}
L
Linus Torvalds 已提交
1238
		}
1239 1240
		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
			set_bit(WriteMostly, &rdev->flags);
1241 1242
		if (desc->state & (1<<MD_DISK_FAILFAST))
			set_bit(FailFast, &rdev->flags);
1243
	} else /* MULTIPATH are always insync */
1244
		set_bit(In_sync, &rdev->flags);
L
Linus Torvalds 已提交
1245 1246 1247 1248 1249 1250
	return 0;
}

/*
 * sync_super for 0.90.0
 */
1251
static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
L
Linus Torvalds 已提交
1252 1253
{
	mdp_super_t *sb;
1254
	struct md_rdev *rdev2;
L
Linus Torvalds 已提交
1255
	int next_spare = mddev->raid_disks;
1256

L
Linus Torvalds 已提交
1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269
	/* make rdev->sb match mddev data..
	 *
	 * 1/ zero out disks
	 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
	 * 3/ any empty disks < next_spare become removed
	 *
	 * disks[0] gets initialised to REMOVED because
	 * we cannot be sure from other fields if it has
	 * been initialised or not.
	 */
	int i;
	int active=0, working=0,failed=0,spare=0,nr_disks=0;

1270 1271
	rdev->sb_size = MD_SB_BYTES;

1272
	sb = page_address(rdev->sb_page);
L
Linus Torvalds 已提交
1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284

	memset(sb, 0, sizeof(*sb));

	sb->md_magic = MD_SB_MAGIC;
	sb->major_version = mddev->major_version;
	sb->patch_version = mddev->patch_version;
	sb->gvalid_words  = 0; /* ignored */
	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
	memcpy(&sb->set_uuid3, mddev->uuid+12,4);

1285
	sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
L
Linus Torvalds 已提交
1286
	sb->level = mddev->level;
A
Andre Noll 已提交
1287
	sb->size = mddev->dev_sectors / 2;
L
Linus Torvalds 已提交
1288 1289
	sb->raid_disks = mddev->raid_disks;
	sb->md_minor = mddev->md_minor;
1290
	sb->not_persistent = 0;
1291
	sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
L
Linus Torvalds 已提交
1292 1293 1294 1295
	sb->state = 0;
	sb->events_hi = (mddev->events>>32);
	sb->events_lo = (u32)mddev->events;

1296 1297 1298 1299 1300 1301 1302 1303
	if (mddev->reshape_position == MaxSector)
		sb->minor_version = 90;
	else {
		sb->minor_version = 91;
		sb->reshape_position = mddev->reshape_position;
		sb->new_level = mddev->new_level;
		sb->delta_disks = mddev->delta_disks;
		sb->new_layout = mddev->new_layout;
1304
		sb->new_chunk = mddev->new_chunk_sectors << 9;
1305 1306
	}
	mddev->minor_version = sb->minor_version;
L
Linus Torvalds 已提交
1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317
	if (mddev->in_sync)
	{
		sb->recovery_cp = mddev->recovery_cp;
		sb->cp_events_hi = (mddev->events>>32);
		sb->cp_events_lo = (u32)mddev->events;
		if (mddev->recovery_cp == MaxSector)
			sb->state = (1<< MD_SB_CLEAN);
	} else
		sb->recovery_cp = 0;

	sb->layout = mddev->layout;
1318
	sb->chunk_size = mddev->chunk_sectors << 9;
L
Linus Torvalds 已提交
1319

1320
	if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1321 1322
		sb->state |= (1<<MD_SB_BITMAP_PRESENT);

L
Linus Torvalds 已提交
1323
	sb->disks[0].state = (1<<MD_DISK_REMOVED);
N
NeilBrown 已提交
1324
	rdev_for_each(rdev2, mddev) {
L
Linus Torvalds 已提交
1325
		mdp_disk_t *d;
1326
		int desc_nr;
1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339
		int is_active = test_bit(In_sync, &rdev2->flags);

		if (rdev2->raid_disk >= 0 &&
		    sb->minor_version >= 91)
			/* we have nowhere to store the recovery_offset,
			 * but if it is not below the reshape_position,
			 * we can piggy-back on that.
			 */
			is_active = 1;
		if (rdev2->raid_disk < 0 ||
		    test_bit(Faulty, &rdev2->flags))
			is_active = 0;
		if (is_active)
1340
			desc_nr = rdev2->raid_disk;
L
Linus Torvalds 已提交
1341
		else
1342
			desc_nr = next_spare++;
1343
		rdev2->desc_nr = desc_nr;
L
Linus Torvalds 已提交
1344 1345 1346 1347 1348
		d = &sb->disks[rdev2->desc_nr];
		nr_disks++;
		d->number = rdev2->desc_nr;
		d->major = MAJOR(rdev2->bdev->bd_dev);
		d->minor = MINOR(rdev2->bdev->bd_dev);
1349
		if (is_active)
L
Linus Torvalds 已提交
1350 1351 1352
			d->raid_disk = rdev2->raid_disk;
		else
			d->raid_disk = rdev2->desc_nr; /* compatibility */
1353
		if (test_bit(Faulty, &rdev2->flags))
L
Linus Torvalds 已提交
1354
			d->state = (1<<MD_DISK_FAULTY);
1355
		else if (is_active) {
L
Linus Torvalds 已提交
1356
			d->state = (1<<MD_DISK_ACTIVE);
1357 1358
			if (test_bit(In_sync, &rdev2->flags))
				d->state |= (1<<MD_DISK_SYNC);
L
Linus Torvalds 已提交
1359 1360 1361 1362 1363 1364 1365
			active++;
			working++;
		} else {
			d->state = 0;
			spare++;
			working++;
		}
1366 1367
		if (test_bit(WriteMostly, &rdev2->flags))
			d->state |= (1<<MD_DISK_WRITEMOSTLY);
1368 1369
		if (test_bit(FailFast, &rdev2->flags))
			d->state |= (1<<MD_DISK_FAILFAST);
L
Linus Torvalds 已提交
1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391
	}
	/* now set the "removed" and "faulty" bits on any missing devices */
	for (i=0 ; i < mddev->raid_disks ; i++) {
		mdp_disk_t *d = &sb->disks[i];
		if (d->state == 0 && d->number == 0) {
			d->number = i;
			d->raid_disk = i;
			d->state = (1<<MD_DISK_REMOVED);
			d->state |= (1<<MD_DISK_FAULTY);
			failed++;
		}
	}
	sb->nr_disks = nr_disks;
	sb->active_disks = active;
	sb->working_disks = working;
	sb->failed_disks = failed;
	sb->spare_disks = spare;

	sb->this_disk = sb->disks[rdev->desc_nr];
	sb->sb_csum = calc_sb_csum(sb);
}

1392 1393 1394 1395
/*
 * rdev_size_change for 0.90.0
 */
static unsigned long long
1396
super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1397
{
A
Andre Noll 已提交
1398
	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1399
		return 0; /* component must fit device */
1400
	if (rdev->mddev->bitmap_info.offset)
1401
		return 0; /* can't move bitmap */
1402
	rdev->sb_start = calc_dev_sboffset(rdev);
1403 1404
	if (!num_sectors || num_sectors > rdev->sb_start)
		num_sectors = rdev->sb_start;
1405 1406 1407
	/* Limit to 4TB as metadata cannot record more than that.
	 * 4TB == 2^32 KB, or 2*2^32 sectors.
	 */
1408 1409 1410
	if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
	    rdev->mddev->level >= 1)
		num_sectors = (sector_t)(2ULL << 32) - 2;
1411 1412
	do {
		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1413
		       rdev->sb_page);
1414
	} while (md_super_wait(rdev->mddev) < 0);
1415
	return num_sectors;
1416 1417
}

1418 1419 1420 1421 1422 1423
static int
super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
{
	/* non-zero offset changes not possible with v0.90 */
	return new_offset == 0;
}
1424

L
Linus Torvalds 已提交
1425 1426 1427 1428
/*
 * version 1 superblock
 */

1429
static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
L
Linus Torvalds 已提交
1430
{
1431 1432
	__le32 disk_csum;
	u32 csum;
L
Linus Torvalds 已提交
1433 1434
	unsigned long long newcsum;
	int size = 256 + le32_to_cpu(sb->max_dev)*2;
1435
	__le32 *isuper = (__le32*)sb;
L
Linus Torvalds 已提交
1436 1437 1438 1439

	disk_csum = sb->sb_csum;
	sb->sb_csum = 0;
	newcsum = 0;
1440
	for (; size >= 4; size -= 4)
L
Linus Torvalds 已提交
1441 1442 1443
		newcsum += le32_to_cpu(*isuper++);

	if (size == 2)
1444
		newcsum += le16_to_cpu(*(__le16*) isuper);
L
Linus Torvalds 已提交
1445 1446 1447 1448 1449 1450

	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
	sb->sb_csum = disk_csum;
	return cpu_to_le32(csum);
}

1451
static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
L
Linus Torvalds 已提交
1452 1453 1454
{
	struct mdp_superblock_1 *sb;
	int ret;
1455
	sector_t sb_start;
1456
	sector_t sectors;
L
Linus Torvalds 已提交
1457
	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1458
	int bmask;
L
Linus Torvalds 已提交
1459 1460

	/*
1461
	 * Calculate the position of the superblock in 512byte sectors.
L
Linus Torvalds 已提交
1462 1463 1464 1465 1466 1467 1468 1469
	 * It is always aligned to a 4K boundary and
	 * depeding on minor_version, it can be:
	 * 0: At least 8K, but less than 12K, from end of device
	 * 1: At start of device
	 * 2: 4K from start of device.
	 */
	switch(minor_version) {
	case 0:
1470
		sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1471 1472
		sb_start -= 8*2;
		sb_start &= ~(sector_t)(4*2-1);
L
Linus Torvalds 已提交
1473 1474
		break;
	case 1:
1475
		sb_start = 0;
L
Linus Torvalds 已提交
1476 1477
		break;
	case 2:
1478
		sb_start = 8;
L
Linus Torvalds 已提交
1479 1480 1481 1482
		break;
	default:
		return -EINVAL;
	}
1483
	rdev->sb_start = sb_start;
L
Linus Torvalds 已提交
1484

1485 1486 1487 1488
	/* superblock is rarely larger than 1K, but it can be larger,
	 * and it is safe to read 4k, so we do that
	 */
	ret = read_disk_sb(rdev, 4096);
L
Linus Torvalds 已提交
1489 1490
	if (ret) return ret;

1491
	sb = page_address(rdev->sb_page);
L
Linus Torvalds 已提交
1492 1493 1494 1495

	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
	    sb->major_version != cpu_to_le32(1) ||
	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1496
	    le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1497
	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
L
Linus Torvalds 已提交
1498 1499 1500
		return -EINVAL;

	if (calc_sb_1_csum(sb) != sb->sb_csum) {
1501
		pr_warn("md: invalid superblock checksum on %s\n",
L
Linus Torvalds 已提交
1502 1503 1504 1505
			bdevname(rdev->bdev,b));
		return -EINVAL;
	}
	if (le64_to_cpu(sb->data_size) < 10) {
1506 1507
		pr_warn("md: data_size too small on %s\n",
			bdevname(rdev->bdev,b));
L
Linus Torvalds 已提交
1508 1509
		return -EINVAL;
	}
1510 1511 1512 1513 1514
	if (sb->pad0 ||
	    sb->pad3[0] ||
	    memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
		/* Some padding is non-zero, might be a new feature */
		return -EINVAL;
1515

L
Linus Torvalds 已提交
1516 1517
	rdev->preferred_minor = 0xffff;
	rdev->data_offset = le64_to_cpu(sb->data_offset);
1518 1519 1520 1521
	rdev->new_data_offset = rdev->data_offset;
	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
		rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1522
	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
L
Linus Torvalds 已提交
1523

1524
	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1525
	bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1526
	if (rdev->sb_size & bmask)
1527 1528 1529
		rdev->sb_size = (rdev->sb_size | bmask) + 1;

	if (minor_version
1530
	    && rdev->data_offset < sb_start + (rdev->sb_size/512))
1531
		return -EINVAL;
1532 1533 1534
	if (minor_version
	    && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
		return -EINVAL;
1535

1536 1537 1538 1539 1540
	if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
		rdev->desc_nr = -1;
	else
		rdev->desc_nr = le32_to_cpu(sb->dev_number);

1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562
	if (!rdev->bb_page) {
		rdev->bb_page = alloc_page(GFP_KERNEL);
		if (!rdev->bb_page)
			return -ENOMEM;
	}
	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
	    rdev->badblocks.count == 0) {
		/* need to load the bad block list.
		 * Currently we limit it to one page.
		 */
		s32 offset;
		sector_t bb_sector;
		u64 *bbp;
		int i;
		int sectors = le16_to_cpu(sb->bblog_size);
		if (sectors > (PAGE_SIZE / 512))
			return -EINVAL;
		offset = le32_to_cpu(sb->bblog_offset);
		if (offset == 0)
			return -EINVAL;
		bb_sector = (long long)offset;
		if (!sync_page_io(rdev, bb_sector, sectors << 9,
M
Mike Christie 已提交
1563
				  rdev->bb_page, REQ_OP_READ, 0, true))
1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574
			return -EIO;
		bbp = (u64 *)page_address(rdev->bb_page);
		rdev->badblocks.shift = sb->bblog_shift;
		for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
			u64 bb = le64_to_cpu(*bbp);
			int count = bb & (0x3ff);
			u64 sector = bb >> 10;
			sector <<= sb->bblog_shift;
			count <<= sb->bblog_shift;
			if (bb + 1 == 0)
				break;
1575
			if (badblocks_set(&rdev->badblocks, sector, count, 1))
1576 1577
				return -EINVAL;
		}
1578 1579
	} else if (sb->bblog_offset != 0)
		rdev->badblocks.shift = 0;
1580

1581 1582
	if ((le32_to_cpu(sb->feature_map) &
	    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1583 1584 1585 1586 1587
		rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
		rdev->ppl.size = le16_to_cpu(sb->ppl.size);
		rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
	}

1588
	if (!refdev) {
1589
		ret = 1;
1590
	} else {
L
Linus Torvalds 已提交
1591
		__u64 ev1, ev2;
1592
		struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
L
Linus Torvalds 已提交
1593 1594 1595 1596 1597

		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
		    sb->level != refsb->level ||
		    sb->layout != refsb->layout ||
		    sb->chunksize != refsb->chunksize) {
1598
			pr_warn("md: %s has strangely different superblock to %s\n",
L
Linus Torvalds 已提交
1599 1600 1601 1602 1603 1604 1605 1606
				bdevname(rdev->bdev,b),
				bdevname(refdev->bdev,b2));
			return -EINVAL;
		}
		ev1 = le64_to_cpu(sb->events);
		ev2 = le64_to_cpu(refsb->events);

		if (ev1 > ev2)
1607 1608 1609
			ret = 1;
		else
			ret = 0;
L
Linus Torvalds 已提交
1610
	}
1611 1612 1613 1614 1615 1616
	if (minor_version) {
		sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
		sectors -= rdev->data_offset;
	} else
		sectors = rdev->sb_start;
	if (sectors < le64_to_cpu(sb->data_size))
L
Linus Torvalds 已提交
1617
		return -EINVAL;
1618
	rdev->sectors = le64_to_cpu(sb->data_size);
1619
	return ret;
L
Linus Torvalds 已提交
1620 1621
}

1622
static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
L
Linus Torvalds 已提交
1623
{
1624
	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1625
	__u64 ev1 = le64_to_cpu(sb->events);
L
Linus Torvalds 已提交
1626

1627
	rdev->raid_disk = -1;
1628 1629
	clear_bit(Faulty, &rdev->flags);
	clear_bit(In_sync, &rdev->flags);
1630
	clear_bit(Bitmap_sync, &rdev->flags);
1631 1632
	clear_bit(WriteMostly, &rdev->flags);

L
Linus Torvalds 已提交
1633 1634 1635
	if (mddev->raid_disks == 0) {
		mddev->major_version = 1;
		mddev->patch_version = 0;
1636
		mddev->external = 0;
1637
		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1638 1639
		mddev->ctime = le64_to_cpu(sb->ctime);
		mddev->utime = le64_to_cpu(sb->utime);
L
Linus Torvalds 已提交
1640
		mddev->level = le32_to_cpu(sb->level);
1641
		mddev->clevel[0] = 0;
L
Linus Torvalds 已提交
1642 1643
		mddev->layout = le32_to_cpu(sb->layout);
		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
A
Andre Noll 已提交
1644
		mddev->dev_sectors = le64_to_cpu(sb->size);
1645
		mddev->events = ev1;
1646
		mddev->bitmap_info.offset = 0;
1647 1648 1649 1650
		mddev->bitmap_info.space = 0;
		/* Default location for bitmap is 1K after superblock
		 * using 3K - total of 4K
		 */
1651
		mddev->bitmap_info.default_offset = 1024 >> 9;
1652
		mddev->bitmap_info.default_space = (4096-1024) >> 9;
1653 1654
		mddev->reshape_backwards = 0;

L
Linus Torvalds 已提交
1655 1656 1657 1658
		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
		memcpy(mddev->uuid, sb->set_uuid, 16);

		mddev->max_disks =  (4096-256)/2;
1659

1660
		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1661
		    mddev->bitmap_info.file == NULL) {
1662 1663
			mddev->bitmap_info.offset =
				(__s32)le32_to_cpu(sb->bitmap_offset);
1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677
			/* Metadata doesn't record how much space is available.
			 * For 1.0, we assume we can use up to the superblock
			 * if before, else to 4K beyond superblock.
			 * For others, assume no change is possible.
			 */
			if (mddev->minor_version > 0)
				mddev->bitmap_info.space = 0;
			else if (mddev->bitmap_info.offset > 0)
				mddev->bitmap_info.space =
					8 - mddev->bitmap_info.offset;
			else
				mddev->bitmap_info.space =
					-mddev->bitmap_info.offset;
		}
1678

1679 1680 1681 1682 1683
		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
			mddev->new_level = le32_to_cpu(sb->new_level);
			mddev->new_layout = le32_to_cpu(sb->new_layout);
1684
			mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1685 1686 1687 1688 1689
			if (mddev->delta_disks < 0 ||
			    (mddev->delta_disks == 0 &&
			     (le32_to_cpu(sb->feature_map)
			      & MD_FEATURE_RESHAPE_BACKWARDS)))
				mddev->reshape_backwards = 1;
1690 1691 1692 1693 1694
		} else {
			mddev->reshape_position = MaxSector;
			mddev->delta_disks = 0;
			mddev->new_level = mddev->level;
			mddev->new_layout = mddev->layout;
1695
			mddev->new_chunk_sectors = mddev->chunk_sectors;
1696 1697
		}

1698
		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1699
			set_bit(MD_HAS_JOURNAL, &mddev->flags);
1700

1701 1702
		if (le32_to_cpu(sb->feature_map) &
		    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
1703 1704 1705
			if (le32_to_cpu(sb->feature_map) &
			    (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
				return -EINVAL;
1706 1707 1708 1709
			if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
			    (le32_to_cpu(sb->feature_map) &
					    MD_FEATURE_MULTIPLE_PPLS))
				return -EINVAL;
1710 1711
			set_bit(MD_HAS_PPL, &mddev->flags);
		}
1712
	} else if (mddev->pers == NULL) {
1713 1714
		/* Insist of good event counter while assembling, except for
		 * spares (which don't need an event count) */
L
Linus Torvalds 已提交
1715
		++ev1;
1716 1717
		if (rdev->desc_nr >= 0 &&
		    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1718 1719
		    (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
		     le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1720 1721
			if (ev1 < mddev->events)
				return -EINVAL;
1722 1723 1724 1725 1726 1727
	} else if (mddev->bitmap) {
		/* If adding to array with a bitmap, then we can accept an
		 * older device, but not too old.
		 */
		if (ev1 < mddev->bitmap->events_cleared)
			return 0;
1728 1729
		if (ev1 < mddev->events)
			set_bit(Bitmap_sync, &rdev->flags);
1730 1731 1732 1733 1734
	} else {
		if (ev1 < mddev->events)
			/* just a hot-add of a new device, leave raid_disk at -1 */
			return 0;
	}
L
Linus Torvalds 已提交
1735 1736
	if (mddev->level != LEVEL_MULTIPATH) {
		int role;
1737 1738
		if (rdev->desc_nr < 0 ||
		    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1739
			role = MD_DISK_ROLE_SPARE;
1740 1741 1742
			rdev->desc_nr = -1;
		} else
			role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
L
Linus Torvalds 已提交
1743
		switch(role) {
1744
		case MD_DISK_ROLE_SPARE: /* spare */
L
Linus Torvalds 已提交
1745
			break;
1746
		case MD_DISK_ROLE_FAULTY: /* faulty */
1747
			set_bit(Faulty, &rdev->flags);
L
Linus Torvalds 已提交
1748
			break;
1749 1750 1751
		case MD_DISK_ROLE_JOURNAL: /* journal device */
			if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
				/* journal device without journal feature */
1752
				pr_warn("md: journal device provided without journal feature, ignoring the device\n");
1753 1754 1755
				return -EINVAL;
			}
			set_bit(Journal, &rdev->flags);
1756
			rdev->journal_tail = le64_to_cpu(sb->journal_tail);
1757
			rdev->raid_disk = 0;
1758
			break;
L
Linus Torvalds 已提交
1759
		default:
1760
			rdev->saved_raid_disk = role;
1761
			if ((le32_to_cpu(sb->feature_map) &
1762
			     MD_FEATURE_RECOVERY_OFFSET)) {
1763
				rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1764 1765 1766 1767
				if (!(le32_to_cpu(sb->feature_map) &
				      MD_FEATURE_RECOVERY_BITMAP))
					rdev->saved_raid_disk = -1;
			} else
1768
				set_bit(In_sync, &rdev->flags);
L
Linus Torvalds 已提交
1769 1770 1771
			rdev->raid_disk = role;
			break;
		}
1772 1773
		if (sb->devflags & WriteMostly1)
			set_bit(WriteMostly, &rdev->flags);
1774 1775
		if (sb->devflags & FailFast1)
			set_bit(FailFast, &rdev->flags);
1776 1777
		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
			set_bit(Replacement, &rdev->flags);
1778
	} else /* MULTIPATH are always insync */
1779
		set_bit(In_sync, &rdev->flags);
1780

L
Linus Torvalds 已提交
1781 1782 1783
	return 0;
}

1784
static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
L
Linus Torvalds 已提交
1785 1786
{
	struct mdp_superblock_1 *sb;
1787
	struct md_rdev *rdev2;
L
Linus Torvalds 已提交
1788 1789 1790
	int max_dev, i;
	/* make rdev->sb match mddev and rdev data. */

1791
	sb = page_address(rdev->sb_page);
L
Linus Torvalds 已提交
1792 1793 1794

	sb->feature_map = 0;
	sb->pad0 = 0;
1795
	sb->recovery_offset = cpu_to_le64(0);
L
Linus Torvalds 已提交
1796 1797 1798 1799 1800 1801
	memset(sb->pad3, 0, sizeof(sb->pad3));

	sb->utime = cpu_to_le64((__u64)mddev->utime);
	sb->events = cpu_to_le64(mddev->events);
	if (mddev->in_sync)
		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1802 1803
	else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
		sb->resync_offset = cpu_to_le64(MaxSector);
L
Linus Torvalds 已提交
1804 1805 1806
	else
		sb->resync_offset = cpu_to_le64(0);

1807
	sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1808

1809
	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
A
Andre Noll 已提交
1810
	sb->size = cpu_to_le64(mddev->dev_sectors);
1811
	sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1812 1813
	sb->level = cpu_to_le32(mddev->level);
	sb->layout = cpu_to_le32(mddev->layout);
1814 1815 1816 1817
	if (test_bit(FailFast, &rdev->flags))
		sb->devflags |= FailFast1;
	else
		sb->devflags &= ~FailFast1;
1818

1819 1820 1821 1822
	if (test_bit(WriteMostly, &rdev->flags))
		sb->devflags |= WriteMostly1;
	else
		sb->devflags &= ~WriteMostly1;
1823 1824
	sb->data_offset = cpu_to_le64(rdev->data_offset);
	sb->data_size = cpu_to_le64(rdev->sectors);
1825

1826 1827
	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1828
		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1829
	}
1830

S
Shaohua Li 已提交
1831
	if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
1832
	    !test_bit(In_sync, &rdev->flags)) {
1833 1834 1835 1836
		sb->feature_map |=
			cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
		sb->recovery_offset =
			cpu_to_le64(rdev->recovery_offset);
1837 1838 1839
		if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
			sb->feature_map |=
				cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
1840
	}
1841 1842 1843
	/* Note: recovery_offset and journal_tail share space  */
	if (test_bit(Journal, &rdev->flags))
		sb->journal_tail = cpu_to_le64(rdev->journal_tail);
1844 1845 1846
	if (test_bit(Replacement, &rdev->flags))
		sb->feature_map |=
			cpu_to_le32(MD_FEATURE_REPLACEMENT);
1847

1848 1849 1850 1851 1852 1853
	if (mddev->reshape_position != MaxSector) {
		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
		sb->new_layout = cpu_to_le32(mddev->new_layout);
		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
		sb->new_level = cpu_to_le32(mddev->new_level);
1854
		sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1855 1856 1857 1858
		if (mddev->delta_disks == 0 &&
		    mddev->reshape_backwards)
			sb->feature_map
				|= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1859 1860 1861 1862 1863 1864
		if (rdev->new_data_offset != rdev->data_offset) {
			sb->feature_map
				|= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
			sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
							     - rdev->data_offset));
		}
1865
	}
1866

1867 1868 1869
	if (mddev_is_clustered(mddev))
		sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);

1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888
	if (rdev->badblocks.count == 0)
		/* Nothing to do for bad blocks*/ ;
	else if (sb->bblog_offset == 0)
		/* Cannot record bad blocks on this device */
		md_error(mddev, rdev);
	else {
		struct badblocks *bb = &rdev->badblocks;
		u64 *bbp = (u64 *)page_address(rdev->bb_page);
		u64 *p = bb->page;
		sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
		if (bb->changed) {
			unsigned seq;

retry:
			seq = read_seqbegin(&bb->lock);

			memset(bbp, 0xff, PAGE_SIZE);

			for (i = 0 ; i < bb->count ; i++) {
1889
				u64 internal_bb = p[i];
1890 1891
				u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
						| BB_LEN(internal_bb));
1892
				bbp[i] = cpu_to_le64(store_bb);
1893
			}
1894
			bb->changed = 0;
1895 1896 1897 1898 1899 1900 1901 1902 1903
			if (read_seqretry(&bb->lock, seq))
				goto retry;

			bb->sector = (rdev->sb_start +
				      (int)le32_to_cpu(sb->bblog_offset));
			bb->size = le16_to_cpu(sb->bblog_size);
		}
	}

L
Linus Torvalds 已提交
1904
	max_dev = 0;
N
NeilBrown 已提交
1905
	rdev_for_each(rdev2, mddev)
L
Linus Torvalds 已提交
1906 1907
		if (rdev2->desc_nr+1 > max_dev)
			max_dev = rdev2->desc_nr+1;
1908

1909 1910
	if (max_dev > le32_to_cpu(sb->max_dev)) {
		int bmask;
1911
		sb->max_dev = cpu_to_le32(max_dev);
1912 1913 1914 1915
		rdev->sb_size = max_dev * 2 + 256;
		bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
		if (rdev->sb_size & bmask)
			rdev->sb_size = (rdev->sb_size | bmask) + 1;
1916 1917 1918
	} else
		max_dev = le32_to_cpu(sb->max_dev);

L
Linus Torvalds 已提交
1919
	for (i=0; i<max_dev;i++)
1920
		sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
1921

1922 1923
	if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
		sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
1924

1925
	if (test_bit(MD_HAS_PPL, &mddev->flags)) {
1926 1927 1928 1929 1930
		if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
			sb->feature_map |=
			    cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
		else
			sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
1931 1932 1933 1934
		sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
		sb->ppl.size = cpu_to_le16(rdev->ppl.size);
	}

N
NeilBrown 已提交
1935
	rdev_for_each(rdev2, mddev) {
L
Linus Torvalds 已提交
1936
		i = rdev2->desc_nr;
1937
		if (test_bit(Faulty, &rdev2->flags))
1938
			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
1939
		else if (test_bit(In_sync, &rdev2->flags))
L
Linus Torvalds 已提交
1940
			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1941
		else if (test_bit(Journal, &rdev2->flags))
1942
			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
1943
		else if (rdev2->raid_disk >= 0)
1944
			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
L
Linus Torvalds 已提交
1945
		else
1946
			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
L
Linus Torvalds 已提交
1947 1948 1949 1950 1951
	}

	sb->sb_csum = calc_sb_1_csum(sb);
}

1952
static unsigned long long
1953
super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1954 1955
{
	struct mdp_superblock_1 *sb;
1956
	sector_t max_sectors;
A
Andre Noll 已提交
1957
	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1958
		return 0; /* component must fit device */
1959 1960
	if (rdev->data_offset != rdev->new_data_offset)
		return 0; /* too confusing */
1961
	if (rdev->sb_start < rdev->data_offset) {
1962
		/* minor versions 1 and 2; superblock before data */
1963
		max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
1964 1965 1966
		max_sectors -= rdev->data_offset;
		if (!num_sectors || num_sectors > max_sectors)
			num_sectors = max_sectors;
1967
	} else if (rdev->mddev->bitmap_info.offset) {
1968 1969 1970 1971
		/* minor version 0 with bitmap we can't move */
		return 0;
	} else {
		/* minor version 0; superblock after data */
1972
		sector_t sb_start;
1973
		sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
1974
		sb_start &= ~(sector_t)(4*2 - 1);
1975
		max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1976 1977
		if (!num_sectors || num_sectors > max_sectors)
			num_sectors = max_sectors;
1978
		rdev->sb_start = sb_start;
1979
	}
1980
	sb = page_address(rdev->sb_page);
1981
	sb->data_size = cpu_to_le64(num_sectors);
1982
	sb->super_offset = cpu_to_le64(rdev->sb_start);
1983
	sb->sb_csum = calc_sb_1_csum(sb);
1984 1985 1986 1987
	do {
		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
			       rdev->sb_page);
	} while (md_super_wait(rdev->mddev) < 0);
1988
	return num_sectors;
1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016

}

static int
super_1_allow_new_offset(struct md_rdev *rdev,
			 unsigned long long new_offset)
{
	/* All necessary checks on new >= old have been done */
	struct bitmap *bitmap;
	if (new_offset >= rdev->data_offset)
		return 1;

	/* with 1.0 metadata, there is no metadata to tread on
	 * so we can always move back */
	if (rdev->mddev->minor_version == 0)
		return 1;

	/* otherwise we must be sure not to step on
	 * any metadata, so stay:
	 * 36K beyond start of superblock
	 * beyond end of badblocks
	 * beyond write-intent bitmap
	 */
	if (rdev->sb_start + (32+4)*2 > new_offset)
		return 0;
	bitmap = rdev->mddev->bitmap;
	if (bitmap && !rdev->mddev->bitmap_info.file &&
	    rdev->sb_start + rdev->mddev->bitmap_info.offset +
2017
	    bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
2018 2019 2020 2021 2022
		return 0;
	if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
		return 0;

	return 1;
2023
}
L
Linus Torvalds 已提交
2024

A
Adrian Bunk 已提交
2025
static struct super_type super_types[] = {
L
Linus Torvalds 已提交
2026 2027 2028
	[0] = {
		.name	= "0.90.0",
		.owner	= THIS_MODULE,
2029 2030 2031 2032
		.load_super	    = super_90_load,
		.validate_super	    = super_90_validate,
		.sync_super	    = super_90_sync,
		.rdev_size_change   = super_90_rdev_size_change,
2033
		.allow_new_offset   = super_90_allow_new_offset,
L
Linus Torvalds 已提交
2034 2035 2036 2037
	},
	[1] = {
		.name	= "md-1",
		.owner	= THIS_MODULE,
2038 2039 2040 2041
		.load_super	    = super_1_load,
		.validate_super	    = super_1_validate,
		.sync_super	    = super_1_sync,
		.rdev_size_change   = super_1_rdev_size_change,
2042
		.allow_new_offset   = super_1_allow_new_offset,
L
Linus Torvalds 已提交
2043 2044 2045
	},
};

2046
static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057
{
	if (mddev->sync_super) {
		mddev->sync_super(mddev, rdev);
		return;
	}

	BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));

	super_types[mddev->major_version].sync_super(mddev, rdev);
}

2058
static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
L
Linus Torvalds 已提交
2059
{
2060
	struct md_rdev *rdev, *rdev2;
L
Linus Torvalds 已提交
2061

2062
	rcu_read_lock();
2063 2064 2065 2066 2067 2068 2069 2070 2071 2072
	rdev_for_each_rcu(rdev, mddev1) {
		if (test_bit(Faulty, &rdev->flags) ||
		    test_bit(Journal, &rdev->flags) ||
		    rdev->raid_disk == -1)
			continue;
		rdev_for_each_rcu(rdev2, mddev2) {
			if (test_bit(Faulty, &rdev2->flags) ||
			    test_bit(Journal, &rdev2->flags) ||
			    rdev2->raid_disk == -1)
				continue;
2073
			if (rdev->bdev->bd_contains ==
2074 2075
			    rdev2->bdev->bd_contains) {
				rcu_read_unlock();
2076
				return 1;
2077
			}
2078 2079
		}
	}
2080
	rcu_read_unlock();
L
Linus Torvalds 已提交
2081 2082 2083 2084 2085
	return 0;
}

static LIST_HEAD(pending_raid_disks);

2086 2087 2088 2089 2090 2091 2092
/*
 * Try to register data integrity profile for an mddev
 *
 * This is called when an array is started and after a disk has been kicked
 * from the array. It only succeeds if all working and active component devices
 * are integrity capable with matching profiles.
 */
2093
int md_integrity_register(struct mddev *mddev)
2094
{
2095
	struct md_rdev *rdev, *reference = NULL;
2096 2097 2098

	if (list_empty(&mddev->disks))
		return 0; /* nothing to do */
2099 2100
	if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
		return 0; /* shouldn't register, or already is */
N
NeilBrown 已提交
2101
	rdev_for_each(rdev, mddev) {
2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116
		/* skip spares and non-functional disks */
		if (test_bit(Faulty, &rdev->flags))
			continue;
		if (rdev->raid_disk < 0)
			continue;
		if (!reference) {
			/* Use the first rdev as the reference */
			reference = rdev;
			continue;
		}
		/* does this rdev's profile match the reference profile? */
		if (blk_integrity_compare(reference->bdev->bd_disk,
				rdev->bdev->bd_disk) < 0)
			return -EINVAL;
	}
2117 2118
	if (!reference || !bdev_get_integrity(reference->bdev))
		return 0;
2119 2120 2121 2122
	/*
	 * All component devices are integrity capable and have matching
	 * profiles, register the common profile for the md device.
	 */
2123 2124 2125
	blk_integrity_register(mddev->gendisk,
			       bdev_get_integrity(reference->bdev));

2126
	pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2127
	if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) {
2128
		pr_err("md: failed to create integrity pool for %s\n",
2129 2130 2131
		       mdname(mddev));
		return -EINVAL;
	}
2132 2133 2134 2135
	return 0;
}
EXPORT_SYMBOL(md_integrity_register);

2136 2137 2138 2139 2140
/*
 * Attempt to add an rdev, but only if it is consistent with the current
 * integrity profile
 */
int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
M
Martin K. Petersen 已提交
2141
{
2142 2143
	struct blk_integrity *bi_rdev;
	struct blk_integrity *bi_mddev;
2144
	char name[BDEVNAME_SIZE];
2145 2146

	if (!mddev->gendisk)
2147
		return 0;
2148 2149 2150

	bi_rdev = bdev_get_integrity(rdev->bdev);
	bi_mddev = blk_get_integrity(mddev->gendisk);
M
Martin K. Petersen 已提交
2151

2152
	if (!bi_mddev) /* nothing to do */
2153 2154 2155
		return 0;

	if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
2156 2157
		pr_err("%s: incompatible integrity profile for %s\n",
		       mdname(mddev), bdevname(rdev->bdev, name));
2158 2159 2160 2161
		return -ENXIO;
	}

	return 0;
M
Martin K. Petersen 已提交
2162
}
2163
EXPORT_SYMBOL(md_integrity_add_rdev);
M
Martin K. Petersen 已提交
2164

2165
static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
L
Linus Torvalds 已提交
2166
{
2167
	char b[BDEVNAME_SIZE];
2168
	struct kobject *ko;
2169
	int err;
L
Linus Torvalds 已提交
2170

2171 2172 2173 2174
	/* prevent duplicates */
	if (find_rdev(mddev, rdev->bdev->bd_dev))
		return -EEXIST;

2175 2176 2177 2178
	if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) &&
	    mddev->pers)
		return -EROFS;

2179
	/* make sure rdev->sectors exceeds mddev->dev_sectors */
2180 2181 2182
	if (!test_bit(Journal, &rdev->flags) &&
	    rdev->sectors &&
	    (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2183 2184 2185 2186 2187 2188 2189 2190
		if (mddev->pers) {
			/* Cannot change size, so fail
			 * If mddev->level <= 0, then we don't care
			 * about aligning sizes (e.g. linear)
			 */
			if (mddev->level > 0)
				return -ENOSPC;
		} else
2191
			mddev->dev_sectors = rdev->sectors;
2192
	}
L
Linus Torvalds 已提交
2193 2194 2195 2196 2197

	/* Verify rdev->desc_nr is unique.
	 * If it is -1, assign a free number, else
	 * check number is not in use
	 */
2198
	rcu_read_lock();
L
Linus Torvalds 已提交
2199 2200
	if (rdev->desc_nr < 0) {
		int choice = 0;
2201 2202
		if (mddev->pers)
			choice = mddev->raid_disks;
2203
		while (md_find_rdev_nr_rcu(mddev, choice))
L
Linus Torvalds 已提交
2204 2205 2206
			choice++;
		rdev->desc_nr = choice;
	} else {
2207
		if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2208
			rcu_read_unlock();
L
Linus Torvalds 已提交
2209
			return -EBUSY;
2210
		}
L
Linus Torvalds 已提交
2211
	}
2212
	rcu_read_unlock();
2213 2214
	if (!test_bit(Journal, &rdev->flags) &&
	    mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2215 2216
		pr_warn("md: %s: array is limited to %d devices\n",
			mdname(mddev), mddev->max_disks);
2217 2218
		return -EBUSY;
	}
2219
	bdevname(rdev->bdev,b);
2220
	strreplace(b, '/', '!');
2221

L
Linus Torvalds 已提交
2222
	rdev->mddev = mddev;
2223
	pr_debug("md: bind<%s>\n", b);
2224

2225
	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2226
		goto fail;
2227

T
Tejun Heo 已提交
2228
	ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
N
NeilBrown 已提交
2229 2230 2231
	if (sysfs_create_link(&rdev->kobj, ko, "block"))
		/* failure here is OK */;
	rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2232

2233
	list_add_rcu(&rdev->same_set, &mddev->disks);
2234
	bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2235 2236

	/* May as well allow recovery to be retried once */
2237
	mddev->recovery_disabled++;
M
Martin K. Petersen 已提交
2238

L
Linus Torvalds 已提交
2239
	return 0;
2240 2241

 fail:
2242 2243
	pr_warn("md: failed to register dev-%s for %s\n",
		b, mdname(mddev));
2244
	return err;
L
Linus Torvalds 已提交
2245 2246
}

2247
static void md_delayed_delete(struct work_struct *ws)
2248
{
2249
	struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2250
	kobject_del(&rdev->kobj);
2251
	kobject_put(&rdev->kobj);
2252 2253
}

2254
static void unbind_rdev_from_array(struct md_rdev *rdev)
L
Linus Torvalds 已提交
2255 2256
{
	char b[BDEVNAME_SIZE];
N
NeilBrown 已提交
2257

2258
	bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2259
	list_del_rcu(&rdev->same_set);
2260
	pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
L
Linus Torvalds 已提交
2261
	rdev->mddev = NULL;
2262
	sysfs_remove_link(&rdev->kobj, "block");
2263 2264
	sysfs_put(rdev->sysfs_state);
	rdev->sysfs_state = NULL;
2265
	rdev->badblocks.count = 0;
2266
	/* We need to delay this, otherwise we can deadlock when
2267 2268
	 * writing to 'remove' to "dev/state".  We also need
	 * to delay it due to rcu usage.
2269
	 */
2270
	synchronize_rcu();
2271 2272
	INIT_WORK(&rdev->del_work, md_delayed_delete);
	kobject_get(&rdev->kobj);
T
Tejun Heo 已提交
2273
	queue_work(md_misc_wq, &rdev->del_work);
L
Linus Torvalds 已提交
2274 2275 2276 2277 2278 2279 2280
}

/*
 * prevent the device from being mounted, repartitioned or
 * otherwise reused by a RAID array (or any other kernel
 * subsystem), by bd_claiming the device.
 */
2281
static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
L
Linus Torvalds 已提交
2282 2283 2284 2285 2286
{
	int err = 0;
	struct block_device *bdev;
	char b[BDEVNAME_SIZE];

2287
	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2288
				 shared ? (struct md_rdev *)lock_rdev : rdev);
L
Linus Torvalds 已提交
2289
	if (IS_ERR(bdev)) {
2290
		pr_warn("md: could not open %s.\n", __bdevname(dev, b));
L
Linus Torvalds 已提交
2291 2292 2293 2294 2295 2296
		return PTR_ERR(bdev);
	}
	rdev->bdev = bdev;
	return err;
}

2297
static void unlock_rdev(struct md_rdev *rdev)
L
Linus Torvalds 已提交
2298 2299 2300
{
	struct block_device *bdev = rdev->bdev;
	rdev->bdev = NULL;
2301
	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
L
Linus Torvalds 已提交
2302 2303 2304 2305
}

void md_autodetect_dev(dev_t dev);

2306
static void export_rdev(struct md_rdev *rdev)
L
Linus Torvalds 已提交
2307 2308
{
	char b[BDEVNAME_SIZE];
N
NeilBrown 已提交
2309

2310
	pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b));
2311
	md_rdev_clear(rdev);
L
Linus Torvalds 已提交
2312
#ifndef MODULE
2313 2314
	if (test_bit(AutoDetected, &rdev->flags))
		md_autodetect_dev(rdev->bdev->bd_dev);
L
Linus Torvalds 已提交
2315 2316
#endif
	unlock_rdev(rdev);
2317
	kobject_put(&rdev->kobj);
L
Linus Torvalds 已提交
2318 2319
}

2320
void md_kick_rdev_from_array(struct md_rdev *rdev)
L
Linus Torvalds 已提交
2321 2322 2323 2324
{
	unbind_rdev_from_array(rdev);
	export_rdev(rdev);
}
2325
EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
L
Linus Torvalds 已提交
2326

2327
static void export_array(struct mddev *mddev)
L
Linus Torvalds 已提交
2328
{
N
NeilBrown 已提交
2329
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
2330

N
NeilBrown 已提交
2331 2332 2333
	while (!list_empty(&mddev->disks)) {
		rdev = list_first_entry(&mddev->disks, struct md_rdev,
					same_set);
2334
		md_kick_rdev_from_array(rdev);
L
Linus Torvalds 已提交
2335 2336 2337 2338 2339
	}
	mddev->raid_disks = 0;
	mddev->major_version = 0;
}

N
NeilBrown 已提交
2340 2341
static bool set_in_sync(struct mddev *mddev)
{
S
Shaohua Li 已提交
2342
	lockdep_assert_held(&mddev->lock);
2343 2344 2345 2346 2347 2348 2349
	if (!mddev->in_sync) {
		mddev->sync_checkers++;
		spin_unlock(&mddev->lock);
		percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
		spin_lock(&mddev->lock);
		if (!mddev->in_sync &&
		    percpu_ref_is_zero(&mddev->writes_pending)) {
N
NeilBrown 已提交
2350
			mddev->in_sync = 1;
2351 2352 2353 2354
			/*
			 * Ensure ->in_sync is visible before we clear
			 * ->sync_checkers.
			 */
2355
			smp_mb();
N
NeilBrown 已提交
2356 2357 2358
			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
			sysfs_notify_dirent_safe(mddev->sysfs_state);
		}
2359 2360
		if (--mddev->sync_checkers == 0)
			percpu_ref_switch_to_percpu(&mddev->writes_pending);
N
NeilBrown 已提交
2361 2362 2363 2364 2365 2366
	}
	if (mddev->safemode == 1)
		mddev->safemode = 0;
	return mddev->in_sync;
}

2367
static void sync_sbs(struct mddev *mddev, int nospares)
L
Linus Torvalds 已提交
2368
{
2369 2370 2371 2372 2373 2374
	/* Update each superblock (in-memory image), but
	 * if we are allowed to, skip spares which already
	 * have the right event counter, or have one earlier
	 * (which would mean they aren't being marked as dirty
	 * with the rest of the array)
	 */
2375
	struct md_rdev *rdev;
N
NeilBrown 已提交
2376
	rdev_for_each(rdev, mddev) {
2377 2378 2379 2380 2381 2382 2383
		if (rdev->sb_events == mddev->events ||
		    (nospares &&
		     rdev->raid_disk < 0 &&
		     rdev->sb_events+1 == mddev->events)) {
			/* Don't update this superblock */
			rdev->sb_loaded = 2;
		} else {
2384
			sync_super(mddev, rdev);
2385 2386
			rdev->sb_loaded = 1;
		}
L
Linus Torvalds 已提交
2387 2388 2389
	}
}

2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420
static bool does_sb_need_changing(struct mddev *mddev)
{
	struct md_rdev *rdev;
	struct mdp_superblock_1 *sb;
	int role;

	/* Find a good rdev */
	rdev_for_each(rdev, mddev)
		if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
			break;

	/* No good device found. */
	if (!rdev)
		return false;

	sb = page_address(rdev->sb_page);
	/* Check if a device has become faulty or a spare become active */
	rdev_for_each(rdev, mddev) {
		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
		/* Device activated? */
		if (role == 0xffff && rdev->raid_disk >=0 &&
		    !test_bit(Faulty, &rdev->flags))
			return true;
		/* Device turned faulty? */
		if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
			return true;
	}

	/* Check if any mddev parameters have changed */
	if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
	    (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2421
	    (mddev->layout != le32_to_cpu(sb->layout)) ||
2422 2423 2424 2425 2426 2427 2428
	    (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
	    (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
		return true;

	return false;
}

2429
void md_update_sb(struct mddev *mddev, int force_change)
L
Linus Torvalds 已提交
2430
{
2431
	struct md_rdev *rdev;
2432
	int sync_req;
2433
	int nospares = 0;
2434
	int any_badblocks_changed = 0;
2435
	int ret = -1;
L
Linus Torvalds 已提交
2436

2437 2438
	if (mddev->ro) {
		if (force_change)
2439
			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2440 2441
		return;
	}
2442

2443
repeat:
2444
	if (mddev_is_clustered(mddev)) {
2445
		if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2446
			force_change = 1;
2447
		if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2448
			nospares = 1;
2449
		ret = md_cluster_ops->metadata_update_start(mddev);
2450 2451
		/* Has someone else has updated the sb */
		if (!does_sb_need_changing(mddev)) {
2452 2453
			if (ret == 0)
				md_cluster_ops->metadata_update_cancel(mddev);
2454 2455 2456
			bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
							 BIT(MD_SB_CHANGE_DEVS) |
							 BIT(MD_SB_CHANGE_CLEAN));
2457 2458 2459
			return;
		}
	}
2460

2461 2462 2463 2464 2465 2466
	/*
	 * First make sure individual recovery_offsets are correct
	 * curr_resync_completed can only be used during recovery.
	 * During reshape/resync it might use array-addresses rather
	 * that device addresses.
	 */
N
NeilBrown 已提交
2467
	rdev_for_each(rdev, mddev) {
2468 2469
		if (rdev->raid_disk >= 0 &&
		    mddev->delta_disks >= 0 &&
2470 2471 2472
		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
		    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
		    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
S
Shaohua Li 已提交
2473
		    !test_bit(Journal, &rdev->flags) &&
2474 2475 2476 2477
		    !test_bit(In_sync, &rdev->flags) &&
		    mddev->curr_resync_completed > rdev->recovery_offset)
				rdev->recovery_offset = mddev->curr_resync_completed;

2478
	}
2479
	if (!mddev->persistent) {
2480 2481
		clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
		clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2482
		if (!mddev->external) {
2483
			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
N
NeilBrown 已提交
2484
			rdev_for_each(rdev, mddev) {
2485
				if (rdev->badblocks.changed) {
2486
					rdev->badblocks.changed = 0;
2487
					ack_all_badblocks(&rdev->badblocks);
2488 2489 2490 2491 2492 2493 2494
					md_error(mddev, rdev);
				}
				clear_bit(Blocked, &rdev->flags);
				clear_bit(BlockedBadBlocks, &rdev->flags);
				wake_up(&rdev->blocked_wait);
			}
		}
2495 2496 2497 2498
		wake_up(&mddev->sb_wait);
		return;
	}

2499
	spin_lock(&mddev->lock);
2500

2501
	mddev->utime = ktime_get_real_seconds();
2502

2503
	if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2504
		force_change = 1;
2505
	if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2506 2507 2508 2509 2510 2511 2512 2513
		/* just a clean<-> dirty transition, possibly leave spares alone,
		 * though if events isn't the right even/odd, we will have to do
		 * spares after all
		 */
		nospares = 1;
	if (force_change)
		nospares = 0;
	if (mddev->degraded)
2514 2515 2516 2517 2518 2519 2520 2521 2522
		/* If the array is degraded, then skipping spares is both
		 * dangerous and fairly pointless.
		 * Dangerous because a device that was removed from the array
		 * might have a event_count that still looks up-to-date,
		 * so it can be re-added without a resync.
		 * Pointless because if there are any spares to skip,
		 * then a recovery will happen and soon that array won't
		 * be degraded any more and the spare can go back to sleep then.
		 */
2523
		nospares = 0;
2524

2525
	sync_req = mddev->in_sync;
2526 2527 2528

	/* If this is just a dirty<->clean transition, and the array is clean
	 * and 'events' is odd, we can roll back to the previous clean state */
2529
	if (nospares
2530
	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2531 2532
	    && mddev->can_decrease_events
	    && mddev->events != 1) {
2533
		mddev->events--;
2534 2535
		mddev->can_decrease_events = 0;
	} else {
2536 2537
		/* otherwise we have to go forward and ... */
		mddev->events ++;
2538
		mddev->can_decrease_events = nospares;
2539
	}
L
Linus Torvalds 已提交
2540

N
NeilBrown 已提交
2541 2542 2543 2544 2545 2546
	/*
	 * This 64-bit counter should never wrap.
	 * Either we are in around ~1 trillion A.C., assuming
	 * 1 reboot per second, or we have a bug...
	 */
	WARN_ON(mddev->events == 0);
2547

N
NeilBrown 已提交
2548
	rdev_for_each(rdev, mddev) {
2549 2550
		if (rdev->badblocks.changed)
			any_badblocks_changed++;
2551 2552 2553
		if (test_bit(Faulty, &rdev->flags))
			set_bit(FaultRecorded, &rdev->flags);
	}
2554

2555
	sync_sbs(mddev, nospares);
2556
	spin_unlock(&mddev->lock);
L
Linus Torvalds 已提交
2557

2558 2559
	pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
		 mdname(mddev), mddev->in_sync);
L
Linus Torvalds 已提交
2560

2561 2562
	if (mddev->queue)
		blk_add_trace_msg(mddev->queue, "md md_update_sb");
2563
rewrite:
2564
	bitmap_update_sb(mddev->bitmap);
N
NeilBrown 已提交
2565
	rdev_for_each(rdev, mddev) {
L
Linus Torvalds 已提交
2566
		char b[BDEVNAME_SIZE];
2567

2568 2569
		if (rdev->sb_loaded != 1)
			continue; /* no noise on spare devices */
L
Linus Torvalds 已提交
2570

2571
		if (!test_bit(Faulty, &rdev->flags)) {
2572
			md_super_write(mddev,rdev,
2573
				       rdev->sb_start, rdev->sb_size,
2574
				       rdev->sb_page);
2575 2576 2577
			pr_debug("md: (write) %s's sb offset: %llu\n",
				 bdevname(rdev->bdev, b),
				 (unsigned long long)rdev->sb_start);
2578
			rdev->sb_events = mddev->events;
2579 2580 2581 2582 2583 2584 2585
			if (rdev->badblocks.size) {
				md_super_write(mddev, rdev,
					       rdev->badblocks.sector,
					       rdev->badblocks.size << 9,
					       rdev->bb_page);
				rdev->badblocks.size = 0;
			}
2586

2587
		} else
2588 2589
			pr_debug("md: %s (skipping faulty)\n",
				 bdevname(rdev->bdev, b));
2590

2591
		if (mddev->level == LEVEL_MULTIPATH)
L
Linus Torvalds 已提交
2592 2593 2594
			/* only need to write one superblock... */
			break;
	}
2595 2596
	if (md_super_wait(mddev) < 0)
		goto rewrite;
2597
	/* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2598

2599 2600 2601
	if (mddev_is_clustered(mddev) && ret == 0)
		md_cluster_ops->metadata_update_finish(mddev);

2602
	if (mddev->in_sync != sync_req ||
2603 2604
	    !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
			       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2605 2606
		/* have to write it out again */
		goto repeat;
2607
	wake_up(&mddev->sb_wait);
2608 2609
	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2610

N
NeilBrown 已提交
2611
	rdev_for_each(rdev, mddev) {
2612 2613 2614 2615
		if (test_and_clear_bit(FaultRecorded, &rdev->flags))
			clear_bit(Blocked, &rdev->flags);

		if (any_badblocks_changed)
2616
			ack_all_badblocks(&rdev->badblocks);
2617 2618 2619
		clear_bit(BlockedBadBlocks, &rdev->flags);
		wake_up(&rdev->blocked_wait);
	}
L
Linus Torvalds 已提交
2620
}
2621
EXPORT_SYMBOL(md_update_sb);
L
Linus Torvalds 已提交
2622

G
Goldwyn Rodrigues 已提交
2623 2624 2625 2626
static int add_bound_rdev(struct md_rdev *rdev)
{
	struct mddev *mddev = rdev->mddev;
	int err = 0;
2627
	bool add_journal = test_bit(Journal, &rdev->flags);
G
Goldwyn Rodrigues 已提交
2628

2629
	if (!mddev->pers->hot_remove_disk || add_journal) {
G
Goldwyn Rodrigues 已提交
2630 2631 2632 2633 2634 2635
		/* If there is hot_add_disk but no hot_remove_disk
		 * then added disks for geometry changes,
		 * and should be added immediately.
		 */
		super_types[mddev->major_version].
			validate_super(mddev, rdev);
2636 2637
		if (add_journal)
			mddev_suspend(mddev);
G
Goldwyn Rodrigues 已提交
2638
		err = mddev->pers->hot_add_disk(mddev, rdev);
2639 2640
		if (add_journal)
			mddev_resume(mddev);
G
Goldwyn Rodrigues 已提交
2641
		if (err) {
2642
			md_kick_rdev_from_array(rdev);
G
Goldwyn Rodrigues 已提交
2643 2644 2645 2646 2647
			return err;
		}
	}
	sysfs_notify_dirent_safe(rdev->sysfs_state);

2648
	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
G
Goldwyn Rodrigues 已提交
2649 2650 2651 2652 2653 2654 2655
	if (mddev->degraded)
		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
	md_new_event(mddev);
	md_wakeup_thread(mddev->thread);
	return 0;
}
L
Linus Torvalds 已提交
2656

2657
/* words written to sysfs files may, or may not, be \n terminated.
2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676
 * We want to accept with case. For this we use cmd_match.
 */
static int cmd_match(const char *cmd, const char *str)
{
	/* See if cmd, written into a sysfs file, matches
	 * str.  They must either be the same, or cmd can
	 * have a trailing newline
	 */
	while (*cmd && *str && *cmd == *str) {
		cmd++;
		str++;
	}
	if (*cmd == '\n')
		cmd++;
	if (*str || *cmd)
		return 0;
	return 1;
}

2677 2678
struct rdev_sysfs_entry {
	struct attribute attr;
2679 2680
	ssize_t (*show)(struct md_rdev *, char *);
	ssize_t (*store)(struct md_rdev *, const char *, size_t);
2681 2682 2683
};

static ssize_t
2684
state_show(struct md_rdev *rdev, char *page)
2685
{
2686
	char *sep = ",";
2687
	size_t len = 0;
2688
	unsigned long flags = READ_ONCE(rdev->flags);
2689

2690
	if (test_bit(Faulty, &flags) ||
2691 2692
	    (!test_bit(ExternalBbl, &flags) &&
	    rdev->badblocks.unacked_exist))
2693 2694 2695 2696 2697 2698 2699
		len += sprintf(page+len, "faulty%s", sep);
	if (test_bit(In_sync, &flags))
		len += sprintf(page+len, "in_sync%s", sep);
	if (test_bit(Journal, &flags))
		len += sprintf(page+len, "journal%s", sep);
	if (test_bit(WriteMostly, &flags))
		len += sprintf(page+len, "write_mostly%s", sep);
2700
	if (test_bit(Blocked, &flags) ||
2701
	    (rdev->badblocks.unacked_exist
2702 2703
	     && !test_bit(Faulty, &flags)))
		len += sprintf(page+len, "blocked%s", sep);
2704
	if (!test_bit(Faulty, &flags) &&
S
Shaohua Li 已提交
2705
	    !test_bit(Journal, &flags) &&
2706 2707 2708 2709 2710 2711 2712 2713 2714 2715
	    !test_bit(In_sync, &flags))
		len += sprintf(page+len, "spare%s", sep);
	if (test_bit(WriteErrorSeen, &flags))
		len += sprintf(page+len, "write_error%s", sep);
	if (test_bit(WantReplacement, &flags))
		len += sprintf(page+len, "want_replacement%s", sep);
	if (test_bit(Replacement, &flags))
		len += sprintf(page+len, "replacement%s", sep);
	if (test_bit(ExternalBbl, &flags))
		len += sprintf(page+len, "external_bbl%s", sep);
2716 2717
	if (test_bit(FailFast, &flags))
		len += sprintf(page+len, "failfast%s", sep);
2718 2719 2720

	if (len)
		len -= strlen(sep);
2721

2722 2723 2724
	return len+sprintf(page+len, "\n");
}

2725
static ssize_t
2726
state_store(struct md_rdev *rdev, const char *buf, size_t len)
2727 2728
{
	/* can write
2729
	 *  faulty  - simulates an error
2730
	 *  remove  - disconnects the device
2731 2732
	 *  writemostly - sets write_mostly
	 *  -writemostly - clears write_mostly
2733 2734
	 *  blocked - sets the Blocked flags
	 *  -blocked - clears the Blocked and possibly simulates an error
2735
	 *  insync - sets Insync providing device isn't active
2736 2737
	 *  -insync - clear Insync for a device with a slot assigned,
	 *            so that it gets rebuilt based on bitmap
2738 2739
	 *  write_error - sets WriteErrorSeen
	 *  -write_error - clears WriteErrorSeen
2740
	 *  {,-}failfast - set/clear FailFast
2741 2742 2743 2744
	 */
	int err = -EINVAL;
	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
		md_error(rdev->mddev, rdev);
2745 2746 2747 2748
		if (test_bit(Faulty, &rdev->flags))
			err = 0;
		else
			err = -EBUSY;
2749
	} else if (cmd_match(buf, "remove")) {
S
Shaohua Li 已提交
2750 2751 2752 2753
		if (rdev->mddev->pers) {
			clear_bit(Blocked, &rdev->flags);
			remove_and_add_spares(rdev->mddev, rdev);
		}
2754 2755 2756
		if (rdev->raid_disk >= 0)
			err = -EBUSY;
		else {
2757
			struct mddev *mddev = rdev->mddev;
2758
			err = 0;
2759 2760 2761 2762 2763
			if (mddev_is_clustered(mddev))
				err = md_cluster_ops->remove_disk(mddev, rdev);

			if (err == 0) {
				md_kick_rdev_from_array(rdev);
2764
				if (mddev->pers) {
2765
					set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2766 2767
					md_wakeup_thread(mddev->thread);
				}
2768 2769
				md_new_event(mddev);
			}
2770
		}
2771 2772 2773 2774 2775
	} else if (cmd_match(buf, "writemostly")) {
		set_bit(WriteMostly, &rdev->flags);
		err = 0;
	} else if (cmd_match(buf, "-writemostly")) {
		clear_bit(WriteMostly, &rdev->flags);
2776 2777 2778 2779 2780
		err = 0;
	} else if (cmd_match(buf, "blocked")) {
		set_bit(Blocked, &rdev->flags);
		err = 0;
	} else if (cmd_match(buf, "-blocked")) {
2781
		if (!test_bit(Faulty, &rdev->flags) &&
2782
		    !test_bit(ExternalBbl, &rdev->flags) &&
2783
		    rdev->badblocks.unacked_exist) {
2784 2785 2786 2787 2788
			/* metadata handler doesn't understand badblocks,
			 * so we need to fail the device
			 */
			md_error(rdev->mddev, rdev);
		}
2789
		clear_bit(Blocked, &rdev->flags);
2790
		clear_bit(BlockedBadBlocks, &rdev->flags);
2791 2792 2793 2794
		wake_up(&rdev->blocked_wait);
		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
		md_wakeup_thread(rdev->mddev->thread);

2795 2796 2797
		err = 0;
	} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
		set_bit(In_sync, &rdev->flags);
2798
		err = 0;
2799 2800 2801 2802 2803 2804
	} else if (cmd_match(buf, "failfast")) {
		set_bit(FailFast, &rdev->flags);
		err = 0;
	} else if (cmd_match(buf, "-failfast")) {
		clear_bit(FailFast, &rdev->flags);
		err = 0;
S
Shaohua Li 已提交
2805 2806
	} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
		   !test_bit(Journal, &rdev->flags)) {
2807 2808 2809 2810 2811 2812
		if (rdev->mddev->pers == NULL) {
			clear_bit(In_sync, &rdev->flags);
			rdev->saved_raid_disk = rdev->raid_disk;
			rdev->raid_disk = -1;
			err = 0;
		}
2813 2814 2815 2816 2817 2818
	} else if (cmd_match(buf, "write_error")) {
		set_bit(WriteErrorSeen, &rdev->flags);
		err = 0;
	} else if (cmd_match(buf, "-write_error")) {
		clear_bit(WriteErrorSeen, &rdev->flags);
		err = 0;
2819 2820 2821 2822 2823 2824
	} else if (cmd_match(buf, "want_replacement")) {
		/* Any non-spare device that is not a replacement can
		 * become want_replacement at any time, but we then need to
		 * check if recovery is needed.
		 */
		if (rdev->raid_disk >= 0 &&
S
Shaohua Li 已提交
2825
		    !test_bit(Journal, &rdev->flags) &&
2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855
		    !test_bit(Replacement, &rdev->flags))
			set_bit(WantReplacement, &rdev->flags);
		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
		md_wakeup_thread(rdev->mddev->thread);
		err = 0;
	} else if (cmd_match(buf, "-want_replacement")) {
		/* Clearing 'want_replacement' is always allowed.
		 * Once replacements starts it is too late though.
		 */
		err = 0;
		clear_bit(WantReplacement, &rdev->flags);
	} else if (cmd_match(buf, "replacement")) {
		/* Can only set a device as a replacement when array has not
		 * yet been started.  Once running, replacement is automatic
		 * from spares, or by assigning 'slot'.
		 */
		if (rdev->mddev->pers)
			err = -EBUSY;
		else {
			set_bit(Replacement, &rdev->flags);
			err = 0;
		}
	} else if (cmd_match(buf, "-replacement")) {
		/* Similarly, can only clear Replacement before start */
		if (rdev->mddev->pers)
			err = -EBUSY;
		else {
			clear_bit(Replacement, &rdev->flags);
			err = 0;
		}
G
Goldwyn Rodrigues 已提交
2856 2857
	} else if (cmd_match(buf, "re-add")) {
		if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) {
2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868
			/* clear_bit is performed _after_ all the devices
			 * have their local Faulty bit cleared. If any writes
			 * happen in the meantime in the local node, they
			 * will land in the local bitmap, which will be synced
			 * by this node eventually
			 */
			if (!mddev_is_clustered(rdev->mddev) ||
			    (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
				clear_bit(Faulty, &rdev->flags);
				err = add_bound_rdev(rdev);
			}
G
Goldwyn Rodrigues 已提交
2869 2870
		} else
			err = -EBUSY;
2871 2872 2873 2874 2875 2876 2877
	} else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
		set_bit(ExternalBbl, &rdev->flags);
		rdev->badblocks.shift = 0;
		err = 0;
	} else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
		clear_bit(ExternalBbl, &rdev->flags);
		err = 0;
2878
	}
N
NeilBrown 已提交
2879 2880
	if (!err)
		sysfs_notify_dirent_safe(rdev->sysfs_state);
2881 2882
	return err ? err : len;
}
2883
static struct rdev_sysfs_entry rdev_state =
2884
__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
2885

2886
static ssize_t
2887
errors_show(struct md_rdev *rdev, char *page)
2888 2889 2890 2891 2892
{
	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
}

static ssize_t
2893
errors_store(struct md_rdev *rdev, const char *buf, size_t len)
2894
{
A
Alexey Dobriyan 已提交
2895 2896 2897 2898 2899 2900 2901 2902
	unsigned int n;
	int rv;

	rv = kstrtouint(buf, 10, &n);
	if (rv < 0)
		return rv;
	atomic_set(&rdev->corrected_errors, n);
	return len;
2903 2904
}
static struct rdev_sysfs_entry rdev_errors =
2905
__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2906

2907
static ssize_t
2908
slot_show(struct md_rdev *rdev, char *page)
2909
{
S
Shaohua Li 已提交
2910 2911 2912
	if (test_bit(Journal, &rdev->flags))
		return sprintf(page, "journal\n");
	else if (rdev->raid_disk < 0)
2913 2914 2915 2916 2917 2918
		return sprintf(page, "none\n");
	else
		return sprintf(page, "%d\n", rdev->raid_disk);
}

static ssize_t
2919
slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2920
{
A
Alexey Dobriyan 已提交
2921
	int slot;
2922
	int err;
A
Alexey Dobriyan 已提交
2923

S
Shaohua Li 已提交
2924 2925
	if (test_bit(Journal, &rdev->flags))
		return -EBUSY;
2926 2927
	if (strncmp(buf, "none", 4)==0)
		slot = -1;
A
Alexey Dobriyan 已提交
2928 2929 2930 2931 2932
	else {
		err = kstrtouint(buf, 10, (unsigned int *)&slot);
		if (err < 0)
			return err;
	}
2933
	if (rdev->mddev->pers && slot == -1) {
2934 2935 2936 2937 2938 2939 2940 2941 2942 2943
		/* Setting 'slot' on an active array requires also
		 * updating the 'rd%d' link, and communicating
		 * with the personality with ->hot_*_disk.
		 * For now we only support removing
		 * failed/spare devices.  This normally happens automatically,
		 * but not when the metadata is externally managed.
		 */
		if (rdev->raid_disk == -1)
			return -EEXIST;
		/* personality does all needed checks */
2944
		if (rdev->mddev->pers->hot_remove_disk == NULL)
2945
			return -EINVAL;
2946 2947 2948 2949
		clear_bit(Blocked, &rdev->flags);
		remove_and_add_spares(rdev->mddev, rdev);
		if (rdev->raid_disk >= 0)
			return -EBUSY;
2950 2951
		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
		md_wakeup_thread(rdev->mddev->thread);
2952 2953
	} else if (rdev->mddev->pers) {
		/* Activating a spare .. or possibly reactivating
2954
		 * if we ever get bitmaps working here.
2955
		 */
2956
		int err;
2957 2958 2959 2960

		if (rdev->raid_disk != -1)
			return -EBUSY;

2961 2962 2963
		if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
			return -EBUSY;

2964 2965 2966
		if (rdev->mddev->pers->hot_add_disk == NULL)
			return -EINVAL;

2967 2968 2969 2970
		if (slot >= rdev->mddev->raid_disks &&
		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
			return -ENOSPC;

2971 2972 2973 2974 2975
		rdev->raid_disk = slot;
		if (test_bit(In_sync, &rdev->flags))
			rdev->saved_raid_disk = slot;
		else
			rdev->saved_raid_disk = -1;
2976
		clear_bit(In_sync, &rdev->flags);
2977
		clear_bit(Bitmap_sync, &rdev->flags);
2978 2979 2980 2981 2982 2983 2984 2985 2986
		err = rdev->mddev->pers->
			hot_add_disk(rdev->mddev, rdev);
		if (err) {
			rdev->raid_disk = -1;
			return err;
		} else
			sysfs_notify_dirent_safe(rdev->sysfs_state);
		if (sysfs_link_rdev(rdev->mddev, rdev))
			/* failure here is OK */;
2987
		/* don't wakeup anyone, leave that to userspace. */
2988
	} else {
2989 2990
		if (slot >= rdev->mddev->raid_disks &&
		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2991 2992 2993
			return -ENOSPC;
		rdev->raid_disk = slot;
		/* assume it is working */
2994 2995
		clear_bit(Faulty, &rdev->flags);
		clear_bit(WriteMostly, &rdev->flags);
2996
		set_bit(In_sync, &rdev->flags);
N
NeilBrown 已提交
2997
		sysfs_notify_dirent_safe(rdev->sysfs_state);
2998
	}
2999 3000 3001 3002
	return len;
}

static struct rdev_sysfs_entry rdev_slot =
3003
__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
3004

3005
static ssize_t
3006
offset_show(struct md_rdev *rdev, char *page)
3007
{
3008
	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3009 3010 3011
}

static ssize_t
3012
offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3013
{
3014
	unsigned long long offset;
3015
	if (kstrtoull(buf, 10, &offset) < 0)
3016
		return -EINVAL;
3017
	if (rdev->mddev->pers && rdev->raid_disk >= 0)
3018
		return -EBUSY;
3019
	if (rdev->sectors && rdev->mddev->external)
3020 3021 3022
		/* Must set offset before size, so overlap checks
		 * can be sane */
		return -EBUSY;
3023
	rdev->data_offset = offset;
3024
	rdev->new_data_offset = offset;
3025 3026 3027 3028
	return len;
}

static struct rdev_sysfs_entry rdev_offset =
3029
__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3030

3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042
static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
{
	return sprintf(page, "%llu\n",
		       (unsigned long long)rdev->new_data_offset);
}

static ssize_t new_offset_store(struct md_rdev *rdev,
				const char *buf, size_t len)
{
	unsigned long long new_offset;
	struct mddev *mddev = rdev->mddev;

3043
	if (kstrtoull(buf, 10, &new_offset) < 0)
3044 3045
		return -EINVAL;

3046 3047
	if (mddev->sync_thread ||
	    test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088
		return -EBUSY;
	if (new_offset == rdev->data_offset)
		/* reset is always permitted */
		;
	else if (new_offset > rdev->data_offset) {
		/* must not push array size beyond rdev_sectors */
		if (new_offset - rdev->data_offset
		    + mddev->dev_sectors > rdev->sectors)
				return -E2BIG;
	}
	/* Metadata worries about other space details. */

	/* decreasing the offset is inconsistent with a backwards
	 * reshape.
	 */
	if (new_offset < rdev->data_offset &&
	    mddev->reshape_backwards)
		return -EINVAL;
	/* Increasing offset is inconsistent with forwards
	 * reshape.  reshape_direction should be set to
	 * 'backwards' first.
	 */
	if (new_offset > rdev->data_offset &&
	    !mddev->reshape_backwards)
		return -EINVAL;

	if (mddev->pers && mddev->persistent &&
	    !super_types[mddev->major_version]
	    .allow_new_offset(rdev, new_offset))
		return -E2BIG;
	rdev->new_data_offset = new_offset;
	if (new_offset > rdev->data_offset)
		mddev->reshape_backwards = 1;
	else if (new_offset < rdev->data_offset)
		mddev->reshape_backwards = 0;

	return len;
}
static struct rdev_sysfs_entry rdev_new_offset =
__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);

3089
static ssize_t
3090
rdev_size_show(struct md_rdev *rdev, char *page)
3091
{
3092
	return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3093 3094
}

3095 3096 3097 3098 3099 3100 3101 3102 3103 3104
static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
{
	/* check if two start/length pairs overlap */
	if (s1+l1 <= s2)
		return 0;
	if (s2+l2 <= s1)
		return 0;
	return 1;
}

D
Dan Williams 已提交
3105 3106 3107 3108 3109
static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
{
	unsigned long long blocks;
	sector_t new;

3110
	if (kstrtoull(buf, 10, &blocks) < 0)
D
Dan Williams 已提交
3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123
		return -EINVAL;

	if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
		return -EINVAL; /* sector conversion overflow */

	new = blocks * 2;
	if (new != blocks * 2)
		return -EINVAL; /* unsigned long long to sector_t overflow */

	*sectors = new;
	return 0;
}

3124
static ssize_t
3125
rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3126
{
3127
	struct mddev *my_mddev = rdev->mddev;
3128
	sector_t oldsectors = rdev->sectors;
D
Dan Williams 已提交
3129
	sector_t sectors;
3130

S
Shaohua Li 已提交
3131 3132
	if (test_bit(Journal, &rdev->flags))
		return -EBUSY;
D
Dan Williams 已提交
3133
	if (strict_blocks_to_sectors(buf, &sectors) < 0)
N
Neil Brown 已提交
3134
		return -EINVAL;
3135 3136
	if (rdev->data_offset != rdev->new_data_offset)
		return -EINVAL; /* too confusing */
3137
	if (my_mddev->pers && rdev->raid_disk >= 0) {
N
Neil Brown 已提交
3138
		if (my_mddev->persistent) {
3139 3140 3141
			sectors = super_types[my_mddev->major_version].
				rdev_size_change(rdev, sectors);
			if (!sectors)
3142
				return -EBUSY;
3143
		} else if (!sectors)
3144
			sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
3145
				rdev->data_offset;
3146 3147 3148
		if (!my_mddev->pers->resize)
			/* Cannot change size for RAID0 or Linear etc */
			return -EINVAL;
3149
	}
3150
	if (sectors < my_mddev->dev_sectors)
3151
		return -EINVAL; /* component must fit device */
3152

3153 3154
	rdev->sectors = sectors;
	if (sectors > oldsectors && my_mddev->external) {
3155 3156 3157 3158 3159
		/* Need to check that all other rdevs with the same
		 * ->bdev do not overlap.  'rcu' is sufficient to walk
		 * the rdev lists safely.
		 * This check does not provide a hard guarantee, it
		 * just helps avoid dangerous mistakes.
3160
		 */
3161
		struct mddev *mddev;
3162
		int overlap = 0;
3163
		struct list_head *tmp;
3164

3165
		rcu_read_lock();
3166
		for_each_mddev(mddev, tmp) {
3167
			struct md_rdev *rdev2;
3168

N
NeilBrown 已提交
3169
			rdev_for_each(rdev2, mddev)
3170 3171 3172 3173 3174
				if (rdev->bdev == rdev2->bdev &&
				    rdev != rdev2 &&
				    overlaps(rdev->data_offset, rdev->sectors,
					     rdev2->data_offset,
					     rdev2->sectors)) {
3175 3176 3177 3178 3179 3180 3181 3182
					overlap = 1;
					break;
				}
			if (overlap) {
				mddev_put(mddev);
				break;
			}
		}
3183
		rcu_read_unlock();
3184 3185 3186
		if (overlap) {
			/* Someone else could have slipped in a size
			 * change here, but doing so is just silly.
3187
			 * We put oldsectors back because we *know* it is
3188 3189 3190
			 * safe, and trust userspace not to race with
			 * itself
			 */
3191
			rdev->sectors = oldsectors;
3192 3193 3194
			return -EBUSY;
		}
	}
3195 3196 3197 3198
	return len;
}

static struct rdev_sysfs_entry rdev_size =
3199
__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3200

3201
static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3202 3203 3204 3205 3206 3207 3208 3209 3210 3211
{
	unsigned long long recovery_start = rdev->recovery_offset;

	if (test_bit(In_sync, &rdev->flags) ||
	    recovery_start == MaxSector)
		return sprintf(page, "none\n");

	return sprintf(page, "%llu\n", recovery_start);
}

3212
static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3213 3214 3215 3216 3217
{
	unsigned long long recovery_start;

	if (cmd_match(buf, "none"))
		recovery_start = MaxSector;
3218
	else if (kstrtoull(buf, 10, &recovery_start))
3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235
		return -EINVAL;

	if (rdev->mddev->pers &&
	    rdev->raid_disk >= 0)
		return -EBUSY;

	rdev->recovery_offset = recovery_start;
	if (recovery_start == MaxSector)
		set_bit(In_sync, &rdev->flags);
	else
		clear_bit(In_sync, &rdev->flags);
	return len;
}

static struct rdev_sysfs_entry rdev_recovery_start =
__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);

3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246
/* sysfs access to bad-blocks list.
 * We present two files.
 * 'bad-blocks' lists sector numbers and lengths of ranges that
 *    are recorded as bad.  The list is truncated to fit within
 *    the one-page limit of sysfs.
 *    Writing "sector length" to this file adds an acknowledged
 *    bad block list.
 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
 *    been acknowledged.  Writing to this file adds bad blocks
 *    without acknowledging them.  This is largely for testing.
 */
3247
static ssize_t bb_show(struct md_rdev *rdev, char *page)
3248 3249 3250
{
	return badblocks_show(&rdev->badblocks, page, 0);
}
3251
static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3252
{
3253 3254 3255 3256 3257
	int rv = badblocks_store(&rdev->badblocks, page, len, 0);
	/* Maybe that ack was all we needed */
	if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
		wake_up(&rdev->blocked_wait);
	return rv;
3258 3259 3260 3261
}
static struct rdev_sysfs_entry rdev_bad_blocks =
__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);

3262
static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3263 3264 3265
{
	return badblocks_show(&rdev->badblocks, page, 1);
}
3266
static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3267 3268 3269 3270 3271 3272
{
	return badblocks_store(&rdev->badblocks, page, len, 1);
}
static struct rdev_sysfs_entry rdev_unack_bad_blocks =
__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);

3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344
static ssize_t
ppl_sector_show(struct md_rdev *rdev, char *page)
{
	return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
}

static ssize_t
ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
{
	unsigned long long sector;

	if (kstrtoull(buf, 10, &sector) < 0)
		return -EINVAL;
	if (sector != (sector_t)sector)
		return -EINVAL;

	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
	    rdev->raid_disk >= 0)
		return -EBUSY;

	if (rdev->mddev->persistent) {
		if (rdev->mddev->major_version == 0)
			return -EINVAL;
		if ((sector > rdev->sb_start &&
		     sector - rdev->sb_start > S16_MAX) ||
		    (sector < rdev->sb_start &&
		     rdev->sb_start - sector > -S16_MIN))
			return -EINVAL;
		rdev->ppl.offset = sector - rdev->sb_start;
	} else if (!rdev->mddev->external) {
		return -EBUSY;
	}
	rdev->ppl.sector = sector;
	return len;
}

static struct rdev_sysfs_entry rdev_ppl_sector =
__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);

static ssize_t
ppl_size_show(struct md_rdev *rdev, char *page)
{
	return sprintf(page, "%u\n", rdev->ppl.size);
}

static ssize_t
ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
{
	unsigned int size;

	if (kstrtouint(buf, 10, &size) < 0)
		return -EINVAL;

	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
	    rdev->raid_disk >= 0)
		return -EBUSY;

	if (rdev->mddev->persistent) {
		if (rdev->mddev->major_version == 0)
			return -EINVAL;
		if (size > U16_MAX)
			return -EINVAL;
	} else if (!rdev->mddev->external) {
		return -EBUSY;
	}
	rdev->ppl.size = size;
	return len;
}

static struct rdev_sysfs_entry rdev_ppl_size =
__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);

3345 3346
static struct attribute *rdev_default_attrs[] = {
	&rdev_state.attr,
3347
	&rdev_errors.attr,
3348
	&rdev_slot.attr,
3349
	&rdev_offset.attr,
3350
	&rdev_new_offset.attr,
3351
	&rdev_size.attr,
3352
	&rdev_recovery_start.attr,
3353 3354
	&rdev_bad_blocks.attr,
	&rdev_unack_bad_blocks.attr,
3355 3356
	&rdev_ppl_sector.attr,
	&rdev_ppl_size.attr,
3357 3358 3359 3360 3361 3362
	NULL,
};
static ssize_t
rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3363
	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3364 3365 3366

	if (!entry->show)
		return -EIO;
3367 3368 3369
	if (!rdev->mddev)
		return -EBUSY;
	return entry->show(rdev, page);
3370 3371 3372 3373 3374 3375 3376
}

static ssize_t
rdev_attr_store(struct kobject *kobj, struct attribute *attr,
	      const char *page, size_t length)
{
	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3377
	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3378
	ssize_t rv;
3379
	struct mddev *mddev = rdev->mddev;
3380 3381 3382

	if (!entry->store)
		return -EIO;
3383 3384
	if (!capable(CAP_SYS_ADMIN))
		return -EACCES;
3385
	rv = mddev ? mddev_lock(mddev): -EBUSY;
3386
	if (!rv) {
3387 3388 3389 3390
		if (rdev->mddev == NULL)
			rv = -EBUSY;
		else
			rv = entry->store(rdev, page, length);
3391
		mddev_unlock(mddev);
3392 3393
	}
	return rv;
3394 3395 3396 3397
}

static void rdev_free(struct kobject *ko)
{
3398
	struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3399 3400
	kfree(rdev);
}
3401
static const struct sysfs_ops rdev_sysfs_ops = {
3402 3403 3404 3405 3406 3407 3408 3409 3410
	.show		= rdev_attr_show,
	.store		= rdev_attr_store,
};
static struct kobj_type rdev_ktype = {
	.release	= rdev_free,
	.sysfs_ops	= &rdev_sysfs_ops,
	.default_attrs	= rdev_default_attrs,
};

3411
int md_rdev_init(struct md_rdev *rdev)
N
NeilBrown 已提交
3412 3413 3414 3415 3416 3417
{
	rdev->desc_nr = -1;
	rdev->saved_raid_disk = -1;
	rdev->raid_disk = -1;
	rdev->flags = 0;
	rdev->data_offset = 0;
3418
	rdev->new_data_offset = 0;
N
NeilBrown 已提交
3419
	rdev->sb_events = 0;
3420
	rdev->last_read_error = 0;
3421 3422
	rdev->sb_loaded = 0;
	rdev->bb_page = NULL;
N
NeilBrown 已提交
3423 3424 3425 3426 3427 3428
	atomic_set(&rdev->nr_pending, 0);
	atomic_set(&rdev->read_errors, 0);
	atomic_set(&rdev->corrected_errors, 0);

	INIT_LIST_HEAD(&rdev->same_set);
	init_waitqueue_head(&rdev->blocked_wait);
3429 3430 3431 3432 3433

	/* Add space to store bad block list.
	 * This reserves the space even on arrays where it cannot
	 * be used - I wonder if that matters
	 */
3434
	return badblocks_init(&rdev->badblocks, 0);
N
NeilBrown 已提交
3435 3436
}
EXPORT_SYMBOL_GPL(md_rdev_init);
L
Linus Torvalds 已提交
3437 3438 3439 3440 3441 3442 3443 3444 3445 3446
/*
 * Import a device. If 'super_format' >= 0, then sanity check the superblock
 *
 * mark the device faulty if:
 *
 *   - the device is nonexistent (zero size)
 *   - the device has no valid superblock
 *
 * a faulty rdev _never_ has rdev->sb set.
 */
3447
static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
L
Linus Torvalds 已提交
3448 3449 3450
{
	char b[BDEVNAME_SIZE];
	int err;
3451
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
3452 3453
	sector_t size;

3454
	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3455
	if (!rdev)
L
Linus Torvalds 已提交
3456 3457
		return ERR_PTR(-ENOMEM);

3458 3459 3460 3461 3462
	err = md_rdev_init(rdev);
	if (err)
		goto abort_free;
	err = alloc_disk_sb(rdev);
	if (err)
L
Linus Torvalds 已提交
3463 3464
		goto abort_free;

3465
	err = lock_rdev(rdev, newdev, super_format == -2);
L
Linus Torvalds 已提交
3466 3467 3468
	if (err)
		goto abort_free;

3469
	kobject_init(&rdev->kobj, &rdev_ktype);
3470

3471
	size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
L
Linus Torvalds 已提交
3472
	if (!size) {
3473
		pr_warn("md: %s has zero or unknown size, marking faulty!\n",
L
Linus Torvalds 已提交
3474 3475 3476 3477 3478 3479 3480 3481 3482
			bdevname(rdev->bdev,b));
		err = -EINVAL;
		goto abort_free;
	}

	if (super_format >= 0) {
		err = super_types[super_format].
			load_super(rdev, NULL, super_minor);
		if (err == -EINVAL) {
3483
			pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n",
3484
				bdevname(rdev->bdev,b),
3485
				super_format, super_minor);
L
Linus Torvalds 已提交
3486 3487 3488
			goto abort_free;
		}
		if (err < 0) {
3489
			pr_warn("md: could not read %s's sb, not importing!\n",
L
Linus Torvalds 已提交
3490 3491 3492 3493
				bdevname(rdev->bdev,b));
			goto abort_free;
		}
	}
3494

L
Linus Torvalds 已提交
3495 3496 3497
	return rdev;

abort_free:
3498 3499
	if (rdev->bdev)
		unlock_rdev(rdev);
3500
	md_rdev_clear(rdev);
L
Linus Torvalds 已提交
3501 3502 3503 3504 3505 3506 3507 3508
	kfree(rdev);
	return ERR_PTR(err);
}

/*
 * Check a full RAID array for plausibility
 */

3509
static void analyze_sbs(struct mddev *mddev)
L
Linus Torvalds 已提交
3510 3511
{
	int i;
3512
	struct md_rdev *rdev, *freshest, *tmp;
L
Linus Torvalds 已提交
3513 3514 3515
	char b[BDEVNAME_SIZE];

	freshest = NULL;
N
NeilBrown 已提交
3516
	rdev_for_each_safe(rdev, tmp, mddev)
L
Linus Torvalds 已提交
3517 3518 3519 3520 3521 3522 3523 3524
		switch (super_types[mddev->major_version].
			load_super(rdev, freshest, mddev->minor_version)) {
		case 1:
			freshest = rdev;
			break;
		case 0:
			break;
		default:
3525
			pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n",
L
Linus Torvalds 已提交
3526
				bdevname(rdev->bdev,b));
3527
			md_kick_rdev_from_array(rdev);
L
Linus Torvalds 已提交
3528 3529 3530 3531 3532 3533
		}

	super_types[mddev->major_version].
		validate_super(mddev, freshest);

	i = 0;
N
NeilBrown 已提交
3534
	rdev_for_each_safe(rdev, tmp, mddev) {
3535 3536 3537
		if (mddev->max_disks &&
		    (rdev->desc_nr >= mddev->max_disks ||
		     i > mddev->max_disks)) {
3538 3539 3540
			pr_warn("md: %s: %s: only %d devices permitted\n",
				mdname(mddev), bdevname(rdev->bdev, b),
				mddev->max_disks);
3541
			md_kick_rdev_from_array(rdev);
3542 3543
			continue;
		}
3544
		if (rdev != freshest) {
L
Linus Torvalds 已提交
3545 3546
			if (super_types[mddev->major_version].
			    validate_super(mddev, rdev)) {
3547
				pr_warn("md: kicking non-fresh %s from array!\n",
L
Linus Torvalds 已提交
3548
					bdevname(rdev->bdev,b));
3549
				md_kick_rdev_from_array(rdev);
L
Linus Torvalds 已提交
3550 3551
				continue;
			}
3552
		}
L
Linus Torvalds 已提交
3553 3554 3555
		if (mddev->level == LEVEL_MULTIPATH) {
			rdev->desc_nr = i++;
			rdev->raid_disk = rdev->desc_nr;
3556
			set_bit(In_sync, &rdev->flags);
S
Shaohua Li 已提交
3557 3558 3559
		} else if (rdev->raid_disk >=
			    (mddev->raid_disks - min(0, mddev->delta_disks)) &&
			   !test_bit(Journal, &rdev->flags)) {
3560 3561
			rdev->raid_disk = -1;
			clear_bit(In_sync, &rdev->flags);
L
Linus Torvalds 已提交
3562 3563 3564 3565
		}
	}
}

3566 3567 3568
/* Read a fixed-point number.
 * Numbers in sysfs attributes should be in "standard" units where
 * possible, so time should be in seconds.
3569
 * However we internally use a a much smaller unit such as
3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605
 * milliseconds or jiffies.
 * This function takes a decimal number with a possible fractional
 * component, and produces an integer which is the result of
 * multiplying that number by 10^'scale'.
 * all without any floating-point arithmetic.
 */
int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
{
	unsigned long result = 0;
	long decimals = -1;
	while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
		if (*cp == '.')
			decimals = 0;
		else if (decimals < scale) {
			unsigned int value;
			value = *cp - '0';
			result = result * 10 + value;
			if (decimals >= 0)
				decimals++;
		}
		cp++;
	}
	if (*cp == '\n')
		cp++;
	if (*cp)
		return -EINVAL;
	if (decimals < 0)
		decimals = 0;
	while (decimals < scale) {
		result *= 10;
		decimals ++;
	}
	*res = result;
	return 0;
}

3606
static ssize_t
3607
safe_delay_show(struct mddev *mddev, char *page)
3608 3609 3610 3611 3612
{
	int msec = (mddev->safemode_delay*1000)/HZ;
	return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
}
static ssize_t
3613
safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3614 3615
{
	unsigned long msec;
3616

3617
	if (mddev_is_clustered(mddev)) {
3618
		pr_warn("md: Safemode is disabled for clustered mode\n");
3619 3620 3621
		return -EINVAL;
	}

3622
	if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3623 3624 3625 3626
		return -EINVAL;
	if (msec == 0)
		mddev->safemode_delay = 0;
	else {
3627
		unsigned long old_delay = mddev->safemode_delay;
3628 3629 3630 3631 3632 3633 3634
		unsigned long new_delay = (msec*HZ)/1000;

		if (new_delay == 0)
			new_delay = 1;
		mddev->safemode_delay = new_delay;
		if (new_delay < old_delay || old_delay == 0)
			mod_timer(&mddev->safemode_timer, jiffies+1);
3635 3636 3637 3638
	}
	return len;
}
static struct md_sysfs_entry md_safe_delay =
3639
__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3640

3641
static ssize_t
3642
level_show(struct mddev *mddev, char *page)
3643
{
3644 3645 3646 3647
	struct md_personality *p;
	int ret;
	spin_lock(&mddev->lock);
	p = mddev->pers;
3648
	if (p)
3649
		ret = sprintf(page, "%s\n", p->name);
3650
	else if (mddev->clevel[0])
3651
		ret = sprintf(page, "%s\n", mddev->clevel);
3652
	else if (mddev->level != LEVEL_NONE)
3653
		ret = sprintf(page, "%d\n", mddev->level);
3654
	else
3655 3656 3657
		ret = 0;
	spin_unlock(&mddev->lock);
	return ret;
3658 3659
}

3660
static ssize_t
3661
level_store(struct mddev *mddev, const char *buf, size_t len)
3662
{
3663
	char clevel[16];
3664 3665
	ssize_t rv;
	size_t slen = len;
3666
	struct md_personality *pers, *oldpers;
3667
	long level;
3668
	void *priv, *oldpriv;
3669
	struct md_rdev *rdev;
3670

3671 3672 3673 3674 3675 3676 3677
	if (slen == 0 || slen >= sizeof(clevel))
		return -EINVAL;

	rv = mddev_lock(mddev);
	if (rv)
		return rv;

3678
	if (mddev->pers == NULL) {
3679 3680 3681 3682
		strncpy(mddev->clevel, buf, slen);
		if (mddev->clevel[slen-1] == '\n')
			slen--;
		mddev->clevel[slen] = 0;
3683
		mddev->level = LEVEL_NONE;
3684 3685
		rv = len;
		goto out_unlock;
3686
	}
3687
	rv = -EROFS;
3688
	if (mddev->ro)
3689
		goto out_unlock;
3690 3691 3692 3693 3694 3695 3696

	/* request to change the personality.  Need to ensure:
	 *  - array is not engaged in resync/recovery/reshape
	 *  - old personality can be suspended
	 *  - new personality will access other array.
	 */

3697
	rv = -EBUSY;
3698
	if (mddev->sync_thread ||
3699
	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3700 3701
	    mddev->reshape_position != MaxSector ||
	    mddev->sysfs_active)
3702
		goto out_unlock;
3703

3704
	rv = -EINVAL;
3705
	if (!mddev->pers->quiesce) {
3706 3707
		pr_warn("md: %s: %s does not support online personality change\n",
			mdname(mddev), mddev->pers->name);
3708
		goto out_unlock;
3709 3710 3711
	}

	/* Now find the new personality */
3712 3713 3714 3715
	strncpy(clevel, buf, slen);
	if (clevel[slen-1] == '\n')
		slen--;
	clevel[slen] = 0;
3716
	if (kstrtol(clevel, 10, &level))
3717
		level = LEVEL_NONE;
3718

3719 3720
	if (request_module("md-%s", clevel) != 0)
		request_module("md-level-%s", clevel);
3721
	spin_lock(&pers_lock);
3722
	pers = find_pers(level, clevel);
3723 3724
	if (!pers || !try_module_get(pers->owner)) {
		spin_unlock(&pers_lock);
3725
		pr_warn("md: personality %s not loaded\n", clevel);
3726 3727
		rv = -EINVAL;
		goto out_unlock;
3728 3729 3730 3731 3732 3733
	}
	spin_unlock(&pers_lock);

	if (pers == mddev->pers) {
		/* Nothing to do! */
		module_put(pers->owner);
3734 3735
		rv = len;
		goto out_unlock;
3736 3737 3738
	}
	if (!pers->takeover) {
		module_put(pers->owner);
3739 3740
		pr_warn("md: %s: %s does not support personality takeover\n",
			mdname(mddev), clevel);
3741 3742
		rv = -EINVAL;
		goto out_unlock;
3743 3744
	}

N
NeilBrown 已提交
3745
	rdev_for_each(rdev, mddev)
3746 3747
		rdev->new_raid_disk = rdev->raid_disk;

3748 3749 3750 3751 3752 3753 3754
	/* ->takeover must set new_* and/or delta_disks
	 * if it succeeds, and may set them when it fails.
	 */
	priv = pers->takeover(mddev);
	if (IS_ERR(priv)) {
		mddev->new_level = mddev->level;
		mddev->new_layout = mddev->layout;
3755
		mddev->new_chunk_sectors = mddev->chunk_sectors;
3756 3757
		mddev->raid_disks -= mddev->delta_disks;
		mddev->delta_disks = 0;
3758
		mddev->reshape_backwards = 0;
3759
		module_put(pers->owner);
3760 3761
		pr_warn("md: %s: %s would not accept array\n",
			mdname(mddev), clevel);
3762 3763
		rv = PTR_ERR(priv);
		goto out_unlock;
3764 3765 3766 3767
	}

	/* Looks like we have a winner */
	mddev_suspend(mddev);
3768
	mddev_detach(mddev);
3769 3770

	spin_lock(&mddev->lock);
3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781
	oldpers = mddev->pers;
	oldpriv = mddev->private;
	mddev->pers = pers;
	mddev->private = priv;
	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
	mddev->level = mddev->new_level;
	mddev->layout = mddev->new_layout;
	mddev->chunk_sectors = mddev->new_chunk_sectors;
	mddev->delta_disks = 0;
	mddev->reshape_backwards = 0;
	mddev->degraded = 0;
3782
	spin_unlock(&mddev->lock);
3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796

	if (oldpers->sync_request == NULL &&
	    mddev->external) {
		/* We are converting from a no-redundancy array
		 * to a redundancy array and metadata is managed
		 * externally so we need to be sure that writes
		 * won't block due to a need to transition
		 *      clean->dirty
		 * until external management is started.
		 */
		mddev->in_sync = 0;
		mddev->safemode_delay = 0;
		mddev->safemode = 0;
	}
3797

3798 3799 3800
	oldpers->free(mddev, oldpriv);

	if (oldpers->sync_request == NULL &&
3801 3802 3803
	    pers->sync_request != NULL) {
		/* need to add the md_redundancy_group */
		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3804 3805
			pr_warn("md: cannot register extra attributes for %s\n",
				mdname(mddev));
T
Tejun Heo 已提交
3806
		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
3807
	}
3808
	if (oldpers->sync_request != NULL &&
3809 3810 3811 3812 3813 3814
	    pers->sync_request == NULL) {
		/* need to remove the md_redundancy_group */
		if (mddev->to_remove == NULL)
			mddev->to_remove = &md_redundancy_group;
	}

3815 3816
	module_put(oldpers->owner);

N
NeilBrown 已提交
3817
	rdev_for_each(rdev, mddev) {
3818 3819
		if (rdev->raid_disk < 0)
			continue;
3820
		if (rdev->new_raid_disk >= mddev->raid_disks)
3821 3822 3823
			rdev->new_raid_disk = -1;
		if (rdev->new_raid_disk == rdev->raid_disk)
			continue;
3824
		sysfs_unlink_rdev(mddev, rdev);
3825
	}
N
NeilBrown 已提交
3826
	rdev_for_each(rdev, mddev) {
3827 3828 3829 3830 3831 3832
		if (rdev->raid_disk < 0)
			continue;
		if (rdev->new_raid_disk == rdev->raid_disk)
			continue;
		rdev->raid_disk = rdev->new_raid_disk;
		if (rdev->raid_disk < 0)
3833
			clear_bit(In_sync, &rdev->flags);
3834
		else {
3835
			if (sysfs_link_rdev(mddev, rdev))
3836 3837
				pr_warn("md: cannot register rd%d for %s after level change\n",
					rdev->raid_disk, mdname(mddev));
3838
		}
3839 3840
	}

3841
	if (pers->sync_request == NULL) {
3842 3843 3844 3845 3846 3847
		/* this is now an array without redundancy, so
		 * it must always be in_sync
		 */
		mddev->in_sync = 1;
		del_timer_sync(&mddev->safemode_timer);
	}
3848
	blk_set_stacking_limits(&mddev->queue->limits);
3849
	pers->run(mddev);
3850
	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
3851
	mddev_resume(mddev);
3852 3853
	if (!mddev->thread)
		md_update_sb(mddev, 1);
3854
	sysfs_notify(&mddev->kobj, NULL, "level");
3855
	md_new_event(mddev);
3856 3857 3858
	rv = len;
out_unlock:
	mddev_unlock(mddev);
3859 3860 3861 3862
	return rv;
}

static struct md_sysfs_entry md_level =
3863
__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
3864

3865
static ssize_t
3866
layout_show(struct mddev *mddev, char *page)
3867 3868
{
	/* just a number, not meaningful for all levels */
3869 3870 3871 3872
	if (mddev->reshape_position != MaxSector &&
	    mddev->layout != mddev->new_layout)
		return sprintf(page, "%d (%d)\n",
			       mddev->new_layout, mddev->layout);
3873 3874 3875 3876
	return sprintf(page, "%d\n", mddev->layout);
}

static ssize_t
3877
layout_store(struct mddev *mddev, const char *buf, size_t len)
3878
{
A
Alexey Dobriyan 已提交
3879
	unsigned int n;
3880
	int err;
3881

A
Alexey Dobriyan 已提交
3882 3883 3884
	err = kstrtouint(buf, 10, &n);
	if (err < 0)
		return err;
3885 3886 3887
	err = mddev_lock(mddev);
	if (err)
		return err;
3888

3889
	if (mddev->pers) {
3890
		if (mddev->pers->check_reshape == NULL)
3891 3892 3893 3894 3895 3896 3897 3898
			err = -EBUSY;
		else if (mddev->ro)
			err = -EROFS;
		else {
			mddev->new_layout = n;
			err = mddev->pers->check_reshape(mddev);
			if (err)
				mddev->new_layout = mddev->layout;
3899
		}
3900
	} else {
3901
		mddev->new_layout = n;
3902 3903 3904
		if (mddev->reshape_position == MaxSector)
			mddev->layout = n;
	}
3905 3906
	mddev_unlock(mddev);
	return err ?: len;
3907 3908
}
static struct md_sysfs_entry md_layout =
3909
__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
3910

3911
static ssize_t
3912
raid_disks_show(struct mddev *mddev, char *page)
3913
{
3914 3915
	if (mddev->raid_disks == 0)
		return 0;
3916 3917 3918 3919
	if (mddev->reshape_position != MaxSector &&
	    mddev->delta_disks != 0)
		return sprintf(page, "%d (%d)\n", mddev->raid_disks,
			       mddev->raid_disks - mddev->delta_disks);
3920 3921 3922
	return sprintf(page, "%d\n", mddev->raid_disks);
}

3923
static int update_raid_disks(struct mddev *mddev, int raid_disks);
3924 3925

static ssize_t
3926
raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
3927
{
A
Alexey Dobriyan 已提交
3928
	unsigned int n;
3929
	int err;
3930

A
Alexey Dobriyan 已提交
3931 3932 3933
	err = kstrtouint(buf, 10, &n);
	if (err < 0)
		return err;
3934

3935 3936 3937
	err = mddev_lock(mddev);
	if (err)
		return err;
3938
	if (mddev->pers)
3939
		err = update_raid_disks(mddev, n);
3940
	else if (mddev->reshape_position != MaxSector) {
3941
		struct md_rdev *rdev;
3942
		int olddisks = mddev->raid_disks - mddev->delta_disks;
3943

3944
		err = -EINVAL;
3945 3946 3947
		rdev_for_each(rdev, mddev) {
			if (olddisks < n &&
			    rdev->data_offset < rdev->new_data_offset)
3948
				goto out_unlock;
3949 3950
			if (olddisks > n &&
			    rdev->data_offset > rdev->new_data_offset)
3951
				goto out_unlock;
3952
		}
3953
		err = 0;
3954 3955
		mddev->delta_disks = n - olddisks;
		mddev->raid_disks = n;
3956
		mddev->reshape_backwards = (mddev->delta_disks < 0);
3957
	} else
3958
		mddev->raid_disks = n;
3959 3960 3961
out_unlock:
	mddev_unlock(mddev);
	return err ? err : len;
3962 3963
}
static struct md_sysfs_entry md_raid_disks =
3964
__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
3965

3966
static ssize_t
3967
chunk_size_show(struct mddev *mddev, char *page)
3968
{
3969
	if (mddev->reshape_position != MaxSector &&
3970 3971 3972
	    mddev->chunk_sectors != mddev->new_chunk_sectors)
		return sprintf(page, "%d (%d)\n",
			       mddev->new_chunk_sectors << 9,
3973 3974
			       mddev->chunk_sectors << 9);
	return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
3975 3976 3977
}

static ssize_t
3978
chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
3979
{
A
Alexey Dobriyan 已提交
3980
	unsigned long n;
3981
	int err;
3982

A
Alexey Dobriyan 已提交
3983 3984 3985
	err = kstrtoul(buf, 10, &n);
	if (err < 0)
		return err;
3986

3987 3988 3989
	err = mddev_lock(mddev);
	if (err)
		return err;
3990
	if (mddev->pers) {
3991
		if (mddev->pers->check_reshape == NULL)
3992 3993 3994 3995 3996 3997 3998 3999
			err = -EBUSY;
		else if (mddev->ro)
			err = -EROFS;
		else {
			mddev->new_chunk_sectors = n >> 9;
			err = mddev->pers->check_reshape(mddev);
			if (err)
				mddev->new_chunk_sectors = mddev->chunk_sectors;
4000
		}
4001
	} else {
4002
		mddev->new_chunk_sectors = n >> 9;
4003
		if (mddev->reshape_position == MaxSector)
4004
			mddev->chunk_sectors = n >> 9;
4005
	}
4006 4007
	mddev_unlock(mddev);
	return err ?: len;
4008 4009
}
static struct md_sysfs_entry md_chunk_size =
4010
__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
4011

4012
static ssize_t
4013
resync_start_show(struct mddev *mddev, char *page)
4014
{
4015 4016
	if (mddev->recovery_cp == MaxSector)
		return sprintf(page, "none\n");
4017 4018 4019 4020
	return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
}

static ssize_t
4021
resync_start_store(struct mddev *mddev, const char *buf, size_t len)
4022
{
A
Alexey Dobriyan 已提交
4023
	unsigned long long n;
4024
	int err;
A
Alexey Dobriyan 已提交
4025 4026 4027 4028 4029 4030 4031 4032 4033 4034

	if (cmd_match(buf, "none"))
		n = MaxSector;
	else {
		err = kstrtoull(buf, 10, &n);
		if (err < 0)
			return err;
		if (n != (sector_t)n)
			return -EINVAL;
	}
4035

4036 4037 4038
	err = mddev_lock(mddev);
	if (err)
		return err;
4039
	if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4040
		err = -EBUSY;
4041

4042 4043 4044
	if (!err) {
		mddev->recovery_cp = n;
		if (mddev->pers)
4045
			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4046 4047 4048
	}
	mddev_unlock(mddev);
	return err ?: len;
4049 4050
}
static struct md_sysfs_entry md_resync_start =
4051 4052
__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
		resync_start_show, resync_start_store);
4053

4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065
/*
 * The array state can be:
 *
 * clear
 *     No devices, no size, no level
 *     Equivalent to STOP_ARRAY ioctl
 * inactive
 *     May have some settings, but array is not active
 *        all IO results in error
 *     When written, doesn't tear down array, but just stops it
 * suspended (not supported yet)
 *     All IO requests will block. The array can be reconfigured.
4066
 *     Writing this, if accepted, will block until array is quiescent
4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091
 * readonly
 *     no resync can happen.  no superblocks get written.
 *     write requests fail
 * read-auto
 *     like readonly, but behaves like 'clean' on a write request.
 *
 * clean - no pending writes, but otherwise active.
 *     When written to inactive array, starts without resync
 *     If a write request arrives then
 *       if metadata is known, mark 'dirty' and switch to 'active'.
 *       if not known, block and switch to write-pending
 *     If written to an active array that has pending writes, then fails.
 * active
 *     fully active: IO and resync can be happening.
 *     When written to inactive array, starts with resync
 *
 * write-pending
 *     clean, but writes are blocked waiting for 'active' to be written.
 *
 * active-idle
 *     like active, but no writes have been seen for a while (100msec).
 *
 */
enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
		   write_pending, active_idle, bad_word};
4092
static char *array_states[] = {
4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105
	"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
	"write-pending", "active-idle", NULL };

static int match_word(const char *word, char **list)
{
	int n;
	for (n=0; list[n]; n++)
		if (cmd_match(word, list[n]))
			break;
	return n;
}

static ssize_t
4106
array_state_show(struct mddev *mddev, char *page)
4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118
{
	enum array_state st = inactive;

	if (mddev->pers)
		switch(mddev->ro) {
		case 1:
			st = readonly;
			break;
		case 2:
			st = read_auto;
			break;
		case 0:
4119
			spin_lock(&mddev->lock);
4120
			if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4121
				st = write_pending;
4122 4123
			else if (mddev->in_sync)
				st = clean;
4124 4125 4126 4127
			else if (mddev->safemode)
				st = active_idle;
			else
				st = active;
4128
			spin_unlock(&mddev->lock);
4129 4130 4131 4132
		}
	else {
		if (list_empty(&mddev->disks) &&
		    mddev->raid_disks == 0 &&
A
Andre Noll 已提交
4133
		    mddev->dev_sectors == 0)
4134 4135 4136 4137 4138 4139 4140
			st = clear;
		else
			st = inactive;
	}
	return sprintf(page, "%s\n", array_states[st]);
}

4141 4142 4143
static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
static int do_md_run(struct mddev *mddev);
4144
static int restart_array(struct mddev *mddev);
4145 4146

static ssize_t
4147
array_state_store(struct mddev *mddev, const char *buf, size_t len)
4148
{
N
NeilBrown 已提交
4149
	int err = 0;
4150
	enum array_state st = match_word(buf, array_states);
4151 4152 4153 4154 4155 4156 4157 4158

	if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
		/* don't take reconfig_mutex when toggling between
		 * clean and active
		 */
		spin_lock(&mddev->lock);
		if (st == active) {
			restart_array(mddev);
4159
			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4160
			md_wakeup_thread(mddev->thread);
4161 4162 4163
			wake_up(&mddev->sb_wait);
		} else /* st == clean */ {
			restart_array(mddev);
N
NeilBrown 已提交
4164
			if (!set_in_sync(mddev))
4165 4166
				err = -EBUSY;
		}
4167 4168
		if (!err)
			sysfs_notify_dirent_safe(mddev->sysfs_state);
4169
		spin_unlock(&mddev->lock);
4170
		return err ?: len;
4171 4172 4173 4174 4175
	}
	err = mddev_lock(mddev);
	if (err)
		return err;
	err = -EINVAL;
4176 4177 4178 4179 4180
	switch(st) {
	case bad_word:
		break;
	case clear:
		/* stopping an active array */
4181
		err = do_md_stop(mddev, 0, NULL);
4182 4183 4184
		break;
	case inactive:
		/* stopping an active array */
4185
		if (mddev->pers)
4186
			err = do_md_stop(mddev, 2, NULL);
4187
		else
4188
			err = 0; /* already inactive */
4189 4190 4191 4192 4193
		break;
	case suspended:
		break; /* not supported yet */
	case readonly:
		if (mddev->pers)
4194
			err = md_set_readonly(mddev, NULL);
4195 4196
		else {
			mddev->ro = 1;
4197
			set_disk_ro(mddev->gendisk, 1);
4198 4199 4200 4201 4202
			err = do_md_run(mddev);
		}
		break;
	case read_auto:
		if (mddev->pers) {
4203
			if (mddev->ro == 0)
4204
				err = md_set_readonly(mddev, NULL);
4205
			else if (mddev->ro == 1)
4206 4207 4208 4209 4210
				err = restart_array(mddev);
			if (err == 0) {
				mddev->ro = 2;
				set_disk_ro(mddev->gendisk, 0);
			}
4211 4212 4213 4214 4215 4216 4217
		} else {
			mddev->ro = 2;
			err = do_md_run(mddev);
		}
		break;
	case clean:
		if (mddev->pers) {
4218 4219 4220
			err = restart_array(mddev);
			if (err)
				break;
4221
			spin_lock(&mddev->lock);
N
NeilBrown 已提交
4222
			if (!set_in_sync(mddev))
4223
				err = -EBUSY;
4224
			spin_unlock(&mddev->lock);
4225 4226
		} else
			err = -EINVAL;
4227 4228 4229
		break;
	case active:
		if (mddev->pers) {
4230 4231 4232
			err = restart_array(mddev);
			if (err)
				break;
4233
			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4234 4235 4236 4237
			wake_up(&mddev->sb_wait);
			err = 0;
		} else {
			mddev->ro = 0;
4238
			set_disk_ro(mddev->gendisk, 0);
4239 4240 4241 4242 4243 4244 4245 4246
			err = do_md_run(mddev);
		}
		break;
	case write_pending:
	case active_idle:
		/* these cannot be set */
		break;
	}
4247 4248

	if (!err) {
4249 4250
		if (mddev->hold_active == UNTIL_IOCTL)
			mddev->hold_active = 0;
N
NeilBrown 已提交
4251
		sysfs_notify_dirent_safe(mddev->sysfs_state);
4252
	}
4253 4254
	mddev_unlock(mddev);
	return err ?: len;
4255
}
4256
static struct md_sysfs_entry md_array_state =
4257
__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4258

4259
static ssize_t
4260
max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4261 4262 4263 4264 4265
	return sprintf(page, "%d\n",
		       atomic_read(&mddev->max_corr_read_errors));
}

static ssize_t
4266
max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4267
{
A
Alexey Dobriyan 已提交
4268 4269
	unsigned int n;
	int rv;
4270

A
Alexey Dobriyan 已提交
4271 4272 4273 4274 4275
	rv = kstrtouint(buf, 10, &n);
	if (rv < 0)
		return rv;
	atomic_set(&mddev->max_corr_read_errors, n);
	return len;
4276 4277 4278 4279 4280 4281
}

static struct md_sysfs_entry max_corr_read_errors =
__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
	max_corrected_read_errors_store);

4282
static ssize_t
4283
null_show(struct mddev *mddev, char *page)
4284 4285 4286 4287 4288
{
	return -EINVAL;
}

static ssize_t
4289
new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301
{
	/* buf must be %d:%d\n? giving major and minor numbers */
	/* The new device is added to the array.
	 * If the array has a persistent superblock, we read the
	 * superblock to initialise info and check validity.
	 * Otherwise, only checking done is that in bind_rdev_to_array,
	 * which mainly checks size.
	 */
	char *e;
	int major = simple_strtoul(buf, &e, 10);
	int minor;
	dev_t dev;
4302
	struct md_rdev *rdev;
4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314
	int err;

	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
		return -EINVAL;
	minor = simple_strtoul(e+1, &e, 10);
	if (*e && *e != '\n')
		return -EINVAL;
	dev = MKDEV(major, minor);
	if (major != MAJOR(dev) ||
	    minor != MINOR(dev))
		return -EOVERFLOW;

4315 4316 4317 4318 4319
	flush_workqueue(md_misc_wq);

	err = mddev_lock(mddev);
	if (err)
		return err;
4320 4321 4322 4323
	if (mddev->persistent) {
		rdev = md_import_device(dev, mddev->major_version,
					mddev->minor_version);
		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4324 4325 4326
			struct md_rdev *rdev0
				= list_entry(mddev->disks.next,
					     struct md_rdev, same_set);
4327 4328 4329 4330 4331
			err = super_types[mddev->major_version]
				.load_super(rdev, rdev0, mddev->minor_version);
			if (err < 0)
				goto out;
		}
4332 4333 4334
	} else if (mddev->external)
		rdev = md_import_device(dev, -2, -1);
	else
4335 4336
		rdev = md_import_device(dev, -1, -1);

4337 4338
	if (IS_ERR(rdev)) {
		mddev_unlock(mddev);
4339
		return PTR_ERR(rdev);
4340
	}
4341 4342 4343 4344
	err = bind_rdev_to_array(rdev, mddev);
 out:
	if (err)
		export_rdev(rdev);
4345
	mddev_unlock(mddev);
4346 4347
	if (!err)
		md_new_event(mddev);
4348 4349 4350 4351
	return err ? err : len;
}

static struct md_sysfs_entry md_new_device =
4352
__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4353

4354
static ssize_t
4355
bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4356 4357 4358
{
	char *end;
	unsigned long chunk, end_chunk;
4359
	int err;
4360

4361 4362 4363
	err = mddev_lock(mddev);
	if (err)
		return err;
4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376
	if (!mddev->bitmap)
		goto out;
	/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
	while (*buf) {
		chunk = end_chunk = simple_strtoul(buf, &end, 0);
		if (buf == end) break;
		if (*end == '-') { /* range */
			buf = end + 1;
			end_chunk = simple_strtoul(buf, &end, 0);
			if (buf == end) break;
		}
		if (*end && !isspace(*end)) break;
		bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
4377
		buf = skip_spaces(end);
4378 4379 4380
	}
	bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
out:
4381
	mddev_unlock(mddev);
4382 4383 4384 4385 4386 4387
	return len;
}

static struct md_sysfs_entry md_bitmap =
__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);

4388
static ssize_t
4389
size_show(struct mddev *mddev, char *page)
4390
{
A
Andre Noll 已提交
4391 4392
	return sprintf(page, "%llu\n",
		(unsigned long long)mddev->dev_sectors / 2);
4393 4394
}

4395
static int update_size(struct mddev *mddev, sector_t num_sectors);
4396 4397

static ssize_t
4398
size_store(struct mddev *mddev, const char *buf, size_t len)
4399 4400 4401 4402 4403
{
	/* If array is inactive, we can reduce the component size, but
	 * not increase it (except from 0).
	 * If array is active, we can try an on-line resize
	 */
D
Dan Williams 已提交
4404 4405
	sector_t sectors;
	int err = strict_blocks_to_sectors(buf, &sectors);
4406

A
Andre Noll 已提交
4407 4408
	if (err < 0)
		return err;
4409 4410 4411
	err = mddev_lock(mddev);
	if (err)
		return err;
4412
	if (mddev->pers) {
A
Andre Noll 已提交
4413
		err = update_size(mddev, sectors);
4414 4415
		if (err == 0)
			md_update_sb(mddev, 1);
4416
	} else {
A
Andre Noll 已提交
4417 4418 4419
		if (mddev->dev_sectors == 0 ||
		    mddev->dev_sectors > sectors)
			mddev->dev_sectors = sectors;
4420 4421 4422
		else
			err = -ENOSPC;
	}
4423
	mddev_unlock(mddev);
4424 4425 4426 4427
	return err ? err : len;
}

static struct md_sysfs_entry md_size =
4428
__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4429

M
Masanari Iida 已提交
4430
/* Metadata version.
4431 4432 4433
 * This is one of
 *   'none' for arrays with no metadata (good luck...)
 *   'external' for arrays with externally managed metadata,
4434 4435 4436
 * or N.M for internally known formats
 */
static ssize_t
4437
metadata_show(struct mddev *mddev, char *page)
4438 4439 4440 4441
{
	if (mddev->persistent)
		return sprintf(page, "%d.%d\n",
			       mddev->major_version, mddev->minor_version);
4442 4443
	else if (mddev->external)
		return sprintf(page, "external:%s\n", mddev->metadata_type);
4444 4445 4446 4447 4448
	else
		return sprintf(page, "none\n");
}

static ssize_t
4449
metadata_store(struct mddev *mddev, const char *buf, size_t len)
4450 4451 4452
{
	int major, minor;
	char *e;
4453
	int err;
4454 4455 4456 4457
	/* Changing the details of 'external' metadata is
	 * always permitted.  Otherwise there must be
	 * no devices attached to the array.
	 */
4458 4459 4460 4461 4462

	err = mddev_lock(mddev);
	if (err)
		return err;
	err = -EBUSY;
4463 4464 4465
	if (mddev->external && strncmp(buf, "external:", 9) == 0)
		;
	else if (!list_empty(&mddev->disks))
4466
		goto out_unlock;
4467

4468
	err = 0;
4469 4470
	if (cmd_match(buf, "none")) {
		mddev->persistent = 0;
4471 4472 4473
		mddev->external = 0;
		mddev->major_version = 0;
		mddev->minor_version = 90;
4474
		goto out_unlock;
4475 4476
	}
	if (strncmp(buf, "external:", 9) == 0) {
4477
		size_t namelen = len-9;
4478 4479 4480 4481 4482 4483 4484 4485
		if (namelen >= sizeof(mddev->metadata_type))
			namelen = sizeof(mddev->metadata_type)-1;
		strncpy(mddev->metadata_type, buf+9, namelen);
		mddev->metadata_type[namelen] = 0;
		if (namelen && mddev->metadata_type[namelen-1] == '\n')
			mddev->metadata_type[--namelen] = 0;
		mddev->persistent = 0;
		mddev->external = 1;
4486 4487
		mddev->major_version = 0;
		mddev->minor_version = 90;
4488
		goto out_unlock;
4489 4490
	}
	major = simple_strtoul(buf, &e, 10);
4491
	err = -EINVAL;
4492
	if (e==buf || *e != '.')
4493
		goto out_unlock;
4494 4495
	buf = e+1;
	minor = simple_strtoul(buf, &e, 10);
4496
	if (e==buf || (*e && *e != '\n') )
4497 4498
		goto out_unlock;
	err = -ENOENT;
4499
	if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4500
		goto out_unlock;
4501 4502 4503
	mddev->major_version = major;
	mddev->minor_version = minor;
	mddev->persistent = 1;
4504
	mddev->external = 0;
4505 4506 4507 4508
	err = 0;
out_unlock:
	mddev_unlock(mddev);
	return err ?: len;
4509 4510 4511
}

static struct md_sysfs_entry md_metadata =
4512
__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4513

4514
static ssize_t
4515
action_show(struct mddev *mddev, char *page)
4516
{
4517
	char *type = "idle";
4518 4519
	unsigned long recovery = mddev->recovery;
	if (test_bit(MD_RECOVERY_FROZEN, &recovery))
4520
		type = "frozen";
4521 4522 4523
	else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
	    (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
		if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
4524
			type = "reshape";
4525 4526
		else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
			if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
4527
				type = "resync";
4528
			else if (test_bit(MD_RECOVERY_CHECK, &recovery))
4529 4530 4531
				type = "check";
			else
				type = "repair";
4532
		} else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4533
			type = "recover";
4534 4535
		else if (mddev->reshape_position != MaxSector)
			type = "reshape";
4536 4537 4538 4539 4540
	}
	return sprintf(page, "%s\n", type);
}

static ssize_t
4541
action_store(struct mddev *mddev, const char *page, size_t len)
4542
{
4543 4544 4545
	if (!mddev->pers || !mddev->pers->sync_request)
		return -EINVAL;

4546 4547

	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4548 4549 4550 4551
		if (cmd_match(page, "frozen"))
			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
		else
			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4552 4553 4554 4555 4556
		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
		    mddev_lock(mddev) == 0) {
			flush_workqueue(md_misc_wq);
			if (mddev->sync_thread) {
				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4557 4558
				md_reap_sync_thread(mddev);
			}
4559
			mddev_unlock(mddev);
4560
		}
4561
	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4562
		return -EBUSY;
4563
	else if (cmd_match(page, "resync"))
4564
		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4565
	else if (cmd_match(page, "recover")) {
4566
		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4567 4568
		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
	} else if (cmd_match(page, "reshape")) {
4569 4570 4571
		int err;
		if (mddev->pers->start_reshape == NULL)
			return -EINVAL;
4572 4573
		err = mddev_lock(mddev);
		if (!err) {
4574 4575 4576 4577 4578 4579
			if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
				err =  -EBUSY;
			else {
				clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
				err = mddev->pers->start_reshape(mddev);
			}
4580 4581
			mddev_unlock(mddev);
		}
4582 4583
		if (err)
			return err;
4584
		sysfs_notify(&mddev->kobj, NULL, "degraded");
4585
	} else {
4586
		if (cmd_match(page, "check"))
4587
			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4588
		else if (!cmd_match(page, "repair"))
4589
			return -EINVAL;
4590
		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4591 4592 4593
		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
	}
4594 4595 4596 4597 4598 4599 4600
	if (mddev->ro == 2) {
		/* A write to sync_action is enough to justify
		 * canceling read-auto mode
		 */
		mddev->ro = 0;
		md_wakeup_thread(mddev->sync_thread);
	}
4601
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4602
	md_wakeup_thread(mddev->thread);
N
NeilBrown 已提交
4603
	sysfs_notify_dirent_safe(mddev->sysfs_action);
4604 4605 4606
	return len;
}

4607
static struct md_sysfs_entry md_scan_mode =
4608
__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4609 4610 4611 4612 4613 4614 4615 4616 4617

static ssize_t
last_sync_action_show(struct mddev *mddev, char *page)
{
	return sprintf(page, "%s\n", mddev->last_sync_action);
}

static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);

4618
static ssize_t
4619
mismatch_cnt_show(struct mddev *mddev, char *page)
4620 4621
{
	return sprintf(page, "%llu\n",
4622 4623
		       (unsigned long long)
		       atomic64_read(&mddev->resync_mismatches));
4624 4625
}

4626
static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4627

4628
static ssize_t
4629
sync_min_show(struct mddev *mddev, char *page)
4630 4631 4632 4633 4634 4635
{
	return sprintf(page, "%d (%s)\n", speed_min(mddev),
		       mddev->sync_speed_min ? "local": "system");
}

static ssize_t
4636
sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4637
{
A
Alexey Dobriyan 已提交
4638 4639 4640
	unsigned int min;
	int rv;

4641
	if (strncmp(buf, "system", 6)==0) {
A
Alexey Dobriyan 已提交
4642 4643 4644 4645 4646 4647 4648
		min = 0;
	} else {
		rv = kstrtouint(buf, 10, &min);
		if (rv < 0)
			return rv;
		if (min == 0)
			return -EINVAL;
4649 4650 4651 4652 4653 4654 4655 4656 4657
	}
	mddev->sync_speed_min = min;
	return len;
}

static struct md_sysfs_entry md_sync_min =
__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);

static ssize_t
4658
sync_max_show(struct mddev *mddev, char *page)
4659 4660 4661 4662 4663 4664
{
	return sprintf(page, "%d (%s)\n", speed_max(mddev),
		       mddev->sync_speed_max ? "local": "system");
}

static ssize_t
4665
sync_max_store(struct mddev *mddev, const char *buf, size_t len)
4666
{
A
Alexey Dobriyan 已提交
4667 4668 4669
	unsigned int max;
	int rv;

4670
	if (strncmp(buf, "system", 6)==0) {
A
Alexey Dobriyan 已提交
4671 4672 4673 4674 4675 4676 4677
		max = 0;
	} else {
		rv = kstrtouint(buf, 10, &max);
		if (rv < 0)
			return rv;
		if (max == 0)
			return -EINVAL;
4678 4679 4680 4681 4682 4683 4684 4685
	}
	mddev->sync_speed_max = max;
	return len;
}

static struct md_sysfs_entry md_sync_max =
__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);

4686
static ssize_t
4687
degraded_show(struct mddev *mddev, char *page)
4688 4689 4690 4691
{
	return sprintf(page, "%d\n", mddev->degraded);
}
static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4692

4693
static ssize_t
4694
sync_force_parallel_show(struct mddev *mddev, char *page)
4695 4696 4697 4698 4699
{
	return sprintf(page, "%d\n", mddev->parallel_resync);
}

static ssize_t
4700
sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
4701 4702 4703
{
	long n;

4704
	if (kstrtol(buf, 10, &n))
4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722
		return -EINVAL;

	if (n != 0 && n != 1)
		return -EINVAL;

	mddev->parallel_resync = n;

	if (mddev->sync_thread)
		wake_up(&resync_wait);

	return len;
}

/* force parallel resync, even with shared block devices */
static struct md_sysfs_entry md_sync_force_parallel =
__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
       sync_force_parallel_show, sync_force_parallel_store);

4723
static ssize_t
4724
sync_speed_show(struct mddev *mddev, char *page)
4725 4726
{
	unsigned long resync, dt, db;
4727 4728
	if (mddev->curr_resync == 0)
		return sprintf(page, "none\n");
4729 4730
	resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
	dt = (jiffies - mddev->resync_mark) / HZ;
4731
	if (!dt) dt++;
4732 4733
	db = resync - mddev->resync_mark_cnt;
	return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
4734 4735
}

4736
static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4737 4738

static ssize_t
4739
sync_completed_show(struct mddev *mddev, char *page)
4740
{
4741
	unsigned long long max_sectors, resync;
4742

4743 4744 4745
	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
		return sprintf(page, "none\n");

4746 4747 4748 4749
	if (mddev->curr_resync == 1 ||
	    mddev->curr_resync == 2)
		return sprintf(page, "delayed\n");

4750 4751
	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
A
Andre Noll 已提交
4752
		max_sectors = mddev->resync_max_sectors;
4753
	else
A
Andre Noll 已提交
4754
		max_sectors = mddev->dev_sectors;
4755

4756
	resync = mddev->curr_resync_completed;
4757
	return sprintf(page, "%llu / %llu\n", resync, max_sectors);
4758 4759
}

4760 4761
static struct md_sysfs_entry md_sync_completed =
	__ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
4762

4763
static ssize_t
4764
min_sync_show(struct mddev *mddev, char *page)
4765 4766 4767 4768 4769
{
	return sprintf(page, "%llu\n",
		       (unsigned long long)mddev->resync_min);
}
static ssize_t
4770
min_sync_store(struct mddev *mddev, const char *buf, size_t len)
4771 4772
{
	unsigned long long min;
4773 4774
	int err;

4775
	if (kstrtoull(buf, 10, &min))
4776
		return -EINVAL;
4777 4778 4779

	spin_lock(&mddev->lock);
	err = -EINVAL;
4780
	if (min > mddev->resync_max)
4781 4782 4783
		goto out_unlock;

	err = -EBUSY;
4784
	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4785
		goto out_unlock;
4786

4787 4788
	/* Round down to multiple of 4K for safety */
	mddev->resync_min = round_down(min, 8);
4789
	err = 0;
4790

4791 4792 4793
out_unlock:
	spin_unlock(&mddev->lock);
	return err ?: len;
4794 4795 4796 4797 4798
}

static struct md_sysfs_entry md_min_sync =
__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);

4799
static ssize_t
4800
max_sync_show(struct mddev *mddev, char *page)
4801 4802 4803 4804 4805 4806 4807 4808
{
	if (mddev->resync_max == MaxSector)
		return sprintf(page, "max\n");
	else
		return sprintf(page, "%llu\n",
			       (unsigned long long)mddev->resync_max);
}
static ssize_t
4809
max_sync_store(struct mddev *mddev, const char *buf, size_t len)
4810
{
4811 4812
	int err;
	spin_lock(&mddev->lock);
4813 4814 4815
	if (strncmp(buf, "max", 3) == 0)
		mddev->resync_max = MaxSector;
	else {
4816
		unsigned long long max;
4817 4818 4819
		int chunk;

		err = -EINVAL;
4820
		if (kstrtoull(buf, 10, &max))
4821
			goto out_unlock;
4822
		if (max < mddev->resync_min)
4823 4824 4825
			goto out_unlock;

		err = -EBUSY;
4826
		if (max < mddev->resync_max &&
4827
		    mddev->ro == 0 &&
4828
		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4829
			goto out_unlock;
4830 4831

		/* Must be a multiple of chunk_size */
4832 4833
		chunk = mddev->chunk_sectors;
		if (chunk) {
4834
			sector_t temp = max;
4835 4836 4837 4838

			err = -EINVAL;
			if (sector_div(temp, chunk))
				goto out_unlock;
4839 4840 4841 4842
		}
		mddev->resync_max = max;
	}
	wake_up(&mddev->recovery_wait);
4843 4844 4845 4846
	err = 0;
out_unlock:
	spin_unlock(&mddev->lock);
	return err ?: len;
4847 4848 4849 4850 4851
}

static struct md_sysfs_entry md_max_sync =
__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);

4852
static ssize_t
4853
suspend_lo_show(struct mddev *mddev, char *page)
4854 4855 4856 4857 4858
{
	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
}

static ssize_t
4859
suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4860
{
4861
	unsigned long long new;
4862
	int err;
4863

A
Alexey Dobriyan 已提交
4864 4865 4866 4867
	err = kstrtoull(buf, 10, &new);
	if (err < 0)
		return err;
	if (new != (sector_t)new)
4868
		return -EINVAL;
4869

4870 4871 4872 4873 4874 4875 4876
	err = mddev_lock(mddev);
	if (err)
		return err;
	err = -EINVAL;
	if (mddev->pers == NULL ||
	    mddev->pers->quiesce == NULL)
		goto unlock;
4877
	mddev_suspend(mddev);
4878
	mddev->suspend_lo = new;
4879 4880
	mddev_resume(mddev);

4881 4882 4883 4884
	err = 0;
unlock:
	mddev_unlock(mddev);
	return err ?: len;
4885 4886 4887 4888 4889
}
static struct md_sysfs_entry md_suspend_lo =
__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);

static ssize_t
4890
suspend_hi_show(struct mddev *mddev, char *page)
4891 4892 4893 4894 4895
{
	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
}

static ssize_t
4896
suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
4897
{
4898
	unsigned long long new;
4899
	int err;
4900

A
Alexey Dobriyan 已提交
4901 4902 4903 4904
	err = kstrtoull(buf, 10, &new);
	if (err < 0)
		return err;
	if (new != (sector_t)new)
4905
		return -EINVAL;
4906

4907 4908 4909 4910
	err = mddev_lock(mddev);
	if (err)
		return err;
	err = -EINVAL;
4911
	if (mddev->pers == NULL)
4912
		goto unlock;
4913 4914

	mddev_suspend(mddev);
4915
	mddev->suspend_hi = new;
4916 4917
	mddev_resume(mddev);

4918 4919 4920 4921
	err = 0;
unlock:
	mddev_unlock(mddev);
	return err ?: len;
4922 4923 4924 4925
}
static struct md_sysfs_entry md_suspend_hi =
__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);

4926
static ssize_t
4927
reshape_position_show(struct mddev *mddev, char *page)
4928 4929 4930 4931 4932 4933 4934 4935 4936
{
	if (mddev->reshape_position != MaxSector)
		return sprintf(page, "%llu\n",
			       (unsigned long long)mddev->reshape_position);
	strcpy(page, "none\n");
	return 5;
}

static ssize_t
4937
reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
4938
{
4939
	struct md_rdev *rdev;
A
Alexey Dobriyan 已提交
4940
	unsigned long long new;
4941 4942
	int err;

A
Alexey Dobriyan 已提交
4943 4944 4945 4946
	err = kstrtoull(buf, 10, &new);
	if (err < 0)
		return err;
	if (new != (sector_t)new)
4947
		return -EINVAL;
4948 4949 4950 4951 4952 4953
	err = mddev_lock(mddev);
	if (err)
		return err;
	err = -EBUSY;
	if (mddev->pers)
		goto unlock;
4954 4955
	mddev->reshape_position = new;
	mddev->delta_disks = 0;
4956
	mddev->reshape_backwards = 0;
4957 4958
	mddev->new_level = mddev->level;
	mddev->new_layout = mddev->layout;
4959
	mddev->new_chunk_sectors = mddev->chunk_sectors;
4960 4961
	rdev_for_each(rdev, mddev)
		rdev->new_data_offset = rdev->data_offset;
4962 4963 4964 4965
	err = 0;
unlock:
	mddev_unlock(mddev);
	return err ?: len;
4966 4967 4968 4969 4970 4971
}

static struct md_sysfs_entry md_reshape_position =
__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
       reshape_position_store);

4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982
static ssize_t
reshape_direction_show(struct mddev *mddev, char *page)
{
	return sprintf(page, "%s\n",
		       mddev->reshape_backwards ? "backwards" : "forwards");
}

static ssize_t
reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
{
	int backwards = 0;
4983 4984
	int err;

4985 4986 4987 4988 4989 4990 4991 4992 4993
	if (cmd_match(buf, "forwards"))
		backwards = 0;
	else if (cmd_match(buf, "backwards"))
		backwards = 1;
	else
		return -EINVAL;
	if (mddev->reshape_backwards == backwards)
		return len;

4994 4995 4996
	err = mddev_lock(mddev);
	if (err)
		return err;
4997 4998
	/* check if we are allowed to change */
	if (mddev->delta_disks)
4999 5000
		err = -EBUSY;
	else if (mddev->persistent &&
5001
	    mddev->major_version == 0)
5002 5003 5004 5005 5006
		err =  -EINVAL;
	else
		mddev->reshape_backwards = backwards;
	mddev_unlock(mddev);
	return err ?: len;
5007 5008 5009 5010 5011 5012
}

static struct md_sysfs_entry md_reshape_direction =
__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
       reshape_direction_store);

D
Dan Williams 已提交
5013
static ssize_t
5014
array_size_show(struct mddev *mddev, char *page)
D
Dan Williams 已提交
5015 5016 5017 5018 5019 5020 5021 5022 5023
{
	if (mddev->external_size)
		return sprintf(page, "%llu\n",
			       (unsigned long long)mddev->array_sectors/2);
	else
		return sprintf(page, "default\n");
}

static ssize_t
5024
array_size_store(struct mddev *mddev, const char *buf, size_t len)
D
Dan Williams 已提交
5025 5026
{
	sector_t sectors;
5027 5028 5029 5030 5031
	int err;

	err = mddev_lock(mddev);
	if (err)
		return err;
D
Dan Williams 已提交
5032

5033
	/* cluster raid doesn't support change array_sectors */
5034 5035
	if (mddev_is_clustered(mddev)) {
		mddev_unlock(mddev);
5036
		return -EINVAL;
5037
	}
5038

D
Dan Williams 已提交
5039 5040 5041 5042 5043 5044 5045 5046 5047
	if (strncmp(buf, "default", 7) == 0) {
		if (mddev->pers)
			sectors = mddev->pers->size(mddev, 0, 0);
		else
			sectors = mddev->array_sectors;

		mddev->external_size = 0;
	} else {
		if (strict_blocks_to_sectors(buf, &sectors) < 0)
5048 5049 5050 5051 5052
			err = -EINVAL;
		else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
			err = -E2BIG;
		else
			mddev->external_size = 1;
D
Dan Williams 已提交
5053 5054
	}

5055 5056 5057 5058 5059 5060
	if (!err) {
		mddev->array_sectors = sectors;
		if (mddev->pers) {
			set_capacity(mddev->gendisk, mddev->array_sectors);
			revalidate_disk(mddev->gendisk);
		}
5061
	}
5062 5063
	mddev_unlock(mddev);
	return err ?: len;
D
Dan Williams 已提交
5064 5065 5066 5067 5068
}

static struct md_sysfs_entry md_array_size =
__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
       array_size_store);
5069

5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095
static ssize_t
consistency_policy_show(struct mddev *mddev, char *page)
{
	int ret;

	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
		ret = sprintf(page, "journal\n");
	} else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
		ret = sprintf(page, "ppl\n");
	} else if (mddev->bitmap) {
		ret = sprintf(page, "bitmap\n");
	} else if (mddev->pers) {
		if (mddev->pers->sync_request)
			ret = sprintf(page, "resync\n");
		else
			ret = sprintf(page, "none\n");
	} else {
		ret = sprintf(page, "unknown\n");
	}

	return ret;
}

static ssize_t
consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
{
5096 5097
	int err = 0;

5098
	if (mddev->pers) {
5099 5100 5101 5102
		if (mddev->pers->change_consistency_policy)
			err = mddev->pers->change_consistency_policy(mddev, buf);
		else
			err = -EBUSY;
5103 5104 5105
	} else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
		set_bit(MD_HAS_PPL, &mddev->flags);
	} else {
5106
		err = -EINVAL;
5107
	}
5108 5109

	return err ? err : len;
5110 5111 5112 5113 5114 5115
}

static struct md_sysfs_entry md_consistency_policy =
__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
       consistency_policy_store);

5116 5117
static struct attribute *md_default_attrs[] = {
	&md_level.attr,
5118
	&md_layout.attr,
5119
	&md_raid_disks.attr,
5120
	&md_chunk_size.attr,
5121
	&md_size.attr,
5122
	&md_resync_start.attr,
5123
	&md_metadata.attr,
5124
	&md_new_device.attr,
5125
	&md_safe_delay.attr,
5126
	&md_array_state.attr,
5127
	&md_reshape_position.attr,
5128
	&md_reshape_direction.attr,
D
Dan Williams 已提交
5129
	&md_array_size.attr,
5130
	&max_corr_read_errors.attr,
5131
	&md_consistency_policy.attr,
5132 5133 5134 5135
	NULL,
};

static struct attribute *md_redundancy_attrs[] = {
5136
	&md_scan_mode.attr,
5137
	&md_last_scan_mode.attr,
5138
	&md_mismatches.attr,
5139 5140 5141
	&md_sync_min.attr,
	&md_sync_max.attr,
	&md_sync_speed.attr,
5142
	&md_sync_force_parallel.attr,
5143
	&md_sync_completed.attr,
5144
	&md_min_sync.attr,
5145
	&md_max_sync.attr,
5146 5147
	&md_suspend_lo.attr,
	&md_suspend_hi.attr,
5148
	&md_bitmap.attr,
5149
	&md_degraded.attr,
5150 5151
	NULL,
};
5152 5153 5154 5155 5156
static struct attribute_group md_redundancy_group = {
	.name = NULL,
	.attrs = md_redundancy_attrs,
};

5157 5158 5159 5160
static ssize_t
md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5161
	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5162
	ssize_t rv;
5163 5164 5165

	if (!entry->show)
		return -EIO;
5166 5167 5168 5169 5170 5171 5172 5173
	spin_lock(&all_mddevs_lock);
	if (list_empty(&mddev->all_mddevs)) {
		spin_unlock(&all_mddevs_lock);
		return -EBUSY;
	}
	mddev_get(mddev);
	spin_unlock(&all_mddevs_lock);

5174
	rv = entry->show(mddev, page);
5175
	mddev_put(mddev);
5176
	return rv;
5177 5178 5179 5180 5181 5182 5183
}

static ssize_t
md_attr_store(struct kobject *kobj, struct attribute *attr,
	      const char *page, size_t length)
{
	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5184
	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5185
	ssize_t rv;
5186 5187 5188

	if (!entry->store)
		return -EIO;
5189 5190
	if (!capable(CAP_SYS_ADMIN))
		return -EACCES;
5191 5192 5193 5194 5195 5196 5197
	spin_lock(&all_mddevs_lock);
	if (list_empty(&mddev->all_mddevs)) {
		spin_unlock(&all_mddevs_lock);
		return -EBUSY;
	}
	mddev_get(mddev);
	spin_unlock(&all_mddevs_lock);
5198
	rv = entry->store(mddev, page, length);
5199
	mddev_put(mddev);
5200
	return rv;
5201 5202 5203 5204
}

static void md_free(struct kobject *ko)
{
5205
	struct mddev *mddev = container_of(ko, struct mddev, kobj);
5206 5207 5208 5209

	if (mddev->sysfs_state)
		sysfs_put(mddev->sysfs_state);

5210 5211
	if (mddev->gendisk)
		del_gendisk(mddev->gendisk);
5212 5213
	if (mddev->queue)
		blk_cleanup_queue(mddev->queue);
5214
	if (mddev->gendisk)
5215
		put_disk(mddev->gendisk);
5216
	percpu_ref_exit(&mddev->writes_pending);
5217

5218 5219 5220
	kfree(mddev);
}

5221
static const struct sysfs_ops md_sysfs_ops = {
5222 5223 5224 5225 5226 5227 5228 5229 5230
	.show	= md_attr_show,
	.store	= md_attr_store,
};
static struct kobj_type md_ktype = {
	.release	= md_free,
	.sysfs_ops	= &md_sysfs_ops,
	.default_attrs	= md_default_attrs,
};

L
Linus Torvalds 已提交
5231 5232
int mdp_major = 0;

5233 5234
static void mddev_delayed_delete(struct work_struct *ws)
{
5235
	struct mddev *mddev = container_of(ws, struct mddev, del_work);
5236

5237
	sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
5238 5239 5240 5241
	kobject_del(&mddev->kobj);
	kobject_put(&mddev->kobj);
}

5242 5243
static void no_op(struct percpu_ref *r) {}

5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255
int mddev_init_writes_pending(struct mddev *mddev)
{
	if (mddev->writes_pending.percpu_count_ptr)
		return 0;
	if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
		return -ENOMEM;
	/* We want to start with the refcount at zero */
	percpu_ref_put(&mddev->writes_pending);
	return 0;
}
EXPORT_SYMBOL_GPL(mddev_init_writes_pending);

5256
static int md_alloc(dev_t dev, char *name)
L
Linus Torvalds 已提交
5257
{
5258 5259 5260 5261 5262 5263 5264 5265 5266
	/*
	 * If dev is zero, name is the name of a device to allocate with
	 * an arbitrary minor number.  It will be "md_???"
	 * If dev is non-zero it must be a device number with a MAJOR of
	 * MD_MAJOR or mdp_major.  In this case, if "name" is NULL, then
	 * the device is being created by opening a node in /dev.
	 * If "name" is not NULL, the device is being created by
	 * writing to /sys/module/md_mod/parameters/new_array.
	 */
A
Arjan van de Ven 已提交
5267
	static DEFINE_MUTEX(disks_mutex);
5268
	struct mddev *mddev = mddev_find(dev);
L
Linus Torvalds 已提交
5269
	struct gendisk *disk;
5270 5271 5272
	int partitioned;
	int shift;
	int unit;
5273
	int error;
L
Linus Torvalds 已提交
5274 5275

	if (!mddev)
5276 5277 5278 5279 5280
		return -ENODEV;

	partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
	shift = partitioned ? MdpMinorShift : 0;
	unit = MINOR(mddev->unit) >> shift;
L
Linus Torvalds 已提交
5281

T
Tejun Heo 已提交
5282 5283
	/* wait for any previous instance of this device to be
	 * completely removed (mddev_delayed_delete).
5284
	 */
T
Tejun Heo 已提交
5285
	flush_workqueue(md_misc_wq);
5286

A
Arjan van de Ven 已提交
5287
	mutex_lock(&disks_mutex);
N
NeilBrown 已提交
5288 5289 5290
	error = -EEXIST;
	if (mddev->gendisk)
		goto abort;
5291

5292
	if (name && !dev) {
5293 5294
		/* Need to ensure that 'name' is not a duplicate.
		 */
5295
		struct mddev *mddev2;
5296 5297 5298 5299 5300 5301
		spin_lock(&all_mddevs_lock);

		list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
			if (mddev2->gendisk &&
			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
				spin_unlock(&all_mddevs_lock);
N
NeilBrown 已提交
5302
				goto abort;
5303 5304
			}
		spin_unlock(&all_mddevs_lock);
L
Linus Torvalds 已提交
5305
	}
5306 5307 5308 5309 5310
	if (name && dev)
		/*
		 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
		 */
		mddev->hold_active = UNTIL_STOP;
5311

N
NeilBrown 已提交
5312
	error = -ENOMEM;
5313
	mddev->queue = blk_alloc_queue(GFP_KERNEL);
N
NeilBrown 已提交
5314 5315
	if (!mddev->queue)
		goto abort;
5316 5317 5318
	mddev->queue->queuedata = mddev;

	blk_queue_make_request(mddev->queue, md_make_request);
5319
	blk_set_stacking_limits(&mddev->queue->limits);
5320

L
Linus Torvalds 已提交
5321 5322
	disk = alloc_disk(1 << shift);
	if (!disk) {
5323 5324
		blk_cleanup_queue(mddev->queue);
		mddev->queue = NULL;
N
NeilBrown 已提交
5325
		goto abort;
L
Linus Torvalds 已提交
5326
	}
5327
	disk->major = MAJOR(mddev->unit);
L
Linus Torvalds 已提交
5328
	disk->first_minor = unit << shift;
5329 5330 5331
	if (name)
		strcpy(disk->disk_name, name);
	else if (partitioned)
L
Linus Torvalds 已提交
5332
		sprintf(disk->disk_name, "md_d%d", unit);
5333
	else
L
Linus Torvalds 已提交
5334 5335 5336 5337
		sprintf(disk->disk_name, "md%d", unit);
	disk->fops = &md_fops;
	disk->private_data = mddev;
	disk->queue = mddev->queue;
5338
	blk_queue_write_cache(mddev->queue, true, true);
5339
	/* Allow extended partitions.  This makes the
5340
	 * 'mdp' device redundant, but we can't really
5341 5342 5343
	 * remove it now.
	 */
	disk->flags |= GENHD_FL_EXT_DEVT;
L
Linus Torvalds 已提交
5344
	mddev->gendisk = disk;
5345 5346 5347 5348 5349 5350
	/* As soon as we call add_disk(), another thread could get
	 * through to md_open, so make sure it doesn't get too far
	 */
	mutex_lock(&mddev->open_mutex);
	add_disk(disk);

5351 5352
	error = kobject_init_and_add(&mddev->kobj, &md_ktype,
				     &disk_to_dev(disk)->kobj, "%s", "md");
N
NeilBrown 已提交
5353 5354 5355 5356
	if (error) {
		/* This isn't possible, but as kobject_init_and_add is marked
		 * __must_check, we must do something with the result
		 */
5357 5358
		pr_debug("md: cannot register %s/md - name in use\n",
			 disk->disk_name);
N
NeilBrown 已提交
5359 5360
		error = 0;
	}
N
NeilBrown 已提交
5361 5362
	if (mddev->kobj.sd &&
	    sysfs_create_group(&mddev->kobj, &md_bitmap_group))
5363
		pr_debug("pointless warning\n");
5364
	mutex_unlock(&mddev->open_mutex);
N
NeilBrown 已提交
5365 5366
 abort:
	mutex_unlock(&disks_mutex);
N
NeilBrown 已提交
5367
	if (!error && mddev->kobj.sd) {
5368
		kobject_uevent(&mddev->kobj, KOBJ_ADD);
N
NeilBrown 已提交
5369
		mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
5370
	}
5371
	mddev_put(mddev);
N
NeilBrown 已提交
5372
	return error;
5373 5374 5375 5376
}

static struct kobject *md_probe(dev_t dev, int *part, void *data)
{
5377 5378
	if (create_on_open)
		md_alloc(dev, NULL);
L
Linus Torvalds 已提交
5379 5380 5381
	return NULL;
}

5382
static int add_named_array(const char *val, const struct kernel_param *kp)
5383
{
5384 5385 5386
	/*
	 * val must be "md_*" or "mdNNN".
	 * For "md_*" we allocate an array with a large free minor number, and
5387
	 * set the name to val.  val must not already be an active name.
5388 5389
	 * For "mdNNN" we allocate an array with the minor number NNN
	 * which must not already be in use.
5390 5391 5392
	 */
	int len = strlen(val);
	char buf[DISK_NAME_LEN];
5393
	unsigned long devnum;
5394 5395 5396 5397 5398 5399

	while (len && val[len-1] == '\n')
		len--;
	if (len >= DISK_NAME_LEN)
		return -E2BIG;
	strlcpy(buf, val, len+1);
5400 5401 5402 5403 5404 5405 5406 5407 5408
	if (strncmp(buf, "md_", 3) == 0)
		return md_alloc(0, buf);
	if (strncmp(buf, "md", 2) == 0 &&
	    isdigit(buf[2]) &&
	    kstrtoul(buf+2, 10, &devnum) == 0 &&
	    devnum <= MINORMASK)
		return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);

	return -EINVAL;
5409 5410
}

5411
static void md_safemode_timeout(struct timer_list *t)
L
Linus Torvalds 已提交
5412
{
5413
	struct mddev *mddev = from_timer(mddev, t, safemode_timer);
L
Linus Torvalds 已提交
5414

5415 5416 5417 5418
	mddev->safemode = 1;
	if (mddev->external)
		sysfs_notify_dirent_safe(mddev->sysfs_state);

L
Linus Torvalds 已提交
5419 5420 5421
	md_wakeup_thread(mddev->thread);
}

5422
static int start_dirty_degraded;
L
Linus Torvalds 已提交
5423

5424
int md_run(struct mddev *mddev)
L
Linus Torvalds 已提交
5425
{
5426
	int err;
5427
	struct md_rdev *rdev;
5428
	struct md_personality *pers;
L
Linus Torvalds 已提交
5429

5430 5431
	if (list_empty(&mddev->disks))
		/* cannot run an array with no devices.. */
L
Linus Torvalds 已提交
5432 5433 5434 5435
		return -EINVAL;

	if (mddev->pers)
		return -EBUSY;
5436 5437 5438
	/* Cannot run until previous stop completes properly */
	if (mddev->sysfs_active)
		return -EBUSY;
5439

L
Linus Torvalds 已提交
5440 5441 5442
	/*
	 * Analyze all RAID superblock(s)
	 */
5443 5444 5445
	if (!mddev->raid_disks) {
		if (!mddev->persistent)
			return -EINVAL;
5446
		analyze_sbs(mddev);
5447
	}
L
Linus Torvalds 已提交
5448

5449 5450 5451 5452
	if (mddev->level != LEVEL_NONE)
		request_module("md-level-%d", mddev->level);
	else if (mddev->clevel[0])
		request_module("md-%s", mddev->clevel);
L
Linus Torvalds 已提交
5453 5454 5455 5456 5457 5458

	/*
	 * Drop all container device buffers, from now on
	 * the only valid external interface is through the md
	 * device.
	 */
5459
	mddev->has_superblocks = false;
N
NeilBrown 已提交
5460
	rdev_for_each(rdev, mddev) {
5461
		if (test_bit(Faulty, &rdev->flags))
L
Linus Torvalds 已提交
5462 5463
			continue;
		sync_blockdev(rdev->bdev);
5464
		invalidate_bdev(rdev->bdev);
5465 5466 5467 5468 5469 5470 5471
		if (mddev->ro != 1 &&
		    (bdev_read_only(rdev->bdev) ||
		     bdev_read_only(rdev->meta_bdev))) {
			mddev->ro = 1;
			if (mddev->gendisk)
				set_disk_ro(mddev->gendisk, 1);
		}
5472

5473 5474 5475
		if (rdev->sb_page)
			mddev->has_superblocks = true;

5476 5477
		/* perform some consistency tests on the device.
		 * We don't want the data to overlap the metadata,
A
Andre Noll 已提交
5478
		 * Internal Bitmap issues have been handled elsewhere.
5479
		 */
5480 5481 5482
		if (rdev->meta_bdev) {
			/* Nothing to check */;
		} else if (rdev->data_offset < rdev->sb_start) {
A
Andre Noll 已提交
5483 5484
			if (mddev->dev_sectors &&
			    rdev->data_offset + mddev->dev_sectors
5485
			    > rdev->sb_start) {
5486 5487
				pr_warn("md: %s: data overlaps metadata\n",
					mdname(mddev));
5488 5489 5490
				return -EINVAL;
			}
		} else {
5491
			if (rdev->sb_start + rdev->sb_size/512
5492
			    > rdev->data_offset) {
5493 5494
				pr_warn("md: %s: metadata overlaps data\n",
					mdname(mddev));
5495 5496 5497
				return -EINVAL;
			}
		}
N
NeilBrown 已提交
5498
		sysfs_notify_dirent_safe(rdev->sysfs_state);
L
Linus Torvalds 已提交
5499 5500
	}

5501 5502 5503 5504
	if (!bioset_initialized(&mddev->bio_set)) {
		err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
		if (err)
			return err;
5505
	}
5506 5507 5508
	if (!bioset_initialized(&mddev->sync_set)) {
		err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
		if (err)
X
Xiao Ni 已提交
5509
			goto abort;
5510
	}
5511

L
Linus Torvalds 已提交
5512
	spin_lock(&pers_lock);
5513
	pers = find_pers(mddev->level, mddev->clevel);
5514
	if (!pers || !try_module_get(pers->owner)) {
L
Linus Torvalds 已提交
5515
		spin_unlock(&pers_lock);
5516
		if (mddev->level != LEVEL_NONE)
5517 5518
			pr_warn("md: personality for level %d is not loaded!\n",
				mddev->level);
5519
		else
5520 5521
			pr_warn("md: personality for level %s is not loaded!\n",
				mddev->clevel);
X
Xiao Ni 已提交
5522 5523
		err = -EINVAL;
		goto abort;
L
Linus Torvalds 已提交
5524 5525
	}
	spin_unlock(&pers_lock);
5526 5527 5528 5529
	if (mddev->level != pers->level) {
		mddev->level = pers->level;
		mddev->new_level = pers->level;
	}
5530
	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
L
Linus Torvalds 已提交
5531

5532
	if (mddev->reshape_position != MaxSector &&
5533
	    pers->start_reshape == NULL) {
5534 5535
		/* This personality cannot handle reshaping... */
		module_put(pers->owner);
X
Xiao Ni 已提交
5536 5537
		err = -EINVAL;
		goto abort;
5538 5539
	}

5540 5541 5542 5543 5544
	if (pers->sync_request) {
		/* Warn if this is a potentially silly
		 * configuration.
		 */
		char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5545
		struct md_rdev *rdev2;
5546
		int warned = 0;
5547

N
NeilBrown 已提交
5548 5549
		rdev_for_each(rdev, mddev)
			rdev_for_each(rdev2, mddev) {
5550 5551 5552
				if (rdev < rdev2 &&
				    rdev->bdev->bd_contains ==
				    rdev2->bdev->bd_contains) {
5553 5554 5555 5556
					pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n",
						mdname(mddev),
						bdevname(rdev->bdev,b),
						bdevname(rdev2->bdev,b2));
5557 5558 5559
					warned = 1;
				}
			}
5560

5561
		if (warned)
5562
			pr_warn("True protection against single-disk failure might be compromised.\n");
5563 5564
	}

5565
	mddev->recovery = 0;
A
Andre Noll 已提交
5566 5567 5568
	/* may be over-ridden by personality */
	mddev->resync_max_sectors = mddev->dev_sectors;

5569
	mddev->ok_start_degraded = start_dirty_degraded;
L
Linus Torvalds 已提交
5570

5571
	if (start_readonly && mddev->ro == 0)
5572 5573
		mddev->ro = 2; /* read-only, but switch on first write */

5574
	err = pers->run(mddev);
5575
	if (err)
5576
		pr_warn("md: pers->run() failed ...\n");
5577
	else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
5578 5579 5580 5581 5582 5583
		WARN_ONCE(!mddev->external_size,
			  "%s: default size too small, but 'external_size' not in effect?\n",
			  __func__);
		pr_warn("md: invalid array_size %llu > default size %llu\n",
			(unsigned long long)mddev->array_sectors / 2,
			(unsigned long long)pers->size(mddev, 0, 0) / 2);
D
Dan Williams 已提交
5584 5585
		err = -EINVAL;
	}
5586
	if (err == 0 && pers->sync_request &&
5587
	    (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
5588 5589 5590 5591 5592
		struct bitmap *bitmap;

		bitmap = bitmap_create(mddev, -1);
		if (IS_ERR(bitmap)) {
			err = PTR_ERR(bitmap);
5593 5594
			pr_warn("%s: failed to create bitmap (%d)\n",
				mdname(mddev), err);
5595 5596 5597
		} else
			mddev->bitmap = bitmap;

5598
	}
L
Linus Torvalds 已提交
5599
	if (err) {
5600
		mddev_detach(mddev);
5601 5602
		if (mddev->private)
			pers->free(mddev, mddev->private);
5603
		mddev->private = NULL;
5604
		module_put(pers->owner);
5605
		bitmap_destroy(mddev);
X
Xiao Ni 已提交
5606
		goto abort;
L
Linus Torvalds 已提交
5607
	}
5608
	if (mddev->queue) {
S
Shaohua Li 已提交
5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620
		bool nonrot = true;

		rdev_for_each(rdev, mddev) {
			if (rdev->raid_disk >= 0 &&
			    !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
				nonrot = false;
				break;
			}
		}
		if (mddev->degraded)
			nonrot = false;
		if (nonrot)
5621
			blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue);
S
Shaohua Li 已提交
5622
		else
5623
			blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue);
5624 5625
		mddev->queue->backing_dev_info->congested_data = mddev;
		mddev->queue->backing_dev_info->congested_fn = md_congested;
5626
	}
5627
	if (pers->sync_request) {
N
NeilBrown 已提交
5628 5629
		if (mddev->kobj.sd &&
		    sysfs_create_group(&mddev->kobj, &md_redundancy_group))
5630 5631
			pr_warn("md: cannot register extra attributes for %s\n",
				mdname(mddev));
N
NeilBrown 已提交
5632
		mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
5633
	} else if (mddev->ro == 2) /* auto-readonly not meaningful */
5634 5635
		mddev->ro = 0;

5636 5637
	atomic_set(&mddev->max_corr_read_errors,
		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
L
Linus Torvalds 已提交
5638
	mddev->safemode = 0;
5639 5640 5641 5642
	if (mddev_is_clustered(mddev))
		mddev->safemode_delay = 0;
	else
		mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
L
Linus Torvalds 已提交
5643
	mddev->in_sync = 1;
5644
	smp_wmb();
5645 5646 5647
	spin_lock(&mddev->lock);
	mddev->pers = pers;
	spin_unlock(&mddev->lock);
N
NeilBrown 已提交
5648
	rdev_for_each(rdev, mddev)
5649 5650
		if (rdev->raid_disk >= 0)
			if (sysfs_link_rdev(mddev, rdev))
N
NeilBrown 已提交
5651
				/* failure here is OK */;
5652

5653 5654 5655 5656 5657
	if (mddev->degraded && !mddev->ro)
		/* This ensures that recovering status is reported immediately
		 * via sysfs - until a lack of spares is confirmed.
		 */
		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
L
Linus Torvalds 已提交
5658
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5659

5660
	if (mddev->sb_flags)
5661
		md_update_sb(mddev, 0);
L
Linus Torvalds 已提交
5662

5663
	md_new_event(mddev);
N
NeilBrown 已提交
5664 5665
	sysfs_notify_dirent_safe(mddev->sysfs_state);
	sysfs_notify_dirent_safe(mddev->sysfs_action);
5666
	sysfs_notify(&mddev->kobj, NULL, "degraded");
L
Linus Torvalds 已提交
5667
	return 0;
X
Xiao Ni 已提交
5668 5669

abort:
5670 5671
	bioset_exit(&mddev->bio_set);
	bioset_exit(&mddev->sync_set);
X
Xiao Ni 已提交
5672 5673

	return err;
L
Linus Torvalds 已提交
5674
}
5675
EXPORT_SYMBOL_GPL(md_run);
L
Linus Torvalds 已提交
5676

5677
static int do_md_run(struct mddev *mddev)
5678 5679 5680 5681 5682 5683
{
	int err;

	err = md_run(mddev);
	if (err)
		goto out;
5684 5685 5686 5687 5688
	err = bitmap_load(mddev);
	if (err) {
		bitmap_destroy(mddev);
		goto out;
	}
5689

5690 5691 5692
	if (mddev_is_clustered(mddev))
		md_allow_write(mddev);

5693 5694 5695
	/* run start up tasks that require md_thread */
	md_start(mddev);

5696 5697 5698
	md_wakeup_thread(mddev->thread);
	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */

5699 5700
	set_capacity(mddev->gendisk, mddev->array_sectors);
	revalidate_disk(mddev->gendisk);
5701
	mddev->changed = 1;
5702 5703 5704 5705 5706
	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
out:
	return err;
}

5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721
int md_start(struct mddev *mddev)
{
	int ret = 0;

	if (mddev->pers->start) {
		set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
		ret = mddev->pers->start(mddev);
		clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
		md_wakeup_thread(mddev->sync_thread);
	}
	return ret;
}
EXPORT_SYMBOL_GPL(md_start);

5722
static int restart_array(struct mddev *mddev)
L
Linus Torvalds 已提交
5723 5724
{
	struct gendisk *disk = mddev->gendisk;
5725 5726 5727
	struct md_rdev *rdev;
	bool has_journal = false;
	bool has_readonly = false;
L
Linus Torvalds 已提交
5728

A
Andre Noll 已提交
5729
	/* Complain if it has no devices */
L
Linus Torvalds 已提交
5730
	if (list_empty(&mddev->disks))
A
Andre Noll 已提交
5731 5732 5733 5734 5735
		return -ENXIO;
	if (!mddev->pers)
		return -EINVAL;
	if (!mddev->ro)
		return -EBUSY;
5736

5737 5738 5739 5740 5741 5742 5743 5744 5745 5746
	rcu_read_lock();
	rdev_for_each_rcu(rdev, mddev) {
		if (test_bit(Journal, &rdev->flags) &&
		    !test_bit(Faulty, &rdev->flags))
			has_journal = true;
		if (bdev_read_only(rdev->bdev))
			has_readonly = true;
	}
	rcu_read_unlock();
	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
5747 5748
		/* Don't restart rw with journal missing/faulty */
			return -EINVAL;
5749 5750
	if (has_readonly)
		return -EROFS;
5751

A
Andre Noll 已提交
5752 5753 5754
	mddev->safemode = 0;
	mddev->ro = 0;
	set_disk_ro(disk, 0);
5755
	pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
A
Andre Noll 已提交
5756 5757 5758 5759
	/* Kick recovery or resync if necessary */
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
	md_wakeup_thread(mddev->thread);
	md_wakeup_thread(mddev->sync_thread);
N
NeilBrown 已提交
5760
	sysfs_notify_dirent_safe(mddev->sysfs_state);
A
Andre Noll 已提交
5761
	return 0;
L
Linus Torvalds 已提交
5762 5763
}

5764
static void md_clean(struct mddev *mddev)
N
NeilBrown 已提交
5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778
{
	mddev->array_sectors = 0;
	mddev->external_size = 0;
	mddev->dev_sectors = 0;
	mddev->raid_disks = 0;
	mddev->recovery_cp = 0;
	mddev->resync_min = 0;
	mddev->resync_max = MaxSector;
	mddev->reshape_position = MaxSector;
	mddev->external = 0;
	mddev->persistent = 0;
	mddev->level = LEVEL_NONE;
	mddev->clevel[0] = 0;
	mddev->flags = 0;
5779
	mddev->sb_flags = 0;
N
NeilBrown 已提交
5780 5781 5782 5783 5784 5785 5786
	mddev->ro = 0;
	mddev->metadata_type[0] = 0;
	mddev->chunk_sectors = 0;
	mddev->ctime = mddev->utime = 0;
	mddev->layout = 0;
	mddev->max_disks = 0;
	mddev->events = 0;
5787
	mddev->can_decrease_events = 0;
N
NeilBrown 已提交
5788
	mddev->delta_disks = 0;
5789
	mddev->reshape_backwards = 0;
N
NeilBrown 已提交
5790 5791 5792 5793
	mddev->new_level = LEVEL_NONE;
	mddev->new_layout = 0;
	mddev->new_chunk_sectors = 0;
	mddev->curr_resync = 0;
5794
	atomic64_set(&mddev->resync_mismatches, 0);
N
NeilBrown 已提交
5795 5796 5797 5798
	mddev->suspend_lo = mddev->suspend_hi = 0;
	mddev->sync_speed_min = mddev->sync_speed_max = 0;
	mddev->recovery = 0;
	mddev->in_sync = 0;
5799
	mddev->changed = 0;
N
NeilBrown 已提交
5800 5801
	mddev->degraded = 0;
	mddev->safemode = 0;
5802
	mddev->private = NULL;
5803
	mddev->cluster_info = NULL;
N
NeilBrown 已提交
5804 5805
	mddev->bitmap_info.offset = 0;
	mddev->bitmap_info.default_offset = 0;
5806
	mddev->bitmap_info.default_space = 0;
N
NeilBrown 已提交
5807 5808 5809
	mddev->bitmap_info.chunksize = 0;
	mddev->bitmap_info.daemon_sleep = 0;
	mddev->bitmap_info.max_write_behind = 0;
5810
	mddev->bitmap_info.nodes = 0;
N
NeilBrown 已提交
5811 5812
}

5813
static void __md_stop_writes(struct mddev *mddev)
5814
{
5815
	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5816
	flush_workqueue(md_misc_wq);
5817 5818
	if (mddev->sync_thread) {
		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5819
		md_reap_sync_thread(mddev);
5820 5821 5822 5823
	}

	del_timer_sync(&mddev->safemode_timer);

5824 5825 5826 5827
	if (mddev->pers && mddev->pers->quiesce) {
		mddev->pers->quiesce(mddev, 1);
		mddev->pers->quiesce(mddev, 0);
	}
5828 5829
	bitmap_flush(mddev);

5830
	if (mddev->ro == 0 &&
5831
	    ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
5832
	     mddev->sb_flags)) {
5833
		/* mark array as shutdown cleanly */
5834 5835
		if (!mddev_is_clustered(mddev))
			mddev->in_sync = 1;
5836 5837 5838
		md_update_sb(mddev, 1);
	}
}
5839

5840
void md_stop_writes(struct mddev *mddev)
5841
{
5842
	mddev_lock_nointr(mddev);
5843 5844 5845
	__md_stop_writes(mddev);
	mddev_unlock(mddev);
}
5846
EXPORT_SYMBOL_GPL(md_stop_writes);
5847

5848 5849
static void mddev_detach(struct mddev *mddev)
{
5850
	bitmap_wait_behind_writes(mddev);
5851
	if (mddev->pers && mddev->pers->quiesce) {
5852 5853 5854 5855 5856 5857 5858 5859
		mddev->pers->quiesce(mddev, 1);
		mddev->pers->quiesce(mddev, 0);
	}
	md_unregister_thread(&mddev->thread);
	if (mddev->queue)
		blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
}

5860
static void __md_stop(struct mddev *mddev)
N
NeilBrown 已提交
5861
{
5862
	struct md_personality *pers = mddev->pers;
5863
	bitmap_destroy(mddev);
5864
	mddev_detach(mddev);
5865 5866
	/* Ensure ->event_work is done */
	flush_workqueue(md_misc_wq);
5867
	spin_lock(&mddev->lock);
N
NeilBrown 已提交
5868
	mddev->pers = NULL;
5869 5870
	spin_unlock(&mddev->lock);
	pers->free(mddev, mddev->private);
5871
	mddev->private = NULL;
5872 5873 5874
	if (pers->sync_request && mddev->to_remove == NULL)
		mddev->to_remove = &md_redundancy_group;
	module_put(pers->owner);
N
NeilBrown 已提交
5875
	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
N
NeilBrown 已提交
5876
}
5877 5878 5879 5880 5881 5882 5883

void md_stop(struct mddev *mddev)
{
	/* stop the array and free an attached data structures.
	 * This is called from dm-raid
	 */
	__md_stop(mddev);
5884 5885
	bioset_exit(&mddev->bio_set);
	bioset_exit(&mddev->sync_set);
5886 5887
}

5888
EXPORT_SYMBOL_GPL(md_stop);
N
NeilBrown 已提交
5889

5890
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5891 5892
{
	int err = 0;
5893 5894 5895 5896 5897 5898 5899
	int did_freeze = 0;

	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
		did_freeze = 1;
		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
	}
5900
	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5901
		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5902
	if (mddev->sync_thread)
5903 5904 5905
		/* Thread might be blocked waiting for metadata update
		 * which will now never happen */
		wake_up_process(mddev->sync_thread->tsk);
5906

5907
	if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
5908
		return -EBUSY;
5909
	mddev_unlock(mddev);
5910 5911
	wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
					  &mddev->recovery));
5912
	wait_event(mddev->sb_wait,
5913
		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
5914 5915
	mddev_lock_nointr(mddev);

5916
	mutex_lock(&mddev->open_mutex);
5917
	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
5918
	    mddev->sync_thread ||
5919
	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5920
		pr_warn("md: %s still in use.\n",mdname(mddev));
5921 5922
		if (did_freeze) {
			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5923
			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5924 5925
			md_wakeup_thread(mddev->thread);
		}
5926 5927 5928 5929
		err = -EBUSY;
		goto out;
	}
	if (mddev->pers) {
5930
		__md_stop_writes(mddev);
5931 5932 5933 5934 5935 5936 5937

		err  = -ENXIO;
		if (mddev->ro==1)
			goto out;
		mddev->ro = 1;
		set_disk_ro(mddev->gendisk, 1);
		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5938 5939
		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
N
NeilBrown 已提交
5940
		sysfs_notify_dirent_safe(mddev->sysfs_state);
5941
		err = 0;
5942 5943 5944 5945 5946 5947
	}
out:
	mutex_unlock(&mddev->open_mutex);
	return err;
}

5948 5949 5950 5951
/* mode:
 *   0 - completely stop and dis-assemble array
 *   2 - stop but do not disassemble array
 */
5952
static int do_md_stop(struct mddev *mddev, int mode,
5953
		      struct block_device *bdev)
L
Linus Torvalds 已提交
5954 5955
{
	struct gendisk *disk = mddev->gendisk;
5956
	struct md_rdev *rdev;
5957 5958 5959 5960 5961 5962 5963
	int did_freeze = 0;

	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
		did_freeze = 1;
		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
	}
5964
	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5965
		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5966
	if (mddev->sync_thread)
5967 5968 5969
		/* Thread might be blocked waiting for metadata update
		 * which will now never happen */
		wake_up_process(mddev->sync_thread->tsk);
5970

5971
	mddev_unlock(mddev);
5972 5973 5974
	wait_event(resync_wait, (mddev->sync_thread == NULL &&
				 !test_bit(MD_RECOVERY_RUNNING,
					   &mddev->recovery)));
5975
	mddev_lock_nointr(mddev);
L
Linus Torvalds 已提交
5976

N
NeilBrown 已提交
5977
	mutex_lock(&mddev->open_mutex);
5978
	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
5979 5980
	    mddev->sysfs_active ||
	    mddev->sync_thread ||
5981
	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5982
		pr_warn("md: %s still in use.\n",mdname(mddev));
N
NeilBrown 已提交
5983
		mutex_unlock(&mddev->open_mutex);
5984 5985
		if (did_freeze) {
			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5986
			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5987 5988
			md_wakeup_thread(mddev->thread);
		}
5989 5990
		return -EBUSY;
	}
N
NeilBrown 已提交
5991
	if (mddev->pers) {
5992 5993
		if (mddev->ro)
			set_disk_ro(disk, 0);
5994

5995
		__md_stop_writes(mddev);
5996
		__md_stop(mddev);
5997
		mddev->queue->backing_dev_info->congested_fn = NULL;
N
NeilBrown 已提交
5998

5999
		/* tell userspace to handle 'inactive' */
N
NeilBrown 已提交
6000
		sysfs_notify_dirent_safe(mddev->sysfs_state);
6001

N
NeilBrown 已提交
6002
		rdev_for_each(rdev, mddev)
6003 6004
			if (rdev->raid_disk >= 0)
				sysfs_unlink_rdev(mddev, rdev);
6005

6006
		set_capacity(disk, 0);
N
NeilBrown 已提交
6007
		mutex_unlock(&mddev->open_mutex);
6008
		mddev->changed = 1;
6009
		revalidate_disk(disk);
6010

6011 6012
		if (mddev->ro)
			mddev->ro = 0;
N
NeilBrown 已提交
6013 6014
	} else
		mutex_unlock(&mddev->open_mutex);
L
Linus Torvalds 已提交
6015 6016 6017
	/*
	 * Free resources if final stop
	 */
6018
	if (mode == 0) {
6019
		pr_info("md: %s stopped.\n", mdname(mddev));
L
Linus Torvalds 已提交
6020

6021
		if (mddev->bitmap_info.file) {
6022 6023
			struct file *f = mddev->bitmap_info.file;
			spin_lock(&mddev->lock);
6024
			mddev->bitmap_info.file = NULL;
6025 6026
			spin_unlock(&mddev->lock);
			fput(f);
6027
		}
6028
		mddev->bitmap_info.offset = 0;
6029

L
Linus Torvalds 已提交
6030 6031
		export_array(mddev);

N
NeilBrown 已提交
6032
		md_clean(mddev);
6033 6034
		if (mddev->hold_active == UNTIL_STOP)
			mddev->hold_active = 0;
6035
	}
6036
	md_new_event(mddev);
N
NeilBrown 已提交
6037
	sysfs_notify_dirent_safe(mddev->sysfs_state);
N
NeilBrown 已提交
6038
	return 0;
L
Linus Torvalds 已提交
6039 6040
}

J
Jeff Garzik 已提交
6041
#ifndef MODULE
6042
static void autorun_array(struct mddev *mddev)
L
Linus Torvalds 已提交
6043
{
6044
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6045 6046
	int err;

6047
	if (list_empty(&mddev->disks))
L
Linus Torvalds 已提交
6048 6049
		return;

6050
	pr_info("md: running: ");
L
Linus Torvalds 已提交
6051

N
NeilBrown 已提交
6052
	rdev_for_each(rdev, mddev) {
L
Linus Torvalds 已提交
6053
		char b[BDEVNAME_SIZE];
6054
		pr_cont("<%s>", bdevname(rdev->bdev,b));
L
Linus Torvalds 已提交
6055
	}
6056
	pr_cont("\n");
L
Linus Torvalds 已提交
6057

6058
	err = do_md_run(mddev);
L
Linus Torvalds 已提交
6059
	if (err) {
6060
		pr_warn("md: do_md_run() returned %d\n", err);
6061
		do_md_stop(mddev, 0, NULL);
L
Linus Torvalds 已提交
6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078
	}
}

/*
 * lets try to run arrays based on all disks that have arrived
 * until now. (those are in pending_raid_disks)
 *
 * the method: pick the first pending disk, collect all disks with
 * the same UUID, remove all from the pending list and put them into
 * the 'same_array' list. Then order this list based on superblock
 * update time (freshest comes first), kick out 'old' disks and
 * compare superblocks. If everything's fine then run it.
 *
 * If "unit" is allocated, then bump its reference count
 */
static void autorun_devices(int part)
{
6079
	struct md_rdev *rdev0, *rdev, *tmp;
6080
	struct mddev *mddev;
L
Linus Torvalds 已提交
6081 6082
	char b[BDEVNAME_SIZE];

6083
	pr_info("md: autorun ...\n");
L
Linus Torvalds 已提交
6084
	while (!list_empty(&pending_raid_disks)) {
6085
		int unit;
L
Linus Torvalds 已提交
6086
		dev_t dev;
6087
		LIST_HEAD(candidates);
L
Linus Torvalds 已提交
6088
		rdev0 = list_entry(pending_raid_disks.next,
6089
					 struct md_rdev, same_set);
L
Linus Torvalds 已提交
6090

6091
		pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b));
L
Linus Torvalds 已提交
6092
		INIT_LIST_HEAD(&candidates);
6093
		rdev_for_each_list(rdev, tmp, &pending_raid_disks)
L
Linus Torvalds 已提交
6094
			if (super_90_load(rdev, rdev0, 0) >= 0) {
6095 6096
				pr_debug("md:  adding %s ...\n",
					 bdevname(rdev->bdev,b));
L
Linus Torvalds 已提交
6097 6098 6099 6100 6101 6102 6103
				list_move(&rdev->same_set, &candidates);
			}
		/*
		 * now we have a set of devices, with all of them having
		 * mostly sane superblocks. It's time to allocate the
		 * mddev.
		 */
6104 6105 6106 6107 6108 6109 6110 6111 6112
		if (part) {
			dev = MKDEV(mdp_major,
				    rdev0->preferred_minor << MdpMinorShift);
			unit = MINOR(dev) >> MdpMinorShift;
		} else {
			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
			unit = MINOR(dev);
		}
		if (rdev0->preferred_minor != unit) {
6113 6114
			pr_warn("md: unit number in %s is bad: %d\n",
				bdevname(rdev0->bdev, b), rdev0->preferred_minor);
L
Linus Torvalds 已提交
6115 6116 6117 6118 6119
			break;
		}

		md_probe(dev, NULL, NULL);
		mddev = mddev_find(dev);
N
Neil Brown 已提交
6120 6121 6122
		if (!mddev || !mddev->gendisk) {
			if (mddev)
				mddev_put(mddev);
L
Linus Torvalds 已提交
6123 6124
			break;
		}
6125
		if (mddev_lock(mddev))
6126
			pr_warn("md: %s locked, cannot run\n", mdname(mddev));
L
Linus Torvalds 已提交
6127 6128
		else if (mddev->raid_disks || mddev->major_version
			 || !list_empty(&mddev->disks)) {
6129
			pr_warn("md: %s already running, cannot run %s\n",
L
Linus Torvalds 已提交
6130 6131 6132
				mdname(mddev), bdevname(rdev0->bdev,b));
			mddev_unlock(mddev);
		} else {
6133
			pr_debug("md: created %s\n", mdname(mddev));
6134
			mddev->persistent = 1;
6135
			rdev_for_each_list(rdev, tmp, &candidates) {
L
Linus Torvalds 已提交
6136 6137 6138 6139 6140 6141 6142 6143 6144 6145
				list_del_init(&rdev->same_set);
				if (bind_rdev_to_array(rdev, mddev))
					export_rdev(rdev);
			}
			autorun_array(mddev);
			mddev_unlock(mddev);
		}
		/* on success, candidates will be empty, on error
		 * it won't...
		 */
6146
		rdev_for_each_list(rdev, tmp, &candidates) {
6147
			list_del_init(&rdev->same_set);
L
Linus Torvalds 已提交
6148
			export_rdev(rdev);
6149
		}
L
Linus Torvalds 已提交
6150 6151
		mddev_put(mddev);
	}
6152
	pr_info("md: ... autorun DONE.\n");
L
Linus Torvalds 已提交
6153
}
J
Jeff Garzik 已提交
6154
#endif /* !MODULE */
L
Linus Torvalds 已提交
6155

6156
static int get_version(void __user *arg)
L
Linus Torvalds 已提交
6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169
{
	mdu_version_t ver;

	ver.major = MD_MAJOR_VERSION;
	ver.minor = MD_MINOR_VERSION;
	ver.patchlevel = MD_PATCHLEVEL_VERSION;

	if (copy_to_user(arg, &ver, sizeof(ver)))
		return -EFAULT;

	return 0;
}

6170
static int get_array_info(struct mddev *mddev, void __user *arg)
L
Linus Torvalds 已提交
6171 6172
{
	mdu_array_info_t info;
6173
	int nr,working,insync,failed,spare;
6174
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6175

6176 6177 6178
	nr = working = insync = failed = spare = 0;
	rcu_read_lock();
	rdev_for_each_rcu(rdev, mddev) {
L
Linus Torvalds 已提交
6179
		nr++;
6180
		if (test_bit(Faulty, &rdev->flags))
L
Linus Torvalds 已提交
6181 6182 6183
			failed++;
		else {
			working++;
6184
			if (test_bit(In_sync, &rdev->flags))
6185
				insync++;
6186 6187 6188
			else if (test_bit(Journal, &rdev->flags))
				/* TODO: add journal count to md_u.h */
				;
L
Linus Torvalds 已提交
6189 6190 6191 6192
			else
				spare++;
		}
	}
6193
	rcu_read_unlock();
L
Linus Torvalds 已提交
6194 6195 6196 6197

	info.major_version = mddev->major_version;
	info.minor_version = mddev->minor_version;
	info.patch_version = MD_PATCHLEVEL_VERSION;
6198
	info.ctime         = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
L
Linus Torvalds 已提交
6199
	info.level         = mddev->level;
A
Andre Noll 已提交
6200 6201
	info.size          = mddev->dev_sectors / 2;
	if (info.size != mddev->dev_sectors / 2) /* overflow */
6202
		info.size = -1;
L
Linus Torvalds 已提交
6203 6204 6205 6206 6207
	info.nr_disks      = nr;
	info.raid_disks    = mddev->raid_disks;
	info.md_minor      = mddev->md_minor;
	info.not_persistent= !mddev->persistent;

6208
	info.utime         = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
L
Linus Torvalds 已提交
6209 6210 6211
	info.state         = 0;
	if (mddev->in_sync)
		info.state = (1<<MD_SB_CLEAN);
6212
	if (mddev->bitmap && mddev->bitmap_info.offset)
6213
		info.state |= (1<<MD_SB_BITMAP_PRESENT);
6214 6215
	if (mddev_is_clustered(mddev))
		info.state |= (1<<MD_SB_CLUSTERED);
6216
	info.active_disks  = insync;
L
Linus Torvalds 已提交
6217 6218 6219 6220 6221
	info.working_disks = working;
	info.failed_disks  = failed;
	info.spare_disks   = spare;

	info.layout        = mddev->layout;
6222
	info.chunk_size    = mddev->chunk_sectors << 9;
L
Linus Torvalds 已提交
6223 6224 6225 6226 6227 6228 6229

	if (copy_to_user(arg, &info, sizeof(info)))
		return -EFAULT;

	return 0;
}

6230
static int get_bitmap_file(struct mddev *mddev, void __user * arg)
6231 6232
{
	mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
6233
	char *ptr;
6234
	int err;
6235

6236
	file = kzalloc(sizeof(*file), GFP_NOIO);
6237
	if (!file)
6238
		return -ENOMEM;
6239

6240 6241
	err = 0;
	spin_lock(&mddev->lock);
6242 6243 6244 6245 6246 6247 6248 6249 6250 6251
	/* bitmap enabled */
	if (mddev->bitmap_info.file) {
		ptr = file_path(mddev->bitmap_info.file, file->pathname,
				sizeof(file->pathname));
		if (IS_ERR(ptr))
			err = PTR_ERR(ptr);
		else
			memmove(file->pathname, ptr,
				sizeof(file->pathname)-(ptr-file->pathname));
	}
6252
	spin_unlock(&mddev->lock);
6253

6254 6255
	if (err == 0 &&
	    copy_to_user(arg, file, sizeof(*file)))
6256
		err = -EFAULT;
6257

6258 6259 6260 6261
	kfree(file);
	return err;
}

6262
static int get_disk_info(struct mddev *mddev, void __user * arg)
L
Linus Torvalds 已提交
6263 6264
{
	mdu_disk_info_t info;
6265
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6266 6267 6268 6269

	if (copy_from_user(&info, arg, sizeof(info)))
		return -EFAULT;

6270
	rcu_read_lock();
6271
	rdev = md_find_rdev_nr_rcu(mddev, info.number);
L
Linus Torvalds 已提交
6272 6273 6274 6275 6276
	if (rdev) {
		info.major = MAJOR(rdev->bdev->bd_dev);
		info.minor = MINOR(rdev->bdev->bd_dev);
		info.raid_disk = rdev->raid_disk;
		info.state = 0;
6277
		if (test_bit(Faulty, &rdev->flags))
L
Linus Torvalds 已提交
6278
			info.state |= (1<<MD_DISK_FAULTY);
6279
		else if (test_bit(In_sync, &rdev->flags)) {
L
Linus Torvalds 已提交
6280 6281 6282
			info.state |= (1<<MD_DISK_ACTIVE);
			info.state |= (1<<MD_DISK_SYNC);
		}
S
Shaohua Li 已提交
6283
		if (test_bit(Journal, &rdev->flags))
6284
			info.state |= (1<<MD_DISK_JOURNAL);
6285 6286
		if (test_bit(WriteMostly, &rdev->flags))
			info.state |= (1<<MD_DISK_WRITEMOSTLY);
6287 6288
		if (test_bit(FailFast, &rdev->flags))
			info.state |= (1<<MD_DISK_FAILFAST);
L
Linus Torvalds 已提交
6289 6290 6291 6292 6293
	} else {
		info.major = info.minor = 0;
		info.raid_disk = -1;
		info.state = (1<<MD_DISK_REMOVED);
	}
6294
	rcu_read_unlock();
L
Linus Torvalds 已提交
6295 6296 6297 6298 6299 6300 6301

	if (copy_to_user(arg, &info, sizeof(info)))
		return -EFAULT;

	return 0;
}

6302
static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
L
Linus Torvalds 已提交
6303 6304
{
	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
6305
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6306 6307
	dev_t dev = MKDEV(info->major,info->minor);

6308 6309
	if (mddev_is_clustered(mddev) &&
		!(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
6310 6311
		pr_warn("%s: Cannot add to clustered mddev.\n",
			mdname(mddev));
6312 6313 6314
		return -EINVAL;
	}

L
Linus Torvalds 已提交
6315 6316 6317 6318 6319 6320 6321 6322
	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
		return -EOVERFLOW;

	if (!mddev->raid_disks) {
		int err;
		/* expecting a device which has a superblock */
		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
		if (IS_ERR(rdev)) {
6323
			pr_warn("md: md_import_device returned %ld\n",
L
Linus Torvalds 已提交
6324 6325 6326 6327
				PTR_ERR(rdev));
			return PTR_ERR(rdev);
		}
		if (!list_empty(&mddev->disks)) {
6328 6329 6330
			struct md_rdev *rdev0
				= list_entry(mddev->disks.next,
					     struct md_rdev, same_set);
6331
			err = super_types[mddev->major_version]
L
Linus Torvalds 已提交
6332 6333
				.load_super(rdev, rdev0, mddev->minor_version);
			if (err < 0) {
6334
				pr_warn("md: %s has different UUID to %s\n",
6335
					bdevname(rdev->bdev,b),
L
Linus Torvalds 已提交
6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354
					bdevname(rdev0->bdev,b2));
				export_rdev(rdev);
				return -EINVAL;
			}
		}
		err = bind_rdev_to_array(rdev, mddev);
		if (err)
			export_rdev(rdev);
		return err;
	}

	/*
	 * add_new_disk can be used once the array is assembled
	 * to add "hot spares".  They must already have a superblock
	 * written
	 */
	if (mddev->pers) {
		int err;
		if (!mddev->pers->hot_add_disk) {
6355 6356
			pr_warn("%s: personality does not support diskops!\n",
				mdname(mddev));
L
Linus Torvalds 已提交
6357 6358
			return -EINVAL;
		}
6359 6360 6361 6362 6363
		if (mddev->persistent)
			rdev = md_import_device(dev, mddev->major_version,
						mddev->minor_version);
		else
			rdev = md_import_device(dev, -1, -1);
L
Linus Torvalds 已提交
6364
		if (IS_ERR(rdev)) {
6365
			pr_warn("md: md_import_device returned %ld\n",
L
Linus Torvalds 已提交
6366 6367 6368
				PTR_ERR(rdev));
			return PTR_ERR(rdev);
		}
6369
		/* set saved_raid_disk if appropriate */
6370 6371
		if (!mddev->persistent) {
			if (info->state & (1<<MD_DISK_SYNC)  &&
6372
			    info->raid_disk < mddev->raid_disks) {
6373
				rdev->raid_disk = info->raid_disk;
6374
				set_bit(In_sync, &rdev->flags);
6375
				clear_bit(Bitmap_sync, &rdev->flags);
6376
			} else
6377
				rdev->raid_disk = -1;
6378
			rdev->saved_raid_disk = rdev->raid_disk;
6379 6380 6381
		} else
			super_types[mddev->major_version].
				validate_super(mddev, rdev);
6382
		if ((info->state & (1<<MD_DISK_SYNC)) &&
6383
		     rdev->raid_disk != info->raid_disk) {
6384 6385 6386 6387 6388 6389 6390
			/* This was a hot-add request, but events doesn't
			 * match, so reject it.
			 */
			export_rdev(rdev);
			return -EINVAL;
		}

6391
		clear_bit(In_sync, &rdev->flags); /* just to be sure */
6392 6393
		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
			set_bit(WriteMostly, &rdev->flags);
6394 6395
		else
			clear_bit(WriteMostly, &rdev->flags);
6396 6397 6398 6399
		if (info->state & (1<<MD_DISK_FAILFAST))
			set_bit(FailFast, &rdev->flags);
		else
			clear_bit(FailFast, &rdev->flags);
6400

6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411
		if (info->state & (1<<MD_DISK_JOURNAL)) {
			struct md_rdev *rdev2;
			bool has_journal = false;

			/* make sure no existing journal disk */
			rdev_for_each(rdev2, mddev) {
				if (test_bit(Journal, &rdev2->flags)) {
					has_journal = true;
					break;
				}
			}
6412
			if (has_journal || mddev->bitmap) {
6413 6414 6415
				export_rdev(rdev);
				return -EBUSY;
			}
6416
			set_bit(Journal, &rdev->flags);
6417
		}
6418 6419 6420 6421
		/*
		 * check whether the device shows up in other nodes
		 */
		if (mddev_is_clustered(mddev)) {
6422
			if (info->state & (1 << MD_DISK_CANDIDATE))
6423
				set_bit(Candidate, &rdev->flags);
6424
			else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
6425
				/* --add initiated by this node */
6426
				err = md_cluster_ops->add_new_disk(mddev, rdev);
6427 6428 6429 6430 6431 6432 6433
				if (err) {
					export_rdev(rdev);
					return err;
				}
			}
		}

L
Linus Torvalds 已提交
6434 6435
		rdev->raid_disk = -1;
		err = bind_rdev_to_array(rdev, mddev);
6436

L
Linus Torvalds 已提交
6437 6438
		if (err)
			export_rdev(rdev);
6439 6440

		if (mddev_is_clustered(mddev)) {
6441 6442 6443 6444 6445 6446 6447 6448
			if (info->state & (1 << MD_DISK_CANDIDATE)) {
				if (!err) {
					err = md_cluster_ops->new_disk_ack(mddev,
						err == 0);
					if (err)
						md_kick_rdev_from_array(rdev);
				}
			} else {
6449 6450 6451 6452 6453 6454 6455
				if (err)
					md_cluster_ops->add_new_disk_cancel(mddev);
				else
					err = add_bound_rdev(rdev);
			}

		} else if (!err)
G
Goldwyn Rodrigues 已提交
6456
			err = add_bound_rdev(rdev);
6457

L
Linus Torvalds 已提交
6458 6459 6460 6461 6462 6463 6464
		return err;
	}

	/* otherwise, add_new_disk is only allowed
	 * for major_version==0 superblocks
	 */
	if (mddev->major_version != 0) {
6465
		pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
L
Linus Torvalds 已提交
6466 6467 6468 6469 6470
		return -EINVAL;
	}

	if (!(info->state & (1<<MD_DISK_FAULTY))) {
		int err;
6471
		rdev = md_import_device(dev, -1, 0);
L
Linus Torvalds 已提交
6472
		if (IS_ERR(rdev)) {
6473
			pr_warn("md: error, md_import_device() returned %ld\n",
L
Linus Torvalds 已提交
6474 6475 6476 6477 6478 6479 6480 6481 6482 6483
				PTR_ERR(rdev));
			return PTR_ERR(rdev);
		}
		rdev->desc_nr = info->number;
		if (info->raid_disk < mddev->raid_disks)
			rdev->raid_disk = info->raid_disk;
		else
			rdev->raid_disk = -1;

		if (rdev->raid_disk < mddev->raid_disks)
6484 6485
			if (info->state & (1<<MD_DISK_SYNC))
				set_bit(In_sync, &rdev->flags);
L
Linus Torvalds 已提交
6486

6487 6488
		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
			set_bit(WriteMostly, &rdev->flags);
6489 6490
		if (info->state & (1<<MD_DISK_FAILFAST))
			set_bit(FailFast, &rdev->flags);
6491

L
Linus Torvalds 已提交
6492
		if (!mddev->persistent) {
6493
			pr_debug("md: nonpersistent superblock ...\n");
6494 6495
			rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
		} else
6496
			rdev->sb_start = calc_dev_sboffset(rdev);
6497
		rdev->sectors = rdev->sb_start;
L
Linus Torvalds 已提交
6498

6499 6500 6501 6502 6503
		err = bind_rdev_to_array(rdev, mddev);
		if (err) {
			export_rdev(rdev);
			return err;
		}
L
Linus Torvalds 已提交
6504 6505 6506 6507 6508
	}

	return 0;
}

6509
static int hot_remove_disk(struct mddev *mddev, dev_t dev)
L
Linus Torvalds 已提交
6510 6511
{
	char b[BDEVNAME_SIZE];
6512
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6513 6514 6515 6516 6517

	rdev = find_rdev(mddev, dev);
	if (!rdev)
		return -ENXIO;

6518 6519
	if (rdev->raid_disk < 0)
		goto kick_rdev;
6520

6521 6522 6523
	clear_bit(Blocked, &rdev->flags);
	remove_and_add_spares(mddev, rdev);

L
Linus Torvalds 已提交
6524 6525 6526
	if (rdev->raid_disk >= 0)
		goto busy;

6527
kick_rdev:
6528
	if (mddev_is_clustered(mddev))
6529 6530
		md_cluster_ops->remove_disk(mddev, rdev);

6531
	md_kick_rdev_from_array(rdev);
6532
	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6533 6534 6535 6536
	if (mddev->thread)
		md_wakeup_thread(mddev->thread);
	else
		md_update_sb(mddev, 1);
6537
	md_new_event(mddev);
L
Linus Torvalds 已提交
6538 6539 6540

	return 0;
busy:
6541 6542
	pr_debug("md: cannot remove active disk %s from %s ...\n",
		 bdevname(rdev->bdev,b), mdname(mddev));
L
Linus Torvalds 已提交
6543 6544 6545
	return -EBUSY;
}

6546
static int hot_add_disk(struct mddev *mddev, dev_t dev)
L
Linus Torvalds 已提交
6547 6548 6549
{
	char b[BDEVNAME_SIZE];
	int err;
6550
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6551 6552 6553 6554 6555

	if (!mddev->pers)
		return -ENODEV;

	if (mddev->major_version != 0) {
6556
		pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
L
Linus Torvalds 已提交
6557 6558 6559 6560
			mdname(mddev));
		return -EINVAL;
	}
	if (!mddev->pers->hot_add_disk) {
6561
		pr_warn("%s: personality does not support diskops!\n",
L
Linus Torvalds 已提交
6562 6563 6564 6565
			mdname(mddev));
		return -EINVAL;
	}

6566
	rdev = md_import_device(dev, -1, 0);
L
Linus Torvalds 已提交
6567
	if (IS_ERR(rdev)) {
6568
		pr_warn("md: error, md_import_device() returned %ld\n",
L
Linus Torvalds 已提交
6569 6570 6571 6572 6573
			PTR_ERR(rdev));
		return -EINVAL;
	}

	if (mddev->persistent)
6574
		rdev->sb_start = calc_dev_sboffset(rdev);
L
Linus Torvalds 已提交
6575
	else
6576
		rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
L
Linus Torvalds 已提交
6577

6578
	rdev->sectors = rdev->sb_start;
L
Linus Torvalds 已提交
6579

6580
	if (test_bit(Faulty, &rdev->flags)) {
6581
		pr_warn("md: can not hot-add faulty %s disk to %s!\n",
L
Linus Torvalds 已提交
6582 6583 6584 6585
			bdevname(rdev->bdev,b), mdname(mddev));
		err = -EINVAL;
		goto abort_export;
	}
6586

6587
	clear_bit(In_sync, &rdev->flags);
L
Linus Torvalds 已提交
6588
	rdev->desc_nr = -1;
6589
	rdev->saved_raid_disk = -1;
6590 6591
	err = bind_rdev_to_array(rdev, mddev);
	if (err)
6592
		goto abort_export;
L
Linus Torvalds 已提交
6593 6594 6595 6596 6597 6598 6599 6600

	/*
	 * The rest should better be atomic, we can have disk failures
	 * noticed in interrupt contexts ...
	 */

	rdev->raid_disk = -1;

6601
	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6602 6603
	if (!mddev->thread)
		md_update_sb(mddev, 1);
L
Linus Torvalds 已提交
6604 6605 6606 6607 6608 6609
	/*
	 * Kick recovery, maybe this spare has to be added to the
	 * array immediately.
	 */
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
	md_wakeup_thread(mddev->thread);
6610
	md_new_event(mddev);
L
Linus Torvalds 已提交
6611 6612 6613 6614 6615 6616 6617
	return 0;

abort_export:
	export_rdev(rdev);
	return err;
}

6618
static int set_bitmap_file(struct mddev *mddev, int fd)
6619
{
6620
	int err = 0;
6621

6622
	if (mddev->pers) {
6623
		if (!mddev->pers->quiesce || !mddev->thread)
6624 6625 6626 6627 6628
			return -EBUSY;
		if (mddev->recovery || mddev->sync_thread)
			return -EBUSY;
		/* we should be able to change the bitmap.. */
	}
6629

6630
	if (fd >= 0) {
6631
		struct inode *inode;
N
NeilBrown 已提交
6632 6633 6634
		struct file *f;

		if (mddev->bitmap || mddev->bitmap_info.file)
6635
			return -EEXIST; /* cannot add when bitmap is present */
N
NeilBrown 已提交
6636
		f = fget(fd);
6637

N
NeilBrown 已提交
6638
		if (f == NULL) {
6639 6640
			pr_warn("%s: error: failed to get bitmap file\n",
				mdname(mddev));
6641 6642 6643
			return -EBADF;
		}

N
NeilBrown 已提交
6644
		inode = f->f_mapping->host;
6645
		if (!S_ISREG(inode->i_mode)) {
6646 6647
			pr_warn("%s: error: bitmap file must be a regular file\n",
				mdname(mddev));
6648
			err = -EBADF;
N
NeilBrown 已提交
6649
		} else if (!(f->f_mode & FMODE_WRITE)) {
6650 6651
			pr_warn("%s: error: bitmap file must open for write\n",
				mdname(mddev));
6652 6653
			err = -EBADF;
		} else if (atomic_read(&inode->i_writecount) != 1) {
6654 6655
			pr_warn("%s: error: bitmap file is already in use\n",
				mdname(mddev));
6656 6657 6658
			err = -EBUSY;
		}
		if (err) {
N
NeilBrown 已提交
6659
			fput(f);
6660 6661
			return err;
		}
N
NeilBrown 已提交
6662
		mddev->bitmap_info.file = f;
6663
		mddev->bitmap_info.offset = 0; /* file overrides offset */
6664 6665 6666 6667
	} else if (mddev->bitmap == NULL)
		return -ENOENT; /* cannot remove what isn't there */
	err = 0;
	if (mddev->pers) {
6668
		if (fd >= 0) {
6669 6670 6671
			struct bitmap *bitmap;

			bitmap = bitmap_create(mddev, -1);
6672
			mddev_suspend(mddev);
6673 6674
			if (!IS_ERR(bitmap)) {
				mddev->bitmap = bitmap;
6675
				err = bitmap_load(mddev);
6676 6677
			} else
				err = PTR_ERR(bitmap);
6678 6679 6680 6681
			if (err) {
				bitmap_destroy(mddev);
				fd = -1;
			}
6682
			mddev_resume(mddev);
6683
		} else if (fd < 0) {
6684
			mddev_suspend(mddev);
6685
			bitmap_destroy(mddev);
6686
			mddev_resume(mddev);
6687 6688 6689
		}
	}
	if (fd < 0) {
6690 6691 6692 6693 6694 6695 6696
		struct file *f = mddev->bitmap_info.file;
		if (f) {
			spin_lock(&mddev->lock);
			mddev->bitmap_info.file = NULL;
			spin_unlock(&mddev->lock);
			fput(f);
		}
6697 6698
	}

6699 6700 6701
	return err;
}

L
Linus Torvalds 已提交
6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714
/*
 * set_array_info is used two different ways
 * The original usage is when creating a new array.
 * In this usage, raid_disks is > 0 and it together with
 *  level, size, not_persistent,layout,chunksize determine the
 *  shape of the array.
 *  This will always create an array with a type-0.90.0 superblock.
 * The newer usage is when assembling an array.
 *  In this case raid_disks will be 0, and the major_version field is
 *  use to determine which style super-blocks are to be found on the devices.
 *  The minor and patch _version numbers are also kept incase the
 *  super_block handler wishes to interpret them.
 */
6715
static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
L
Linus Torvalds 已提交
6716 6717 6718 6719 6720
{

	if (info->raid_disks == 0) {
		/* just setting version number for superblock loading */
		if (info->major_version < 0 ||
6721
		    info->major_version >= ARRAY_SIZE(super_types) ||
L
Linus Torvalds 已提交
6722 6723
		    super_types[info->major_version].name == NULL) {
			/* maybe try to auto-load a module? */
6724
			pr_warn("md: superblock version %d not known\n",
L
Linus Torvalds 已提交
6725 6726 6727 6728 6729 6730
				info->major_version);
			return -EINVAL;
		}
		mddev->major_version = info->major_version;
		mddev->minor_version = info->minor_version;
		mddev->patch_version = info->patch_version;
6731
		mddev->persistent = !info->not_persistent;
6732 6733 6734
		/* ensure mddev_put doesn't delete this now that there
		 * is some minimal configuration.
		 */
6735
		mddev->ctime         = ktime_get_real_seconds();
L
Linus Torvalds 已提交
6736 6737 6738 6739 6740
		return 0;
	}
	mddev->major_version = MD_MAJOR_VERSION;
	mddev->minor_version = MD_MINOR_VERSION;
	mddev->patch_version = MD_PATCHLEVEL_VERSION;
6741
	mddev->ctime         = ktime_get_real_seconds();
L
Linus Torvalds 已提交
6742 6743

	mddev->level         = info->level;
6744
	mddev->clevel[0]     = 0;
A
Andre Noll 已提交
6745
	mddev->dev_sectors   = 2 * (sector_t)info->size;
L
Linus Torvalds 已提交
6746 6747 6748 6749 6750 6751 6752 6753 6754
	mddev->raid_disks    = info->raid_disks;
	/* don't set md_minor, it is determined by which /dev/md* was
	 * openned
	 */
	if (info->state & (1<<MD_SB_CLEAN))
		mddev->recovery_cp = MaxSector;
	else
		mddev->recovery_cp = 0;
	mddev->persistent    = ! info->not_persistent;
6755
	mddev->external	     = 0;
L
Linus Torvalds 已提交
6756 6757

	mddev->layout        = info->layout;
6758
	mddev->chunk_sectors = info->chunk_size >> 9;
L
Linus Torvalds 已提交
6759

6760
	if (mddev->persistent) {
6761 6762 6763
		mddev->max_disks = MD_SB_DISKS;
		mddev->flags = 0;
		mddev->sb_flags = 0;
6764 6765
	}
	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
L
Linus Torvalds 已提交
6766

6767
	mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6768
	mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
6769
	mddev->bitmap_info.offset = 0;
6770

6771 6772
	mddev->reshape_position = MaxSector;

L
Linus Torvalds 已提交
6773 6774 6775 6776 6777
	/*
	 * Generate a 128 bit UUID
	 */
	get_random_bytes(mddev->uuid, 16);

6778
	mddev->new_level = mddev->level;
6779
	mddev->new_chunk_sectors = mddev->chunk_sectors;
6780 6781
	mddev->new_layout = mddev->layout;
	mddev->delta_disks = 0;
6782
	mddev->reshape_backwards = 0;
6783

L
Linus Torvalds 已提交
6784 6785 6786
	return 0;
}

6787
void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
6788
{
S
Shaohua Li 已提交
6789
	lockdep_assert_held(&mddev->reconfig_mutex);
D
Dan Williams 已提交
6790 6791 6792 6793

	if (mddev->external_size)
		return;

6794 6795 6796 6797
	mddev->array_sectors = array_sectors;
}
EXPORT_SYMBOL(md_set_array_sectors);

6798
static int update_size(struct mddev *mddev, sector_t num_sectors)
6799
{
6800
	struct md_rdev *rdev;
6801
	int rv;
6802
	int fit = (num_sectors == 0);
6803
	sector_t old_dev_sectors = mddev->dev_sectors;
6804

6805 6806
	if (mddev->pers->resize == NULL)
		return -EINVAL;
6807 6808 6809 6810 6811
	/* The "num_sectors" is the number of sectors of each device that
	 * is used.  This can only make sense for arrays with redundancy.
	 * linear and raid0 always use whatever space is available. We can only
	 * consider changing this number if no resync or reconstruction is
	 * happening, and if the new size is acceptable. It must fit before the
6812
	 * sb_start or, if that is <data_offset, it must fit before the size
6813 6814
	 * of each device.  If num_sectors is zero, we find the largest size
	 * that fits.
6815
	 */
6816 6817
	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
	    mddev->sync_thread)
6818
		return -EBUSY;
6819 6820
	if (mddev->ro)
		return -EROFS;
6821

N
NeilBrown 已提交
6822
	rdev_for_each(rdev, mddev) {
6823
		sector_t avail = rdev->sectors;
6824

6825 6826 6827
		if (fit && (num_sectors == 0 || num_sectors > avail))
			num_sectors = avail;
		if (avail < num_sectors)
6828 6829
			return -ENOSPC;
	}
6830
	rv = mddev->pers->resize(mddev, num_sectors);
6831
	if (!rv) {
6832 6833 6834
		if (mddev_is_clustered(mddev))
			md_cluster_ops->update_size(mddev, old_dev_sectors);
		else if (mddev->queue) {
6835 6836 6837 6838
			set_capacity(mddev->gendisk, mddev->array_sectors);
			revalidate_disk(mddev->gendisk);
		}
	}
6839 6840 6841
	return rv;
}

6842
static int update_raid_disks(struct mddev *mddev, int raid_disks)
6843 6844
{
	int rv;
6845
	struct md_rdev *rdev;
6846
	/* change the number of raid disks */
6847
	if (mddev->pers->check_reshape == NULL)
6848
		return -EINVAL;
6849 6850
	if (mddev->ro)
		return -EROFS;
6851
	if (raid_disks <= 0 ||
6852
	    (mddev->max_disks && raid_disks >= mddev->max_disks))
6853
		return -EINVAL;
6854 6855 6856
	if (mddev->sync_thread ||
	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
	    mddev->reshape_position != MaxSector)
6857
		return -EBUSY;
6858 6859 6860 6861 6862 6863 6864 6865 6866 6867

	rdev_for_each(rdev, mddev) {
		if (mddev->raid_disks < raid_disks &&
		    rdev->data_offset < rdev->new_data_offset)
			return -EINVAL;
		if (mddev->raid_disks > raid_disks &&
		    rdev->data_offset > rdev->new_data_offset)
			return -EINVAL;
	}

6868
	mddev->delta_disks = raid_disks - mddev->raid_disks;
6869 6870 6871 6872
	if (mddev->delta_disks < 0)
		mddev->reshape_backwards = 1;
	else if (mddev->delta_disks > 0)
		mddev->reshape_backwards = 0;
6873 6874

	rv = mddev->pers->check_reshape(mddev);
6875
	if (rv < 0) {
6876
		mddev->delta_disks = 0;
6877 6878
		mddev->reshape_backwards = 0;
	}
6879 6880 6881
	return rv;
}

L
Linus Torvalds 已提交
6882 6883 6884 6885 6886 6887 6888 6889
/*
 * update_array_info is used to change the configuration of an
 * on-line array.
 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
 * fields in the info are checked against the array.
 * Any differences that cannot be handled will cause an error.
 * Normally, only one change can be managed at a time.
 */
6890
static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
L
Linus Torvalds 已提交
6891 6892 6893
{
	int rv = 0;
	int cnt = 0;
6894 6895 6896
	int state = 0;

	/* calculate expected state,ignoring low bits */
6897
	if (mddev->bitmap && mddev->bitmap_info.offset)
6898
		state |= (1 << MD_SB_BITMAP_PRESENT);
L
Linus Torvalds 已提交
6899 6900 6901 6902 6903 6904 6905

	if (mddev->major_version != info->major_version ||
	    mddev->minor_version != info->minor_version ||
/*	    mddev->patch_version != info->patch_version || */
	    mddev->ctime         != info->ctime         ||
	    mddev->level         != info->level         ||
/*	    mddev->layout        != info->layout        || */
F
Firo Yang 已提交
6906
	    mddev->persistent	 != !info->not_persistent ||
6907
	    mddev->chunk_sectors != info->chunk_size >> 9 ||
6908 6909 6910
	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
	    ((state^info->state) & 0xfffffe00)
		)
L
Linus Torvalds 已提交
6911 6912
		return -EINVAL;
	/* Check there is only one change */
A
Andre Noll 已提交
6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924
	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
		cnt++;
	if (mddev->raid_disks != info->raid_disks)
		cnt++;
	if (mddev->layout != info->layout)
		cnt++;
	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
		cnt++;
	if (cnt == 0)
		return 0;
	if (cnt > 1)
		return -EINVAL;
L
Linus Torvalds 已提交
6925 6926 6927 6928 6929 6930

	if (mddev->layout != info->layout) {
		/* Change layout
		 * we don't need to do anything at the md level, the
		 * personality will take care of it all.
		 */
6931
		if (mddev->pers->check_reshape == NULL)
L
Linus Torvalds 已提交
6932
			return -EINVAL;
6933 6934
		else {
			mddev->new_layout = info->layout;
6935
			rv = mddev->pers->check_reshape(mddev);
6936 6937 6938 6939
			if (rv)
				mddev->new_layout = mddev->layout;
			return rv;
		}
L
Linus Torvalds 已提交
6940
	}
A
Andre Noll 已提交
6941
	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6942
		rv = update_size(mddev, (sector_t)info->size * 2);
6943

6944 6945 6946
	if (mddev->raid_disks    != info->raid_disks)
		rv = update_raid_disks(mddev, info->raid_disks);

6947
	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
6948 6949 6950 6951 6952 6953 6954 6955
		if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
			rv = -EINVAL;
			goto err;
		}
		if (mddev->recovery || mddev->sync_thread) {
			rv = -EBUSY;
			goto err;
		}
6956
		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
6957
			struct bitmap *bitmap;
6958
			/* add the bitmap */
6959 6960 6961 6962 6963 6964 6965 6966
			if (mddev->bitmap) {
				rv = -EEXIST;
				goto err;
			}
			if (mddev->bitmap_info.default_offset == 0) {
				rv = -EINVAL;
				goto err;
			}
6967 6968
			mddev->bitmap_info.offset =
				mddev->bitmap_info.default_offset;
6969 6970
			mddev->bitmap_info.space =
				mddev->bitmap_info.default_space;
6971
			bitmap = bitmap_create(mddev, -1);
6972
			mddev_suspend(mddev);
6973 6974
			if (!IS_ERR(bitmap)) {
				mddev->bitmap = bitmap;
6975
				rv = bitmap_load(mddev);
6976 6977
			} else
				rv = PTR_ERR(bitmap);
6978 6979
			if (rv)
				bitmap_destroy(mddev);
6980
			mddev_resume(mddev);
6981 6982
		} else {
			/* remove the bitmap */
6983 6984 6985 6986 6987 6988 6989 6990
			if (!mddev->bitmap) {
				rv = -ENOENT;
				goto err;
			}
			if (mddev->bitmap->storage.file) {
				rv = -EINVAL;
				goto err;
			}
6991 6992 6993
			if (mddev->bitmap_info.nodes) {
				/* hold PW on all the bitmap lock */
				if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
6994
					pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
6995 6996 6997 6998 6999 7000 7001 7002
					rv = -EPERM;
					md_cluster_ops->unlock_all_bitmaps(mddev);
					goto err;
				}

				mddev->bitmap_info.nodes = 0;
				md_cluster_ops->leave(mddev);
			}
7003
			mddev_suspend(mddev);
7004
			bitmap_destroy(mddev);
7005
			mddev_resume(mddev);
7006
			mddev->bitmap_info.offset = 0;
7007 7008
		}
	}
7009
	md_update_sb(mddev, 1);
7010 7011
	return rv;
err:
L
Linus Torvalds 已提交
7012 7013 7014
	return rv;
}

7015
static int set_disk_faulty(struct mddev *mddev, dev_t dev)
L
Linus Torvalds 已提交
7016
{
7017
	struct md_rdev *rdev;
7018
	int err = 0;
L
Linus Torvalds 已提交
7019 7020 7021 7022

	if (mddev->pers == NULL)
		return -ENODEV;

7023
	rcu_read_lock();
7024
	rdev = md_find_rdev_rcu(mddev, dev);
L
Linus Torvalds 已提交
7025
	if (!rdev)
7026 7027 7028 7029 7030 7031 7032 7033
		err =  -ENODEV;
	else {
		md_error(mddev, rdev);
		if (!test_bit(Faulty, &rdev->flags))
			err = -EBUSY;
	}
	rcu_read_unlock();
	return err;
L
Linus Torvalds 已提交
7034 7035
}

7036 7037 7038 7039 7040 7041
/*
 * We have a problem here : there is no easy way to give a CHS
 * virtual geometry. We currently pretend that we have a 2 heads
 * 4 sectors (with a BIG number of cylinders...). This drives
 * dosfs just mad... ;-)
 */
7042 7043
static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
7044
	struct mddev *mddev = bdev->bd_disk->private_data;
7045 7046 7047

	geo->heads = 2;
	geo->sectors = 4;
7048
	geo->cylinders = mddev->array_sectors / 8;
7049 7050 7051
	return 0;
}

7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070
static inline bool md_ioctl_valid(unsigned int cmd)
{
	switch (cmd) {
	case ADD_NEW_DISK:
	case BLKROSET:
	case GET_ARRAY_INFO:
	case GET_BITMAP_FILE:
	case GET_DISK_INFO:
	case HOT_ADD_DISK:
	case HOT_REMOVE_DISK:
	case RAID_AUTORUN:
	case RAID_VERSION:
	case RESTART_ARRAY_RW:
	case RUN_ARRAY:
	case SET_ARRAY_INFO:
	case SET_BITMAP_FILE:
	case SET_DISK_FAULTY:
	case STOP_ARRAY:
	case STOP_ARRAY_RO:
7071
	case CLUSTERED_DISK_NACK:
7072 7073 7074 7075 7076 7077
		return true;
	default:
		return false;
	}
}

A
Al Viro 已提交
7078
static int md_ioctl(struct block_device *bdev, fmode_t mode,
L
Linus Torvalds 已提交
7079 7080 7081 7082
			unsigned int cmd, unsigned long arg)
{
	int err = 0;
	void __user *argp = (void __user *)arg;
7083
	struct mddev *mddev = NULL;
7084
	int ro;
7085
	bool did_set_md_closing = false;
L
Linus Torvalds 已提交
7086

7087 7088 7089
	if (!md_ioctl_valid(cmd))
		return -ENOTTY;

7090 7091 7092 7093 7094 7095 7096 7097 7098
	switch (cmd) {
	case RAID_VERSION:
	case GET_ARRAY_INFO:
	case GET_DISK_INFO:
		break;
	default:
		if (!capable(CAP_SYS_ADMIN))
			return -EACCES;
	}
L
Linus Torvalds 已提交
7099 7100 7101 7102 7103

	/*
	 * Commands dealing with the RAID driver but not any
	 * particular array:
	 */
7104 7105 7106
	switch (cmd) {
	case RAID_VERSION:
		err = get_version(argp);
7107
		goto out;
L
Linus Torvalds 已提交
7108 7109

#ifndef MODULE
7110 7111 7112
	case RAID_AUTORUN:
		err = 0;
		autostart_arrays(arg);
7113
		goto out;
L
Linus Torvalds 已提交
7114
#endif
7115
	default:;
L
Linus Torvalds 已提交
7116 7117 7118 7119 7120 7121
	}

	/*
	 * Commands creating/starting a new array:
	 */

A
Al Viro 已提交
7122
	mddev = bdev->bd_disk->private_data;
L
Linus Torvalds 已提交
7123 7124 7125

	if (!mddev) {
		BUG();
7126
		goto out;
L
Linus Torvalds 已提交
7127 7128
	}

7129 7130 7131 7132 7133 7134 7135
	/* Some actions do not requires the mutex */
	switch (cmd) {
	case GET_ARRAY_INFO:
		if (!mddev->raid_disks && !mddev->external)
			err = -ENODEV;
		else
			err = get_array_info(mddev, argp);
7136
		goto out;
7137 7138 7139 7140 7141 7142

	case GET_DISK_INFO:
		if (!mddev->raid_disks && !mddev->external)
			err = -ENODEV;
		else
			err = get_disk_info(mddev, argp);
7143
		goto out;
7144 7145 7146

	case SET_DISK_FAULTY:
		err = set_disk_faulty(mddev, new_decode_dev(arg));
7147
		goto out;
7148 7149 7150 7151 7152

	case GET_BITMAP_FILE:
		err = get_bitmap_file(mddev, argp);
		goto out;

7153 7154
	}

7155 7156 7157 7158
	if (cmd == ADD_NEW_DISK)
		/* need to ensure md_delayed_delete() has completed */
		flush_workqueue(md_misc_wq);

7159 7160 7161 7162
	if (cmd == HOT_REMOVE_DISK)
		/* need to ensure recovery thread has run */
		wait_event_interruptible_timeout(mddev->sb_wait,
						 !test_bit(MD_RECOVERY_NEEDED,
7163
							   &mddev->recovery),
7164
						 msecs_to_jiffies(5000));
7165 7166 7167 7168 7169
	if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
		/* Need to flush page cache, and ensure no-one else opens
		 * and writes
		 */
		mutex_lock(&mddev->open_mutex);
7170
		if (mddev->pers && atomic_read(&mddev->openers) > 1) {
7171 7172
			mutex_unlock(&mddev->open_mutex);
			err = -EBUSY;
7173
			goto out;
7174
		}
7175
		WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags));
7176
		set_bit(MD_CLOSING, &mddev->flags);
7177
		did_set_md_closing = true;
7178 7179 7180
		mutex_unlock(&mddev->open_mutex);
		sync_blockdev(bdev);
	}
L
Linus Torvalds 已提交
7181 7182
	err = mddev_lock(mddev);
	if (err) {
7183 7184
		pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
			 err, cmd);
7185
		goto out;
L
Linus Torvalds 已提交
7186 7187
	}

7188 7189 7190 7191 7192 7193
	if (cmd == SET_ARRAY_INFO) {
		mdu_array_info_t info;
		if (!arg)
			memset(&info, 0, sizeof(info));
		else if (copy_from_user(&info, argp, sizeof(info))) {
			err = -EFAULT;
7194
			goto unlock;
7195 7196 7197 7198
		}
		if (mddev->pers) {
			err = update_array_info(mddev, &info);
			if (err) {
7199
				pr_warn("md: couldn't update array info. %d\n", err);
7200
				goto unlock;
L
Linus Torvalds 已提交
7201
			}
7202
			goto unlock;
7203 7204
		}
		if (!list_empty(&mddev->disks)) {
7205
			pr_warn("md: array %s already has disks!\n", mdname(mddev));
7206
			err = -EBUSY;
7207
			goto unlock;
7208 7209
		}
		if (mddev->raid_disks) {
7210
			pr_warn("md: array %s already initialised!\n", mdname(mddev));
7211
			err = -EBUSY;
7212
			goto unlock;
7213 7214 7215
		}
		err = set_array_info(mddev, &info);
		if (err) {
7216
			pr_warn("md: couldn't set array info. %d\n", err);
7217
			goto unlock;
7218
		}
7219
		goto unlock;
L
Linus Torvalds 已提交
7220 7221 7222 7223 7224
	}

	/*
	 * Commands querying/configuring an existing array:
	 */
7225
	/* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
7226
	 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
7227 7228 7229 7230
	if ((!mddev->raid_disks && !mddev->external)
	    && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
	    && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
	    && cmd != GET_BITMAP_FILE) {
L
Linus Torvalds 已提交
7231
		err = -ENODEV;
7232
		goto unlock;
L
Linus Torvalds 已提交
7233 7234 7235 7236 7237
	}

	/*
	 * Commands even a read-only array can execute:
	 */
7238 7239 7240
	switch (cmd) {
	case RESTART_ARRAY_RW:
		err = restart_array(mddev);
7241
		goto unlock;
L
Linus Torvalds 已提交
7242

7243 7244
	case STOP_ARRAY:
		err = do_md_stop(mddev, 0, bdev);
7245
		goto unlock;
L
Linus Torvalds 已提交
7246

7247 7248
	case STOP_ARRAY_RO:
		err = md_set_readonly(mddev, bdev);
7249
		goto unlock;
L
Linus Torvalds 已提交
7250

7251 7252
	case HOT_REMOVE_DISK:
		err = hot_remove_disk(mddev, new_decode_dev(arg));
7253
		goto unlock;
7254

7255 7256
	case ADD_NEW_DISK:
		/* We can support ADD_NEW_DISK on read-only arrays
W
Wei Fang 已提交
7257
		 * only if we are re-adding a preexisting device.
7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268
		 * So require mddev->pers and MD_DISK_SYNC.
		 */
		if (mddev->pers) {
			mdu_disk_info_t info;
			if (copy_from_user(&info, argp, sizeof(info)))
				err = -EFAULT;
			else if (!(info.state & (1<<MD_DISK_SYNC)))
				/* Need to clear read-only for this */
				break;
			else
				err = add_new_disk(mddev, &info);
7269
			goto unlock;
7270 7271 7272
		}
		break;

7273 7274 7275
	case BLKROSET:
		if (get_user(ro, (int __user *)(arg))) {
			err = -EFAULT;
7276
			goto unlock;
7277 7278
		}
		err = -EINVAL;
7279

7280 7281 7282 7283
		/* if the bdev is going readonly the value of mddev->ro
		 * does not matter, no writes are coming
		 */
		if (ro)
7284
			goto unlock;
7285

7286 7287
		/* are we are already prepared for writes? */
		if (mddev->ro != 1)
7288
			goto unlock;
7289

7290 7291 7292 7293 7294 7295 7296 7297
		/* transitioning to readauto need only happen for
		 * arrays that call md_write_start
		 */
		if (mddev->pers) {
			err = restart_array(mddev);
			if (err == 0) {
				mddev->ro = 2;
				set_disk_ro(mddev->gendisk, 0);
7298
			}
7299
		}
7300
		goto unlock;
L
Linus Torvalds 已提交
7301 7302 7303 7304
	}

	/*
	 * The remaining ioctls are changing the state of the
7305
	 * superblock, so we do not allow them on read-only arrays.
L
Linus Torvalds 已提交
7306
	 */
7307
	if (mddev->ro && mddev->pers) {
7308 7309
		if (mddev->ro == 2) {
			mddev->ro = 0;
N
NeilBrown 已提交
7310
			sysfs_notify_dirent_safe(mddev->sysfs_state);
7311
			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7312 7313 7314 7315
			/* mddev_unlock will wake thread */
			/* If a device failed while we were read-only, we
			 * need to make sure the metadata is updated now.
			 */
7316
			if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
7317 7318
				mddev_unlock(mddev);
				wait_event(mddev->sb_wait,
7319 7320
					   !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
					   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7321
				mddev_lock_nointr(mddev);
7322
			}
7323 7324
		} else {
			err = -EROFS;
7325
			goto unlock;
7326
		}
L
Linus Torvalds 已提交
7327 7328
	}

7329 7330
	switch (cmd) {
	case ADD_NEW_DISK:
L
Linus Torvalds 已提交
7331
	{
7332 7333 7334 7335 7336
		mdu_disk_info_t info;
		if (copy_from_user(&info, argp, sizeof(info)))
			err = -EFAULT;
		else
			err = add_new_disk(mddev, &info);
7337
		goto unlock;
7338
	}
L
Linus Torvalds 已提交
7339

7340 7341 7342 7343 7344 7345 7346
	case CLUSTERED_DISK_NACK:
		if (mddev_is_clustered(mddev))
			md_cluster_ops->new_disk_ack(mddev, false);
		else
			err = -EINVAL;
		goto unlock;

7347 7348
	case HOT_ADD_DISK:
		err = hot_add_disk(mddev, new_decode_dev(arg));
7349
		goto unlock;
L
Linus Torvalds 已提交
7350

7351 7352
	case RUN_ARRAY:
		err = do_md_run(mddev);
7353
		goto unlock;
L
Linus Torvalds 已提交
7354

7355 7356
	case SET_BITMAP_FILE:
		err = set_bitmap_file(mddev, (int)arg);
7357
		goto unlock;
7358

7359 7360
	default:
		err = -EINVAL;
7361
		goto unlock;
L
Linus Torvalds 已提交
7362 7363
	}

7364
unlock:
7365 7366 7367
	if (mddev->hold_active == UNTIL_IOCTL &&
	    err != -EINVAL)
		mddev->hold_active = 0;
L
Linus Torvalds 已提交
7368
	mddev_unlock(mddev);
7369
out:
7370 7371
	if(did_set_md_closing)
		clear_bit(MD_CLOSING, &mddev->flags);
L
Linus Torvalds 已提交
7372 7373
	return err;
}
7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392
#ifdef CONFIG_COMPAT
static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
		    unsigned int cmd, unsigned long arg)
{
	switch (cmd) {
	case HOT_REMOVE_DISK:
	case HOT_ADD_DISK:
	case SET_DISK_FAULTY:
	case SET_BITMAP_FILE:
		/* These take in integer arg, do not convert */
		break;
	default:
		arg = (unsigned long)compat_ptr(arg);
		break;
	}

	return md_ioctl(bdev, mode, cmd, arg);
}
#endif /* CONFIG_COMPAT */
L
Linus Torvalds 已提交
7393

A
Al Viro 已提交
7394
static int md_open(struct block_device *bdev, fmode_t mode)
L
Linus Torvalds 已提交
7395 7396 7397 7398 7399
{
	/*
	 * Succeed if we can lock the mddev, which confirms that
	 * it isn't being stopped right now.
	 */
7400
	struct mddev *mddev = mddev_find(bdev->bd_dev);
L
Linus Torvalds 已提交
7401 7402
	int err;

7403 7404 7405
	if (!mddev)
		return -ENODEV;

7406 7407 7408 7409 7410 7411
	if (mddev->gendisk != bdev->bd_disk) {
		/* we are racing with mddev_put which is discarding this
		 * bd_disk.
		 */
		mddev_put(mddev);
		/* Wait until bdev->bd_disk is definitely gone */
T
Tejun Heo 已提交
7412
		flush_workqueue(md_misc_wq);
7413 7414 7415 7416 7417
		/* Then retry the open from the top */
		return -ERESTARTSYS;
	}
	BUG_ON(mddev != bdev->bd_disk->private_data);

N
NeilBrown 已提交
7418
	if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
L
Linus Torvalds 已提交
7419 7420
		goto out;

7421 7422
	if (test_bit(MD_CLOSING, &mddev->flags)) {
		mutex_unlock(&mddev->open_mutex);
7423 7424
		err = -ENODEV;
		goto out;
7425 7426
	}

L
Linus Torvalds 已提交
7427
	err = 0;
7428
	atomic_inc(&mddev->openers);
N
NeilBrown 已提交
7429
	mutex_unlock(&mddev->open_mutex);
L
Linus Torvalds 已提交
7430

7431
	check_disk_change(bdev);
L
Linus Torvalds 已提交
7432
 out:
7433 7434
	if (err)
		mddev_put(mddev);
L
Linus Torvalds 已提交
7435 7436 7437
	return err;
}

7438
static void md_release(struct gendisk *disk, fmode_t mode)
L
Linus Torvalds 已提交
7439
{
7440
	struct mddev *mddev = disk->private_data;
L
Linus Torvalds 已提交
7441

E
Eric Sesterhenn 已提交
7442
	BUG_ON(!mddev);
7443
	atomic_dec(&mddev->openers);
L
Linus Torvalds 已提交
7444 7445
	mddev_put(mddev);
}
7446 7447 7448

static int md_media_changed(struct gendisk *disk)
{
7449
	struct mddev *mddev = disk->private_data;
7450 7451 7452 7453 7454 7455

	return mddev->changed;
}

static int md_revalidate(struct gendisk *disk)
{
7456
	struct mddev *mddev = disk->private_data;
7457 7458 7459 7460

	mddev->changed = 0;
	return 0;
}
7461
static const struct block_device_operations md_fops =
L
Linus Torvalds 已提交
7462 7463
{
	.owner		= THIS_MODULE,
A
Al Viro 已提交
7464 7465
	.open		= md_open,
	.release	= md_release,
N
NeilBrown 已提交
7466
	.ioctl		= md_ioctl,
7467 7468 7469
#ifdef CONFIG_COMPAT
	.compat_ioctl	= md_compat_ioctl,
#endif
7470
	.getgeo		= md_getgeo,
7471 7472
	.media_changed  = md_media_changed,
	.revalidate_disk= md_revalidate,
L
Linus Torvalds 已提交
7473 7474
};

7475
static int md_thread(void *arg)
L
Linus Torvalds 已提交
7476
{
7477
	struct md_thread *thread = arg;
L
Linus Torvalds 已提交
7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490

	/*
	 * md_thread is a 'system-thread', it's priority should be very
	 * high. We avoid resource deadlocks individually in each
	 * raid personality. (RAID5 does preallocation) We also use RR and
	 * the very same RT priority as kswapd, thus we will never get
	 * into a priority inversion deadlock.
	 *
	 * we definitely have to have equal or higher priority than
	 * bdflush, otherwise bdflush will deadlock if there are too
	 * many dirty RAID5 blocks.
	 */

N
NeilBrown 已提交
7491
	allow_signal(SIGKILL);
7492
	while (!kthread_should_stop()) {
L
Linus Torvalds 已提交
7493

7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504
		/* We need to wait INTERRUPTIBLE so that
		 * we don't add to the load-average.
		 * That means we need to be sure no signals are
		 * pending
		 */
		if (signal_pending(current))
			flush_signals(current);

		wait_event_interruptible_timeout
			(thread->wqueue,
			 test_bit(THREAD_WAKEUP, &thread->flags)
7505
			 || kthread_should_stop() || kthread_should_park(),
7506
			 thread->timeout);
L
Linus Torvalds 已提交
7507

7508
		clear_bit(THREAD_WAKEUP, &thread->flags);
7509 7510
		if (kthread_should_park())
			kthread_parkme();
7511
		if (!kthread_should_stop())
S
Shaohua Li 已提交
7512
			thread->run(thread);
L
Linus Torvalds 已提交
7513
	}
7514

L
Linus Torvalds 已提交
7515 7516 7517
	return 0;
}

7518
void md_wakeup_thread(struct md_thread *thread)
L
Linus Torvalds 已提交
7519 7520
{
	if (thread) {
7521
		pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
7522 7523
		set_bit(THREAD_WAKEUP, &thread->flags);
		wake_up(&thread->wqueue);
L
Linus Torvalds 已提交
7524 7525
	}
}
7526
EXPORT_SYMBOL(md_wakeup_thread);
L
Linus Torvalds 已提交
7527

S
Shaohua Li 已提交
7528 7529
struct md_thread *md_register_thread(void (*run) (struct md_thread *),
		struct mddev *mddev, const char *name)
L
Linus Torvalds 已提交
7530
{
7531
	struct md_thread *thread;
L
Linus Torvalds 已提交
7532

7533
	thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
L
Linus Torvalds 已提交
7534 7535 7536 7537 7538 7539 7540
	if (!thread)
		return NULL;

	init_waitqueue_head(&thread->wqueue);

	thread->run = run;
	thread->mddev = mddev;
7541
	thread->timeout = MAX_SCHEDULE_TIMEOUT;
7542 7543 7544
	thread->tsk = kthread_run(md_thread, thread,
				  "%s_%s",
				  mdname(thread->mddev),
7545
				  name);
7546
	if (IS_ERR(thread->tsk)) {
L
Linus Torvalds 已提交
7547 7548 7549 7550 7551
		kfree(thread);
		return NULL;
	}
	return thread;
}
7552
EXPORT_SYMBOL(md_register_thread);
L
Linus Torvalds 已提交
7553

7554
void md_unregister_thread(struct md_thread **threadp)
L
Linus Torvalds 已提交
7555
{
7556
	struct md_thread *thread = *threadp;
7557 7558
	if (!thread)
		return;
7559
	pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
7560 7561 7562 7563 7564 7565
	/* Locking ensures that mddev_unlock does not wake_up a
	 * non-existent thread
	 */
	spin_lock(&pers_lock);
	*threadp = NULL;
	spin_unlock(&pers_lock);
7566 7567

	kthread_stop(thread->tsk);
L
Linus Torvalds 已提交
7568 7569
	kfree(thread);
}
7570
EXPORT_SYMBOL(md_unregister_thread);
L
Linus Torvalds 已提交
7571

7572
void md_error(struct mddev *mddev, struct md_rdev *rdev)
L
Linus Torvalds 已提交
7573
{
7574
	if (!rdev || test_bit(Faulty, &rdev->flags))
L
Linus Torvalds 已提交
7575
		return;
7576

7577
	if (!mddev->pers || !mddev->pers->error_handler)
L
Linus Torvalds 已提交
7578 7579
		return;
	mddev->pers->error_handler(mddev,rdev);
7580 7581
	if (mddev->degraded)
		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
N
NeilBrown 已提交
7582
	sysfs_notify_dirent_safe(rdev->sysfs_state);
L
Linus Torvalds 已提交
7583 7584 7585
	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
	md_wakeup_thread(mddev->thread);
7586
	if (mddev->event_work.func)
T
Tejun Heo 已提交
7587
		queue_work(md_misc_wq, &mddev->event_work);
7588
	md_new_event(mddev);
L
Linus Torvalds 已提交
7589
}
7590
EXPORT_SYMBOL(md_error);
L
Linus Torvalds 已提交
7591 7592 7593 7594 7595 7596

/* seq_file implementation /proc/mdstat */

static void status_unused(struct seq_file *seq)
{
	int i = 0;
7597
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
7598 7599 7600

	seq_printf(seq, "unused devices: ");

7601
	list_for_each_entry(rdev, &pending_raid_disks, same_set) {
L
Linus Torvalds 已提交
7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612
		char b[BDEVNAME_SIZE];
		i++;
		seq_printf(seq, "%s ",
			      bdevname(rdev->bdev,b));
	}
	if (!i)
		seq_printf(seq, "<none>");

	seq_printf(seq, "\n");
}

7613
static int status_resync(struct seq_file *seq, struct mddev *mddev)
L
Linus Torvalds 已提交
7614
{
7615 7616 7617
	sector_t max_sectors, resync, res;
	unsigned long dt, db;
	sector_t rt;
7618 7619
	int scale;
	unsigned int per_milli;
L
Linus Torvalds 已提交
7620

7621 7622
	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7623
		max_sectors = mddev->resync_max_sectors;
L
Linus Torvalds 已提交
7624
	else
7625
		max_sectors = mddev->dev_sectors;
L
Linus Torvalds 已提交
7626

7627 7628 7629 7630 7631
	resync = mddev->curr_resync;
	if (resync <= 3) {
		if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
			/* Still cleaning up */
			resync = max_sectors;
7632 7633 7634
	} else if (resync > max_sectors)
		resync = max_sectors;
	else
7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648
		resync -= atomic_read(&mddev->recovery_active);

	if (resync == 0) {
		if (mddev->recovery_cp < MaxSector) {
			seq_printf(seq, "\tresync=PENDING");
			return 1;
		}
		return 0;
	}
	if (resync < 3) {
		seq_printf(seq, "\tresync=DELAYED");
		return 1;
	}

N
NeilBrown 已提交
7649
	WARN_ON(max_sectors == 0);
7650
	/* Pick 'scale' such that (resync>>scale)*1000 will fit
7651
	 * in a sector_t, and (max_sectors>>scale) will fit in a
7652 7653 7654 7655 7656
	 * u32, as those are the requirements for sector_div.
	 * Thus 'scale' must be at least 10
	 */
	scale = 10;
	if (sizeof(sector_t) > sizeof(unsigned long)) {
7657
		while ( max_sectors/2 > (1ULL<<(scale+32)))
7658 7659 7660
			scale++;
	}
	res = (resync>>scale)*1000;
7661
	sector_div(res, (u32)((max_sectors>>scale)+1));
7662 7663

	per_milli = res;
L
Linus Torvalds 已提交
7664
	{
7665
		int i, x = per_milli/50, y = 20-x;
L
Linus Torvalds 已提交
7666 7667 7668 7669 7670 7671 7672 7673
		seq_printf(seq, "[");
		for (i = 0; i < x; i++)
			seq_printf(seq, "=");
		seq_printf(seq, ">");
		for (i = 0; i < y; i++)
			seq_printf(seq, ".");
		seq_printf(seq, "] ");
	}
7674
	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
7675 7676
		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
		    "reshape" :
7677 7678 7679 7680 7681
		    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
		     "check" :
		     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
		      "resync" : "recovery"))),
		   per_milli/10, per_milli % 10,
7682 7683
		   (unsigned long long) resync/2,
		   (unsigned long long) max_sectors/2);
L
Linus Torvalds 已提交
7684 7685 7686 7687 7688

	/*
	 * dt: time from mark until now
	 * db: blocks written from mark until now
	 * rt: remaining time
7689 7690 7691 7692
	 *
	 * rt is a sector_t, so could be 32bit or 64bit.
	 * So we divide before multiply in case it is 32bit and close
	 * to the limit.
L
Lucas De Marchi 已提交
7693
	 * We scale the divisor (db) by 32 to avoid losing precision
7694 7695 7696 7697
	 * near the end of resync when the number of remaining sectors
	 * is close to 'db'.
	 * We then divide rt by 32 after multiplying by db to compensate.
	 * The '+1' avoids division by zero if db is very small.
L
Linus Torvalds 已提交
7698 7699 7700
	 */
	dt = ((jiffies - mddev->resync_mark) / HZ);
	if (!dt) dt++;
7701 7702
	db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
		- mddev->resync_mark_cnt;
L
Linus Torvalds 已提交
7703

7704 7705 7706 7707 7708 7709 7710
	rt = max_sectors - resync;    /* number of remaining sectors */
	sector_div(rt, db/32+1);
	rt *= dt;
	rt >>= 5;

	seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
		   ((unsigned long)rt % 60)/6);
L
Linus Torvalds 已提交
7711

7712
	seq_printf(seq, " speed=%ldK/sec", db/2/dt);
7713
	return 1;
L
Linus Torvalds 已提交
7714 7715 7716 7717 7718 7719
}

static void *md_seq_start(struct seq_file *seq, loff_t *pos)
{
	struct list_head *tmp;
	loff_t l = *pos;
7720
	struct mddev *mddev;
L
Linus Torvalds 已提交
7721 7722 7723 7724 7725 7726 7727 7728 7729 7730

	if (l >= 0x10000)
		return NULL;
	if (!l--)
		/* header */
		return (void*)1;

	spin_lock(&all_mddevs_lock);
	list_for_each(tmp,&all_mddevs)
		if (!l--) {
7731
			mddev = list_entry(tmp, struct mddev, all_mddevs);
L
Linus Torvalds 已提交
7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744
			mddev_get(mddev);
			spin_unlock(&all_mddevs_lock);
			return mddev;
		}
	spin_unlock(&all_mddevs_lock);
	if (!l--)
		return (void*)2;/* tail */
	return NULL;
}

static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct list_head *tmp;
7745
	struct mddev *next_mddev, *mddev = v;
7746

L
Linus Torvalds 已提交
7747 7748 7749 7750 7751 7752 7753 7754 7755 7756
	++*pos;
	if (v == (void*)2)
		return NULL;

	spin_lock(&all_mddevs_lock);
	if (v == (void*)1)
		tmp = all_mddevs.next;
	else
		tmp = mddev->all_mddevs.next;
	if (tmp != &all_mddevs)
7757
		next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
L
Linus Torvalds 已提交
7758 7759 7760
	else {
		next_mddev = (void*)2;
		*pos = 0x10000;
7761
	}
L
Linus Torvalds 已提交
7762 7763 7764 7765 7766 7767 7768 7769 7770 7771
	spin_unlock(&all_mddevs_lock);

	if (v != (void*)1)
		mddev_put(mddev);
	return next_mddev;

}

static void md_seq_stop(struct seq_file *seq, void *v)
{
7772
	struct mddev *mddev = v;
L
Linus Torvalds 已提交
7773 7774 7775 7776 7777 7778 7779

	if (mddev && v != (void*)1 && v != (void*)2)
		mddev_put(mddev);
}

static int md_seq_show(struct seq_file *seq, void *v)
{
7780
	struct mddev *mddev = v;
7781
	sector_t sectors;
7782
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
7783 7784

	if (v == (void*)1) {
7785
		struct md_personality *pers;
L
Linus Torvalds 已提交
7786 7787
		seq_printf(seq, "Personalities : ");
		spin_lock(&pers_lock);
7788 7789
		list_for_each_entry(pers, &pers_list, list)
			seq_printf(seq, "[%s] ", pers->name);
L
Linus Torvalds 已提交
7790 7791 7792

		spin_unlock(&pers_lock);
		seq_printf(seq, "\n");
7793
		seq->poll_event = atomic_read(&md_event_count);
L
Linus Torvalds 已提交
7794 7795 7796 7797 7798 7799 7800
		return 0;
	}
	if (v == (void*)2) {
		status_unused(seq);
		return 0;
	}

7801
	spin_lock(&mddev->lock);
L
Linus Torvalds 已提交
7802 7803 7804 7805
	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
		seq_printf(seq, "%s : %sactive", mdname(mddev),
						mddev->pers ? "" : "in");
		if (mddev->pers) {
7806
			if (mddev->ro==1)
L
Linus Torvalds 已提交
7807
				seq_printf(seq, " (read-only)");
7808
			if (mddev->ro==2)
7809
				seq_printf(seq, " (auto-read-only)");
L
Linus Torvalds 已提交
7810 7811 7812
			seq_printf(seq, " %s", mddev->pers->name);
		}

7813
		sectors = 0;
7814 7815
		rcu_read_lock();
		rdev_for_each_rcu(rdev, mddev) {
L
Linus Torvalds 已提交
7816 7817 7818
			char b[BDEVNAME_SIZE];
			seq_printf(seq, " %s[%d]",
				bdevname(rdev->bdev,b), rdev->desc_nr);
7819 7820
			if (test_bit(WriteMostly, &rdev->flags))
				seq_printf(seq, "(W)");
S
Shaohua Li 已提交
7821 7822
			if (test_bit(Journal, &rdev->flags))
				seq_printf(seq, "(J)");
7823
			if (test_bit(Faulty, &rdev->flags)) {
L
Linus Torvalds 已提交
7824 7825
				seq_printf(seq, "(F)");
				continue;
7826 7827
			}
			if (rdev->raid_disk < 0)
7828
				seq_printf(seq, "(S)"); /* spare */
7829 7830
			if (test_bit(Replacement, &rdev->flags))
				seq_printf(seq, "(R)");
7831
			sectors += rdev->sectors;
L
Linus Torvalds 已提交
7832
		}
7833
		rcu_read_unlock();
L
Linus Torvalds 已提交
7834 7835 7836 7837

		if (!list_empty(&mddev->disks)) {
			if (mddev->pers)
				seq_printf(seq, "\n      %llu blocks",
7838 7839
					   (unsigned long long)
					   mddev->array_sectors / 2);
L
Linus Torvalds 已提交
7840 7841
			else
				seq_printf(seq, "\n      %llu blocks",
7842
					   (unsigned long long)sectors / 2);
L
Linus Torvalds 已提交
7843
		}
7844 7845 7846 7847 7848 7849 7850
		if (mddev->persistent) {
			if (mddev->major_version != 0 ||
			    mddev->minor_version != 90) {
				seq_printf(seq," super %d.%d",
					   mddev->major_version,
					   mddev->minor_version);
			}
7851 7852 7853 7854
		} else if (mddev->external)
			seq_printf(seq, " super external:%s",
				   mddev->metadata_type);
		else
7855
			seq_printf(seq, " super non-persistent");
L
Linus Torvalds 已提交
7856 7857

		if (mddev->pers) {
7858
			mddev->pers->status(seq, mddev);
7859
			seq_printf(seq, "\n      ");
7860
			if (mddev->pers->sync_request) {
7861
				if (status_resync(seq, mddev))
7862 7863
					seq_printf(seq, "\n      ");
			}
7864 7865 7866
		} else
			seq_printf(seq, "\n       ");

7867
		bitmap_status(seq, mddev->bitmap);
L
Linus Torvalds 已提交
7868 7869 7870

		seq_printf(seq, "\n");
	}
7871
	spin_unlock(&mddev->lock);
7872

L
Linus Torvalds 已提交
7873 7874 7875
	return 0;
}

J
Jan Engelhardt 已提交
7876
static const struct seq_operations md_seq_ops = {
L
Linus Torvalds 已提交
7877 7878 7879 7880 7881 7882 7883 7884
	.start  = md_seq_start,
	.next   = md_seq_next,
	.stop   = md_seq_stop,
	.show   = md_seq_show,
};

static int md_seq_open(struct inode *inode, struct file *file)
{
7885
	struct seq_file *seq;
L
Linus Torvalds 已提交
7886 7887 7888
	int error;

	error = seq_open(file, &md_seq_ops);
7889
	if (error)
7890 7891 7892 7893
		return error;

	seq = file->private_data;
	seq->poll_event = atomic_read(&md_event_count);
L
Linus Torvalds 已提交
7894 7895 7896
	return error;
}

7897
static int md_unloading;
7898
static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
7899
{
7900
	struct seq_file *seq = filp->private_data;
7901
	__poll_t mask;
7902

7903
	if (md_unloading)
7904
		return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
7905 7906 7907
	poll_wait(filp, &md_event_waiters, wait);

	/* always allow read */
7908
	mask = EPOLLIN | EPOLLRDNORM;
7909

7910
	if (seq->poll_event != atomic_read(&md_event_count))
7911
		mask |= EPOLLERR | EPOLLPRI;
7912 7913 7914
	return mask;
}

7915
static const struct file_operations md_seq_fops = {
7916
	.owner		= THIS_MODULE,
L
Linus Torvalds 已提交
7917 7918 7919
	.open           = md_seq_open,
	.read           = seq_read,
	.llseek         = seq_lseek,
7920
	.release	= seq_release,
7921
	.poll		= mdstat_poll,
L
Linus Torvalds 已提交
7922 7923
};

7924
int register_md_personality(struct md_personality *p)
L
Linus Torvalds 已提交
7925
{
7926 7927
	pr_debug("md: %s personality registered for level %d\n",
		 p->name, p->level);
L
Linus Torvalds 已提交
7928
	spin_lock(&pers_lock);
7929
	list_add_tail(&p->list, &pers_list);
L
Linus Torvalds 已提交
7930 7931 7932
	spin_unlock(&pers_lock);
	return 0;
}
7933
EXPORT_SYMBOL(register_md_personality);
L
Linus Torvalds 已提交
7934

7935
int unregister_md_personality(struct md_personality *p)
L
Linus Torvalds 已提交
7936
{
7937
	pr_debug("md: %s personality unregistered\n", p->name);
L
Linus Torvalds 已提交
7938
	spin_lock(&pers_lock);
7939
	list_del_init(&p->list);
L
Linus Torvalds 已提交
7940 7941 7942
	spin_unlock(&pers_lock);
	return 0;
}
7943
EXPORT_SYMBOL(unregister_md_personality);
L
Linus Torvalds 已提交
7944

7945 7946
int register_md_cluster_operations(struct md_cluster_operations *ops,
				   struct module *module)
7947
{
7948
	int ret = 0;
7949
	spin_lock(&pers_lock);
7950 7951 7952 7953 7954 7955
	if (md_cluster_ops != NULL)
		ret = -EALREADY;
	else {
		md_cluster_ops = ops;
		md_cluster_mod = module;
	}
7956
	spin_unlock(&pers_lock);
7957
	return ret;
7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971
}
EXPORT_SYMBOL(register_md_cluster_operations);

int unregister_md_cluster_operations(void)
{
	spin_lock(&pers_lock);
	md_cluster_ops = NULL;
	spin_unlock(&pers_lock);
	return 0;
}
EXPORT_SYMBOL(unregister_md_cluster_operations);

int md_setup_cluster(struct mddev *mddev, int nodes)
{
7972 7973
	if (!md_cluster_ops)
		request_module("md-cluster");
7974
	spin_lock(&pers_lock);
7975
	/* ensure module won't be unloaded */
7976
	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
7977
		pr_warn("can't find md-cluster module or get it's reference.\n");
7978 7979 7980 7981 7982
		spin_unlock(&pers_lock);
		return -ENOENT;
	}
	spin_unlock(&pers_lock);

G
Goldwyn Rodrigues 已提交
7983
	return md_cluster_ops->join(mddev, nodes);
7984 7985 7986 7987
}

void md_cluster_stop(struct mddev *mddev)
{
G
Goldwyn Rodrigues 已提交
7988 7989
	if (!md_cluster_ops)
		return;
7990 7991 7992 7993
	md_cluster_ops->leave(mddev);
	module_put(md_cluster_mod);
}

7994
static int is_mddev_idle(struct mddev *mddev, int init)
L
Linus Torvalds 已提交
7995
{
7996
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
7997
	int idle;
N
NeilBrown 已提交
7998
	int curr_events;
L
Linus Torvalds 已提交
7999 8000

	idle = 1;
8001 8002
	rcu_read_lock();
	rdev_for_each_rcu(rdev, mddev) {
L
Linus Torvalds 已提交
8003
		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
N
NeilBrown 已提交
8004 8005 8006
		curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
			      (int)part_stat_read(&disk->part0, sectors[1]) -
			      atomic_read(&disk->sync_io);
8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026
		/* sync IO will cause sync_io to increase before the disk_stats
		 * as sync_io is counted when a request starts, and
		 * disk_stats is counted when it completes.
		 * So resync activity will cause curr_events to be smaller than
		 * when there was no such activity.
		 * non-sync IO will cause disk_stat to increase without
		 * increasing sync_io so curr_events will (eventually)
		 * be larger than it was before.  Once it becomes
		 * substantially larger, the test below will cause
		 * the array to appear non-idle, and resync will slow
		 * down.
		 * If there is a lot of outstanding resync activity when
		 * we set last_event to curr_events, then all that activity
		 * completing might cause the array to appear non-idle
		 * and resync will be slowed down even though there might
		 * not have been non-resync activity.  This will only
		 * happen once though.  'last_events' will soon reflect
		 * the state where there is little or no outstanding
		 * resync requests, and further resync activity will
		 * always make curr_events less than last_events.
8027
		 *
L
Linus Torvalds 已提交
8028
		 */
N
NeilBrown 已提交
8029
		if (init || curr_events - rdev->last_events > 64) {
L
Linus Torvalds 已提交
8030 8031 8032 8033
			rdev->last_events = curr_events;
			idle = 0;
		}
	}
8034
	rcu_read_unlock();
L
Linus Torvalds 已提交
8035 8036 8037
	return idle;
}

8038
void md_done_sync(struct mddev *mddev, int blocks, int ok)
L
Linus Torvalds 已提交
8039 8040 8041 8042 8043
{
	/* another "blocks" (512byte) blocks have been synced */
	atomic_sub(blocks, &mddev->recovery_active);
	wake_up(&mddev->recovery_wait);
	if (!ok) {
8044
		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8045
		set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
L
Linus Torvalds 已提交
8046 8047 8048 8049
		md_wakeup_thread(mddev->thread);
		// stop recovery, signal do_sync ....
	}
}
8050
EXPORT_SYMBOL(md_done_sync);
L
Linus Torvalds 已提交
8051

8052 8053
/* md_write_start(mddev, bi)
 * If we need to update some array metadata (e.g. 'active' flag
8054 8055
 * in superblock) before writing, schedule a superblock update
 * and wait for it to complete.
8056 8057
 * A return value of 'false' means that the write wasn't recorded
 * and cannot proceed as the array is being suspend.
8058
 */
8059
bool md_write_start(struct mddev *mddev, struct bio *bi)
L
Linus Torvalds 已提交
8060
{
8061
	int did_change = 0;
8062

8063
	if (bio_data_dir(bi) != WRITE)
8064
		return true;
8065

8066 8067 8068 8069 8070 8071
	BUG_ON(mddev->ro == 1);
	if (mddev->ro == 2) {
		/* need to switch to read/write */
		mddev->ro = 0;
		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
8072
		md_wakeup_thread(mddev->sync_thread);
8073
		did_change = 1;
8074
	}
8075 8076
	rcu_read_lock();
	percpu_ref_get(&mddev->writes_pending);
8077
	smp_mb(); /* Match smp_mb in set_in_sync() */
8078 8079
	if (mddev->safemode == 1)
		mddev->safemode = 0;
8080
	/* sync_checkers is always 0 when writes_pending is in per-cpu mode */
N
NeilBrown 已提交
8081
	if (mddev->in_sync || mddev->sync_checkers) {
8082
		spin_lock(&mddev->lock);
8083 8084
		if (mddev->in_sync) {
			mddev->in_sync = 0;
8085 8086
			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
			set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8087
			md_wakeup_thread(mddev->thread);
8088
			did_change = 1;
8089
		}
8090
		spin_unlock(&mddev->lock);
8091
	}
8092
	rcu_read_unlock();
8093
	if (did_change)
N
NeilBrown 已提交
8094
		sysfs_notify_dirent_safe(mddev->sysfs_state);
8095 8096
	if (!mddev->has_superblocks)
		return true;
8097
	wait_event(mddev->sb_wait,
8098 8099
		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
		   mddev->suspended);
8100 8101 8102 8103 8104
	if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
		percpu_ref_put(&mddev->writes_pending);
		return false;
	}
	return true;
L
Linus Torvalds 已提交
8105
}
8106
EXPORT_SYMBOL(md_write_start);
L
Linus Torvalds 已提交
8107

8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120
/* md_write_inc can only be called when md_write_start() has
 * already been called at least once of the current request.
 * It increments the counter and is useful when a single request
 * is split into several parts.  Each part causes an increment and
 * so needs a matching md_write_end().
 * Unlike md_write_start(), it is safe to call md_write_inc() inside
 * a spinlocked region.
 */
void md_write_inc(struct mddev *mddev, struct bio *bi)
{
	if (bio_data_dir(bi) != WRITE)
		return;
	WARN_ON_ONCE(mddev->in_sync || mddev->ro);
8121
	percpu_ref_get(&mddev->writes_pending);
8122 8123 8124
}
EXPORT_SYMBOL(md_write_inc);

8125
void md_write_end(struct mddev *mddev)
L
Linus Torvalds 已提交
8126
{
8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137
	percpu_ref_put(&mddev->writes_pending);

	if (mddev->safemode == 2)
		md_wakeup_thread(mddev->thread);
	else if (mddev->safemode_delay)
		/* The roundup() ensures this only performs locking once
		 * every ->safemode_delay jiffies
		 */
		mod_timer(&mddev->safemode_timer,
			  roundup(jiffies, mddev->safemode_delay) +
			  mddev->safemode_delay);
L
Linus Torvalds 已提交
8138
}
8139

8140
EXPORT_SYMBOL(md_write_end);
L
Linus Torvalds 已提交
8141

8142 8143 8144 8145 8146 8147
/* md_allow_write(mddev)
 * Calling this ensures that the array is marked 'active' so that writes
 * may proceed without blocking.  It is important to call this before
 * attempting a GFP_KERNEL allocation while holding the mddev lock.
 * Must be called with mddev_lock held.
 */
8148
void md_allow_write(struct mddev *mddev)
8149 8150
{
	if (!mddev->pers)
8151
		return;
8152
	if (mddev->ro)
8153
		return;
8154
	if (!mddev->pers->sync_request)
8155
		return;
8156

8157
	spin_lock(&mddev->lock);
8158 8159
	if (mddev->in_sync) {
		mddev->in_sync = 0;
8160 8161
		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
		set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8162 8163 8164
		if (mddev->safemode_delay &&
		    mddev->safemode == 0)
			mddev->safemode = 1;
8165
		spin_unlock(&mddev->lock);
8166
		md_update_sb(mddev, 0);
N
NeilBrown 已提交
8167
		sysfs_notify_dirent_safe(mddev->sysfs_state);
8168 8169 8170
		/* wait for the dirty state to be recorded in the metadata */
		wait_event(mddev->sb_wait,
			   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8171
	} else
8172
		spin_unlock(&mddev->lock);
8173 8174 8175
}
EXPORT_SYMBOL_GPL(md_allow_write);

L
Linus Torvalds 已提交
8176 8177
#define SYNC_MARKS	10
#define	SYNC_MARK_STEP	(3*HZ)
8178
#define UPDATE_FREQUENCY (5*60*HZ)
S
Shaohua Li 已提交
8179
void md_do_sync(struct md_thread *thread)
L
Linus Torvalds 已提交
8180
{
S
Shaohua Li 已提交
8181
	struct mddev *mddev = thread->mddev;
8182
	struct mddev *mddev2;
L
Linus Torvalds 已提交
8183 8184
	unsigned int currspeed = 0,
		 window;
X
Xiao Ni 已提交
8185
	sector_t max_sectors,j, io_sectors, recovery_done;
L
Linus Torvalds 已提交
8186
	unsigned long mark[SYNC_MARKS];
8187
	unsigned long update_time;
L
Linus Torvalds 已提交
8188 8189 8190 8191
	sector_t mark_cnt[SYNC_MARKS];
	int last_mark,m;
	struct list_head *tmp;
	sector_t last_check;
8192
	int skipped = 0;
8193
	struct md_rdev *rdev;
8194
	char *desc, *action = NULL;
M
majianpeng 已提交
8195
	struct blk_plug plug;
8196
	int ret;
L
Linus Torvalds 已提交
8197 8198

	/* just incase thread restarts... */
8199 8200
	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
	    test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
L
Linus Torvalds 已提交
8201
		return;
8202 8203
	if (mddev->ro) {/* never try to sync a read-only array */
		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8204
		return;
8205
	}
L
Linus Torvalds 已提交
8206

8207 8208 8209 8210 8211
	if (mddev_is_clustered(mddev)) {
		ret = md_cluster_ops->resync_start(mddev);
		if (ret)
			goto skip;

8212
		set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
8213 8214 8215 8216 8217 8218 8219 8220
		if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
			test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
			test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
		     && ((unsigned long long)mddev->curr_resync_completed
			 < (unsigned long long)mddev->resync_max_sectors))
			goto skip;
	}

8221
	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8222
		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
8223
			desc = "data-check";
8224 8225
			action = "check";
		} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8226
			desc = "requested-resync";
8227 8228
			action = "repair";
		} else
8229 8230 8231 8232 8233 8234
			desc = "resync";
	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
		desc = "reshape";
	else
		desc = "recovery";

8235 8236
	mddev->last_sync_action = action ?: desc;

L
Linus Torvalds 已提交
8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253
	/* we overload curr_resync somewhat here.
	 * 0 == not engaged in resync at all
	 * 2 == checking that there is no conflict with another sync
	 * 1 == like 2, but have yielded to allow conflicting resync to
	 *		commense
	 * other == active in resync - this many blocks
	 *
	 * Before starting a resync we must have set curr_resync to
	 * 2, and then checked that every "conflicting" array has curr_resync
	 * less than ours.  When we find one that is the same or higher
	 * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
	 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
	 * This will mean we have to start checking from the beginning again.
	 *
	 */

	do {
8254
		int mddev2_minor = -1;
L
Linus Torvalds 已提交
8255 8256 8257
		mddev->curr_resync = 2;

	try_again:
8258
		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
L
Linus Torvalds 已提交
8259
			goto skip;
8260
		for_each_mddev(mddev2, tmp) {
L
Linus Torvalds 已提交
8261 8262
			if (mddev2 == mddev)
				continue;
8263 8264 8265
			if (!mddev->parallel_resync
			&&  mddev2->curr_resync
			&&  match_mddev_units(mddev, mddev2)) {
L
Linus Torvalds 已提交
8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276
				DEFINE_WAIT(wq);
				if (mddev < mddev2 && mddev->curr_resync == 2) {
					/* arbitrarily yield */
					mddev->curr_resync = 1;
					wake_up(&resync_wait);
				}
				if (mddev > mddev2 && mddev->curr_resync == 1)
					/* no need to wait here, we can wait the next
					 * time 'round when curr_resync == 2
					 */
					continue;
8277 8278 8279 8280 8281
				/* We need to wait 'interruptible' so as not to
				 * contribute to the load average, and not to
				 * be caught by 'softlockup'
				 */
				prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
8282
				if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8283
				    mddev2->curr_resync >= mddev->curr_resync) {
8284 8285
					if (mddev2_minor != mddev2->md_minor) {
						mddev2_minor = mddev2->md_minor;
8286 8287 8288
						pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
							desc, mdname(mddev),
							mdname(mddev2));
8289
					}
L
Linus Torvalds 已提交
8290
					mddev_put(mddev2);
8291 8292
					if (signal_pending(current))
						flush_signals(current);
L
Linus Torvalds 已提交
8293 8294 8295 8296 8297 8298 8299 8300 8301
					schedule();
					finish_wait(&resync_wait, &wq);
					goto try_again;
				}
				finish_wait(&resync_wait, &wq);
			}
		}
	} while (mddev->curr_resync < 2);

8302
	j = 0;
8303
	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
L
Linus Torvalds 已提交
8304
		/* resync follows the size requested by the personality,
8305
		 * which defaults to physical size, but can be virtual size
L
Linus Torvalds 已提交
8306 8307
		 */
		max_sectors = mddev->resync_max_sectors;
8308
		atomic64_set(&mddev->resync_mismatches, 0);
8309
		/* we don't use the checkpoint if there's a bitmap */
8310 8311 8312
		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
			j = mddev->resync_min;
		else if (!mddev->bitmap)
8313
			j = mddev->recovery_cp;
8314

8315
	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8316
		max_sectors = mddev->resync_max_sectors;
8317
	else {
L
Linus Torvalds 已提交
8318
		/* recovery follows the physical size of devices */
A
Andre Noll 已提交
8319
		max_sectors = mddev->dev_sectors;
8320
		j = MaxSector;
8321
		rcu_read_lock();
N
NeilBrown 已提交
8322
		rdev_for_each_rcu(rdev, mddev)
8323
			if (rdev->raid_disk >= 0 &&
S
Shaohua Li 已提交
8324
			    !test_bit(Journal, &rdev->flags) &&
8325 8326 8327 8328
			    !test_bit(Faulty, &rdev->flags) &&
			    !test_bit(In_sync, &rdev->flags) &&
			    rdev->recovery_offset < j)
				j = rdev->recovery_offset;
8329
		rcu_read_unlock();
8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342

		/* If there is a bitmap, we need to make sure all
		 * writes that started before we added a spare
		 * complete before we start doing a recovery.
		 * Otherwise the write might complete and (via
		 * bitmap_endwrite) set a bit in the bitmap after the
		 * recovery has checked that bit and skipped that
		 * region.
		 */
		if (mddev->bitmap) {
			mddev->pers->quiesce(mddev, 1);
			mddev->pers->quiesce(mddev, 0);
		}
8343
	}
L
Linus Torvalds 已提交
8344

8345 8346 8347 8348
	pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
	pr_debug("md: minimum _guaranteed_  speed: %d KB/sec/disk.\n", speed_min(mddev));
	pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
		 speed_max(mddev), desc);
L
Linus Torvalds 已提交
8349

N
NeilBrown 已提交
8350
	is_mddev_idle(mddev, 1); /* this initializes IO event counters */
8351

8352
	io_sectors = 0;
L
Linus Torvalds 已提交
8353 8354
	for (m = 0; m < SYNC_MARKS; m++) {
		mark[m] = jiffies;
8355
		mark_cnt[m] = io_sectors;
L
Linus Torvalds 已提交
8356 8357 8358 8359 8360 8361 8362 8363 8364
	}
	last_mark = 0;
	mddev->resync_mark = mark[last_mark];
	mddev->resync_mark_cnt = mark_cnt[last_mark];

	/*
	 * Tune reconstruction:
	 */
	window = 32*(PAGE_SIZE/512);
8365 8366
	pr_debug("md: using %dk window, over a total of %lluk.\n",
		 window/2, (unsigned long long)max_sectors/2);
L
Linus Torvalds 已提交
8367 8368 8369 8370 8371

	atomic_set(&mddev->recovery_active, 0);
	last_check = 0;

	if (j>2) {
8372 8373
		pr_debug("md: resuming %s of %s from checkpoint.\n",
			 desc, mdname(mddev));
L
Linus Torvalds 已提交
8374
		mddev->curr_resync = j;
8375 8376
	} else
		mddev->curr_resync = 3; /* no longer delayed */
8377
	mddev->curr_resync_completed = j;
8378 8379
	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
	md_new_event(mddev);
8380
	update_time = jiffies;
L
Linus Torvalds 已提交
8381

M
majianpeng 已提交
8382
	blk_start_plug(&plug);
L
Linus Torvalds 已提交
8383
	while (j < max_sectors) {
8384
		sector_t sectors;
L
Linus Torvalds 已提交
8385

8386
		skipped = 0;
8387

8388 8389 8390 8391
		if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
		    ((mddev->curr_resync > mddev->curr_resync_completed &&
		      (mddev->curr_resync - mddev->curr_resync_completed)
		      > (max_sectors >> 4)) ||
8392
		     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
8393
		     (j - mddev->curr_resync_completed)*2
8394 8395
		     >= mddev->resync_max - mddev->curr_resync_completed ||
		     mddev->curr_resync_completed > mddev->resync_max
8396
			    )) {
8397 8398 8399
			/* time to update curr_resync_completed */
			wait_event(mddev->recovery_wait,
				   atomic_read(&mddev->recovery_active) == 0);
8400
			mddev->curr_resync_completed = j;
K
kernelmail 已提交
8401 8402 8403
			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
			    j > mddev->recovery_cp)
				mddev->recovery_cp = j;
8404
			update_time = jiffies;
8405
			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8406
			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8407
		}
8408

8409 8410
		while (j >= mddev->resync_max &&
		       !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8411 8412 8413 8414 8415 8416 8417
			/* As this condition is controlled by user-space,
			 * we can block indefinitely, so use '_interruptible'
			 * to avoid triggering warnings.
			 */
			flush_signals(current); /* just in case */
			wait_event_interruptible(mddev->recovery_wait,
						 mddev->resync_max > j
8418 8419
						 || test_bit(MD_RECOVERY_INTR,
							     &mddev->recovery));
8420
		}
8421

8422 8423
		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
			break;
8424

8425
		sectors = mddev->pers->sync_request(mddev, j, &skipped);
8426
		if (sectors == 0) {
8427
			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8428
			break;
L
Linus Torvalds 已提交
8429
		}
8430 8431 8432 8433 8434 8435

		if (!skipped) { /* actual IO requested */
			io_sectors += sectors;
			atomic_add(sectors, &mddev->recovery_active);
		}

8436 8437 8438
		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
			break;

L
Linus Torvalds 已提交
8439
		j += sectors;
8440 8441 8442
		if (j > max_sectors)
			/* when skipping, extra large numbers can be returned. */
			j = max_sectors;
8443 8444
		if (j > 2)
			mddev->curr_resync = j;
8445
		mddev->curr_mark_cnt = io_sectors;
8446
		if (last_check == 0)
8447
			/* this is the earliest that rebuild will be
8448 8449 8450
			 * visible in /proc/mdstat
			 */
			md_new_event(mddev);
8451 8452

		if (last_check + window > io_sectors || j == max_sectors)
L
Linus Torvalds 已提交
8453 8454
			continue;

8455
		last_check = io_sectors;
L
Linus Torvalds 已提交
8456 8457 8458 8459 8460 8461 8462 8463
	repeat:
		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
			/* step marks */
			int next = (last_mark+1) % SYNC_MARKS;

			mddev->resync_mark = mark[next];
			mddev->resync_mark_cnt = mark_cnt[next];
			mark[next] = jiffies;
8464
			mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
L
Linus Torvalds 已提交
8465 8466 8467
			last_mark = next;
		}

8468 8469
		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
			break;
L
Linus Torvalds 已提交
8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480

		/*
		 * this loop exits only if either when we are slower than
		 * the 'hard' speed limit, or the system was IO-idle for
		 * a jiffy.
		 * the system might be non-idle CPU-wise, but we only care
		 * about not overloading the IO subsystem. (things like an
		 * e2fsck being done on the RAID array should execute fast)
		 */
		cond_resched();

X
Xiao Ni 已提交
8481 8482
		recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
		currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
8483
			/((jiffies-mddev->resync_mark)/HZ +1) +1;
L
Linus Torvalds 已提交
8484

8485
		if (currspeed > speed_min(mddev)) {
8486
			if (currspeed > speed_max(mddev)) {
8487
				msleep(500);
L
Linus Torvalds 已提交
8488 8489
				goto repeat;
			}
8490 8491 8492 8493 8494 8495 8496 8497
			if (!is_mddev_idle(mddev, 0)) {
				/*
				 * Give other IO more of a chance.
				 * The faster the devices, the less we wait.
				 */
				wait_event(mddev->recovery_wait,
					   !atomic_read(&mddev->recovery_active));
			}
L
Linus Torvalds 已提交
8498 8499
		}
	}
8500 8501 8502
	pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
		test_bit(MD_RECOVERY_INTR, &mddev->recovery)
		? "interrupted" : "done");
L
Linus Torvalds 已提交
8503 8504 8505
	/*
	 * this also signals 'finished resyncing' to md_stop
	 */
M
majianpeng 已提交
8506
	blk_finish_plug(&plug);
L
Linus Torvalds 已提交
8507 8508
	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));

8509 8510
	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8511
	    mddev->curr_resync > 3) {
8512 8513 8514
		mddev->curr_resync_completed = mddev->curr_resync;
		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
	}
8515
	mddev->pers->sync_request(mddev, max_sectors, &skipped);
L
Linus Torvalds 已提交
8516

8517
	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
8518
	    mddev->curr_resync > 3) {
8519 8520 8521
		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
				if (mddev->curr_resync >= mddev->recovery_cp) {
8522 8523
					pr_debug("md: checkpointing %s of %s.\n",
						 desc, mdname(mddev));
8524 8525 8526 8527 8528 8529 8530
					if (test_bit(MD_RECOVERY_ERROR,
						&mddev->recovery))
						mddev->recovery_cp =
							mddev->curr_resync_completed;
					else
						mddev->recovery_cp =
							mddev->curr_resync;
8531 8532 8533 8534 8535 8536
				}
			} else
				mddev->recovery_cp = MaxSector;
		} else {
			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
				mddev->curr_resync = MaxSector;
8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549
			if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
				rcu_read_lock();
				rdev_for_each_rcu(rdev, mddev)
					if (rdev->raid_disk >= 0 &&
					    mddev->delta_disks >= 0 &&
					    !test_bit(Journal, &rdev->flags) &&
					    !test_bit(Faulty, &rdev->flags) &&
					    !test_bit(In_sync, &rdev->flags) &&
					    rdev->recovery_offset < mddev->curr_resync)
						rdev->recovery_offset = mddev->curr_resync;
				rcu_read_unlock();
			}
8550
		}
L
Linus Torvalds 已提交
8551
	}
8552
 skip:
8553 8554 8555
	/* set CHANGE_PENDING here since maybe another update is needed,
	 * so other nodes are informed. It should be harmless for normal
	 * raid */
8556 8557
	set_mask_bits(&mddev->sb_flags, 0,
		      BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
8558

8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571
	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
			!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
			mddev->delta_disks > 0 &&
			mddev->pers->finish_reshape &&
			mddev->pers->size &&
			mddev->queue) {
		mddev_lock_nointr(mddev);
		md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
		mddev_unlock(mddev);
		set_capacity(mddev->gendisk, mddev->array_sectors);
		revalidate_disk(mddev->gendisk);
	}

8572
	spin_lock(&mddev->lock);
8573 8574 8575 8576 8577 8578 8579
	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
		/* We completed so min/max setting can be forgotten if used. */
		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
			mddev->resync_min = 0;
		mddev->resync_max = MaxSector;
	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
		mddev->resync_min = mddev->curr_resync_completed;
8580
	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
L
Linus Torvalds 已提交
8581
	mddev->curr_resync = 0;
8582 8583
	spin_unlock(&mddev->lock);

L
Linus Torvalds 已提交
8584 8585
	wake_up(&resync_wait);
	md_wakeup_thread(mddev->thread);
8586
	return;
L
Linus Torvalds 已提交
8587
}
8588
EXPORT_SYMBOL_GPL(md_do_sync);
L
Linus Torvalds 已提交
8589

8590 8591
static int remove_and_add_spares(struct mddev *mddev,
				 struct md_rdev *this)
8592
{
8593
	struct md_rdev *rdev;
8594
	int spares = 0;
8595
	int removed = 0;
8596
	bool remove_some = false;
8597

8598 8599 8600 8601
	if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
		/* Mustn't remove devices when resync thread is running */
		return 0;

8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620
	rdev_for_each(rdev, mddev) {
		if ((this == NULL || rdev == this) &&
		    rdev->raid_disk >= 0 &&
		    !test_bit(Blocked, &rdev->flags) &&
		    test_bit(Faulty, &rdev->flags) &&
		    atomic_read(&rdev->nr_pending)==0) {
			/* Faulty non-Blocked devices with nr_pending == 0
			 * never get nr_pending incremented,
			 * never get Faulty cleared, and never get Blocked set.
			 * So we can synchronize_rcu now rather than once per device
			 */
			remove_some = true;
			set_bit(RemoveSynchronized, &rdev->flags);
		}
	}

	if (remove_some)
		synchronize_rcu();
	rdev_for_each(rdev, mddev) {
8621 8622
		if ((this == NULL || rdev == this) &&
		    rdev->raid_disk >= 0 &&
8623
		    !test_bit(Blocked, &rdev->flags) &&
8624
		    ((test_bit(RemoveSynchronized, &rdev->flags) ||
S
Shaohua Li 已提交
8625 8626
		     (!test_bit(In_sync, &rdev->flags) &&
		      !test_bit(Journal, &rdev->flags))) &&
8627
		    atomic_read(&rdev->nr_pending)==0)) {
8628
			if (mddev->pers->hot_remove_disk(
8629
				    mddev, rdev) == 0) {
8630
				sysfs_unlink_rdev(mddev, rdev);
8631
				rdev->raid_disk = -1;
8632
				removed++;
8633 8634
			}
		}
8635 8636 8637 8638
		if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
			clear_bit(RemoveSynchronized, &rdev->flags);
	}

8639 8640
	if (removed && mddev->kobj.sd)
		sysfs_notify(&mddev->kobj, NULL, "degraded");
8641

8642
	if (this && removed)
8643 8644
		goto no_add;

N
NeilBrown 已提交
8645
	rdev_for_each(rdev, mddev) {
8646 8647
		if (this && this != rdev)
			continue;
8648 8649
		if (test_bit(Candidate, &rdev->flags))
			continue;
8650 8651
		if (rdev->raid_disk >= 0 &&
		    !test_bit(In_sync, &rdev->flags) &&
S
Shaohua Li 已提交
8652
		    !test_bit(Journal, &rdev->flags) &&
8653 8654
		    !test_bit(Faulty, &rdev->flags))
			spares++;
8655 8656 8657 8658
		if (rdev->raid_disk >= 0)
			continue;
		if (test_bit(Faulty, &rdev->flags))
			continue;
8659 8660 8661 8662 8663
		if (!test_bit(Journal, &rdev->flags)) {
			if (mddev->ro &&
			    ! (rdev->saved_raid_disk >= 0 &&
			       !test_bit(Bitmap_sync, &rdev->flags)))
				continue;
8664

8665 8666
			rdev->recovery_offset = 0;
		}
8667 8668 8669 8670
		if (mddev->pers->
		    hot_add_disk(mddev, rdev) == 0) {
			if (sysfs_link_rdev(mddev, rdev))
				/* failure here is OK */;
8671 8672
			if (!test_bit(Journal, &rdev->flags))
				spares++;
8673
			md_new_event(mddev);
8674
			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8675
		}
8676
	}
8677
no_add:
8678
	if (removed)
8679
		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8680 8681
	return spares;
}
8682

8683 8684 8685
static void md_start_sync(struct work_struct *ws)
{
	struct mddev *mddev = container_of(ws, struct mddev, del_work);
8686

8687 8688 8689 8690
	mddev->sync_thread = md_register_thread(md_do_sync,
						mddev,
						"resync");
	if (!mddev->sync_thread) {
8691 8692
		pr_warn("%s: could not start resync thread...\n",
			mdname(mddev));
8693 8694 8695 8696 8697 8698
		/* leave the spares where they are, it shouldn't hurt */
		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
		clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
		clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
		clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8699
		wake_up(&resync_wait);
8700 8701 8702 8703 8704 8705 8706 8707 8708 8709
		if (test_and_clear_bit(MD_RECOVERY_RECOVER,
				       &mddev->recovery))
			if (mddev->sysfs_action)
				sysfs_notify_dirent_safe(mddev->sysfs_action);
	} else
		md_wakeup_thread(mddev->sync_thread);
	sysfs_notify_dirent_safe(mddev->sysfs_action);
	md_new_event(mddev);
}

L
Linus Torvalds 已提交
8710 8711 8712 8713 8714 8715 8716 8717 8718 8719
/*
 * This routine is regularly called by all per-raid-array threads to
 * deal with generic issues like resync and super-block update.
 * Raid personalities that don't have a thread (linear/raid0) do not
 * need this as they never do any recovery or update the superblock.
 *
 * It does not do any resync itself, but rather "forks" off other threads
 * to do that as needed.
 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
 * "->recovery" and create a thread at ->sync_thread.
8720
 * When the thread finishes it sets MD_RECOVERY_DONE
L
Linus Torvalds 已提交
8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731
 * and wakeups up this thread which will reap the thread and finish up.
 * This thread also removes any faulty devices (with nr_pending == 0).
 *
 * The overall approach is:
 *  1/ if the superblock needs updating, update it.
 *  2/ If a recovery thread is running, don't do anything else.
 *  3/ If recovery has finished, clean up, possibly marking spares active.
 *  4/ If there are any faulty devices, remove them.
 *  5/ If array is degraded, try to add spares devices
 *  6/ If array has spares or is not in-sync, start a resync thread.
 */
8732
void md_check_recovery(struct mddev *mddev)
L
Linus Torvalds 已提交
8733
{
8734 8735 8736
	if (mddev->suspended)
		return;

8737
	if (mddev->bitmap)
8738
		bitmap_daemon_work(mddev);
L
Linus Torvalds 已提交
8739

8740
	if (signal_pending(current)) {
8741
		if (mddev->pers->sync_request && !mddev->external) {
8742 8743
			pr_debug("md: %s in immediate safe mode\n",
				 mdname(mddev));
8744 8745 8746 8747 8748
			mddev->safemode = 2;
		}
		flush_signals(current);
	}

8749 8750
	if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
		return;
L
Linus Torvalds 已提交
8751
	if ( ! (
8752
		(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
L
Linus Torvalds 已提交
8753
		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
8754
		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8755
		(mddev->external == 0 && mddev->safemode == 1) ||
8756
		(mddev->safemode == 2
8757
		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
L
Linus Torvalds 已提交
8758 8759
		))
		return;
8760

8761
	if (mddev_trylock(mddev)) {
8762
		int spares = 0;
8763

8764
		if (!mddev->external && mddev->safemode == 1)
8765 8766
			mddev->safemode = 0;

8767
		if (mddev->ro) {
8768 8769 8770 8771 8772 8773 8774 8775 8776
			struct md_rdev *rdev;
			if (!mddev->external && mddev->in_sync)
				/* 'Blocked' flag not needed as failed devices
				 * will be recorded if array switched to read/write.
				 * Leaving it set will prevent the device
				 * from being removed.
				 */
				rdev_for_each(rdev, mddev)
					clear_bit(Blocked, &rdev->flags);
8777 8778 8779 8780 8781 8782
			/* On a read-only array we can:
			 * - remove failed devices
			 * - add already-in_sync devices if the array itself
			 *   is in-sync.
			 * As we only add devices that are already in-sync,
			 * we can activate the spares immediately.
8783
			 */
8784
			remove_and_add_spares(mddev, NULL);
8785 8786 8787
			/* There is no thread, but we need to call
			 * ->spare_active and clear saved_raid_disk
			 */
8788
			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8789
			md_reap_sync_thread(mddev);
8790
			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8791
			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8792
			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8793 8794 8795
			goto unlock;
		}

8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807
		if (mddev_is_clustered(mddev)) {
			struct md_rdev *rdev;
			/* kick the device if another node issued a
			 * remove disk.
			 */
			rdev_for_each(rdev, mddev) {
				if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
						rdev->raid_disk < 0)
					md_kick_rdev_from_array(rdev);
			}
		}

N
NeilBrown 已提交
8808
		if (!mddev->external && !mddev->in_sync) {
8809
			spin_lock(&mddev->lock);
N
NeilBrown 已提交
8810
			set_in_sync(mddev);
8811
			spin_unlock(&mddev->lock);
8812 8813
		}

8814
		if (mddev->sb_flags)
8815
			md_update_sb(mddev, 0);
8816

L
Linus Torvalds 已提交
8817 8818 8819 8820 8821 8822 8823
		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
			/* resync/recovery still happening */
			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
			goto unlock;
		}
		if (mddev->sync_thread) {
8824
			md_reap_sync_thread(mddev);
L
Linus Torvalds 已提交
8825 8826
			goto unlock;
		}
8827 8828 8829
		/* Set RUNNING before clearing NEEDED to avoid
		 * any transients in the value of "sync_action".
		 */
8830
		mddev->curr_resync_completed = 0;
8831
		spin_lock(&mddev->lock);
8832
		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8833
		spin_unlock(&mddev->lock);
8834 8835 8836 8837 8838
		/* Clear some bits that don't mean anything, but
		 * might be left set
		 */
		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
L
Linus Torvalds 已提交
8839

8840 8841
		if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
		    test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
8842
			goto not_running;
L
Linus Torvalds 已提交
8843 8844 8845
		/* no recovery is running.
		 * remove any failed drives, then
		 * add spares if possible.
8846
		 * Spares are also removed and re-added, to allow
L
Linus Torvalds 已提交
8847 8848 8849
		 * the personality to fail the re-add.
		 */

8850
		if (mddev->reshape_position != MaxSector) {
8851 8852
			if (mddev->pers->check_reshape == NULL ||
			    mddev->pers->check_reshape(mddev) != 0)
8853
				/* Cannot proceed */
8854
				goto not_running;
8855
			set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8856
			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8857
		} else if ((spares = remove_and_add_spares(mddev, NULL))) {
8858 8859
			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8860
			clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8861
			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8862 8863
		} else if (mddev->recovery_cp < MaxSector) {
			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8864
			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8865 8866
		} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
			/* nothing to be done ... */
8867
			goto not_running;
8868

L
Linus Torvalds 已提交
8869
		if (mddev->pers->sync_request) {
8870
			if (spares) {
8871 8872 8873 8874 8875 8876
				/* We are adding a device or devices to an array
				 * which has the bitmap stored on all devices.
				 * So make sure all bitmap pages get written
				 */
				bitmap_write_all(mddev->bitmap);
			}
8877 8878 8879
			INIT_WORK(&mddev->del_work, md_start_sync);
			queue_work(md_misc_wq, &mddev->del_work);
			goto unlock;
L
Linus Torvalds 已提交
8880
		}
8881
	not_running:
8882 8883
		if (!mddev->sync_thread) {
			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8884
			wake_up(&resync_wait);
8885 8886
			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
					       &mddev->recovery))
8887
				if (mddev->sysfs_action)
N
NeilBrown 已提交
8888
					sysfs_notify_dirent_safe(mddev->sysfs_action);
8889
		}
8890 8891
	unlock:
		wake_up(&mddev->sb_wait);
L
Linus Torvalds 已提交
8892
		mddev_unlock(mddev);
8893 8894 8895 8896 8897 8898 8899 8900 8901 8902
	} else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
		/* Write superblock - thread that called mddev_suspend()
		 * holds reconfig_mutex for us.
		 */
		set_bit(MD_UPDATING_SB, &mddev->flags);
		smp_mb__after_atomic();
		if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
			md_update_sb(mddev, 0);
		clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
		wake_up(&mddev->sb_wait);
L
Linus Torvalds 已提交
8903 8904
	}
}
8905
EXPORT_SYMBOL(md_check_recovery);
L
Linus Torvalds 已提交
8906

8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919
void md_reap_sync_thread(struct mddev *mddev)
{
	struct md_rdev *rdev;

	/* resync has finished, collect result */
	md_unregister_thread(&mddev->sync_thread);
	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
		/* success...*/
		/* activate any spares */
		if (mddev->pers->spare_active(mddev)) {
			sysfs_notify(&mddev->kobj, NULL,
				     "degraded");
8920
			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8921 8922 8923 8924 8925 8926 8927
		}
	}
	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
	    mddev->pers->finish_reshape)
		mddev->pers->finish_reshape(mddev);

	/* If array is no-longer degraded, then any saved_raid_disk
8928
	 * information must be scrapped.
8929
	 */
8930 8931
	if (!mddev->degraded)
		rdev_for_each(rdev, mddev)
8932 8933 8934
			rdev->saved_raid_disk = -1;

	md_update_sb(mddev, 1);
8935
	/* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
8936 8937 8938 8939
	 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
	 * clustered raid */
	if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
		md_cluster_ops->resync_finish(mddev);
8940
	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8941
	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8942 8943 8944 8945
	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8946
	wake_up(&resync_wait);
8947 8948 8949 8950 8951 8952 8953
	/* flag recovery needed just to double check */
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
	sysfs_notify_dirent_safe(mddev->sysfs_action);
	md_new_event(mddev);
	if (mddev->event_work.func)
		queue_work(md_misc_wq, &mddev->event_work);
}
8954
EXPORT_SYMBOL(md_reap_sync_thread);
8955

8956
void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
8957
{
N
NeilBrown 已提交
8958
	sysfs_notify_dirent_safe(rdev->sysfs_state);
8959
	wait_event_timeout(rdev->blocked_wait,
8960 8961
			   !test_bit(Blocked, &rdev->flags) &&
			   !test_bit(BlockedBadBlocks, &rdev->flags),
8962 8963 8964 8965 8966
			   msecs_to_jiffies(5000));
	rdev_dec_pending(rdev, mddev);
}
EXPORT_SYMBOL(md_wait_for_blocked_rdev);

8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980
void md_finish_reshape(struct mddev *mddev)
{
	/* called be personality module when reshape completes. */
	struct md_rdev *rdev;

	rdev_for_each(rdev, mddev) {
		if (rdev->data_offset > rdev->new_data_offset)
			rdev->sectors += rdev->data_offset - rdev->new_data_offset;
		else
			rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
		rdev->data_offset = rdev->new_data_offset;
	}
}
EXPORT_SYMBOL(md_finish_reshape);
8981

8982
/* Bad block management */
8983

8984
/* Returns 1 on success, 0 on failure */
8985
int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8986
		       int is_new)
8987
{
8988
	struct mddev *mddev = rdev->mddev;
8989 8990 8991 8992 8993
	int rv;
	if (is_new)
		s += rdev->new_data_offset;
	else
		s += rdev->data_offset;
8994 8995
	rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
	if (rv == 0) {
8996
		/* Make sure they get written out promptly */
8997 8998 8999
		if (test_bit(ExternalBbl, &rdev->flags))
			sysfs_notify(&rdev->kobj, NULL,
				     "unacknowledged_bad_blocks");
9000
		sysfs_notify_dirent_safe(rdev->sysfs_state);
9001 9002
		set_mask_bits(&mddev->sb_flags, 0,
			      BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
9003
		md_wakeup_thread(rdev->mddev->thread);
9004 9005 9006
		return 1;
	} else
		return 0;
9007 9008 9009
}
EXPORT_SYMBOL_GPL(rdev_set_badblocks);

9010 9011
int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
			 int is_new)
9012
{
9013
	int rv;
9014 9015 9016 9017
	if (is_new)
		s += rdev->new_data_offset;
	else
		s += rdev->data_offset;
9018 9019 9020 9021
	rv = badblocks_clear(&rdev->badblocks, s, sectors);
	if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
		sysfs_notify(&rdev->kobj, NULL, "bad_blocks");
	return rv;
9022 9023 9024
}
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);

A
Adrian Bunk 已提交
9025 9026
static int md_notify_reboot(struct notifier_block *this,
			    unsigned long code, void *x)
L
Linus Torvalds 已提交
9027 9028
{
	struct list_head *tmp;
9029
	struct mddev *mddev;
9030
	int need_delay = 0;
L
Linus Torvalds 已提交
9031

9032 9033
	for_each_mddev(mddev, tmp) {
		if (mddev_trylock(mddev)) {
9034 9035
			if (mddev->pers)
				__md_stop_writes(mddev);
9036 9037
			if (mddev->persistent)
				mddev->safemode = 2;
9038
			mddev_unlock(mddev);
9039
		}
9040
		need_delay = 1;
L
Linus Torvalds 已提交
9041
	}
9042 9043 9044 9045 9046 9047 9048 9049 9050
	/*
	 * certain more exotic SCSI devices are known to be
	 * volatile wrt too early system reboots. While the
	 * right place to handle this issue is the given
	 * driver, we do want to have a safe RAID driver ...
	 */
	if (need_delay)
		mdelay(1000*1);

L
Linus Torvalds 已提交
9051 9052 9053
	return NOTIFY_DONE;
}

A
Adrian Bunk 已提交
9054
static struct notifier_block md_notifier = {
L
Linus Torvalds 已提交
9055 9056 9057 9058 9059 9060 9061
	.notifier_call	= md_notify_reboot,
	.next		= NULL,
	.priority	= INT_MAX, /* before any real devices */
};

static void md_geninit(void)
{
9062
	pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
L
Linus Torvalds 已提交
9063

9064
	proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
L
Linus Torvalds 已提交
9065 9066
}

A
Adrian Bunk 已提交
9067
static int __init md_init(void)
L
Linus Torvalds 已提交
9068
{
T
Tejun Heo 已提交
9069 9070
	int ret = -ENOMEM;

9071
	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
T
Tejun Heo 已提交
9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085
	if (!md_wq)
		goto err_wq;

	md_misc_wq = alloc_workqueue("md_misc", 0, 0);
	if (!md_misc_wq)
		goto err_misc_wq;

	if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
		goto err_md;

	if ((ret = register_blkdev(0, "mdp")) < 0)
		goto err_mdp;
	mdp_major = ret;

9086
	blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
9087 9088
			    md_probe, NULL, NULL);
	blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
L
Linus Torvalds 已提交
9089 9090 9091
			    md_probe, NULL, NULL);

	register_reboot_notifier(&md_notifier);
9092
	raid_table_header = register_sysctl_table(raid_root_table);
L
Linus Torvalds 已提交
9093 9094

	md_geninit();
9095
	return 0;
L
Linus Torvalds 已提交
9096

T
Tejun Heo 已提交
9097 9098 9099 9100 9101 9102 9103 9104 9105
err_mdp:
	unregister_blkdev(MD_MAJOR, "md");
err_md:
	destroy_workqueue(md_misc_wq);
err_misc_wq:
	destroy_workqueue(md_wq);
err_wq:
	return ret;
}
L
Linus Torvalds 已提交
9106

9107
static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
9108
{
9109 9110 9111 9112
	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
	struct md_rdev *rdev2;
	int role, ret;
	char b[BDEVNAME_SIZE];
9113

9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125
	/*
	 * If size is changed in another node then we need to
	 * do resize as well.
	 */
	if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
		ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
		if (ret)
			pr_info("md-cluster: resize failed\n");
		else
			bitmap_update_sb(mddev->bitmap);
	}

9126 9127 9128 9129 9130 9131 9132
	/* Check for change of roles in the active devices */
	rdev_for_each(rdev2, mddev) {
		if (test_bit(Faulty, &rdev2->flags))
			continue;

		/* Check if the roles changed */
		role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143

		if (test_bit(Candidate, &rdev2->flags)) {
			if (role == 0xfffe) {
				pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
				md_kick_rdev_from_array(rdev2);
				continue;
			}
			else
				clear_bit(Candidate, &rdev2->flags);
		}

9144 9145 9146 9147 9148 9149
		if (role != rdev2->raid_disk) {
			/* got activated */
			if (rdev2->raid_disk == -1 && role != 0xffff) {
				rdev2->saved_raid_disk = role;
				ret = remove_and_add_spares(mddev, rdev2);
				pr_info("Activated spare: %s\n",
9150
					bdevname(rdev2->bdev,b));
9151 9152 9153 9154 9155
				/* wakeup mddev->thread here, so array could
				 * perform resync with the new activated disk */
				set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
				md_wakeup_thread(mddev->thread);

9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166
			}
			/* device faulty
			 * We just want to do the minimum to mark the disk
			 * as faulty. The recovery is performed by the
			 * one who initiated the error.
			 */
			if ((role == 0xfffe) || (role == 0xfffd)) {
				md_error(mddev, rdev2);
				clear_bit(Blocked, &rdev2->flags);
			}
		}
9167
	}
9168

9169 9170
	if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
		update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185

	/* Finally set the event to be up to date */
	mddev->events = le64_to_cpu(sb->events);
}

static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
	int err;
	struct page *swapout = rdev->sb_page;
	struct mdp_superblock_1 *sb;

	/* Store the sb page of the rdev in the swapout temporary
	 * variable in case we err in the future
	 */
	rdev->sb_page = NULL;
9186 9187 9188 9189 9190 9191 9192
	err = alloc_disk_sb(rdev);
	if (err == 0) {
		ClearPageUptodate(rdev->sb_page);
		rdev->sb_loaded = 0;
		err = super_types[mddev->major_version].
			load_super(rdev, NULL, mddev->minor_version);
	}
9193 9194 9195
	if (err < 0) {
		pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
				__func__, __LINE__, rdev->desc_nr, err);
9196 9197
		if (rdev->sb_page)
			put_page(rdev->sb_page);
9198 9199 9200
		rdev->sb_page = swapout;
		rdev->sb_loaded = 1;
		return err;
9201 9202
	}

9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245
	sb = page_address(rdev->sb_page);
	/* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
	 * is not set
	 */

	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
		rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);

	/* The other node finished recovery, call spare_active to set
	 * device In_sync and mddev->degraded
	 */
	if (rdev->recovery_offset == MaxSector &&
	    !test_bit(In_sync, &rdev->flags) &&
	    mddev->pers->spare_active(mddev))
		sysfs_notify(&mddev->kobj, NULL, "degraded");

	put_page(swapout);
	return 0;
}

void md_reload_sb(struct mddev *mddev, int nr)
{
	struct md_rdev *rdev;
	int err;

	/* Find the rdev */
	rdev_for_each_rcu(rdev, mddev) {
		if (rdev->desc_nr == nr)
			break;
	}

	if (!rdev || rdev->desc_nr != nr) {
		pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
		return;
	}

	err = read_rdev(mddev, rdev);
	if (err < 0)
		return;

	check_sb_changes(mddev, rdev);

	/* Read all rdev's to update recovery_offset */
9246 9247 9248 9249
	rdev_for_each_rcu(rdev, mddev) {
		if (!test_bit(Faulty, &rdev->flags))
			read_rdev(mddev, rdev);
	}
9250 9251 9252
}
EXPORT_SYMBOL(md_reload_sb);

L
Linus Torvalds 已提交
9253 9254 9255 9256 9257 9258
#ifndef MODULE

/*
 * Searches all registered partitions for autorun RAID arrays
 * at boot time.
 */
9259

9260
static DEFINE_MUTEX(detected_devices_mutex);
9261 9262 9263 9264 9265
static LIST_HEAD(all_detected_devices);
struct detected_devices_node {
	struct list_head list;
	dev_t dev;
};
L
Linus Torvalds 已提交
9266 9267 9268

void md_autodetect_dev(dev_t dev)
{
9269 9270 9271 9272 9273
	struct detected_devices_node *node_detected_dev;

	node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
	if (node_detected_dev) {
		node_detected_dev->dev = dev;
9274
		mutex_lock(&detected_devices_mutex);
9275
		list_add_tail(&node_detected_dev->list, &all_detected_devices);
9276
		mutex_unlock(&detected_devices_mutex);
9277
	}
L
Linus Torvalds 已提交
9278 9279 9280 9281
}

static void autostart_arrays(int part)
{
9282
	struct md_rdev *rdev;
9283 9284 9285
	struct detected_devices_node *node_detected_dev;
	dev_t dev;
	int i_scanned, i_passed;
L
Linus Torvalds 已提交
9286

9287 9288
	i_scanned = 0;
	i_passed = 0;
L
Linus Torvalds 已提交
9289

9290
	pr_info("md: Autodetecting RAID arrays.\n");
L
Linus Torvalds 已提交
9291

9292
	mutex_lock(&detected_devices_mutex);
9293 9294 9295 9296 9297 9298 9299
	while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
		i_scanned++;
		node_detected_dev = list_entry(all_detected_devices.next,
					struct detected_devices_node, list);
		list_del(&node_detected_dev->list);
		dev = node_detected_dev->dev;
		kfree(node_detected_dev);
S
Shaohua Li 已提交
9300
		mutex_unlock(&detected_devices_mutex);
9301
		rdev = md_import_device(dev,0, 90);
S
Shaohua Li 已提交
9302
		mutex_lock(&detected_devices_mutex);
L
Linus Torvalds 已提交
9303 9304 9305
		if (IS_ERR(rdev))
			continue;

N
NeilBrown 已提交
9306
		if (test_bit(Faulty, &rdev->flags))
L
Linus Torvalds 已提交
9307
			continue;
N
NeilBrown 已提交
9308

9309
		set_bit(AutoDetected, &rdev->flags);
L
Linus Torvalds 已提交
9310
		list_add(&rdev->same_set, &pending_raid_disks);
9311
		i_passed++;
L
Linus Torvalds 已提交
9312
	}
9313
	mutex_unlock(&detected_devices_mutex);
9314

9315
	pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
L
Linus Torvalds 已提交
9316 9317 9318 9319

	autorun_devices(part);
}

J
Jeff Garzik 已提交
9320
#endif /* !MODULE */
L
Linus Torvalds 已提交
9321 9322 9323

static __exit void md_exit(void)
{
9324
	struct mddev *mddev;
L
Linus Torvalds 已提交
9325
	struct list_head *tmp;
9326
	int delay = 1;
9327

9328
	blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
9329
	blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
L
Linus Torvalds 已提交
9330

C
Christoph Hellwig 已提交
9331
	unregister_blkdev(MD_MAJOR,"md");
L
Linus Torvalds 已提交
9332 9333 9334
	unregister_blkdev(mdp_major, "mdp");
	unregister_reboot_notifier(&md_notifier);
	unregister_sysctl_table(raid_table_header);
9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345

	/* We cannot unload the modules while some process is
	 * waiting for us in select() or poll() - wake them up
	 */
	md_unloading = 1;
	while (waitqueue_active(&md_event_waiters)) {
		/* not safe to leave yet */
		wake_up(&md_event_waiters);
		msleep(delay);
		delay += delay;
	}
L
Linus Torvalds 已提交
9346
	remove_proc_entry("mdstat", NULL);
9347

9348
	for_each_mddev(mddev, tmp) {
L
Linus Torvalds 已提交
9349
		export_array(mddev);
9350
		mddev->ctime = 0;
9351
		mddev->hold_active = 0;
9352 9353 9354 9355 9356 9357
		/*
		 * for_each_mddev() will call mddev_put() at the end of each
		 * iteration.  As the mddev is now fully clear, this will
		 * schedule the mddev for destruction by a workqueue, and the
		 * destroy_workqueue() below will wait for that to complete.
		 */
L
Linus Torvalds 已提交
9358
	}
T
Tejun Heo 已提交
9359 9360
	destroy_workqueue(md_misc_wq);
	destroy_workqueue(md_wq);
L
Linus Torvalds 已提交
9361 9362
}

9363
subsys_initcall(md_init);
L
Linus Torvalds 已提交
9364 9365
module_exit(md_exit)

9366
static int get_ro(char *buffer, const struct kernel_param *kp)
9367 9368 9369
{
	return sprintf(buffer, "%d", start_readonly);
}
9370
static int set_ro(const char *val, const struct kernel_param *kp)
9371
{
A
Alexey Dobriyan 已提交
9372
	return kstrtouint(val, 10, (unsigned int *)&start_readonly);
9373 9374
}

9375 9376
module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
9377
module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
9378
module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
9379

L
Linus Torvalds 已提交
9380
MODULE_LICENSE("GPL");
9381
MODULE_DESCRIPTION("MD RAID framework");
9382
MODULE_ALIAS("md");
9383
MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);