md.c 255.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
L
Linus Torvalds 已提交
2 3
/*
   md.c : Multiple Devices driver for Linux
4
     Copyright (C) 1998, 1999, 2000 Ingo Molnar
L
Linus Torvalds 已提交
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22

     completely rewritten, based on the MD driver code from Marc Zyngier

   Changes:

   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
   - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
   - kmod support by: Cyrus Durgin
   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>

   - lots of fixes and improvements to the RAID1/RAID5 and generic
     RAID code (such as request based resynchronization):

     Neil Brown <neilb@cse.unsw.edu.au>.

23 24 25
   - persistent bitmap code
     Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.

26 27 28 29 30 31 32 33 34 35 36 37

   Errors, Warnings, etc.
   Please use:
     pr_crit() for error conditions that risk data loss
     pr_err() for error conditions that are unexpected, like an IO error
         or internal inconsistency
     pr_warn() for error conditions that could have been predicated, like
         adding a device to an array when it has incompatible metadata
     pr_info() for every interesting, very rare events, like an array starting
         or stopping, or resync starting or stopping
     pr_debug() for everything else.

L
Linus Torvalds 已提交
38 39
*/

40
#include <linux/sched/mm.h>
41
#include <linux/sched/signal.h>
42
#include <linux/kthread.h>
43
#include <linux/blkdev.h>
44
#include <linux/badblocks.h>
L
Linus Torvalds 已提交
45
#include <linux/sysctl.h>
46
#include <linux/seq_file.h>
A
Al Viro 已提交
47
#include <linux/fs.h>
48
#include <linux/poll.h>
49
#include <linux/ctype.h>
50
#include <linux/string.h>
51 52 53
#include <linux/hdreg.h>
#include <linux/proc_fs.h>
#include <linux/random.h>
54
#include <linux/module.h>
55
#include <linux/reboot.h>
56
#include <linux/file.h>
57
#include <linux/compat.h>
58
#include <linux/delay.h>
59 60
#include <linux/raid/md_p.h>
#include <linux/raid/md_u.h>
61
#include <linux/slab.h>
62 63
#include <linux/percpu-refcount.h>

64
#include <trace/events/block.h>
65
#include "md.h"
66
#include "md-bitmap.h"
67
#include "md-cluster.h"
L
Linus Torvalds 已提交
68 69

#ifndef MODULE
70
static void autostart_arrays(int part);
L
Linus Torvalds 已提交
71 72
#endif

73 74 75 76 77
/* pers_list is a list of registered personalities protected
 * by pers_lock.
 * pers_lock does extra service to protect accesses to
 * mddev->thread when the mutex cannot be held.
 */
78
static LIST_HEAD(pers_list);
L
Linus Torvalds 已提交
79 80
static DEFINE_SPINLOCK(pers_lock);

81 82
static struct kobj_type md_ktype;

83
struct md_cluster_operations *md_cluster_ops;
84
EXPORT_SYMBOL(md_cluster_ops);
85
static struct module *md_cluster_mod;
86

87
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
T
Tejun Heo 已提交
88 89
static struct workqueue_struct *md_wq;
static struct workqueue_struct *md_misc_wq;
90

91 92
static int remove_and_add_spares(struct mddev *mddev,
				 struct md_rdev *this);
93
static void mddev_detach(struct mddev *mddev);
94

95 96 97 98 99 100
/*
 * Default number of read corrections we'll attempt on an rdev
 * before ejecting it from the array. We divide the read error
 * count by 2 for every hour elapsed between read errors.
 */
#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
L
Linus Torvalds 已提交
101 102 103 104
/*
 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
 * is 1000 KB/sec, so the extra system load does not show up that much.
 * Increase it if you want to have more _guaranteed_ speed. Note that
105
 * the RAID driver will use the maximum available bandwidth if the IO
L
Linus Torvalds 已提交
106 107 108 109 110
 * subsystem is idle. There is also an 'absolute maximum' reconstruction
 * speed limit - in case reconstruction slows down your system despite
 * idle IO detection.
 *
 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
111
 * or /sys/block/mdX/md/sync_speed_{min,max}
L
Linus Torvalds 已提交
112 113 114 115
 */

static int sysctl_speed_limit_min = 1000;
static int sysctl_speed_limit_max = 200000;
116
static inline int speed_min(struct mddev *mddev)
117 118 119 120 121
{
	return mddev->sync_speed_min ?
		mddev->sync_speed_min : sysctl_speed_limit_min;
}

122
static inline int speed_max(struct mddev *mddev)
123 124 125 126
{
	return mddev->sync_speed_max ?
		mddev->sync_speed_max : sysctl_speed_limit_max;
}
L
Linus Torvalds 已提交
127

128 129 130 131 132
static void rdev_uninit_serial(struct md_rdev *rdev)
{
	if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
		return;

133
	kvfree(rdev->serial);
134 135 136 137 138 139 140 141 142 143 144
	rdev->serial = NULL;
}

static void rdevs_uninit_serial(struct mddev *mddev)
{
	struct md_rdev *rdev;

	rdev_for_each(rdev, mddev)
		rdev_uninit_serial(rdev);
}

G
Guoqing Jiang 已提交
145
static int rdev_init_serial(struct md_rdev *rdev)
146
{
147 148
	/* serial_nums equals with BARRIER_BUCKETS_NR */
	int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t))));
149 150 151 152 153
	struct serial_in_rdev *serial = NULL;

	if (test_bit(CollisionCheck, &rdev->flags))
		return 0;

154 155
	serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums,
			  GFP_KERNEL);
156 157 158
	if (!serial)
		return -ENOMEM;

159 160 161 162 163 164 165 166
	for (i = 0; i < serial_nums; i++) {
		struct serial_in_rdev *serial_tmp = &serial[i];

		spin_lock_init(&serial_tmp->serial_lock);
		serial_tmp->serial_rb = RB_ROOT_CACHED;
		init_waitqueue_head(&serial_tmp->serial_io_wait);
	}

167
	rdev->serial = serial;
G
Guoqing Jiang 已提交
168
	set_bit(CollisionCheck, &rdev->flags);
169

170
	return 0;
171 172
}

173
static int rdevs_init_serial(struct mddev *mddev)
174 175
{
	struct md_rdev *rdev;
176
	int ret = 0;
177 178

	rdev_for_each(rdev, mddev) {
179 180 181
		ret = rdev_init_serial(rdev);
		if (ret)
			break;
182
	}
183 184 185 186 187 188

	/* Free all resources if pool is not existed */
	if (ret && !mddev->serial_info_pool)
		rdevs_uninit_serial(mddev);

	return ret;
189 190
}

191
/*
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
 * rdev needs to enable serial stuffs if it meets the conditions:
 * 1. it is multi-queue device flaged with writemostly.
 * 2. the write-behind mode is enabled.
 */
static int rdev_need_serial(struct md_rdev *rdev)
{
	return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 &&
		rdev->bdev->bd_queue->nr_hw_queues != 1 &&
		test_bit(WriteMostly, &rdev->flags));
}

/*
 * Init resource for rdev(s), then create serial_info_pool if:
 * 1. rdev is the first device which return true from rdev_enable_serial.
 * 2. rdev is NULL, means we want to enable serialization for all rdevs.
207
 */
G
Guoqing Jiang 已提交
208
void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
209
			      bool is_suspend)
210
{
211 212
	int ret = 0;

213 214
	if (rdev && !rdev_need_serial(rdev) &&
	    !test_bit(CollisionCheck, &rdev->flags))
215 216
		return;

217 218 219 220
	if (!is_suspend)
		mddev_suspend(mddev);

	if (!rdev)
221
		ret = rdevs_init_serial(mddev);
222
	else
223 224 225
		ret = rdev_init_serial(rdev);
	if (ret)
		goto abort;
226

G
Guoqing Jiang 已提交
227
	if (mddev->serial_info_pool == NULL) {
228 229 230
		unsigned int noio_flag;

		noio_flag = memalloc_noio_save();
G
Guoqing Jiang 已提交
231 232 233
		mddev->serial_info_pool =
			mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
						sizeof(struct serial_info));
234
		memalloc_noio_restore(noio_flag);
235 236
		if (!mddev->serial_info_pool) {
			rdevs_uninit_serial(mddev);
G
Guoqing Jiang 已提交
237
			pr_err("can't alloc memory pool for serialization\n");
238
		}
239
	}
240 241

abort:
242 243
	if (!is_suspend)
		mddev_resume(mddev);
244 245 246
}

/*
247 248 249 250
 * Free resource from rdev(s), and destroy serial_info_pool under conditions:
 * 1. rdev is the last device flaged with CollisionCheck.
 * 2. when bitmap is destroyed while policy is not enabled.
 * 3. for disable policy, the pool is destroyed only when no rdev needs it.
251
 */
252 253
void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
			       bool is_suspend)
254
{
255
	if (rdev && !test_bit(CollisionCheck, &rdev->flags))
256 257
		return;

G
Guoqing Jiang 已提交
258
	if (mddev->serial_info_pool) {
259
		struct md_rdev *temp;
260
		int num = 0; /* used to track if other rdevs need the pool */
261

262 263 264 265
		if (!is_suspend)
			mddev_suspend(mddev);
		rdev_for_each(temp, mddev) {
			if (!rdev) {
266 267 268
				if (!mddev->serialize_policy ||
				    !rdev_need_serial(temp))
					rdev_uninit_serial(temp);
269 270 271 272
				else
					num++;
			} else if (temp != rdev &&
				   test_bit(CollisionCheck, &temp->flags))
273
				num++;
274 275 276
		}

		if (rdev)
277
			rdev_uninit_serial(rdev);
278 279 280 281

		if (num)
			pr_info("The mempool could be used by other devices\n");
		else {
G
Guoqing Jiang 已提交
282 283
			mempool_destroy(mddev->serial_info_pool);
			mddev->serial_info_pool = NULL;
284
		}
285 286
		if (!is_suspend)
			mddev_resume(mddev);
287 288 289
	}
}

L
Linus Torvalds 已提交
290 291
static struct ctl_table_header *raid_table_header;

292
static struct ctl_table raid_table[] = {
L
Linus Torvalds 已提交
293 294 295 296
	{
		.procname	= "speed_limit_min",
		.data		= &sysctl_speed_limit_min,
		.maxlen		= sizeof(int),
297
		.mode		= S_IRUGO|S_IWUSR,
298
		.proc_handler	= proc_dointvec,
L
Linus Torvalds 已提交
299 300 301 302 303
	},
	{
		.procname	= "speed_limit_max",
		.data		= &sysctl_speed_limit_max,
		.maxlen		= sizeof(int),
304
		.mode		= S_IRUGO|S_IWUSR,
305
		.proc_handler	= proc_dointvec,
L
Linus Torvalds 已提交
306
	},
307
	{ }
L
Linus Torvalds 已提交
308 309
};

310
static struct ctl_table raid_dir_table[] = {
L
Linus Torvalds 已提交
311 312 313
	{
		.procname	= "raid",
		.maxlen		= 0,
314
		.mode		= S_IRUGO|S_IXUGO,
L
Linus Torvalds 已提交
315 316
		.child		= raid_table,
	},
317
	{ }
L
Linus Torvalds 已提交
318 319
};

320
static struct ctl_table raid_root_table[] = {
L
Linus Torvalds 已提交
321 322 323 324 325 326
	{
		.procname	= "dev",
		.maxlen		= 0,
		.mode		= 0555,
		.child		= raid_dir_table,
	},
327
	{  }
L
Linus Torvalds 已提交
328 329
};

330
static const struct block_device_operations md_fops;
L
Linus Torvalds 已提交
331

332 333
static int start_readonly;

334 335 336 337 338 339 340 341 342 343
/*
 * The original mechanism for creating an md device is to create
 * a device node in /dev and to open it.  This causes races with device-close.
 * The preferred method is to write to the "new_array" module parameter.
 * This can avoid races.
 * Setting create_on_open to false disables the original mechanism
 * so all the races disappear.
 */
static bool create_on_open = true;

344
struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
345
			    struct mddev *mddev)
346
{
347
	if (!mddev || !bioset_initialized(&mddev->bio_set))
348 349
		return bio_alloc(gfp_mask, nr_iovecs);

350
	return bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
351 352 353
}
EXPORT_SYMBOL_GPL(bio_alloc_mddev);

354 355
static struct bio *md_bio_alloc_sync(struct mddev *mddev)
{
356
	if (!mddev || !bioset_initialized(&mddev->sync_set))
357 358
		return bio_alloc(GFP_NOIO, 1);

359
	return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
360 361
}

362 363 364 365 366 367 368 369 370 371
/*
 * We have a system wide 'event count' that is incremented
 * on any 'interesting' event, and readers of /proc/mdstat
 * can use 'poll' or 'select' to find out when the event
 * count increases.
 *
 * Events are:
 *  start array, stop array, error, add device, remove device,
 *  start build, activate spare
 */
372
static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
373
static atomic_t md_event_count;
374
void md_new_event(struct mddev *mddev)
375 376 377 378
{
	atomic_inc(&md_event_count);
	wake_up(&md_event_waiters);
}
379
EXPORT_SYMBOL_GPL(md_new_event);
380

L
Linus Torvalds 已提交
381 382 383 384 385 386 387 388 389 390 391 392 393 394
/*
 * Enables to iterate over all existing md arrays
 * all_mddevs_lock protects this list.
 */
static LIST_HEAD(all_mddevs);
static DEFINE_SPINLOCK(all_mddevs_lock);

/*
 * iterates through all used mddevs in the system.
 * We take care to grab the all_mddevs_lock whenever navigating
 * the list, and to always hold a refcount when unlocked.
 * Any code which breaks out of this loop while own
 * a reference to the current mddev and must mddev_put it.
 */
395
#define for_each_mddev(_mddev,_tmp)					\
L
Linus Torvalds 已提交
396
									\
397
	for (({ spin_lock(&all_mddevs_lock);				\
398 399 400 401
		_tmp = all_mddevs.next;					\
		_mddev = NULL;});					\
	     ({ if (_tmp != &all_mddevs)				\
			mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
L
Linus Torvalds 已提交
402
		spin_unlock(&all_mddevs_lock);				\
403 404 405
		if (_mddev) mddev_put(_mddev);				\
		_mddev = list_entry(_tmp, struct mddev, all_mddevs);	\
		_tmp != &all_mddevs;});					\
L
Linus Torvalds 已提交
406
	     ({ spin_lock(&all_mddevs_lock);				\
407
		_tmp = _tmp->next;})					\
L
Linus Torvalds 已提交
408 409
		)

410 411 412 413 414 415 416
/* Rather than calling directly into the personality make_request function,
 * IO requests come here first so that we can check if the device is
 * being suspended pending a reconfiguration.
 * We hold a refcount over the call to ->make_request.  By the time that
 * call has finished, the bio has been linked into some internal structure
 * and so is visible to ->quiesce(), so we don't need the refcount any more.
 */
417 418 419 420 421 422 423 424 425 426 427 428 429 430 431
static bool is_suspended(struct mddev *mddev, struct bio *bio)
{
	if (mddev->suspended)
		return true;
	if (bio_data_dir(bio) != WRITE)
		return false;
	if (mddev->suspend_lo >= mddev->suspend_hi)
		return false;
	if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
		return false;
	if (bio_end_sector(bio) < mddev->suspend_lo)
		return false;
	return true;
}

S
Shaohua Li 已提交
432 433 434 435
void md_handle_request(struct mddev *mddev, struct bio *bio)
{
check_suspended:
	rcu_read_lock();
436
	if (is_suspended(mddev, bio)) {
S
Shaohua Li 已提交
437 438 439 440
		DEFINE_WAIT(__wait);
		for (;;) {
			prepare_to_wait(&mddev->sb_wait, &__wait,
					TASK_UNINTERRUPTIBLE);
441
			if (!is_suspended(mddev, bio))
S
Shaohua Li 已提交
442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
				break;
			rcu_read_unlock();
			schedule();
			rcu_read_lock();
		}
		finish_wait(&mddev->sb_wait, &__wait);
	}
	atomic_inc(&mddev->active_io);
	rcu_read_unlock();

	if (!mddev->pers->make_request(mddev, bio)) {
		atomic_dec(&mddev->active_io);
		wake_up(&mddev->sb_wait);
		goto check_suspended;
	}

	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
		wake_up(&mddev->sb_wait);
}
EXPORT_SYMBOL(md_handle_request);

463
static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
L
Linus Torvalds 已提交
464
{
465
	const int rw = bio_data_dir(bio);
466
	const int sgrp = op_stat_group(bio_op(bio));
467
	struct mddev *mddev = q->queuedata;
468
	unsigned int sectors;
469

470 471 472 473 474
	if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
		bio_io_error(bio);
		return BLK_QC_T_NONE;
	}

475
	blk_queue_split(q, &bio);
476

N
NeilBrown 已提交
477
	if (mddev == NULL || mddev->pers == NULL) {
478
		bio_io_error(bio);
479
		return BLK_QC_T_NONE;
480
	}
481
	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
482
		if (bio_sectors(bio) != 0)
483
			bio->bi_status = BLK_STS_IOERR;
484
		bio_endio(bio);
485
		return BLK_QC_T_NONE;
486
	}
487

488 489 490 491 492
	/*
	 * save the sectors now since our bio can
	 * go away inside make_request
	 */
	sectors = bio_sectors(bio);
S
Shaohua Li 已提交
493
	/* bio could be mergeable after passing to underlayer */
J
Jens Axboe 已提交
494
	bio->bi_opf &= ~REQ_NOMERGE;
S
Shaohua Li 已提交
495 496

	md_handle_request(mddev, bio);
497

498 499 500
	part_stat_lock();
	part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
	part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
G
Gu Zheng 已提交
501
	part_stat_unlock();
502

503
	return BLK_QC_T_NONE;
504 505
}

506 507 508
/* mddev_suspend makes sure no new requests are submitted
 * to the device, and that any requests that have been submitted
 * are completely handled.
N
NeilBrown 已提交
509 510
 * Once mddev_detach() is called and completes, the module will be
 * completely unused.
511
 */
512
void mddev_suspend(struct mddev *mddev)
513
{
514
	WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
515
	lockdep_assert_held(&mddev->reconfig_mutex);
516 517
	if (mddev->suspended++)
		return;
518
	synchronize_rcu();
519
	wake_up(&mddev->sb_wait);
520 521
	set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
	smp_mb__after_atomic();
522 523
	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
	mddev->pers->quiesce(mddev, 1);
524 525
	clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
	wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
526 527

	del_timer_sync(&mddev->safemode_timer);
528
}
529
EXPORT_SYMBOL_GPL(mddev_suspend);
530

531
void mddev_resume(struct mddev *mddev)
532
{
533
	lockdep_assert_held(&mddev->reconfig_mutex);
534 535
	if (--mddev->suspended)
		return;
536 537
	wake_up(&mddev->sb_wait);
	mddev->pers->quiesce(mddev, 0);
538

539
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
540 541
	md_wakeup_thread(mddev->thread);
	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
L
Linus Torvalds 已提交
542
}
543
EXPORT_SYMBOL_GPL(mddev_resume);
L
Linus Torvalds 已提交
544

545
int mddev_congested(struct mddev *mddev, int bits)
546
{
547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562
	struct md_personality *pers = mddev->pers;
	int ret = 0;

	rcu_read_lock();
	if (mddev->suspended)
		ret = 1;
	else if (pers && pers->congested)
		ret = pers->congested(mddev, bits);
	rcu_read_unlock();
	return ret;
}
EXPORT_SYMBOL_GPL(mddev_congested);
static int md_congested(void *data, int bits)
{
	struct mddev *mddev = data;
	return mddev_congested(mddev, bits);
563 564
}

565
/*
T
Tejun Heo 已提交
566
 * Generic flush handling for md
567 568
 */

569
static void md_end_flush(struct bio *bio)
570
{
571 572
	struct md_rdev *rdev = bio->bi_private;
	struct mddev *mddev = rdev->mddev;
573 574 575

	rdev_dec_pending(rdev, mddev);

576 577 578
	if (atomic_dec_and_test(&mddev->flush_pending)) {
		/* The pre-request flush has finished */
		queue_work(md_wq, &mddev->flush_work);
579
	}
580
	bio_put(bio);
X
Xiao Ni 已提交
581
}
N
NeilBrown 已提交
582

583 584 585
static void md_submit_flush_data(struct work_struct *ws);

static void submit_flushes(struct work_struct *ws)
586
{
587
	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
588
	struct md_rdev *rdev;
589

N
NeilBrown 已提交
590
	mddev->start_flush = ktime_get_boottime();
591 592
	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
	atomic_set(&mddev->flush_pending, 1);
593
	rcu_read_lock();
N
NeilBrown 已提交
594
	rdev_for_each_rcu(rdev, mddev)
595 596 597 598 599 600 601 602 603 604
		if (rdev->raid_disk >= 0 &&
		    !test_bit(Faulty, &rdev->flags)) {
			/* Take two references, one is dropped
			 * when request finishes, one after
			 * we reclaim rcu_read_lock
			 */
			struct bio *bi;
			atomic_inc(&rdev->nr_pending);
			atomic_inc(&rdev->nr_pending);
			rcu_read_unlock();
605
			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
X
Xiao Ni 已提交
606
			bi->bi_end_io = md_end_flush;
607 608
			bi->bi_private = rdev;
			bio_set_dev(bi, rdev->bdev);
609
			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
610
			atomic_inc(&mddev->flush_pending);
611
			submit_bio(bi);
612 613 614 615
			rcu_read_lock();
			rdev_dec_pending(rdev, mddev);
		}
	rcu_read_unlock();
616 617 618
	if (atomic_dec_and_test(&mddev->flush_pending))
		queue_work(md_wq, &mddev->flush_work);
}
619

620 621 622 623 624 625 626 627 628 629 630
static void md_submit_flush_data(struct work_struct *ws)
{
	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
	struct bio *bio = mddev->flush_bio;

	/*
	 * must reset flush_bio before calling into md_handle_request to avoid a
	 * deadlock, because other bios passed md_handle_request suspend check
	 * could wait for this and below md_handle_request could wait for those
	 * bios because of suspend check
	 */
N
NeilBrown 已提交
631
	mddev->last_flush = mddev->start_flush;
632 633 634 635 636 637 638 639 640
	mddev->flush_bio = NULL;
	wake_up(&mddev->sb_wait);

	if (bio->bi_iter.bi_size == 0) {
		/* an empty barrier - all done */
		bio_endio(bio);
	} else {
		bio->bi_opf &= ~REQ_PREFLUSH;
		md_handle_request(mddev, bio);
641 642
	}
}
643

644 645 646 647 648 649 650
/*
 * Manages consolidation of flushes and submitting any flushes needed for
 * a bio with REQ_PREFLUSH.  Returns true if the bio is finished or is
 * being finished in another context.  Returns false if the flushing is
 * complete but still needs the I/O portion of the bio to be processed.
 */
bool md_flush_request(struct mddev *mddev, struct bio *bio)
651
{
N
NeilBrown 已提交
652
	ktime_t start = ktime_get_boottime();
653 654
	spin_lock_irq(&mddev->lock);
	wait_event_lock_irq(mddev->sb_wait,
N
NeilBrown 已提交
655 656
			    !mddev->flush_bio ||
			    ktime_after(mddev->last_flush, start),
657
			    mddev->lock);
N
NeilBrown 已提交
658 659 660 661 662
	if (!ktime_after(mddev->last_flush, start)) {
		WARN_ON(mddev->flush_bio);
		mddev->flush_bio = bio;
		bio = NULL;
	}
663 664
	spin_unlock_irq(&mddev->lock);

N
NeilBrown 已提交
665 666 667 668 669 670 671 672 673 674
	if (!bio) {
		INIT_WORK(&mddev->flush_work, submit_flushes);
		queue_work(md_wq, &mddev->flush_work);
	} else {
		/* flush was performed for some other bio while we waited. */
		if (bio->bi_iter.bi_size == 0)
			/* an empty barrier - all done */
			bio_endio(bio);
		else {
			bio->bi_opf &= ~REQ_PREFLUSH;
675
			return false;
N
NeilBrown 已提交
676 677
		}
	}
678
	return true;
679
}
T
Tejun Heo 已提交
680
EXPORT_SYMBOL(md_flush_request);
681

682
static inline struct mddev *mddev_get(struct mddev *mddev)
L
Linus Torvalds 已提交
683 684 685 686 687
{
	atomic_inc(&mddev->active);
	return mddev;
}

688
static void mddev_delayed_delete(struct work_struct *ws);
689

690
static void mddev_put(struct mddev *mddev)
L
Linus Torvalds 已提交
691 692 693
{
	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
		return;
694
	if (!mddev->raid_disks && list_empty(&mddev->disks) &&
695 696 697
	    mddev->ctime == 0 && !mddev->hold_active) {
		/* Array is not configured at all, and not held active,
		 * so destroy it */
698
		list_del_init(&mddev->all_mddevs);
699 700 701 702 703 704 705 706

		/*
		 * Call queue_work inside the spinlock so that
		 * flush_workqueue() after mddev_find will succeed in waiting
		 * for the work to be done.
		 */
		INIT_WORK(&mddev->del_work, mddev_delayed_delete);
		queue_work(md_misc_wq, &mddev->del_work);
707 708
	}
	spin_unlock(&all_mddevs_lock);
L
Linus Torvalds 已提交
709 710
}

711
static void md_safemode_timeout(struct timer_list *t);
712

713
void mddev_init(struct mddev *mddev)
714
{
715
	kobject_init(&mddev->kobj, &md_ktype);
716 717 718 719 720
	mutex_init(&mddev->open_mutex);
	mutex_init(&mddev->reconfig_mutex);
	mutex_init(&mddev->bitmap_info.mutex);
	INIT_LIST_HEAD(&mddev->disks);
	INIT_LIST_HEAD(&mddev->all_mddevs);
721
	timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
722 723 724
	atomic_set(&mddev->active, 1);
	atomic_set(&mddev->openers, 0);
	atomic_set(&mddev->active_io, 0);
725
	spin_lock_init(&mddev->lock);
726
	atomic_set(&mddev->flush_pending, 0);
727 728 729
	init_waitqueue_head(&mddev->sb_wait);
	init_waitqueue_head(&mddev->recovery_wait);
	mddev->reshape_position = MaxSector;
730
	mddev->reshape_backwards = 0;
731
	mddev->last_sync_action = "none";
732 733 734 735
	mddev->resync_min = 0;
	mddev->resync_max = MaxSector;
	mddev->level = LEVEL_NONE;
}
736
EXPORT_SYMBOL_GPL(mddev_init);
737

738
static struct mddev *mddev_find(dev_t unit)
L
Linus Torvalds 已提交
739
{
740
	struct mddev *mddev, *new = NULL;
L
Linus Torvalds 已提交
741

742 743 744
	if (unit && MAJOR(unit) != MD_MAJOR)
		unit &= ~((1<<MdpMinorShift)-1);

L
Linus Torvalds 已提交
745 746
 retry:
	spin_lock(&all_mddevs_lock);
747 748 749 750 751 752 753 754 755 756 757 758

	if (unit) {
		list_for_each_entry(mddev, &all_mddevs, all_mddevs)
			if (mddev->unit == unit) {
				mddev_get(mddev);
				spin_unlock(&all_mddevs_lock);
				kfree(new);
				return mddev;
			}

		if (new) {
			list_add(&new->all_mddevs, &all_mddevs);
L
Linus Torvalds 已提交
759
			spin_unlock(&all_mddevs_lock);
760 761
			new->hold_active = UNTIL_IOCTL;
			return new;
L
Linus Torvalds 已提交
762
		}
763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779
	} else if (new) {
		/* find an unused unit number */
		static int next_minor = 512;
		int start = next_minor;
		int is_free = 0;
		int dev = 0;
		while (!is_free) {
			dev = MKDEV(MD_MAJOR, next_minor);
			next_minor++;
			if (next_minor > MINORMASK)
				next_minor = 0;
			if (next_minor == start) {
				/* Oh dear, all in use. */
				spin_unlock(&all_mddevs_lock);
				kfree(new);
				return NULL;
			}
780

781 782 783 784 785 786 787 788 789 790
			is_free = 1;
			list_for_each_entry(mddev, &all_mddevs, all_mddevs)
				if (mddev->unit == dev) {
					is_free = 0;
					break;
				}
		}
		new->unit = dev;
		new->md_minor = MINOR(dev);
		new->hold_active = UNTIL_STOP;
L
Linus Torvalds 已提交
791 792 793 794 795 796
		list_add(&new->all_mddevs, &all_mddevs);
		spin_unlock(&all_mddevs_lock);
		return new;
	}
	spin_unlock(&all_mddevs_lock);

797
	new = kzalloc(sizeof(*new), GFP_KERNEL);
L
Linus Torvalds 已提交
798 799 800 801 802 803 804 805 806
	if (!new)
		return NULL;

	new->unit = unit;
	if (MAJOR(unit) == MD_MAJOR)
		new->md_minor = MINOR(unit);
	else
		new->md_minor = MINOR(unit) >> MdpMinorShift;

807
	mddev_init(new);
L
Linus Torvalds 已提交
808 809 810 811

	goto retry;
}

812 813
static struct attribute_group md_redundancy_group;

814
void mddev_unlock(struct mddev *mddev)
L
Linus Torvalds 已提交
815
{
816
	if (mddev->to_remove) {
817 818 819 820
		/* These cannot be removed under reconfig_mutex as
		 * an access to the files will try to take reconfig_mutex
		 * while holding the file unremovable, which leads to
		 * a deadlock.
821 822 823 824 825 826 827
		 * So hold set sysfs_active while the remove in happeing,
		 * and anything else which might set ->to_remove or my
		 * otherwise change the sysfs namespace will fail with
		 * -EBUSY if sysfs_active is still set.
		 * We set sysfs_active under reconfig_mutex and elsewhere
		 * test it under the same mutex to ensure its correct value
		 * is seen.
828
		 */
829 830
		struct attribute_group *to_remove = mddev->to_remove;
		mddev->to_remove = NULL;
831
		mddev->sysfs_active = 1;
832 833
		mutex_unlock(&mddev->reconfig_mutex);

N
NeilBrown 已提交
834 835 836 837 838 839 840 841 842 843
		if (mddev->kobj.sd) {
			if (to_remove != &md_redundancy_group)
				sysfs_remove_group(&mddev->kobj, to_remove);
			if (mddev->pers == NULL ||
			    mddev->pers->sync_request == NULL) {
				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
				if (mddev->sysfs_action)
					sysfs_put(mddev->sysfs_action);
				mddev->sysfs_action = NULL;
			}
844
		}
845
		mddev->sysfs_active = 0;
846 847
	} else
		mutex_unlock(&mddev->reconfig_mutex);
L
Linus Torvalds 已提交
848

C
Chris Dunlop 已提交
849 850
	/* As we've dropped the mutex we need a spinlock to
	 * make sure the thread doesn't disappear
851 852
	 */
	spin_lock(&pers_lock);
853
	md_wakeup_thread(mddev->thread);
854
	wake_up(&mddev->sb_wait);
855
	spin_unlock(&pers_lock);
L
Linus Torvalds 已提交
856
}
857
EXPORT_SYMBOL_GPL(mddev_unlock);
L
Linus Torvalds 已提交
858

859
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
860 861 862 863 864 865 866 867 868
{
	struct md_rdev *rdev;

	rdev_for_each_rcu(rdev, mddev)
		if (rdev->desc_nr == nr)
			return rdev;

	return NULL;
}
869
EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
870 871

static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
L
Linus Torvalds 已提交
872
{
873
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
874

N
NeilBrown 已提交
875
	rdev_for_each(rdev, mddev)
L
Linus Torvalds 已提交
876 877
		if (rdev->bdev->bd_dev == dev)
			return rdev;
878

L
Linus Torvalds 已提交
879 880 881
	return NULL;
}

882
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
883 884 885 886 887 888 889 890 891
{
	struct md_rdev *rdev;

	rdev_for_each_rcu(rdev, mddev)
		if (rdev->bdev->bd_dev == dev)
			return rdev;

	return NULL;
}
892
EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
893

894
static struct md_personality *find_pers(int level, char *clevel)
895
{
896
	struct md_personality *pers;
897 898
	list_for_each_entry(pers, &pers_list, list) {
		if (level != LEVEL_NONE && pers->level == level)
899
			return pers;
900 901 902
		if (strcmp(pers->name, clevel)==0)
			return pers;
	}
903 904 905
	return NULL;
}

906
/* return the offset of the super block in 512byte sectors */
907
static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
L
Linus Torvalds 已提交
908
{
909
	sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
910
	return MD_NEW_SIZE_SECTORS(num_sectors);
L
Linus Torvalds 已提交
911 912
}

913
static int alloc_disk_sb(struct md_rdev *rdev)
L
Linus Torvalds 已提交
914 915
{
	rdev->sb_page = alloc_page(GFP_KERNEL);
916
	if (!rdev->sb_page)
917
		return -ENOMEM;
L
Linus Torvalds 已提交
918 919 920
	return 0;
}

921
void md_rdev_clear(struct md_rdev *rdev)
L
Linus Torvalds 已提交
922 923
{
	if (rdev->sb_page) {
924
		put_page(rdev->sb_page);
L
Linus Torvalds 已提交
925 926
		rdev->sb_loaded = 0;
		rdev->sb_page = NULL;
927
		rdev->sb_start = 0;
928
		rdev->sectors = 0;
L
Linus Torvalds 已提交
929
	}
930 931 932 933
	if (rdev->bb_page) {
		put_page(rdev->bb_page);
		rdev->bb_page = NULL;
	}
934
	badblocks_exit(&rdev->badblocks);
L
Linus Torvalds 已提交
935
}
936
EXPORT_SYMBOL_GPL(md_rdev_clear);
L
Linus Torvalds 已提交
937

938
static void super_written(struct bio *bio)
939
{
940
	struct md_rdev *rdev = bio->bi_private;
941
	struct mddev *mddev = rdev->mddev;
942

943 944
	if (bio->bi_status) {
		pr_err("md: super_written gets error=%d\n", bio->bi_status);
945
		md_error(mddev, rdev);
946 947
		if (!test_bit(Faulty, &rdev->flags)
		    && (bio->bi_opf & MD_FAILFAST)) {
948
			set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
949 950 951 952
			set_bit(LastDev, &rdev->flags);
		}
	} else
		clear_bit(LastDev, &rdev->flags);
953

954 955
	if (atomic_dec_and_test(&mddev->pending_writes))
		wake_up(&mddev->sb_wait);
956
	rdev_dec_pending(rdev, mddev);
N
Neil Brown 已提交
957
	bio_put(bio);
958 959
}

960
void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
961 962 963 964 965 966 967 968
		   sector_t sector, int size, struct page *page)
{
	/* write first size bytes of page to sector of rdev
	 * Increment mddev->pending_writes before returning
	 * and decrement it on completion, waking up sb_wait
	 * if zero is reached.
	 * If an error occurred, call md_error
	 */
969 970 971
	struct bio *bio;
	int ff = 0;

972 973 974
	if (!page)
		return;

975 976 977
	if (test_bit(Faulty, &rdev->flags))
		return;

978
	bio = md_bio_alloc_sync(mddev);
979

980 981
	atomic_inc(&rdev->nr_pending);

982
	bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
983
	bio->bi_iter.bi_sector = sector;
984 985 986
	bio_add_page(bio, page, size, 0);
	bio->bi_private = rdev;
	bio->bi_end_io = super_written;
987 988 989 990 991

	if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
	    test_bit(FailFast, &rdev->flags) &&
	    !test_bit(LastDev, &rdev->flags))
		ff = MD_FAILFAST;
J
Jan Kara 已提交
992
	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff;
993

994
	atomic_inc(&mddev->pending_writes);
995
	submit_bio(bio);
996 997
}

998
int md_super_wait(struct mddev *mddev)
999
{
T
Tejun Heo 已提交
1000
	/* wait for all superblock writes that were scheduled to complete */
1001
	wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
1002
	if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
1003 1004
		return -EAGAIN;
	return 0;
1005 1006
}

1007
int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
M
Mike Christie 已提交
1008
		 struct page *page, int op, int op_flags, bool metadata_op)
L
Linus Torvalds 已提交
1009
{
1010
	struct bio *bio = md_bio_alloc_sync(rdev->mddev);
L
Linus Torvalds 已提交
1011 1012
	int ret;

1013 1014 1015 1016
	if (metadata_op && rdev->meta_bdev)
		bio_set_dev(bio, rdev->meta_bdev);
	else
		bio_set_dev(bio, rdev->bdev);
M
Mike Christie 已提交
1017
	bio_set_op_attrs(bio, op, op_flags);
J
Jonathan Brassow 已提交
1018
	if (metadata_op)
1019
		bio->bi_iter.bi_sector = sector + rdev->sb_start;
1020 1021 1022
	else if (rdev->mddev->reshape_position != MaxSector &&
		 (rdev->mddev->reshape_backwards ==
		  (sector >= rdev->mddev->reshape_position)))
1023
		bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
J
Jonathan Brassow 已提交
1024
	else
1025
		bio->bi_iter.bi_sector = sector + rdev->data_offset;
L
Linus Torvalds 已提交
1026
	bio_add_page(bio, page, size, 0);
1027 1028

	submit_bio_wait(bio);
L
Linus Torvalds 已提交
1029

1030
	ret = !bio->bi_status;
L
Linus Torvalds 已提交
1031 1032 1033
	bio_put(bio);
	return ret;
}
1034
EXPORT_SYMBOL_GPL(sync_page_io);
L
Linus Torvalds 已提交
1035

1036
static int read_disk_sb(struct md_rdev *rdev, int size)
L
Linus Torvalds 已提交
1037 1038
{
	char b[BDEVNAME_SIZE];
N
NeilBrown 已提交
1039

L
Linus Torvalds 已提交
1040 1041 1042
	if (rdev->sb_loaded)
		return 0;

M
Mike Christie 已提交
1043
	if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true))
L
Linus Torvalds 已提交
1044 1045 1046 1047 1048
		goto fail;
	rdev->sb_loaded = 1;
	return 0;

fail:
1049 1050
	pr_err("md: disabled device %s, could not read superblock.\n",
	       bdevname(rdev->bdev,b));
L
Linus Torvalds 已提交
1051 1052 1053
	return -EINVAL;
}

1054
static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
L
Linus Torvalds 已提交
1055
{
1056
	return	sb1->set_uuid0 == sb2->set_uuid0 &&
A
Andre Noll 已提交
1057 1058 1059
		sb1->set_uuid1 == sb2->set_uuid1 &&
		sb1->set_uuid2 == sb2->set_uuid2 &&
		sb1->set_uuid3 == sb2->set_uuid3;
L
Linus Torvalds 已提交
1060 1061
}

1062
static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
L
Linus Torvalds 已提交
1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083
{
	int ret;
	mdp_super_t *tmp1, *tmp2;

	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);

	if (!tmp1 || !tmp2) {
		ret = 0;
		goto abort;
	}

	*tmp1 = *sb1;
	*tmp2 = *sb2;

	/*
	 * nr_disks is not constant
	 */
	tmp1->nr_disks = 0;
	tmp2->nr_disks = 0;

A
Andre Noll 已提交
1084
	ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
L
Linus Torvalds 已提交
1085
abort:
1086 1087
	kfree(tmp1);
	kfree(tmp2);
L
Linus Torvalds 已提交
1088 1089 1090
	return ret;
}

1091 1092 1093 1094 1095 1096
static u32 md_csum_fold(u32 csum)
{
	csum = (csum & 0xffff) + (csum >> 16);
	return (csum & 0xffff) + (csum >> 16);
}

1097
static unsigned int calc_sb_csum(mdp_super_t *sb)
L
Linus Torvalds 已提交
1098
{
1099 1100 1101
	u64 newcsum = 0;
	u32 *sb32 = (u32*)sb;
	int i;
L
Linus Torvalds 已提交
1102 1103 1104 1105
	unsigned int disk_csum, csum;

	disk_csum = sb->sb_csum;
	sb->sb_csum = 0;
1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121

	for (i = 0; i < MD_SB_BYTES/4 ; i++)
		newcsum += sb32[i];
	csum = (newcsum & 0xffffffff) + (newcsum>>32);

#ifdef CONFIG_ALPHA
	/* This used to use csum_partial, which was wrong for several
	 * reasons including that different results are returned on
	 * different architectures.  It isn't critical that we get exactly
	 * the same return value as before (we always csum_fold before
	 * testing, and that removes any differences).  However as we
	 * know that csum_partial always returned a 16bit value on
	 * alphas, do a fold to maximise conformity to previous behaviour.
	 */
	sb->sb_csum = md_csum_fold(disk_csum);
#else
L
Linus Torvalds 已提交
1122
	sb->sb_csum = disk_csum;
1123
#endif
L
Linus Torvalds 已提交
1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134
	return csum;
}

/*
 * Handle superblock details.
 * We want to be able to handle multiple superblock formats
 * so we have a common interface to them all, and an array of
 * different handlers.
 * We rely on user-space to write the initial superblock, and support
 * reading and updating of superblocks.
 * Interface methods are:
1135
 *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
L
Linus Torvalds 已提交
1136 1137 1138 1139 1140 1141 1142 1143 1144
 *      loads and validates a superblock on dev.
 *      if refdev != NULL, compare superblocks on both devices
 *    Return:
 *      0 - dev has a superblock that is compatible with refdev
 *      1 - dev has a superblock that is compatible and newer than refdev
 *          so dev should be used as the refdev in future
 *     -EINVAL superblock incompatible or invalid
 *     -othererror e.g. -EIO
 *
1145
 *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
L
Linus Torvalds 已提交
1146 1147 1148 1149 1150
 *      Verify that dev is acceptable into mddev.
 *       The first time, mddev->raid_disks will be 0, and data from
 *       dev should be merged in.  Subsequent calls check that dev
 *       is new enough.  Return 0 or -EINVAL
 *
1151
 *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
L
Linus Torvalds 已提交
1152 1153 1154 1155 1156 1157
 *     Update the superblock for rdev with data in mddev
 *     This does not write to disc.
 *
 */

struct super_type  {
1158 1159
	char		    *name;
	struct module	    *owner;
1160 1161
	int		    (*load_super)(struct md_rdev *rdev,
					  struct md_rdev *refdev,
1162
					  int minor_version);
1163 1164 1165 1166
	int		    (*validate_super)(struct mddev *mddev,
					      struct md_rdev *rdev);
	void		    (*sync_super)(struct mddev *mddev,
					  struct md_rdev *rdev);
1167
	unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
1168
						sector_t num_sectors);
1169 1170
	int		    (*allow_new_offset)(struct md_rdev *rdev,
						unsigned long long new_offset);
L
Linus Torvalds 已提交
1171 1172
};

1173 1174 1175 1176 1177 1178 1179 1180
/*
 * Check that the given mddev has no bitmap.
 *
 * This function is called from the run method of all personalities that do not
 * support bitmaps. It prints an error message and returns non-zero if mddev
 * has a bitmap. Otherwise, it returns 0.
 *
 */
1181
int md_check_no_bitmap(struct mddev *mddev)
1182
{
1183
	if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1184
		return 0;
1185
	pr_warn("%s: bitmaps are not supported for %s\n",
1186 1187 1188 1189 1190
		mdname(mddev), mddev->pers->name);
	return 1;
}
EXPORT_SYMBOL(md_check_no_bitmap);

L
Linus Torvalds 已提交
1191
/*
1192
 * load_super for 0.90.0
L
Linus Torvalds 已提交
1193
 */
1194
static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
L
Linus Torvalds 已提交
1195 1196 1197 1198
{
	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
	mdp_super_t *sb;
	int ret;
1199
	bool spare_disk = true;
L
Linus Torvalds 已提交
1200 1201

	/*
1202
	 * Calculate the position of the superblock (512byte sectors),
L
Linus Torvalds 已提交
1203 1204 1205 1206
	 * it's at the end of the disk.
	 *
	 * It also happens to be a multiple of 4Kb.
	 */
1207
	rdev->sb_start = calc_dev_sboffset(rdev);
L
Linus Torvalds 已提交
1208

1209
	ret = read_disk_sb(rdev, MD_SB_BYTES);
1210 1211
	if (ret)
		return ret;
L
Linus Torvalds 已提交
1212 1213 1214 1215

	ret = -EINVAL;

	bdevname(rdev->bdev, b);
1216
	sb = page_address(rdev->sb_page);
L
Linus Torvalds 已提交
1217 1218

	if (sb->md_magic != MD_SB_MAGIC) {
1219
		pr_warn("md: invalid raid superblock magic on %s\n", b);
L
Linus Torvalds 已提交
1220 1221 1222 1223
		goto abort;
	}

	if (sb->major_version != 0 ||
1224 1225
	    sb->minor_version < 90 ||
	    sb->minor_version > 91) {
1226 1227
		pr_warn("Bad version number %d.%d on %s\n",
			sb->major_version, sb->minor_version, b);
L
Linus Torvalds 已提交
1228 1229 1230 1231 1232 1233
		goto abort;
	}

	if (sb->raid_disks <= 0)
		goto abort;

1234
	if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1235
		pr_warn("md: invalid superblock checksum on %s\n", b);
L
Linus Torvalds 已提交
1236 1237 1238 1239 1240
		goto abort;
	}

	rdev->preferred_minor = sb->md_minor;
	rdev->data_offset = 0;
1241
	rdev->new_data_offset = 0;
1242
	rdev->sb_size = MD_SB_BYTES;
1243
	rdev->badblocks.shift = -1;
L
Linus Torvalds 已提交
1244 1245 1246 1247 1248 1249

	if (sb->level == LEVEL_MULTIPATH)
		rdev->desc_nr = -1;
	else
		rdev->desc_nr = sb->this_disk.number;

1250 1251 1252
	/* not spare disk, or LEVEL_MULTIPATH */
	if (sb->level == LEVEL_MULTIPATH ||
		(rdev->desc_nr >= 0 &&
1253
		 rdev->desc_nr < MD_SB_DISKS &&
1254 1255 1256 1257
		 sb->disks[rdev->desc_nr].state &
		 ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))))
		spare_disk = false;

1258
	if (!refdev) {
1259
		if (!spare_disk)
1260 1261 1262
			ret = 1;
		else
			ret = 0;
1263
	} else {
L
Linus Torvalds 已提交
1264
		__u64 ev1, ev2;
1265
		mdp_super_t *refsb = page_address(refdev->sb_page);
1266
		if (!md_uuid_equal(refsb, sb)) {
1267
			pr_warn("md: %s has different UUID to %s\n",
L
Linus Torvalds 已提交
1268 1269 1270
				b, bdevname(refdev->bdev,b2));
			goto abort;
		}
1271
		if (!md_sb_equal(refsb, sb)) {
1272 1273
			pr_warn("md: %s has same UUID but different superblock to %s\n",
				b, bdevname(refdev->bdev, b2));
L
Linus Torvalds 已提交
1274 1275 1276 1277
			goto abort;
		}
		ev1 = md_event(sb);
		ev2 = md_event(refsb);
1278

1279
		if (!spare_disk && ev1 > ev2)
L
Linus Torvalds 已提交
1280
			ret = 1;
1281
		else
L
Linus Torvalds 已提交
1282 1283
			ret = 0;
	}
1284
	rdev->sectors = rdev->sb_start;
1285 1286 1287 1288
	/* Limit to 4TB as metadata cannot record more than that.
	 * (not needed for Linear and RAID0 as metadata doesn't
	 * record this size)
	 */
C
Christoph Hellwig 已提交
1289
	if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1290
		rdev->sectors = (sector_t)(2ULL << 32) - 2;
L
Linus Torvalds 已提交
1291

1292
	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1293 1294 1295
		/* "this cannot possibly happen" ... */
		ret = -EINVAL;

L
Linus Torvalds 已提交
1296 1297 1298 1299 1300 1301 1302
 abort:
	return ret;
}

/*
 * validate_super for 0.90.0
 */
1303
static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
L
Linus Torvalds 已提交
1304 1305
{
	mdp_disk_t *desc;
1306
	mdp_super_t *sb = page_address(rdev->sb_page);
1307
	__u64 ev1 = md_event(sb);
L
Linus Torvalds 已提交
1308

1309
	rdev->raid_disk = -1;
1310 1311
	clear_bit(Faulty, &rdev->flags);
	clear_bit(In_sync, &rdev->flags);
1312
	clear_bit(Bitmap_sync, &rdev->flags);
1313 1314
	clear_bit(WriteMostly, &rdev->flags);

L
Linus Torvalds 已提交
1315 1316 1317 1318
	if (mddev->raid_disks == 0) {
		mddev->major_version = 0;
		mddev->minor_version = sb->minor_version;
		mddev->patch_version = sb->patch_version;
1319
		mddev->external = 0;
1320
		mddev->chunk_sectors = sb->chunk_size >> 9;
L
Linus Torvalds 已提交
1321 1322 1323
		mddev->ctime = sb->ctime;
		mddev->utime = sb->utime;
		mddev->level = sb->level;
1324
		mddev->clevel[0] = 0;
L
Linus Torvalds 已提交
1325 1326
		mddev->layout = sb->layout;
		mddev->raid_disks = sb->raid_disks;
1327
		mddev->dev_sectors = ((sector_t)sb->size) * 2;
1328
		mddev->events = ev1;
1329
		mddev->bitmap_info.offset = 0;
1330 1331
		mddev->bitmap_info.space = 0;
		/* bitmap can use 60 K after the 4K superblocks */
1332
		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1333
		mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1334
		mddev->reshape_backwards = 0;
L
Linus Torvalds 已提交
1335

1336 1337 1338 1339 1340
		if (mddev->minor_version >= 91) {
			mddev->reshape_position = sb->reshape_position;
			mddev->delta_disks = sb->delta_disks;
			mddev->new_level = sb->new_level;
			mddev->new_layout = sb->new_layout;
1341
			mddev->new_chunk_sectors = sb->new_chunk >> 9;
1342 1343
			if (mddev->delta_disks < 0)
				mddev->reshape_backwards = 1;
1344 1345 1346 1347 1348
		} else {
			mddev->reshape_position = MaxSector;
			mddev->delta_disks = 0;
			mddev->new_level = mddev->level;
			mddev->new_layout = mddev->layout;
1349
			mddev->new_chunk_sectors = mddev->chunk_sectors;
1350
		}
1351 1352
		if (mddev->level == 0)
			mddev->layout = -1;
1353

L
Linus Torvalds 已提交
1354 1355 1356
		if (sb->state & (1<<MD_SB_CLEAN))
			mddev->recovery_cp = MaxSector;
		else {
1357
			if (sb->events_hi == sb->cp_events_hi &&
L
Linus Torvalds 已提交
1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369
				sb->events_lo == sb->cp_events_lo) {
				mddev->recovery_cp = sb->recovery_cp;
			} else
				mddev->recovery_cp = 0;
		}

		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
		memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
		memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);

		mddev->max_disks = MD_SB_DISKS;
1370 1371

		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1372
		    mddev->bitmap_info.file == NULL) {
1373 1374
			mddev->bitmap_info.offset =
				mddev->bitmap_info.default_offset;
1375
			mddev->bitmap_info.space =
1376
				mddev->bitmap_info.default_space;
1377
		}
1378

1379
	} else if (mddev->pers == NULL) {
1380 1381
		/* Insist on good event counter while assembling, except
		 * for spares (which don't need an event count) */
L
Linus Torvalds 已提交
1382
		++ev1;
1383 1384
		if (sb->disks[rdev->desc_nr].state & (
			    (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1385
			if (ev1 < mddev->events)
1386
				return -EINVAL;
1387 1388 1389 1390 1391 1392
	} else if (mddev->bitmap) {
		/* if adding to array with a bitmap, then we can accept an
		 * older device ... but not too old.
		 */
		if (ev1 < mddev->bitmap->events_cleared)
			return 0;
1393 1394
		if (ev1 < mddev->events)
			set_bit(Bitmap_sync, &rdev->flags);
1395 1396 1397 1398 1399
	} else {
		if (ev1 < mddev->events)
			/* just a hot-add of a new device, leave raid_disk at -1 */
			return 0;
	}
1400

L
Linus Torvalds 已提交
1401 1402 1403 1404
	if (mddev->level != LEVEL_MULTIPATH) {
		desc = sb->disks + rdev->desc_nr;

		if (desc->state & (1<<MD_DISK_FAULTY))
1405
			set_bit(Faulty, &rdev->flags);
1406 1407
		else if (desc->state & (1<<MD_DISK_SYNC) /* &&
			    desc->raid_disk < mddev->raid_disks */) {
1408
			set_bit(In_sync, &rdev->flags);
L
Linus Torvalds 已提交
1409
			rdev->raid_disk = desc->raid_disk;
1410
			rdev->saved_raid_disk = desc->raid_disk;
1411 1412 1413 1414 1415 1416 1417 1418
		} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
			/* active but not in sync implies recovery up to
			 * reshape position.  We don't know exactly where
			 * that is, so set to zero for now */
			if (mddev->minor_version >= 91) {
				rdev->recovery_offset = 0;
				rdev->raid_disk = desc->raid_disk;
			}
L
Linus Torvalds 已提交
1419
		}
1420 1421
		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
			set_bit(WriteMostly, &rdev->flags);
1422 1423
		if (desc->state & (1<<MD_DISK_FAILFAST))
			set_bit(FailFast, &rdev->flags);
1424
	} else /* MULTIPATH are always insync */
1425
		set_bit(In_sync, &rdev->flags);
L
Linus Torvalds 已提交
1426 1427 1428 1429 1430 1431
	return 0;
}

/*
 * sync_super for 0.90.0
 */
1432
static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
L
Linus Torvalds 已提交
1433 1434
{
	mdp_super_t *sb;
1435
	struct md_rdev *rdev2;
L
Linus Torvalds 已提交
1436
	int next_spare = mddev->raid_disks;
1437

L
Linus Torvalds 已提交
1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450
	/* make rdev->sb match mddev data..
	 *
	 * 1/ zero out disks
	 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
	 * 3/ any empty disks < next_spare become removed
	 *
	 * disks[0] gets initialised to REMOVED because
	 * we cannot be sure from other fields if it has
	 * been initialised or not.
	 */
	int i;
	int active=0, working=0,failed=0,spare=0,nr_disks=0;

1451 1452
	rdev->sb_size = MD_SB_BYTES;

1453
	sb = page_address(rdev->sb_page);
L
Linus Torvalds 已提交
1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465

	memset(sb, 0, sizeof(*sb));

	sb->md_magic = MD_SB_MAGIC;
	sb->major_version = mddev->major_version;
	sb->patch_version = mddev->patch_version;
	sb->gvalid_words  = 0; /* ignored */
	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
	memcpy(&sb->set_uuid3, mddev->uuid+12,4);

1466
	sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
L
Linus Torvalds 已提交
1467
	sb->level = mddev->level;
A
Andre Noll 已提交
1468
	sb->size = mddev->dev_sectors / 2;
L
Linus Torvalds 已提交
1469 1470
	sb->raid_disks = mddev->raid_disks;
	sb->md_minor = mddev->md_minor;
1471
	sb->not_persistent = 0;
1472
	sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
L
Linus Torvalds 已提交
1473 1474 1475 1476
	sb->state = 0;
	sb->events_hi = (mddev->events>>32);
	sb->events_lo = (u32)mddev->events;

1477 1478 1479 1480 1481 1482 1483 1484
	if (mddev->reshape_position == MaxSector)
		sb->minor_version = 90;
	else {
		sb->minor_version = 91;
		sb->reshape_position = mddev->reshape_position;
		sb->new_level = mddev->new_level;
		sb->delta_disks = mddev->delta_disks;
		sb->new_layout = mddev->new_layout;
1485
		sb->new_chunk = mddev->new_chunk_sectors << 9;
1486 1487
	}
	mddev->minor_version = sb->minor_version;
L
Linus Torvalds 已提交
1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498
	if (mddev->in_sync)
	{
		sb->recovery_cp = mddev->recovery_cp;
		sb->cp_events_hi = (mddev->events>>32);
		sb->cp_events_lo = (u32)mddev->events;
		if (mddev->recovery_cp == MaxSector)
			sb->state = (1<< MD_SB_CLEAN);
	} else
		sb->recovery_cp = 0;

	sb->layout = mddev->layout;
1499
	sb->chunk_size = mddev->chunk_sectors << 9;
L
Linus Torvalds 已提交
1500

1501
	if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1502 1503
		sb->state |= (1<<MD_SB_BITMAP_PRESENT);

L
Linus Torvalds 已提交
1504
	sb->disks[0].state = (1<<MD_DISK_REMOVED);
N
NeilBrown 已提交
1505
	rdev_for_each(rdev2, mddev) {
L
Linus Torvalds 已提交
1506
		mdp_disk_t *d;
1507
		int desc_nr;
1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520
		int is_active = test_bit(In_sync, &rdev2->flags);

		if (rdev2->raid_disk >= 0 &&
		    sb->minor_version >= 91)
			/* we have nowhere to store the recovery_offset,
			 * but if it is not below the reshape_position,
			 * we can piggy-back on that.
			 */
			is_active = 1;
		if (rdev2->raid_disk < 0 ||
		    test_bit(Faulty, &rdev2->flags))
			is_active = 0;
		if (is_active)
1521
			desc_nr = rdev2->raid_disk;
L
Linus Torvalds 已提交
1522
		else
1523
			desc_nr = next_spare++;
1524
		rdev2->desc_nr = desc_nr;
L
Linus Torvalds 已提交
1525 1526 1527 1528 1529
		d = &sb->disks[rdev2->desc_nr];
		nr_disks++;
		d->number = rdev2->desc_nr;
		d->major = MAJOR(rdev2->bdev->bd_dev);
		d->minor = MINOR(rdev2->bdev->bd_dev);
1530
		if (is_active)
L
Linus Torvalds 已提交
1531 1532 1533
			d->raid_disk = rdev2->raid_disk;
		else
			d->raid_disk = rdev2->desc_nr; /* compatibility */
1534
		if (test_bit(Faulty, &rdev2->flags))
L
Linus Torvalds 已提交
1535
			d->state = (1<<MD_DISK_FAULTY);
1536
		else if (is_active) {
L
Linus Torvalds 已提交
1537
			d->state = (1<<MD_DISK_ACTIVE);
1538 1539
			if (test_bit(In_sync, &rdev2->flags))
				d->state |= (1<<MD_DISK_SYNC);
L
Linus Torvalds 已提交
1540 1541 1542 1543 1544 1545 1546
			active++;
			working++;
		} else {
			d->state = 0;
			spare++;
			working++;
		}
1547 1548
		if (test_bit(WriteMostly, &rdev2->flags))
			d->state |= (1<<MD_DISK_WRITEMOSTLY);
1549 1550
		if (test_bit(FailFast, &rdev2->flags))
			d->state |= (1<<MD_DISK_FAILFAST);
L
Linus Torvalds 已提交
1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572
	}
	/* now set the "removed" and "faulty" bits on any missing devices */
	for (i=0 ; i < mddev->raid_disks ; i++) {
		mdp_disk_t *d = &sb->disks[i];
		if (d->state == 0 && d->number == 0) {
			d->number = i;
			d->raid_disk = i;
			d->state = (1<<MD_DISK_REMOVED);
			d->state |= (1<<MD_DISK_FAULTY);
			failed++;
		}
	}
	sb->nr_disks = nr_disks;
	sb->active_disks = active;
	sb->working_disks = working;
	sb->failed_disks = failed;
	sb->spare_disks = spare;

	sb->this_disk = sb->disks[rdev->desc_nr];
	sb->sb_csum = calc_sb_csum(sb);
}

1573 1574 1575 1576
/*
 * rdev_size_change for 0.90.0
 */
static unsigned long long
1577
super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1578
{
A
Andre Noll 已提交
1579
	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1580
		return 0; /* component must fit device */
1581
	if (rdev->mddev->bitmap_info.offset)
1582
		return 0; /* can't move bitmap */
1583
	rdev->sb_start = calc_dev_sboffset(rdev);
1584 1585
	if (!num_sectors || num_sectors > rdev->sb_start)
		num_sectors = rdev->sb_start;
1586 1587 1588
	/* Limit to 4TB as metadata cannot record more than that.
	 * 4TB == 2^32 KB, or 2*2^32 sectors.
	 */
C
Christoph Hellwig 已提交
1589
	if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1590
		num_sectors = (sector_t)(2ULL << 32) - 2;
1591 1592
	do {
		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1593
		       rdev->sb_page);
1594
	} while (md_super_wait(rdev->mddev) < 0);
1595
	return num_sectors;
1596 1597
}

1598 1599 1600 1601 1602 1603
static int
super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
{
	/* non-zero offset changes not possible with v0.90 */
	return new_offset == 0;
}
1604

L
Linus Torvalds 已提交
1605 1606 1607 1608
/*
 * version 1 superblock
 */

1609
static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
L
Linus Torvalds 已提交
1610
{
1611 1612
	__le32 disk_csum;
	u32 csum;
L
Linus Torvalds 已提交
1613 1614
	unsigned long long newcsum;
	int size = 256 + le32_to_cpu(sb->max_dev)*2;
1615
	__le32 *isuper = (__le32*)sb;
L
Linus Torvalds 已提交
1616 1617 1618 1619

	disk_csum = sb->sb_csum;
	sb->sb_csum = 0;
	newcsum = 0;
1620
	for (; size >= 4; size -= 4)
L
Linus Torvalds 已提交
1621 1622 1623
		newcsum += le32_to_cpu(*isuper++);

	if (size == 2)
1624
		newcsum += le16_to_cpu(*(__le16*) isuper);
L
Linus Torvalds 已提交
1625 1626 1627 1628 1629 1630

	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
	sb->sb_csum = disk_csum;
	return cpu_to_le32(csum);
}

1631
static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
L
Linus Torvalds 已提交
1632 1633 1634
{
	struct mdp_superblock_1 *sb;
	int ret;
1635
	sector_t sb_start;
1636
	sector_t sectors;
L
Linus Torvalds 已提交
1637
	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1638
	int bmask;
1639
	bool spare_disk = true;
L
Linus Torvalds 已提交
1640 1641

	/*
1642
	 * Calculate the position of the superblock in 512byte sectors.
L
Linus Torvalds 已提交
1643 1644 1645 1646 1647 1648 1649 1650
	 * It is always aligned to a 4K boundary and
	 * depeding on minor_version, it can be:
	 * 0: At least 8K, but less than 12K, from end of device
	 * 1: At start of device
	 * 2: 4K from start of device.
	 */
	switch(minor_version) {
	case 0:
1651
		sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1652 1653
		sb_start -= 8*2;
		sb_start &= ~(sector_t)(4*2-1);
L
Linus Torvalds 已提交
1654 1655
		break;
	case 1:
1656
		sb_start = 0;
L
Linus Torvalds 已提交
1657 1658
		break;
	case 2:
1659
		sb_start = 8;
L
Linus Torvalds 已提交
1660 1661 1662 1663
		break;
	default:
		return -EINVAL;
	}
1664
	rdev->sb_start = sb_start;
L
Linus Torvalds 已提交
1665

1666 1667 1668 1669
	/* superblock is rarely larger than 1K, but it can be larger,
	 * and it is safe to read 4k, so we do that
	 */
	ret = read_disk_sb(rdev, 4096);
L
Linus Torvalds 已提交
1670 1671
	if (ret) return ret;

1672
	sb = page_address(rdev->sb_page);
L
Linus Torvalds 已提交
1673 1674 1675 1676

	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
	    sb->major_version != cpu_to_le32(1) ||
	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1677
	    le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1678
	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
L
Linus Torvalds 已提交
1679 1680 1681
		return -EINVAL;

	if (calc_sb_1_csum(sb) != sb->sb_csum) {
1682
		pr_warn("md: invalid superblock checksum on %s\n",
L
Linus Torvalds 已提交
1683 1684 1685 1686
			bdevname(rdev->bdev,b));
		return -EINVAL;
	}
	if (le64_to_cpu(sb->data_size) < 10) {
1687 1688
		pr_warn("md: data_size too small on %s\n",
			bdevname(rdev->bdev,b));
L
Linus Torvalds 已提交
1689 1690
		return -EINVAL;
	}
1691 1692 1693 1694 1695
	if (sb->pad0 ||
	    sb->pad3[0] ||
	    memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
		/* Some padding is non-zero, might be a new feature */
		return -EINVAL;
1696

L
Linus Torvalds 已提交
1697 1698
	rdev->preferred_minor = 0xffff;
	rdev->data_offset = le64_to_cpu(sb->data_offset);
1699 1700 1701 1702
	rdev->new_data_offset = rdev->data_offset;
	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
		rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1703
	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
L
Linus Torvalds 已提交
1704

1705
	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1706
	bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1707
	if (rdev->sb_size & bmask)
1708 1709 1710
		rdev->sb_size = (rdev->sb_size | bmask) + 1;

	if (minor_version
1711
	    && rdev->data_offset < sb_start + (rdev->sb_size/512))
1712
		return -EINVAL;
1713 1714 1715
	if (minor_version
	    && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
		return -EINVAL;
1716

1717 1718 1719 1720 1721
	if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
		rdev->desc_nr = -1;
	else
		rdev->desc_nr = le32_to_cpu(sb->dev_number);

1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733
	if (!rdev->bb_page) {
		rdev->bb_page = alloc_page(GFP_KERNEL);
		if (!rdev->bb_page)
			return -ENOMEM;
	}
	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
	    rdev->badblocks.count == 0) {
		/* need to load the bad block list.
		 * Currently we limit it to one page.
		 */
		s32 offset;
		sector_t bb_sector;
1734
		__le64 *bbp;
1735 1736 1737 1738 1739 1740 1741 1742 1743
		int i;
		int sectors = le16_to_cpu(sb->bblog_size);
		if (sectors > (PAGE_SIZE / 512))
			return -EINVAL;
		offset = le32_to_cpu(sb->bblog_offset);
		if (offset == 0)
			return -EINVAL;
		bb_sector = (long long)offset;
		if (!sync_page_io(rdev, bb_sector, sectors << 9,
M
Mike Christie 已提交
1744
				  rdev->bb_page, REQ_OP_READ, 0, true))
1745
			return -EIO;
1746
		bbp = (__le64 *)page_address(rdev->bb_page);
1747 1748 1749 1750 1751 1752 1753 1754 1755
		rdev->badblocks.shift = sb->bblog_shift;
		for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
			u64 bb = le64_to_cpu(*bbp);
			int count = bb & (0x3ff);
			u64 sector = bb >> 10;
			sector <<= sb->bblog_shift;
			count <<= sb->bblog_shift;
			if (bb + 1 == 0)
				break;
1756
			if (badblocks_set(&rdev->badblocks, sector, count, 1))
1757 1758
				return -EINVAL;
		}
1759 1760
	} else if (sb->bblog_offset != 0)
		rdev->badblocks.shift = 0;
1761

1762 1763
	if ((le32_to_cpu(sb->feature_map) &
	    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1764 1765 1766 1767 1768
		rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
		rdev->ppl.size = le16_to_cpu(sb->ppl.size);
		rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
	}

1769 1770 1771 1772
	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
	    sb->level != 0)
		return -EINVAL;

1773 1774 1775 1776 1777 1778 1779
	/* not spare disk, or LEVEL_MULTIPATH */
	if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) ||
		(rdev->desc_nr >= 0 &&
		rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
		(le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
		 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)))
		spare_disk = false;
1780

1781
	if (!refdev) {
1782
		if (!spare_disk)
1783 1784 1785
			ret = 1;
		else
			ret = 0;
1786
	} else {
L
Linus Torvalds 已提交
1787
		__u64 ev1, ev2;
1788
		struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
L
Linus Torvalds 已提交
1789 1790 1791 1792 1793

		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
		    sb->level != refsb->level ||
		    sb->layout != refsb->layout ||
		    sb->chunksize != refsb->chunksize) {
1794
			pr_warn("md: %s has strangely different superblock to %s\n",
L
Linus Torvalds 已提交
1795 1796 1797 1798 1799 1800 1801
				bdevname(rdev->bdev,b),
				bdevname(refdev->bdev,b2));
			return -EINVAL;
		}
		ev1 = le64_to_cpu(sb->events);
		ev2 = le64_to_cpu(refsb->events);

1802
		if (!spare_disk && ev1 > ev2)
1803 1804 1805
			ret = 1;
		else
			ret = 0;
L
Linus Torvalds 已提交
1806
	}
1807 1808 1809 1810 1811 1812
	if (minor_version) {
		sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
		sectors -= rdev->data_offset;
	} else
		sectors = rdev->sb_start;
	if (sectors < le64_to_cpu(sb->data_size))
L
Linus Torvalds 已提交
1813
		return -EINVAL;
1814
	rdev->sectors = le64_to_cpu(sb->data_size);
1815
	return ret;
L
Linus Torvalds 已提交
1816 1817
}

1818
static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
L
Linus Torvalds 已提交
1819
{
1820
	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1821
	__u64 ev1 = le64_to_cpu(sb->events);
L
Linus Torvalds 已提交
1822

1823
	rdev->raid_disk = -1;
1824 1825
	clear_bit(Faulty, &rdev->flags);
	clear_bit(In_sync, &rdev->flags);
1826
	clear_bit(Bitmap_sync, &rdev->flags);
1827 1828
	clear_bit(WriteMostly, &rdev->flags);

L
Linus Torvalds 已提交
1829 1830 1831
	if (mddev->raid_disks == 0) {
		mddev->major_version = 1;
		mddev->patch_version = 0;
1832
		mddev->external = 0;
1833
		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1834 1835
		mddev->ctime = le64_to_cpu(sb->ctime);
		mddev->utime = le64_to_cpu(sb->utime);
L
Linus Torvalds 已提交
1836
		mddev->level = le32_to_cpu(sb->level);
1837
		mddev->clevel[0] = 0;
L
Linus Torvalds 已提交
1838 1839
		mddev->layout = le32_to_cpu(sb->layout);
		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
A
Andre Noll 已提交
1840
		mddev->dev_sectors = le64_to_cpu(sb->size);
1841
		mddev->events = ev1;
1842
		mddev->bitmap_info.offset = 0;
1843 1844 1845 1846
		mddev->bitmap_info.space = 0;
		/* Default location for bitmap is 1K after superblock
		 * using 3K - total of 4K
		 */
1847
		mddev->bitmap_info.default_offset = 1024 >> 9;
1848
		mddev->bitmap_info.default_space = (4096-1024) >> 9;
1849 1850
		mddev->reshape_backwards = 0;

L
Linus Torvalds 已提交
1851 1852 1853 1854
		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
		memcpy(mddev->uuid, sb->set_uuid, 16);

		mddev->max_disks =  (4096-256)/2;
1855

1856
		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1857
		    mddev->bitmap_info.file == NULL) {
1858 1859
			mddev->bitmap_info.offset =
				(__s32)le32_to_cpu(sb->bitmap_offset);
1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873
			/* Metadata doesn't record how much space is available.
			 * For 1.0, we assume we can use up to the superblock
			 * if before, else to 4K beyond superblock.
			 * For others, assume no change is possible.
			 */
			if (mddev->minor_version > 0)
				mddev->bitmap_info.space = 0;
			else if (mddev->bitmap_info.offset > 0)
				mddev->bitmap_info.space =
					8 - mddev->bitmap_info.offset;
			else
				mddev->bitmap_info.space =
					-mddev->bitmap_info.offset;
		}
1874

1875 1876 1877 1878 1879
		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
			mddev->new_level = le32_to_cpu(sb->new_level);
			mddev->new_layout = le32_to_cpu(sb->new_layout);
1880
			mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1881 1882 1883 1884 1885
			if (mddev->delta_disks < 0 ||
			    (mddev->delta_disks == 0 &&
			     (le32_to_cpu(sb->feature_map)
			      & MD_FEATURE_RESHAPE_BACKWARDS)))
				mddev->reshape_backwards = 1;
1886 1887 1888 1889 1890
		} else {
			mddev->reshape_position = MaxSector;
			mddev->delta_disks = 0;
			mddev->new_level = mddev->level;
			mddev->new_layout = mddev->layout;
1891
			mddev->new_chunk_sectors = mddev->chunk_sectors;
1892 1893
		}

1894 1895 1896 1897
		if (mddev->level == 0 &&
		    !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
			mddev->layout = -1;

1898
		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1899
			set_bit(MD_HAS_JOURNAL, &mddev->flags);
1900

1901 1902
		if (le32_to_cpu(sb->feature_map) &
		    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
1903 1904 1905
			if (le32_to_cpu(sb->feature_map) &
			    (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
				return -EINVAL;
1906 1907 1908 1909
			if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
			    (le32_to_cpu(sb->feature_map) &
					    MD_FEATURE_MULTIPLE_PPLS))
				return -EINVAL;
1910 1911
			set_bit(MD_HAS_PPL, &mddev->flags);
		}
1912
	} else if (mddev->pers == NULL) {
1913 1914
		/* Insist of good event counter while assembling, except for
		 * spares (which don't need an event count) */
L
Linus Torvalds 已提交
1915
		++ev1;
1916 1917
		if (rdev->desc_nr >= 0 &&
		    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1918 1919
		    (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
		     le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1920 1921
			if (ev1 < mddev->events)
				return -EINVAL;
1922 1923 1924 1925 1926 1927
	} else if (mddev->bitmap) {
		/* If adding to array with a bitmap, then we can accept an
		 * older device, but not too old.
		 */
		if (ev1 < mddev->bitmap->events_cleared)
			return 0;
1928 1929
		if (ev1 < mddev->events)
			set_bit(Bitmap_sync, &rdev->flags);
1930 1931 1932 1933 1934
	} else {
		if (ev1 < mddev->events)
			/* just a hot-add of a new device, leave raid_disk at -1 */
			return 0;
	}
L
Linus Torvalds 已提交
1935 1936
	if (mddev->level != LEVEL_MULTIPATH) {
		int role;
1937 1938
		if (rdev->desc_nr < 0 ||
		    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1939
			role = MD_DISK_ROLE_SPARE;
1940 1941 1942
			rdev->desc_nr = -1;
		} else
			role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
L
Linus Torvalds 已提交
1943
		switch(role) {
1944
		case MD_DISK_ROLE_SPARE: /* spare */
L
Linus Torvalds 已提交
1945
			break;
1946
		case MD_DISK_ROLE_FAULTY: /* faulty */
1947
			set_bit(Faulty, &rdev->flags);
L
Linus Torvalds 已提交
1948
			break;
1949 1950 1951
		case MD_DISK_ROLE_JOURNAL: /* journal device */
			if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
				/* journal device without journal feature */
1952
				pr_warn("md: journal device provided without journal feature, ignoring the device\n");
1953 1954 1955
				return -EINVAL;
			}
			set_bit(Journal, &rdev->flags);
1956
			rdev->journal_tail = le64_to_cpu(sb->journal_tail);
1957
			rdev->raid_disk = 0;
1958
			break;
L
Linus Torvalds 已提交
1959
		default:
1960
			rdev->saved_raid_disk = role;
1961
			if ((le32_to_cpu(sb->feature_map) &
1962
			     MD_FEATURE_RECOVERY_OFFSET)) {
1963
				rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1964 1965 1966
				if (!(le32_to_cpu(sb->feature_map) &
				      MD_FEATURE_RECOVERY_BITMAP))
					rdev->saved_raid_disk = -1;
1967 1968 1969 1970 1971 1972 1973 1974 1975
			} else {
				/*
				 * If the array is FROZEN, then the device can't
				 * be in_sync with rest of array.
				 */
				if (!test_bit(MD_RECOVERY_FROZEN,
					      &mddev->recovery))
					set_bit(In_sync, &rdev->flags);
			}
L
Linus Torvalds 已提交
1976 1977 1978
			rdev->raid_disk = role;
			break;
		}
1979 1980
		if (sb->devflags & WriteMostly1)
			set_bit(WriteMostly, &rdev->flags);
1981 1982
		if (sb->devflags & FailFast1)
			set_bit(FailFast, &rdev->flags);
1983 1984
		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
			set_bit(Replacement, &rdev->flags);
1985
	} else /* MULTIPATH are always insync */
1986
		set_bit(In_sync, &rdev->flags);
1987

L
Linus Torvalds 已提交
1988 1989 1990
	return 0;
}

1991
static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
L
Linus Torvalds 已提交
1992 1993
{
	struct mdp_superblock_1 *sb;
1994
	struct md_rdev *rdev2;
L
Linus Torvalds 已提交
1995 1996 1997
	int max_dev, i;
	/* make rdev->sb match mddev and rdev data. */

1998
	sb = page_address(rdev->sb_page);
L
Linus Torvalds 已提交
1999 2000 2001

	sb->feature_map = 0;
	sb->pad0 = 0;
2002
	sb->recovery_offset = cpu_to_le64(0);
L
Linus Torvalds 已提交
2003 2004 2005 2006 2007 2008
	memset(sb->pad3, 0, sizeof(sb->pad3));

	sb->utime = cpu_to_le64((__u64)mddev->utime);
	sb->events = cpu_to_le64(mddev->events);
	if (mddev->in_sync)
		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
2009 2010
	else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
		sb->resync_offset = cpu_to_le64(MaxSector);
L
Linus Torvalds 已提交
2011 2012 2013
	else
		sb->resync_offset = cpu_to_le64(0);

2014
	sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
2015

2016
	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
A
Andre Noll 已提交
2017
	sb->size = cpu_to_le64(mddev->dev_sectors);
2018
	sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
2019 2020
	sb->level = cpu_to_le32(mddev->level);
	sb->layout = cpu_to_le32(mddev->layout);
2021 2022 2023 2024
	if (test_bit(FailFast, &rdev->flags))
		sb->devflags |= FailFast1;
	else
		sb->devflags &= ~FailFast1;
2025

2026 2027 2028 2029
	if (test_bit(WriteMostly, &rdev->flags))
		sb->devflags |= WriteMostly1;
	else
		sb->devflags &= ~WriteMostly1;
2030 2031
	sb->data_offset = cpu_to_le64(rdev->data_offset);
	sb->data_size = cpu_to_le64(rdev->sectors);
2032

2033 2034
	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
2035
		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
2036
	}
2037

S
Shaohua Li 已提交
2038
	if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
2039
	    !test_bit(In_sync, &rdev->flags)) {
2040 2041 2042 2043
		sb->feature_map |=
			cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
		sb->recovery_offset =
			cpu_to_le64(rdev->recovery_offset);
2044 2045 2046
		if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
			sb->feature_map |=
				cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
2047
	}
2048 2049 2050
	/* Note: recovery_offset and journal_tail share space  */
	if (test_bit(Journal, &rdev->flags))
		sb->journal_tail = cpu_to_le64(rdev->journal_tail);
2051 2052 2053
	if (test_bit(Replacement, &rdev->flags))
		sb->feature_map |=
			cpu_to_le32(MD_FEATURE_REPLACEMENT);
2054

2055 2056 2057 2058 2059 2060
	if (mddev->reshape_position != MaxSector) {
		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
		sb->new_layout = cpu_to_le32(mddev->new_layout);
		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
		sb->new_level = cpu_to_le32(mddev->new_level);
2061
		sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
2062 2063 2064 2065
		if (mddev->delta_disks == 0 &&
		    mddev->reshape_backwards)
			sb->feature_map
				|= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
2066 2067 2068 2069 2070 2071
		if (rdev->new_data_offset != rdev->data_offset) {
			sb->feature_map
				|= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
			sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
							     - rdev->data_offset));
		}
2072
	}
2073

2074 2075 2076
	if (mddev_is_clustered(mddev))
		sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);

2077 2078 2079 2080 2081 2082 2083
	if (rdev->badblocks.count == 0)
		/* Nothing to do for bad blocks*/ ;
	else if (sb->bblog_offset == 0)
		/* Cannot record bad blocks on this device */
		md_error(mddev, rdev);
	else {
		struct badblocks *bb = &rdev->badblocks;
2084
		__le64 *bbp = (__le64 *)page_address(rdev->bb_page);
2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095
		u64 *p = bb->page;
		sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
		if (bb->changed) {
			unsigned seq;

retry:
			seq = read_seqbegin(&bb->lock);

			memset(bbp, 0xff, PAGE_SIZE);

			for (i = 0 ; i < bb->count ; i++) {
2096
				u64 internal_bb = p[i];
2097 2098
				u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
						| BB_LEN(internal_bb));
2099
				bbp[i] = cpu_to_le64(store_bb);
2100
			}
2101
			bb->changed = 0;
2102 2103 2104 2105 2106 2107 2108 2109 2110
			if (read_seqretry(&bb->lock, seq))
				goto retry;

			bb->sector = (rdev->sb_start +
				      (int)le32_to_cpu(sb->bblog_offset));
			bb->size = le16_to_cpu(sb->bblog_size);
		}
	}

L
Linus Torvalds 已提交
2111
	max_dev = 0;
N
NeilBrown 已提交
2112
	rdev_for_each(rdev2, mddev)
L
Linus Torvalds 已提交
2113 2114
		if (rdev2->desc_nr+1 > max_dev)
			max_dev = rdev2->desc_nr+1;
2115

2116 2117
	if (max_dev > le32_to_cpu(sb->max_dev)) {
		int bmask;
2118
		sb->max_dev = cpu_to_le32(max_dev);
2119 2120 2121 2122
		rdev->sb_size = max_dev * 2 + 256;
		bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
		if (rdev->sb_size & bmask)
			rdev->sb_size = (rdev->sb_size | bmask) + 1;
2123 2124 2125
	} else
		max_dev = le32_to_cpu(sb->max_dev);

L
Linus Torvalds 已提交
2126
	for (i=0; i<max_dev;i++)
2127
		sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2128

2129 2130
	if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
		sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
2131

2132
	if (test_bit(MD_HAS_PPL, &mddev->flags)) {
2133 2134 2135 2136 2137
		if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
			sb->feature_map |=
			    cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
		else
			sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
2138 2139 2140 2141
		sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
		sb->ppl.size = cpu_to_le16(rdev->ppl.size);
	}

N
NeilBrown 已提交
2142
	rdev_for_each(rdev2, mddev) {
L
Linus Torvalds 已提交
2143
		i = rdev2->desc_nr;
2144
		if (test_bit(Faulty, &rdev2->flags))
2145
			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
2146
		else if (test_bit(In_sync, &rdev2->flags))
L
Linus Torvalds 已提交
2147
			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2148
		else if (test_bit(Journal, &rdev2->flags))
2149
			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
2150
		else if (rdev2->raid_disk >= 0)
2151
			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
L
Linus Torvalds 已提交
2152
		else
2153
			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
L
Linus Torvalds 已提交
2154 2155 2156 2157 2158
	}

	sb->sb_csum = calc_sb_1_csum(sb);
}

2159
static unsigned long long
2160
super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
2161 2162
{
	struct mdp_superblock_1 *sb;
2163
	sector_t max_sectors;
A
Andre Noll 已提交
2164
	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
2165
		return 0; /* component must fit device */
2166 2167
	if (rdev->data_offset != rdev->new_data_offset)
		return 0; /* too confusing */
2168
	if (rdev->sb_start < rdev->data_offset) {
2169
		/* minor versions 1 and 2; superblock before data */
2170
		max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
2171 2172 2173
		max_sectors -= rdev->data_offset;
		if (!num_sectors || num_sectors > max_sectors)
			num_sectors = max_sectors;
2174
	} else if (rdev->mddev->bitmap_info.offset) {
2175 2176 2177 2178
		/* minor version 0 with bitmap we can't move */
		return 0;
	} else {
		/* minor version 0; superblock after data */
2179
		sector_t sb_start;
2180
		sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
2181
		sb_start &= ~(sector_t)(4*2 - 1);
2182
		max_sectors = rdev->sectors + sb_start - rdev->sb_start;
2183 2184
		if (!num_sectors || num_sectors > max_sectors)
			num_sectors = max_sectors;
2185
		rdev->sb_start = sb_start;
2186
	}
2187
	sb = page_address(rdev->sb_page);
2188
	sb->data_size = cpu_to_le64(num_sectors);
2189
	sb->super_offset = cpu_to_le64(rdev->sb_start);
2190
	sb->sb_csum = calc_sb_1_csum(sb);
2191 2192 2193 2194
	do {
		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
			       rdev->sb_page);
	} while (md_super_wait(rdev->mddev) < 0);
2195
	return num_sectors;
2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223

}

static int
super_1_allow_new_offset(struct md_rdev *rdev,
			 unsigned long long new_offset)
{
	/* All necessary checks on new >= old have been done */
	struct bitmap *bitmap;
	if (new_offset >= rdev->data_offset)
		return 1;

	/* with 1.0 metadata, there is no metadata to tread on
	 * so we can always move back */
	if (rdev->mddev->minor_version == 0)
		return 1;

	/* otherwise we must be sure not to step on
	 * any metadata, so stay:
	 * 36K beyond start of superblock
	 * beyond end of badblocks
	 * beyond write-intent bitmap
	 */
	if (rdev->sb_start + (32+4)*2 > new_offset)
		return 0;
	bitmap = rdev->mddev->bitmap;
	if (bitmap && !rdev->mddev->bitmap_info.file &&
	    rdev->sb_start + rdev->mddev->bitmap_info.offset +
2224
	    bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
2225 2226 2227 2228 2229
		return 0;
	if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
		return 0;

	return 1;
2230
}
L
Linus Torvalds 已提交
2231

A
Adrian Bunk 已提交
2232
static struct super_type super_types[] = {
L
Linus Torvalds 已提交
2233 2234 2235
	[0] = {
		.name	= "0.90.0",
		.owner	= THIS_MODULE,
2236 2237 2238 2239
		.load_super	    = super_90_load,
		.validate_super	    = super_90_validate,
		.sync_super	    = super_90_sync,
		.rdev_size_change   = super_90_rdev_size_change,
2240
		.allow_new_offset   = super_90_allow_new_offset,
L
Linus Torvalds 已提交
2241 2242 2243 2244
	},
	[1] = {
		.name	= "md-1",
		.owner	= THIS_MODULE,
2245 2246 2247 2248
		.load_super	    = super_1_load,
		.validate_super	    = super_1_validate,
		.sync_super	    = super_1_sync,
		.rdev_size_change   = super_1_rdev_size_change,
2249
		.allow_new_offset   = super_1_allow_new_offset,
L
Linus Torvalds 已提交
2250 2251 2252
	},
};

2253
static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264
{
	if (mddev->sync_super) {
		mddev->sync_super(mddev, rdev);
		return;
	}

	BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));

	super_types[mddev->major_version].sync_super(mddev, rdev);
}

2265
static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
L
Linus Torvalds 已提交
2266
{
2267
	struct md_rdev *rdev, *rdev2;
L
Linus Torvalds 已提交
2268

2269
	rcu_read_lock();
2270 2271 2272 2273 2274 2275 2276 2277 2278 2279
	rdev_for_each_rcu(rdev, mddev1) {
		if (test_bit(Faulty, &rdev->flags) ||
		    test_bit(Journal, &rdev->flags) ||
		    rdev->raid_disk == -1)
			continue;
		rdev_for_each_rcu(rdev2, mddev2) {
			if (test_bit(Faulty, &rdev2->flags) ||
			    test_bit(Journal, &rdev2->flags) ||
			    rdev2->raid_disk == -1)
				continue;
2280
			if (rdev->bdev->bd_contains ==
2281 2282
			    rdev2->bdev->bd_contains) {
				rcu_read_unlock();
2283
				return 1;
2284
			}
2285 2286
		}
	}
2287
	rcu_read_unlock();
L
Linus Torvalds 已提交
2288 2289 2290 2291 2292
	return 0;
}

static LIST_HEAD(pending_raid_disks);

2293 2294 2295 2296 2297 2298 2299
/*
 * Try to register data integrity profile for an mddev
 *
 * This is called when an array is started and after a disk has been kicked
 * from the array. It only succeeds if all working and active component devices
 * are integrity capable with matching profiles.
 */
2300
int md_integrity_register(struct mddev *mddev)
2301
{
2302
	struct md_rdev *rdev, *reference = NULL;
2303 2304 2305

	if (list_empty(&mddev->disks))
		return 0; /* nothing to do */
2306 2307
	if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
		return 0; /* shouldn't register, or already is */
N
NeilBrown 已提交
2308
	rdev_for_each(rdev, mddev) {
2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323
		/* skip spares and non-functional disks */
		if (test_bit(Faulty, &rdev->flags))
			continue;
		if (rdev->raid_disk < 0)
			continue;
		if (!reference) {
			/* Use the first rdev as the reference */
			reference = rdev;
			continue;
		}
		/* does this rdev's profile match the reference profile? */
		if (blk_integrity_compare(reference->bdev->bd_disk,
				rdev->bdev->bd_disk) < 0)
			return -EINVAL;
	}
2324 2325
	if (!reference || !bdev_get_integrity(reference->bdev))
		return 0;
2326 2327 2328 2329
	/*
	 * All component devices are integrity capable and have matching
	 * profiles, register the common profile for the md device.
	 */
2330 2331 2332
	blk_integrity_register(mddev->gendisk,
			       bdev_get_integrity(reference->bdev));

2333
	pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2334
	if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) {
2335
		pr_err("md: failed to create integrity pool for %s\n",
2336 2337 2338
		       mdname(mddev));
		return -EINVAL;
	}
2339 2340 2341 2342
	return 0;
}
EXPORT_SYMBOL(md_integrity_register);

2343 2344 2345 2346 2347
/*
 * Attempt to add an rdev, but only if it is consistent with the current
 * integrity profile
 */
int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
M
Martin K. Petersen 已提交
2348
{
2349
	struct blk_integrity *bi_mddev;
2350
	char name[BDEVNAME_SIZE];
2351 2352

	if (!mddev->gendisk)
2353
		return 0;
2354 2355

	bi_mddev = blk_get_integrity(mddev->gendisk);
M
Martin K. Petersen 已提交
2356

2357
	if (!bi_mddev) /* nothing to do */
2358 2359 2360
		return 0;

	if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
2361 2362
		pr_err("%s: incompatible integrity profile for %s\n",
		       mdname(mddev), bdevname(rdev->bdev, name));
2363 2364 2365 2366
		return -ENXIO;
	}

	return 0;
M
Martin K. Petersen 已提交
2367
}
2368
EXPORT_SYMBOL(md_integrity_add_rdev);
M
Martin K. Petersen 已提交
2369

2370
static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
L
Linus Torvalds 已提交
2371
{
2372
	char b[BDEVNAME_SIZE];
2373
	struct kobject *ko;
2374
	int err;
L
Linus Torvalds 已提交
2375

2376 2377 2378 2379
	/* prevent duplicates */
	if (find_rdev(mddev, rdev->bdev->bd_dev))
		return -EEXIST;

2380 2381 2382 2383
	if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) &&
	    mddev->pers)
		return -EROFS;

2384
	/* make sure rdev->sectors exceeds mddev->dev_sectors */
2385 2386 2387
	if (!test_bit(Journal, &rdev->flags) &&
	    rdev->sectors &&
	    (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2388 2389 2390 2391 2392 2393 2394 2395
		if (mddev->pers) {
			/* Cannot change size, so fail
			 * If mddev->level <= 0, then we don't care
			 * about aligning sizes (e.g. linear)
			 */
			if (mddev->level > 0)
				return -ENOSPC;
		} else
2396
			mddev->dev_sectors = rdev->sectors;
2397
	}
L
Linus Torvalds 已提交
2398 2399 2400 2401 2402

	/* Verify rdev->desc_nr is unique.
	 * If it is -1, assign a free number, else
	 * check number is not in use
	 */
2403
	rcu_read_lock();
L
Linus Torvalds 已提交
2404 2405
	if (rdev->desc_nr < 0) {
		int choice = 0;
2406 2407
		if (mddev->pers)
			choice = mddev->raid_disks;
2408
		while (md_find_rdev_nr_rcu(mddev, choice))
L
Linus Torvalds 已提交
2409 2410 2411
			choice++;
		rdev->desc_nr = choice;
	} else {
2412
		if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2413
			rcu_read_unlock();
L
Linus Torvalds 已提交
2414
			return -EBUSY;
2415
		}
L
Linus Torvalds 已提交
2416
	}
2417
	rcu_read_unlock();
2418 2419
	if (!test_bit(Journal, &rdev->flags) &&
	    mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2420 2421
		pr_warn("md: %s: array is limited to %d devices\n",
			mdname(mddev), mddev->max_disks);
2422 2423
		return -EBUSY;
	}
2424
	bdevname(rdev->bdev,b);
2425
	strreplace(b, '/', '!');
2426

L
Linus Torvalds 已提交
2427
	rdev->mddev = mddev;
2428
	pr_debug("md: bind<%s>\n", b);
2429

2430
	if (mddev->raid_disks)
G
Guoqing Jiang 已提交
2431
		mddev_create_serial_pool(mddev, rdev, false);
2432

2433
	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2434
		goto fail;
2435

T
Tejun Heo 已提交
2436
	ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
N
NeilBrown 已提交
2437 2438 2439
	if (sysfs_create_link(&rdev->kobj, ko, "block"))
		/* failure here is OK */;
	rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2440

2441
	list_add_rcu(&rdev->same_set, &mddev->disks);
2442
	bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2443 2444

	/* May as well allow recovery to be retried once */
2445
	mddev->recovery_disabled++;
M
Martin K. Petersen 已提交
2446

L
Linus Torvalds 已提交
2447
	return 0;
2448 2449

 fail:
2450 2451
	pr_warn("md: failed to register dev-%s for %s\n",
		b, mdname(mddev));
2452
	return err;
L
Linus Torvalds 已提交
2453 2454
}

2455
static void md_delayed_delete(struct work_struct *ws)
2456
{
2457
	struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2458
	kobject_del(&rdev->kobj);
2459
	kobject_put(&rdev->kobj);
2460 2461
}

2462
static void unbind_rdev_from_array(struct md_rdev *rdev)
L
Linus Torvalds 已提交
2463 2464
{
	char b[BDEVNAME_SIZE];
N
NeilBrown 已提交
2465

2466
	bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2467
	list_del_rcu(&rdev->same_set);
2468
	pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
2469
	mddev_destroy_serial_pool(rdev->mddev, rdev, false);
L
Linus Torvalds 已提交
2470
	rdev->mddev = NULL;
2471
	sysfs_remove_link(&rdev->kobj, "block");
2472 2473
	sysfs_put(rdev->sysfs_state);
	rdev->sysfs_state = NULL;
2474
	rdev->badblocks.count = 0;
2475
	/* We need to delay this, otherwise we can deadlock when
2476 2477
	 * writing to 'remove' to "dev/state".  We also need
	 * to delay it due to rcu usage.
2478
	 */
2479
	synchronize_rcu();
2480 2481
	INIT_WORK(&rdev->del_work, md_delayed_delete);
	kobject_get(&rdev->kobj);
T
Tejun Heo 已提交
2482
	queue_work(md_misc_wq, &rdev->del_work);
L
Linus Torvalds 已提交
2483 2484 2485 2486 2487 2488 2489
}

/*
 * prevent the device from being mounted, repartitioned or
 * otherwise reused by a RAID array (or any other kernel
 * subsystem), by bd_claiming the device.
 */
2490
static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
L
Linus Torvalds 已提交
2491 2492 2493 2494 2495
{
	int err = 0;
	struct block_device *bdev;
	char b[BDEVNAME_SIZE];

2496
	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2497
				 shared ? (struct md_rdev *)lock_rdev : rdev);
L
Linus Torvalds 已提交
2498
	if (IS_ERR(bdev)) {
2499
		pr_warn("md: could not open %s.\n", __bdevname(dev, b));
L
Linus Torvalds 已提交
2500 2501 2502 2503 2504 2505
		return PTR_ERR(bdev);
	}
	rdev->bdev = bdev;
	return err;
}

2506
static void unlock_rdev(struct md_rdev *rdev)
L
Linus Torvalds 已提交
2507 2508 2509
{
	struct block_device *bdev = rdev->bdev;
	rdev->bdev = NULL;
2510
	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
L
Linus Torvalds 已提交
2511 2512 2513 2514
}

void md_autodetect_dev(dev_t dev);

2515
static void export_rdev(struct md_rdev *rdev)
L
Linus Torvalds 已提交
2516 2517
{
	char b[BDEVNAME_SIZE];
N
NeilBrown 已提交
2518

2519
	pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b));
2520
	md_rdev_clear(rdev);
L
Linus Torvalds 已提交
2521
#ifndef MODULE
2522 2523
	if (test_bit(AutoDetected, &rdev->flags))
		md_autodetect_dev(rdev->bdev->bd_dev);
L
Linus Torvalds 已提交
2524 2525
#endif
	unlock_rdev(rdev);
2526
	kobject_put(&rdev->kobj);
L
Linus Torvalds 已提交
2527 2528
}

2529
void md_kick_rdev_from_array(struct md_rdev *rdev)
L
Linus Torvalds 已提交
2530 2531 2532 2533
{
	unbind_rdev_from_array(rdev);
	export_rdev(rdev);
}
2534
EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
L
Linus Torvalds 已提交
2535

2536
static void export_array(struct mddev *mddev)
L
Linus Torvalds 已提交
2537
{
N
NeilBrown 已提交
2538
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
2539

N
NeilBrown 已提交
2540 2541 2542
	while (!list_empty(&mddev->disks)) {
		rdev = list_first_entry(&mddev->disks, struct md_rdev,
					same_set);
2543
		md_kick_rdev_from_array(rdev);
L
Linus Torvalds 已提交
2544 2545 2546 2547 2548
	}
	mddev->raid_disks = 0;
	mddev->major_version = 0;
}

N
NeilBrown 已提交
2549 2550
static bool set_in_sync(struct mddev *mddev)
{
S
Shaohua Li 已提交
2551
	lockdep_assert_held(&mddev->lock);
2552 2553 2554 2555 2556 2557 2558
	if (!mddev->in_sync) {
		mddev->sync_checkers++;
		spin_unlock(&mddev->lock);
		percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
		spin_lock(&mddev->lock);
		if (!mddev->in_sync &&
		    percpu_ref_is_zero(&mddev->writes_pending)) {
N
NeilBrown 已提交
2559
			mddev->in_sync = 1;
2560 2561 2562 2563
			/*
			 * Ensure ->in_sync is visible before we clear
			 * ->sync_checkers.
			 */
2564
			smp_mb();
N
NeilBrown 已提交
2565 2566 2567
			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
			sysfs_notify_dirent_safe(mddev->sysfs_state);
		}
2568 2569
		if (--mddev->sync_checkers == 0)
			percpu_ref_switch_to_percpu(&mddev->writes_pending);
N
NeilBrown 已提交
2570 2571 2572 2573 2574 2575
	}
	if (mddev->safemode == 1)
		mddev->safemode = 0;
	return mddev->in_sync;
}

2576
static void sync_sbs(struct mddev *mddev, int nospares)
L
Linus Torvalds 已提交
2577
{
2578 2579 2580 2581 2582 2583
	/* Update each superblock (in-memory image), but
	 * if we are allowed to, skip spares which already
	 * have the right event counter, or have one earlier
	 * (which would mean they aren't being marked as dirty
	 * with the rest of the array)
	 */
2584
	struct md_rdev *rdev;
N
NeilBrown 已提交
2585
	rdev_for_each(rdev, mddev) {
2586 2587 2588 2589 2590 2591 2592
		if (rdev->sb_events == mddev->events ||
		    (nospares &&
		     rdev->raid_disk < 0 &&
		     rdev->sb_events+1 == mddev->events)) {
			/* Don't update this superblock */
			rdev->sb_loaded = 2;
		} else {
2593
			sync_super(mddev, rdev);
2594 2595
			rdev->sb_loaded = 1;
		}
L
Linus Torvalds 已提交
2596 2597 2598
	}
}

2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629
static bool does_sb_need_changing(struct mddev *mddev)
{
	struct md_rdev *rdev;
	struct mdp_superblock_1 *sb;
	int role;

	/* Find a good rdev */
	rdev_for_each(rdev, mddev)
		if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
			break;

	/* No good device found. */
	if (!rdev)
		return false;

	sb = page_address(rdev->sb_page);
	/* Check if a device has become faulty or a spare become active */
	rdev_for_each(rdev, mddev) {
		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
		/* Device activated? */
		if (role == 0xffff && rdev->raid_disk >=0 &&
		    !test_bit(Faulty, &rdev->flags))
			return true;
		/* Device turned faulty? */
		if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
			return true;
	}

	/* Check if any mddev parameters have changed */
	if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
	    (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2630
	    (mddev->layout != le32_to_cpu(sb->layout)) ||
2631 2632 2633 2634 2635 2636 2637
	    (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
	    (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
		return true;

	return false;
}

2638
void md_update_sb(struct mddev *mddev, int force_change)
L
Linus Torvalds 已提交
2639
{
2640
	struct md_rdev *rdev;
2641
	int sync_req;
2642
	int nospares = 0;
2643
	int any_badblocks_changed = 0;
2644
	int ret = -1;
L
Linus Torvalds 已提交
2645

2646 2647
	if (mddev->ro) {
		if (force_change)
2648
			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2649 2650
		return;
	}
2651

2652
repeat:
2653
	if (mddev_is_clustered(mddev)) {
2654
		if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2655
			force_change = 1;
2656
		if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2657
			nospares = 1;
2658
		ret = md_cluster_ops->metadata_update_start(mddev);
2659 2660
		/* Has someone else has updated the sb */
		if (!does_sb_need_changing(mddev)) {
2661 2662
			if (ret == 0)
				md_cluster_ops->metadata_update_cancel(mddev);
2663 2664 2665
			bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
							 BIT(MD_SB_CHANGE_DEVS) |
							 BIT(MD_SB_CHANGE_CLEAN));
2666 2667 2668
			return;
		}
	}
2669

2670 2671 2672 2673 2674 2675
	/*
	 * First make sure individual recovery_offsets are correct
	 * curr_resync_completed can only be used during recovery.
	 * During reshape/resync it might use array-addresses rather
	 * that device addresses.
	 */
N
NeilBrown 已提交
2676
	rdev_for_each(rdev, mddev) {
2677 2678
		if (rdev->raid_disk >= 0 &&
		    mddev->delta_disks >= 0 &&
2679 2680 2681
		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
		    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
		    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
S
Shaohua Li 已提交
2682
		    !test_bit(Journal, &rdev->flags) &&
2683 2684 2685 2686
		    !test_bit(In_sync, &rdev->flags) &&
		    mddev->curr_resync_completed > rdev->recovery_offset)
				rdev->recovery_offset = mddev->curr_resync_completed;

2687
	}
2688
	if (!mddev->persistent) {
2689 2690
		clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
		clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2691
		if (!mddev->external) {
2692
			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
N
NeilBrown 已提交
2693
			rdev_for_each(rdev, mddev) {
2694
				if (rdev->badblocks.changed) {
2695
					rdev->badblocks.changed = 0;
2696
					ack_all_badblocks(&rdev->badblocks);
2697 2698 2699 2700 2701 2702 2703
					md_error(mddev, rdev);
				}
				clear_bit(Blocked, &rdev->flags);
				clear_bit(BlockedBadBlocks, &rdev->flags);
				wake_up(&rdev->blocked_wait);
			}
		}
2704 2705 2706 2707
		wake_up(&mddev->sb_wait);
		return;
	}

2708
	spin_lock(&mddev->lock);
2709

2710
	mddev->utime = ktime_get_real_seconds();
2711

2712
	if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2713
		force_change = 1;
2714
	if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2715 2716 2717 2718 2719 2720 2721 2722
		/* just a clean<-> dirty transition, possibly leave spares alone,
		 * though if events isn't the right even/odd, we will have to do
		 * spares after all
		 */
		nospares = 1;
	if (force_change)
		nospares = 0;
	if (mddev->degraded)
2723 2724 2725 2726 2727 2728 2729 2730 2731
		/* If the array is degraded, then skipping spares is both
		 * dangerous and fairly pointless.
		 * Dangerous because a device that was removed from the array
		 * might have a event_count that still looks up-to-date,
		 * so it can be re-added without a resync.
		 * Pointless because if there are any spares to skip,
		 * then a recovery will happen and soon that array won't
		 * be degraded any more and the spare can go back to sleep then.
		 */
2732
		nospares = 0;
2733

2734
	sync_req = mddev->in_sync;
2735 2736 2737

	/* If this is just a dirty<->clean transition, and the array is clean
	 * and 'events' is odd, we can roll back to the previous clean state */
2738
	if (nospares
2739
	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2740 2741
	    && mddev->can_decrease_events
	    && mddev->events != 1) {
2742
		mddev->events--;
2743 2744
		mddev->can_decrease_events = 0;
	} else {
2745 2746
		/* otherwise we have to go forward and ... */
		mddev->events ++;
2747
		mddev->can_decrease_events = nospares;
2748
	}
L
Linus Torvalds 已提交
2749

N
NeilBrown 已提交
2750 2751 2752 2753 2754 2755
	/*
	 * This 64-bit counter should never wrap.
	 * Either we are in around ~1 trillion A.C., assuming
	 * 1 reboot per second, or we have a bug...
	 */
	WARN_ON(mddev->events == 0);
2756

N
NeilBrown 已提交
2757
	rdev_for_each(rdev, mddev) {
2758 2759
		if (rdev->badblocks.changed)
			any_badblocks_changed++;
2760 2761 2762
		if (test_bit(Faulty, &rdev->flags))
			set_bit(FaultRecorded, &rdev->flags);
	}
2763

2764
	sync_sbs(mddev, nospares);
2765
	spin_unlock(&mddev->lock);
L
Linus Torvalds 已提交
2766

2767 2768
	pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
		 mdname(mddev), mddev->in_sync);
L
Linus Torvalds 已提交
2769

2770 2771
	if (mddev->queue)
		blk_add_trace_msg(mddev->queue, "md md_update_sb");
2772
rewrite:
2773
	md_bitmap_update_sb(mddev->bitmap);
N
NeilBrown 已提交
2774
	rdev_for_each(rdev, mddev) {
L
Linus Torvalds 已提交
2775
		char b[BDEVNAME_SIZE];
2776

2777 2778
		if (rdev->sb_loaded != 1)
			continue; /* no noise on spare devices */
L
Linus Torvalds 已提交
2779

2780
		if (!test_bit(Faulty, &rdev->flags)) {
2781
			md_super_write(mddev,rdev,
2782
				       rdev->sb_start, rdev->sb_size,
2783
				       rdev->sb_page);
2784 2785 2786
			pr_debug("md: (write) %s's sb offset: %llu\n",
				 bdevname(rdev->bdev, b),
				 (unsigned long long)rdev->sb_start);
2787
			rdev->sb_events = mddev->events;
2788 2789 2790 2791 2792 2793 2794
			if (rdev->badblocks.size) {
				md_super_write(mddev, rdev,
					       rdev->badblocks.sector,
					       rdev->badblocks.size << 9,
					       rdev->bb_page);
				rdev->badblocks.size = 0;
			}
2795

2796
		} else
2797 2798
			pr_debug("md: %s (skipping faulty)\n",
				 bdevname(rdev->bdev, b));
2799

2800
		if (mddev->level == LEVEL_MULTIPATH)
L
Linus Torvalds 已提交
2801 2802 2803
			/* only need to write one superblock... */
			break;
	}
2804 2805
	if (md_super_wait(mddev) < 0)
		goto rewrite;
2806
	/* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2807

2808 2809 2810
	if (mddev_is_clustered(mddev) && ret == 0)
		md_cluster_ops->metadata_update_finish(mddev);

2811
	if (mddev->in_sync != sync_req ||
2812 2813
	    !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
			       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2814 2815
		/* have to write it out again */
		goto repeat;
2816
	wake_up(&mddev->sb_wait);
2817 2818
	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2819

N
NeilBrown 已提交
2820
	rdev_for_each(rdev, mddev) {
2821 2822 2823 2824
		if (test_and_clear_bit(FaultRecorded, &rdev->flags))
			clear_bit(Blocked, &rdev->flags);

		if (any_badblocks_changed)
2825
			ack_all_badblocks(&rdev->badblocks);
2826 2827 2828
		clear_bit(BlockedBadBlocks, &rdev->flags);
		wake_up(&rdev->blocked_wait);
	}
L
Linus Torvalds 已提交
2829
}
2830
EXPORT_SYMBOL(md_update_sb);
L
Linus Torvalds 已提交
2831

G
Goldwyn Rodrigues 已提交
2832 2833 2834 2835
static int add_bound_rdev(struct md_rdev *rdev)
{
	struct mddev *mddev = rdev->mddev;
	int err = 0;
2836
	bool add_journal = test_bit(Journal, &rdev->flags);
G
Goldwyn Rodrigues 已提交
2837

2838
	if (!mddev->pers->hot_remove_disk || add_journal) {
G
Goldwyn Rodrigues 已提交
2839 2840 2841 2842 2843 2844
		/* If there is hot_add_disk but no hot_remove_disk
		 * then added disks for geometry changes,
		 * and should be added immediately.
		 */
		super_types[mddev->major_version].
			validate_super(mddev, rdev);
2845 2846
		if (add_journal)
			mddev_suspend(mddev);
G
Goldwyn Rodrigues 已提交
2847
		err = mddev->pers->hot_add_disk(mddev, rdev);
2848 2849
		if (add_journal)
			mddev_resume(mddev);
G
Goldwyn Rodrigues 已提交
2850
		if (err) {
2851
			md_kick_rdev_from_array(rdev);
G
Goldwyn Rodrigues 已提交
2852 2853 2854 2855 2856
			return err;
		}
	}
	sysfs_notify_dirent_safe(rdev->sysfs_state);

2857
	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
G
Goldwyn Rodrigues 已提交
2858 2859 2860 2861 2862 2863 2864
	if (mddev->degraded)
		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
	md_new_event(mddev);
	md_wakeup_thread(mddev->thread);
	return 0;
}
L
Linus Torvalds 已提交
2865

2866
/* words written to sysfs files may, or may not, be \n terminated.
2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885
 * We want to accept with case. For this we use cmd_match.
 */
static int cmd_match(const char *cmd, const char *str)
{
	/* See if cmd, written into a sysfs file, matches
	 * str.  They must either be the same, or cmd can
	 * have a trailing newline
	 */
	while (*cmd && *str && *cmd == *str) {
		cmd++;
		str++;
	}
	if (*cmd == '\n')
		cmd++;
	if (*str || *cmd)
		return 0;
	return 1;
}

2886 2887
struct rdev_sysfs_entry {
	struct attribute attr;
2888 2889
	ssize_t (*show)(struct md_rdev *, char *);
	ssize_t (*store)(struct md_rdev *, const char *, size_t);
2890 2891 2892
};

static ssize_t
2893
state_show(struct md_rdev *rdev, char *page)
2894
{
2895
	char *sep = ",";
2896
	size_t len = 0;
2897
	unsigned long flags = READ_ONCE(rdev->flags);
2898

2899
	if (test_bit(Faulty, &flags) ||
2900 2901
	    (!test_bit(ExternalBbl, &flags) &&
	    rdev->badblocks.unacked_exist))
2902 2903 2904 2905 2906 2907 2908
		len += sprintf(page+len, "faulty%s", sep);
	if (test_bit(In_sync, &flags))
		len += sprintf(page+len, "in_sync%s", sep);
	if (test_bit(Journal, &flags))
		len += sprintf(page+len, "journal%s", sep);
	if (test_bit(WriteMostly, &flags))
		len += sprintf(page+len, "write_mostly%s", sep);
2909
	if (test_bit(Blocked, &flags) ||
2910
	    (rdev->badblocks.unacked_exist
2911 2912
	     && !test_bit(Faulty, &flags)))
		len += sprintf(page+len, "blocked%s", sep);
2913
	if (!test_bit(Faulty, &flags) &&
S
Shaohua Li 已提交
2914
	    !test_bit(Journal, &flags) &&
2915 2916 2917 2918 2919 2920 2921 2922 2923 2924
	    !test_bit(In_sync, &flags))
		len += sprintf(page+len, "spare%s", sep);
	if (test_bit(WriteErrorSeen, &flags))
		len += sprintf(page+len, "write_error%s", sep);
	if (test_bit(WantReplacement, &flags))
		len += sprintf(page+len, "want_replacement%s", sep);
	if (test_bit(Replacement, &flags))
		len += sprintf(page+len, "replacement%s", sep);
	if (test_bit(ExternalBbl, &flags))
		len += sprintf(page+len, "external_bbl%s", sep);
2925 2926
	if (test_bit(FailFast, &flags))
		len += sprintf(page+len, "failfast%s", sep);
2927 2928 2929

	if (len)
		len -= strlen(sep);
2930

2931 2932 2933
	return len+sprintf(page+len, "\n");
}

2934
static ssize_t
2935
state_store(struct md_rdev *rdev, const char *buf, size_t len)
2936 2937
{
	/* can write
2938
	 *  faulty  - simulates an error
2939
	 *  remove  - disconnects the device
2940 2941
	 *  writemostly - sets write_mostly
	 *  -writemostly - clears write_mostly
2942 2943
	 *  blocked - sets the Blocked flags
	 *  -blocked - clears the Blocked and possibly simulates an error
2944
	 *  insync - sets Insync providing device isn't active
2945 2946
	 *  -insync - clear Insync for a device with a slot assigned,
	 *            so that it gets rebuilt based on bitmap
2947 2948
	 *  write_error - sets WriteErrorSeen
	 *  -write_error - clears WriteErrorSeen
2949
	 *  {,-}failfast - set/clear FailFast
2950 2951 2952 2953
	 */
	int err = -EINVAL;
	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
		md_error(rdev->mddev, rdev);
2954 2955 2956 2957
		if (test_bit(Faulty, &rdev->flags))
			err = 0;
		else
			err = -EBUSY;
2958
	} else if (cmd_match(buf, "remove")) {
S
Shaohua Li 已提交
2959 2960 2961 2962
		if (rdev->mddev->pers) {
			clear_bit(Blocked, &rdev->flags);
			remove_and_add_spares(rdev->mddev, rdev);
		}
2963 2964 2965
		if (rdev->raid_disk >= 0)
			err = -EBUSY;
		else {
2966
			struct mddev *mddev = rdev->mddev;
2967
			err = 0;
2968 2969 2970 2971 2972
			if (mddev_is_clustered(mddev))
				err = md_cluster_ops->remove_disk(mddev, rdev);

			if (err == 0) {
				md_kick_rdev_from_array(rdev);
2973
				if (mddev->pers) {
2974
					set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2975 2976
					md_wakeup_thread(mddev->thread);
				}
2977 2978
				md_new_event(mddev);
			}
2979
		}
2980 2981
	} else if (cmd_match(buf, "writemostly")) {
		set_bit(WriteMostly, &rdev->flags);
G
Guoqing Jiang 已提交
2982
		mddev_create_serial_pool(rdev->mddev, rdev, false);
2983 2984
		err = 0;
	} else if (cmd_match(buf, "-writemostly")) {
2985
		mddev_destroy_serial_pool(rdev->mddev, rdev, false);
2986
		clear_bit(WriteMostly, &rdev->flags);
2987 2988 2989 2990 2991
		err = 0;
	} else if (cmd_match(buf, "blocked")) {
		set_bit(Blocked, &rdev->flags);
		err = 0;
	} else if (cmd_match(buf, "-blocked")) {
2992
		if (!test_bit(Faulty, &rdev->flags) &&
2993
		    !test_bit(ExternalBbl, &rdev->flags) &&
2994
		    rdev->badblocks.unacked_exist) {
2995 2996 2997 2998 2999
			/* metadata handler doesn't understand badblocks,
			 * so we need to fail the device
			 */
			md_error(rdev->mddev, rdev);
		}
3000
		clear_bit(Blocked, &rdev->flags);
3001
		clear_bit(BlockedBadBlocks, &rdev->flags);
3002 3003 3004 3005
		wake_up(&rdev->blocked_wait);
		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
		md_wakeup_thread(rdev->mddev->thread);

3006 3007 3008
		err = 0;
	} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
		set_bit(In_sync, &rdev->flags);
3009
		err = 0;
3010 3011 3012 3013 3014 3015
	} else if (cmd_match(buf, "failfast")) {
		set_bit(FailFast, &rdev->flags);
		err = 0;
	} else if (cmd_match(buf, "-failfast")) {
		clear_bit(FailFast, &rdev->flags);
		err = 0;
S
Shaohua Li 已提交
3016 3017
	} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
		   !test_bit(Journal, &rdev->flags)) {
3018 3019 3020 3021 3022 3023
		if (rdev->mddev->pers == NULL) {
			clear_bit(In_sync, &rdev->flags);
			rdev->saved_raid_disk = rdev->raid_disk;
			rdev->raid_disk = -1;
			err = 0;
		}
3024 3025 3026 3027 3028 3029
	} else if (cmd_match(buf, "write_error")) {
		set_bit(WriteErrorSeen, &rdev->flags);
		err = 0;
	} else if (cmd_match(buf, "-write_error")) {
		clear_bit(WriteErrorSeen, &rdev->flags);
		err = 0;
3030 3031 3032 3033 3034 3035
	} else if (cmd_match(buf, "want_replacement")) {
		/* Any non-spare device that is not a replacement can
		 * become want_replacement at any time, but we then need to
		 * check if recovery is needed.
		 */
		if (rdev->raid_disk >= 0 &&
S
Shaohua Li 已提交
3036
		    !test_bit(Journal, &rdev->flags) &&
3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066
		    !test_bit(Replacement, &rdev->flags))
			set_bit(WantReplacement, &rdev->flags);
		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
		md_wakeup_thread(rdev->mddev->thread);
		err = 0;
	} else if (cmd_match(buf, "-want_replacement")) {
		/* Clearing 'want_replacement' is always allowed.
		 * Once replacements starts it is too late though.
		 */
		err = 0;
		clear_bit(WantReplacement, &rdev->flags);
	} else if (cmd_match(buf, "replacement")) {
		/* Can only set a device as a replacement when array has not
		 * yet been started.  Once running, replacement is automatic
		 * from spares, or by assigning 'slot'.
		 */
		if (rdev->mddev->pers)
			err = -EBUSY;
		else {
			set_bit(Replacement, &rdev->flags);
			err = 0;
		}
	} else if (cmd_match(buf, "-replacement")) {
		/* Similarly, can only clear Replacement before start */
		if (rdev->mddev->pers)
			err = -EBUSY;
		else {
			clear_bit(Replacement, &rdev->flags);
			err = 0;
		}
G
Goldwyn Rodrigues 已提交
3067
	} else if (cmd_match(buf, "re-add")) {
3068 3069 3070 3071
		if (!rdev->mddev->pers)
			err = -EINVAL;
		else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
				rdev->saved_raid_disk >= 0) {
3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082
			/* clear_bit is performed _after_ all the devices
			 * have their local Faulty bit cleared. If any writes
			 * happen in the meantime in the local node, they
			 * will land in the local bitmap, which will be synced
			 * by this node eventually
			 */
			if (!mddev_is_clustered(rdev->mddev) ||
			    (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
				clear_bit(Faulty, &rdev->flags);
				err = add_bound_rdev(rdev);
			}
G
Goldwyn Rodrigues 已提交
3083 3084
		} else
			err = -EBUSY;
3085 3086 3087 3088 3089 3090 3091
	} else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
		set_bit(ExternalBbl, &rdev->flags);
		rdev->badblocks.shift = 0;
		err = 0;
	} else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
		clear_bit(ExternalBbl, &rdev->flags);
		err = 0;
3092
	}
N
NeilBrown 已提交
3093 3094
	if (!err)
		sysfs_notify_dirent_safe(rdev->sysfs_state);
3095 3096
	return err ? err : len;
}
3097
static struct rdev_sysfs_entry rdev_state =
3098
__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
3099

3100
static ssize_t
3101
errors_show(struct md_rdev *rdev, char *page)
3102 3103 3104 3105 3106
{
	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
}

static ssize_t
3107
errors_store(struct md_rdev *rdev, const char *buf, size_t len)
3108
{
A
Alexey Dobriyan 已提交
3109 3110 3111 3112 3113 3114 3115 3116
	unsigned int n;
	int rv;

	rv = kstrtouint(buf, 10, &n);
	if (rv < 0)
		return rv;
	atomic_set(&rdev->corrected_errors, n);
	return len;
3117 3118
}
static struct rdev_sysfs_entry rdev_errors =
3119
__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
3120

3121
static ssize_t
3122
slot_show(struct md_rdev *rdev, char *page)
3123
{
S
Shaohua Li 已提交
3124 3125 3126
	if (test_bit(Journal, &rdev->flags))
		return sprintf(page, "journal\n");
	else if (rdev->raid_disk < 0)
3127 3128 3129 3130 3131 3132
		return sprintf(page, "none\n");
	else
		return sprintf(page, "%d\n", rdev->raid_disk);
}

static ssize_t
3133
slot_store(struct md_rdev *rdev, const char *buf, size_t len)
3134
{
A
Alexey Dobriyan 已提交
3135
	int slot;
3136
	int err;
A
Alexey Dobriyan 已提交
3137

S
Shaohua Li 已提交
3138 3139
	if (test_bit(Journal, &rdev->flags))
		return -EBUSY;
3140 3141
	if (strncmp(buf, "none", 4)==0)
		slot = -1;
A
Alexey Dobriyan 已提交
3142 3143 3144 3145 3146
	else {
		err = kstrtouint(buf, 10, (unsigned int *)&slot);
		if (err < 0)
			return err;
	}
3147
	if (rdev->mddev->pers && slot == -1) {
3148 3149 3150 3151 3152 3153 3154 3155 3156 3157
		/* Setting 'slot' on an active array requires also
		 * updating the 'rd%d' link, and communicating
		 * with the personality with ->hot_*_disk.
		 * For now we only support removing
		 * failed/spare devices.  This normally happens automatically,
		 * but not when the metadata is externally managed.
		 */
		if (rdev->raid_disk == -1)
			return -EEXIST;
		/* personality does all needed checks */
3158
		if (rdev->mddev->pers->hot_remove_disk == NULL)
3159
			return -EINVAL;
3160 3161 3162 3163
		clear_bit(Blocked, &rdev->flags);
		remove_and_add_spares(rdev->mddev, rdev);
		if (rdev->raid_disk >= 0)
			return -EBUSY;
3164 3165
		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
		md_wakeup_thread(rdev->mddev->thread);
3166 3167
	} else if (rdev->mddev->pers) {
		/* Activating a spare .. or possibly reactivating
3168
		 * if we ever get bitmaps working here.
3169
		 */
3170
		int err;
3171 3172 3173 3174

		if (rdev->raid_disk != -1)
			return -EBUSY;

3175 3176 3177
		if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
			return -EBUSY;

3178 3179 3180
		if (rdev->mddev->pers->hot_add_disk == NULL)
			return -EINVAL;

3181 3182 3183 3184
		if (slot >= rdev->mddev->raid_disks &&
		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
			return -ENOSPC;

3185 3186 3187 3188 3189
		rdev->raid_disk = slot;
		if (test_bit(In_sync, &rdev->flags))
			rdev->saved_raid_disk = slot;
		else
			rdev->saved_raid_disk = -1;
3190
		clear_bit(In_sync, &rdev->flags);
3191
		clear_bit(Bitmap_sync, &rdev->flags);
3192 3193 3194 3195 3196 3197 3198 3199 3200
		err = rdev->mddev->pers->
			hot_add_disk(rdev->mddev, rdev);
		if (err) {
			rdev->raid_disk = -1;
			return err;
		} else
			sysfs_notify_dirent_safe(rdev->sysfs_state);
		if (sysfs_link_rdev(rdev->mddev, rdev))
			/* failure here is OK */;
3201
		/* don't wakeup anyone, leave that to userspace. */
3202
	} else {
3203 3204
		if (slot >= rdev->mddev->raid_disks &&
		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3205 3206 3207
			return -ENOSPC;
		rdev->raid_disk = slot;
		/* assume it is working */
3208 3209
		clear_bit(Faulty, &rdev->flags);
		clear_bit(WriteMostly, &rdev->flags);
3210
		set_bit(In_sync, &rdev->flags);
N
NeilBrown 已提交
3211
		sysfs_notify_dirent_safe(rdev->sysfs_state);
3212
	}
3213 3214 3215 3216
	return len;
}

static struct rdev_sysfs_entry rdev_slot =
3217
__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
3218

3219
static ssize_t
3220
offset_show(struct md_rdev *rdev, char *page)
3221
{
3222
	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3223 3224 3225
}

static ssize_t
3226
offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3227
{
3228
	unsigned long long offset;
3229
	if (kstrtoull(buf, 10, &offset) < 0)
3230
		return -EINVAL;
3231
	if (rdev->mddev->pers && rdev->raid_disk >= 0)
3232
		return -EBUSY;
3233
	if (rdev->sectors && rdev->mddev->external)
3234 3235 3236
		/* Must set offset before size, so overlap checks
		 * can be sane */
		return -EBUSY;
3237
	rdev->data_offset = offset;
3238
	rdev->new_data_offset = offset;
3239 3240 3241 3242
	return len;
}

static struct rdev_sysfs_entry rdev_offset =
3243
__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3244

3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256
static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
{
	return sprintf(page, "%llu\n",
		       (unsigned long long)rdev->new_data_offset);
}

static ssize_t new_offset_store(struct md_rdev *rdev,
				const char *buf, size_t len)
{
	unsigned long long new_offset;
	struct mddev *mddev = rdev->mddev;

3257
	if (kstrtoull(buf, 10, &new_offset) < 0)
3258 3259
		return -EINVAL;

3260 3261
	if (mddev->sync_thread ||
	    test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302
		return -EBUSY;
	if (new_offset == rdev->data_offset)
		/* reset is always permitted */
		;
	else if (new_offset > rdev->data_offset) {
		/* must not push array size beyond rdev_sectors */
		if (new_offset - rdev->data_offset
		    + mddev->dev_sectors > rdev->sectors)
				return -E2BIG;
	}
	/* Metadata worries about other space details. */

	/* decreasing the offset is inconsistent with a backwards
	 * reshape.
	 */
	if (new_offset < rdev->data_offset &&
	    mddev->reshape_backwards)
		return -EINVAL;
	/* Increasing offset is inconsistent with forwards
	 * reshape.  reshape_direction should be set to
	 * 'backwards' first.
	 */
	if (new_offset > rdev->data_offset &&
	    !mddev->reshape_backwards)
		return -EINVAL;

	if (mddev->pers && mddev->persistent &&
	    !super_types[mddev->major_version]
	    .allow_new_offset(rdev, new_offset))
		return -E2BIG;
	rdev->new_data_offset = new_offset;
	if (new_offset > rdev->data_offset)
		mddev->reshape_backwards = 1;
	else if (new_offset < rdev->data_offset)
		mddev->reshape_backwards = 0;

	return len;
}
static struct rdev_sysfs_entry rdev_new_offset =
__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);

3303
static ssize_t
3304
rdev_size_show(struct md_rdev *rdev, char *page)
3305
{
3306
	return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3307 3308
}

3309 3310 3311 3312 3313 3314 3315 3316 3317 3318
static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
{
	/* check if two start/length pairs overlap */
	if (s1+l1 <= s2)
		return 0;
	if (s2+l2 <= s1)
		return 0;
	return 1;
}

D
Dan Williams 已提交
3319 3320 3321 3322 3323
static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
{
	unsigned long long blocks;
	sector_t new;

3324
	if (kstrtoull(buf, 10, &blocks) < 0)
D
Dan Williams 已提交
3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337
		return -EINVAL;

	if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
		return -EINVAL; /* sector conversion overflow */

	new = blocks * 2;
	if (new != blocks * 2)
		return -EINVAL; /* unsigned long long to sector_t overflow */

	*sectors = new;
	return 0;
}

3338
static ssize_t
3339
rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3340
{
3341
	struct mddev *my_mddev = rdev->mddev;
3342
	sector_t oldsectors = rdev->sectors;
D
Dan Williams 已提交
3343
	sector_t sectors;
3344

S
Shaohua Li 已提交
3345 3346
	if (test_bit(Journal, &rdev->flags))
		return -EBUSY;
D
Dan Williams 已提交
3347
	if (strict_blocks_to_sectors(buf, &sectors) < 0)
N
Neil Brown 已提交
3348
		return -EINVAL;
3349 3350
	if (rdev->data_offset != rdev->new_data_offset)
		return -EINVAL; /* too confusing */
3351
	if (my_mddev->pers && rdev->raid_disk >= 0) {
N
Neil Brown 已提交
3352
		if (my_mddev->persistent) {
3353 3354 3355
			sectors = super_types[my_mddev->major_version].
				rdev_size_change(rdev, sectors);
			if (!sectors)
3356
				return -EBUSY;
3357
		} else if (!sectors)
3358
			sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
3359
				rdev->data_offset;
3360 3361 3362
		if (!my_mddev->pers->resize)
			/* Cannot change size for RAID0 or Linear etc */
			return -EINVAL;
3363
	}
3364
	if (sectors < my_mddev->dev_sectors)
3365
		return -EINVAL; /* component must fit device */
3366

3367 3368
	rdev->sectors = sectors;
	if (sectors > oldsectors && my_mddev->external) {
3369 3370 3371 3372 3373
		/* Need to check that all other rdevs with the same
		 * ->bdev do not overlap.  'rcu' is sufficient to walk
		 * the rdev lists safely.
		 * This check does not provide a hard guarantee, it
		 * just helps avoid dangerous mistakes.
3374
		 */
3375
		struct mddev *mddev;
3376
		int overlap = 0;
3377
		struct list_head *tmp;
3378

3379
		rcu_read_lock();
3380
		for_each_mddev(mddev, tmp) {
3381
			struct md_rdev *rdev2;
3382

N
NeilBrown 已提交
3383
			rdev_for_each(rdev2, mddev)
3384 3385 3386 3387 3388
				if (rdev->bdev == rdev2->bdev &&
				    rdev != rdev2 &&
				    overlaps(rdev->data_offset, rdev->sectors,
					     rdev2->data_offset,
					     rdev2->sectors)) {
3389 3390 3391 3392 3393 3394 3395 3396
					overlap = 1;
					break;
				}
			if (overlap) {
				mddev_put(mddev);
				break;
			}
		}
3397
		rcu_read_unlock();
3398 3399 3400
		if (overlap) {
			/* Someone else could have slipped in a size
			 * change here, but doing so is just silly.
3401
			 * We put oldsectors back because we *know* it is
3402 3403 3404
			 * safe, and trust userspace not to race with
			 * itself
			 */
3405
			rdev->sectors = oldsectors;
3406 3407 3408
			return -EBUSY;
		}
	}
3409 3410 3411 3412
	return len;
}

static struct rdev_sysfs_entry rdev_size =
3413
__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3414

3415
static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3416 3417 3418 3419 3420 3421 3422 3423 3424 3425
{
	unsigned long long recovery_start = rdev->recovery_offset;

	if (test_bit(In_sync, &rdev->flags) ||
	    recovery_start == MaxSector)
		return sprintf(page, "none\n");

	return sprintf(page, "%llu\n", recovery_start);
}

3426
static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3427 3428 3429 3430 3431
{
	unsigned long long recovery_start;

	if (cmd_match(buf, "none"))
		recovery_start = MaxSector;
3432
	else if (kstrtoull(buf, 10, &recovery_start))
3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449
		return -EINVAL;

	if (rdev->mddev->pers &&
	    rdev->raid_disk >= 0)
		return -EBUSY;

	rdev->recovery_offset = recovery_start;
	if (recovery_start == MaxSector)
		set_bit(In_sync, &rdev->flags);
	else
		clear_bit(In_sync, &rdev->flags);
	return len;
}

static struct rdev_sysfs_entry rdev_recovery_start =
__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);

3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460
/* sysfs access to bad-blocks list.
 * We present two files.
 * 'bad-blocks' lists sector numbers and lengths of ranges that
 *    are recorded as bad.  The list is truncated to fit within
 *    the one-page limit of sysfs.
 *    Writing "sector length" to this file adds an acknowledged
 *    bad block list.
 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
 *    been acknowledged.  Writing to this file adds bad blocks
 *    without acknowledging them.  This is largely for testing.
 */
3461
static ssize_t bb_show(struct md_rdev *rdev, char *page)
3462 3463 3464
{
	return badblocks_show(&rdev->badblocks, page, 0);
}
3465
static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3466
{
3467 3468 3469 3470 3471
	int rv = badblocks_store(&rdev->badblocks, page, len, 0);
	/* Maybe that ack was all we needed */
	if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
		wake_up(&rdev->blocked_wait);
	return rv;
3472 3473 3474 3475
}
static struct rdev_sysfs_entry rdev_bad_blocks =
__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);

3476
static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3477 3478 3479
{
	return badblocks_show(&rdev->badblocks, page, 1);
}
3480
static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3481 3482 3483 3484 3485 3486
{
	return badblocks_store(&rdev->badblocks, page, len, 1);
}
static struct rdev_sysfs_entry rdev_unack_bad_blocks =
__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);

3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558
static ssize_t
ppl_sector_show(struct md_rdev *rdev, char *page)
{
	return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
}

static ssize_t
ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
{
	unsigned long long sector;

	if (kstrtoull(buf, 10, &sector) < 0)
		return -EINVAL;
	if (sector != (sector_t)sector)
		return -EINVAL;

	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
	    rdev->raid_disk >= 0)
		return -EBUSY;

	if (rdev->mddev->persistent) {
		if (rdev->mddev->major_version == 0)
			return -EINVAL;
		if ((sector > rdev->sb_start &&
		     sector - rdev->sb_start > S16_MAX) ||
		    (sector < rdev->sb_start &&
		     rdev->sb_start - sector > -S16_MIN))
			return -EINVAL;
		rdev->ppl.offset = sector - rdev->sb_start;
	} else if (!rdev->mddev->external) {
		return -EBUSY;
	}
	rdev->ppl.sector = sector;
	return len;
}

static struct rdev_sysfs_entry rdev_ppl_sector =
__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);

static ssize_t
ppl_size_show(struct md_rdev *rdev, char *page)
{
	return sprintf(page, "%u\n", rdev->ppl.size);
}

static ssize_t
ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
{
	unsigned int size;

	if (kstrtouint(buf, 10, &size) < 0)
		return -EINVAL;

	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
	    rdev->raid_disk >= 0)
		return -EBUSY;

	if (rdev->mddev->persistent) {
		if (rdev->mddev->major_version == 0)
			return -EINVAL;
		if (size > U16_MAX)
			return -EINVAL;
	} else if (!rdev->mddev->external) {
		return -EBUSY;
	}
	rdev->ppl.size = size;
	return len;
}

static struct rdev_sysfs_entry rdev_ppl_size =
__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);

3559 3560
static struct attribute *rdev_default_attrs[] = {
	&rdev_state.attr,
3561
	&rdev_errors.attr,
3562
	&rdev_slot.attr,
3563
	&rdev_offset.attr,
3564
	&rdev_new_offset.attr,
3565
	&rdev_size.attr,
3566
	&rdev_recovery_start.attr,
3567 3568
	&rdev_bad_blocks.attr,
	&rdev_unack_bad_blocks.attr,
3569 3570
	&rdev_ppl_sector.attr,
	&rdev_ppl_size.attr,
3571 3572 3573 3574 3575 3576
	NULL,
};
static ssize_t
rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3577
	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3578 3579 3580

	if (!entry->show)
		return -EIO;
3581
	if (!rdev->mddev)
3582
		return -ENODEV;
3583
	return entry->show(rdev, page);
3584 3585 3586 3587 3588 3589 3590
}

static ssize_t
rdev_attr_store(struct kobject *kobj, struct attribute *attr,
	      const char *page, size_t length)
{
	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3591
	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3592
	ssize_t rv;
3593
	struct mddev *mddev = rdev->mddev;
3594 3595 3596

	if (!entry->store)
		return -EIO;
3597 3598
	if (!capable(CAP_SYS_ADMIN))
		return -EACCES;
3599
	rv = mddev ? mddev_lock(mddev) : -ENODEV;
3600
	if (!rv) {
3601
		if (rdev->mddev == NULL)
3602
			rv = -ENODEV;
3603 3604
		else
			rv = entry->store(rdev, page, length);
3605
		mddev_unlock(mddev);
3606 3607
	}
	return rv;
3608 3609 3610 3611
}

static void rdev_free(struct kobject *ko)
{
3612
	struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3613 3614
	kfree(rdev);
}
3615
static const struct sysfs_ops rdev_sysfs_ops = {
3616 3617 3618 3619 3620 3621 3622 3623 3624
	.show		= rdev_attr_show,
	.store		= rdev_attr_store,
};
static struct kobj_type rdev_ktype = {
	.release	= rdev_free,
	.sysfs_ops	= &rdev_sysfs_ops,
	.default_attrs	= rdev_default_attrs,
};

3625
int md_rdev_init(struct md_rdev *rdev)
N
NeilBrown 已提交
3626 3627 3628 3629 3630 3631
{
	rdev->desc_nr = -1;
	rdev->saved_raid_disk = -1;
	rdev->raid_disk = -1;
	rdev->flags = 0;
	rdev->data_offset = 0;
3632
	rdev->new_data_offset = 0;
N
NeilBrown 已提交
3633
	rdev->sb_events = 0;
3634
	rdev->last_read_error = 0;
3635 3636
	rdev->sb_loaded = 0;
	rdev->bb_page = NULL;
N
NeilBrown 已提交
3637 3638 3639 3640 3641 3642
	atomic_set(&rdev->nr_pending, 0);
	atomic_set(&rdev->read_errors, 0);
	atomic_set(&rdev->corrected_errors, 0);

	INIT_LIST_HEAD(&rdev->same_set);
	init_waitqueue_head(&rdev->blocked_wait);
3643 3644 3645 3646 3647

	/* Add space to store bad block list.
	 * This reserves the space even on arrays where it cannot
	 * be used - I wonder if that matters
	 */
3648
	return badblocks_init(&rdev->badblocks, 0);
N
NeilBrown 已提交
3649 3650
}
EXPORT_SYMBOL_GPL(md_rdev_init);
L
Linus Torvalds 已提交
3651 3652 3653 3654 3655 3656 3657 3658 3659 3660
/*
 * Import a device. If 'super_format' >= 0, then sanity check the superblock
 *
 * mark the device faulty if:
 *
 *   - the device is nonexistent (zero size)
 *   - the device has no valid superblock
 *
 * a faulty rdev _never_ has rdev->sb set.
 */
3661
static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
L
Linus Torvalds 已提交
3662 3663 3664
{
	char b[BDEVNAME_SIZE];
	int err;
3665
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
3666 3667
	sector_t size;

3668
	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3669
	if (!rdev)
L
Linus Torvalds 已提交
3670 3671
		return ERR_PTR(-ENOMEM);

3672 3673 3674 3675 3676
	err = md_rdev_init(rdev);
	if (err)
		goto abort_free;
	err = alloc_disk_sb(rdev);
	if (err)
L
Linus Torvalds 已提交
3677 3678
		goto abort_free;

3679
	err = lock_rdev(rdev, newdev, super_format == -2);
L
Linus Torvalds 已提交
3680 3681 3682
	if (err)
		goto abort_free;

3683
	kobject_init(&rdev->kobj, &rdev_ktype);
3684

3685
	size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
L
Linus Torvalds 已提交
3686
	if (!size) {
3687
		pr_warn("md: %s has zero or unknown size, marking faulty!\n",
L
Linus Torvalds 已提交
3688 3689 3690 3691 3692 3693 3694 3695 3696
			bdevname(rdev->bdev,b));
		err = -EINVAL;
		goto abort_free;
	}

	if (super_format >= 0) {
		err = super_types[super_format].
			load_super(rdev, NULL, super_minor);
		if (err == -EINVAL) {
3697
			pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n",
3698
				bdevname(rdev->bdev,b),
3699
				super_format, super_minor);
L
Linus Torvalds 已提交
3700 3701 3702
			goto abort_free;
		}
		if (err < 0) {
3703
			pr_warn("md: could not read %s's sb, not importing!\n",
L
Linus Torvalds 已提交
3704 3705 3706 3707
				bdevname(rdev->bdev,b));
			goto abort_free;
		}
	}
3708

L
Linus Torvalds 已提交
3709 3710 3711
	return rdev;

abort_free:
3712 3713
	if (rdev->bdev)
		unlock_rdev(rdev);
3714
	md_rdev_clear(rdev);
L
Linus Torvalds 已提交
3715 3716 3717 3718 3719 3720 3721 3722
	kfree(rdev);
	return ERR_PTR(err);
}

/*
 * Check a full RAID array for plausibility
 */

3723
static int analyze_sbs(struct mddev *mddev)
L
Linus Torvalds 已提交
3724 3725
{
	int i;
3726
	struct md_rdev *rdev, *freshest, *tmp;
L
Linus Torvalds 已提交
3727 3728 3729
	char b[BDEVNAME_SIZE];

	freshest = NULL;
N
NeilBrown 已提交
3730
	rdev_for_each_safe(rdev, tmp, mddev)
L
Linus Torvalds 已提交
3731 3732 3733 3734 3735 3736 3737 3738
		switch (super_types[mddev->major_version].
			load_super(rdev, freshest, mddev->minor_version)) {
		case 1:
			freshest = rdev;
			break;
		case 0:
			break;
		default:
3739
			pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n",
L
Linus Torvalds 已提交
3740
				bdevname(rdev->bdev,b));
3741
			md_kick_rdev_from_array(rdev);
L
Linus Torvalds 已提交
3742 3743
		}

3744 3745 3746 3747 3748 3749
	/* Cannot find a valid fresh disk */
	if (!freshest) {
		pr_warn("md: cannot find a valid disk\n");
		return -EINVAL;
	}

L
Linus Torvalds 已提交
3750 3751 3752 3753
	super_types[mddev->major_version].
		validate_super(mddev, freshest);

	i = 0;
N
NeilBrown 已提交
3754
	rdev_for_each_safe(rdev, tmp, mddev) {
3755 3756 3757
		if (mddev->max_disks &&
		    (rdev->desc_nr >= mddev->max_disks ||
		     i > mddev->max_disks)) {
3758 3759 3760
			pr_warn("md: %s: %s: only %d devices permitted\n",
				mdname(mddev), bdevname(rdev->bdev, b),
				mddev->max_disks);
3761
			md_kick_rdev_from_array(rdev);
3762 3763
			continue;
		}
3764
		if (rdev != freshest) {
L
Linus Torvalds 已提交
3765 3766
			if (super_types[mddev->major_version].
			    validate_super(mddev, rdev)) {
3767
				pr_warn("md: kicking non-fresh %s from array!\n",
L
Linus Torvalds 已提交
3768
					bdevname(rdev->bdev,b));
3769
				md_kick_rdev_from_array(rdev);
L
Linus Torvalds 已提交
3770 3771
				continue;
			}
3772
		}
L
Linus Torvalds 已提交
3773 3774 3775
		if (mddev->level == LEVEL_MULTIPATH) {
			rdev->desc_nr = i++;
			rdev->raid_disk = rdev->desc_nr;
3776
			set_bit(In_sync, &rdev->flags);
S
Shaohua Li 已提交
3777 3778 3779
		} else if (rdev->raid_disk >=
			    (mddev->raid_disks - min(0, mddev->delta_disks)) &&
			   !test_bit(Journal, &rdev->flags)) {
3780 3781
			rdev->raid_disk = -1;
			clear_bit(In_sync, &rdev->flags);
L
Linus Torvalds 已提交
3782 3783
		}
	}
3784 3785

	return 0;
L
Linus Torvalds 已提交
3786 3787
}

3788 3789 3790
/* Read a fixed-point number.
 * Numbers in sysfs attributes should be in "standard" units where
 * possible, so time should be in seconds.
3791
 * However we internally use a a much smaller unit such as
3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819
 * milliseconds or jiffies.
 * This function takes a decimal number with a possible fractional
 * component, and produces an integer which is the result of
 * multiplying that number by 10^'scale'.
 * all without any floating-point arithmetic.
 */
int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
{
	unsigned long result = 0;
	long decimals = -1;
	while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
		if (*cp == '.')
			decimals = 0;
		else if (decimals < scale) {
			unsigned int value;
			value = *cp - '0';
			result = result * 10 + value;
			if (decimals >= 0)
				decimals++;
		}
		cp++;
	}
	if (*cp == '\n')
		cp++;
	if (*cp)
		return -EINVAL;
	if (decimals < 0)
		decimals = 0;
A
Andy Shevchenko 已提交
3820
	*res = result * int_pow(10, scale - decimals);
3821 3822 3823
	return 0;
}

3824
static ssize_t
3825
safe_delay_show(struct mddev *mddev, char *page)
3826 3827 3828 3829 3830
{
	int msec = (mddev->safemode_delay*1000)/HZ;
	return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
}
static ssize_t
3831
safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3832 3833
{
	unsigned long msec;
3834

3835
	if (mddev_is_clustered(mddev)) {
3836
		pr_warn("md: Safemode is disabled for clustered mode\n");
3837 3838 3839
		return -EINVAL;
	}

3840
	if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3841 3842 3843 3844
		return -EINVAL;
	if (msec == 0)
		mddev->safemode_delay = 0;
	else {
3845
		unsigned long old_delay = mddev->safemode_delay;
3846 3847 3848 3849 3850 3851 3852
		unsigned long new_delay = (msec*HZ)/1000;

		if (new_delay == 0)
			new_delay = 1;
		mddev->safemode_delay = new_delay;
		if (new_delay < old_delay || old_delay == 0)
			mod_timer(&mddev->safemode_timer, jiffies+1);
3853 3854 3855 3856
	}
	return len;
}
static struct md_sysfs_entry md_safe_delay =
3857
__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3858

3859
static ssize_t
3860
level_show(struct mddev *mddev, char *page)
3861
{
3862 3863 3864 3865
	struct md_personality *p;
	int ret;
	spin_lock(&mddev->lock);
	p = mddev->pers;
3866
	if (p)
3867
		ret = sprintf(page, "%s\n", p->name);
3868
	else if (mddev->clevel[0])
3869
		ret = sprintf(page, "%s\n", mddev->clevel);
3870
	else if (mddev->level != LEVEL_NONE)
3871
		ret = sprintf(page, "%d\n", mddev->level);
3872
	else
3873 3874 3875
		ret = 0;
	spin_unlock(&mddev->lock);
	return ret;
3876 3877
}

3878
static ssize_t
3879
level_store(struct mddev *mddev, const char *buf, size_t len)
3880
{
3881
	char clevel[16];
3882 3883
	ssize_t rv;
	size_t slen = len;
3884
	struct md_personality *pers, *oldpers;
3885
	long level;
3886
	void *priv, *oldpriv;
3887
	struct md_rdev *rdev;
3888

3889 3890 3891 3892 3893 3894 3895
	if (slen == 0 || slen >= sizeof(clevel))
		return -EINVAL;

	rv = mddev_lock(mddev);
	if (rv)
		return rv;

3896
	if (mddev->pers == NULL) {
3897 3898 3899 3900
		strncpy(mddev->clevel, buf, slen);
		if (mddev->clevel[slen-1] == '\n')
			slen--;
		mddev->clevel[slen] = 0;
3901
		mddev->level = LEVEL_NONE;
3902 3903
		rv = len;
		goto out_unlock;
3904
	}
3905
	rv = -EROFS;
3906
	if (mddev->ro)
3907
		goto out_unlock;
3908 3909 3910 3911 3912 3913 3914

	/* request to change the personality.  Need to ensure:
	 *  - array is not engaged in resync/recovery/reshape
	 *  - old personality can be suspended
	 *  - new personality will access other array.
	 */

3915
	rv = -EBUSY;
3916
	if (mddev->sync_thread ||
3917
	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3918 3919
	    mddev->reshape_position != MaxSector ||
	    mddev->sysfs_active)
3920
		goto out_unlock;
3921

3922
	rv = -EINVAL;
3923
	if (!mddev->pers->quiesce) {
3924 3925
		pr_warn("md: %s: %s does not support online personality change\n",
			mdname(mddev), mddev->pers->name);
3926
		goto out_unlock;
3927 3928 3929
	}

	/* Now find the new personality */
3930 3931 3932 3933
	strncpy(clevel, buf, slen);
	if (clevel[slen-1] == '\n')
		slen--;
	clevel[slen] = 0;
3934
	if (kstrtol(clevel, 10, &level))
3935
		level = LEVEL_NONE;
3936

3937 3938
	if (request_module("md-%s", clevel) != 0)
		request_module("md-level-%s", clevel);
3939
	spin_lock(&pers_lock);
3940
	pers = find_pers(level, clevel);
3941 3942
	if (!pers || !try_module_get(pers->owner)) {
		spin_unlock(&pers_lock);
3943
		pr_warn("md: personality %s not loaded\n", clevel);
3944 3945
		rv = -EINVAL;
		goto out_unlock;
3946 3947 3948 3949 3950 3951
	}
	spin_unlock(&pers_lock);

	if (pers == mddev->pers) {
		/* Nothing to do! */
		module_put(pers->owner);
3952 3953
		rv = len;
		goto out_unlock;
3954 3955 3956
	}
	if (!pers->takeover) {
		module_put(pers->owner);
3957 3958
		pr_warn("md: %s: %s does not support personality takeover\n",
			mdname(mddev), clevel);
3959 3960
		rv = -EINVAL;
		goto out_unlock;
3961 3962
	}

N
NeilBrown 已提交
3963
	rdev_for_each(rdev, mddev)
3964 3965
		rdev->new_raid_disk = rdev->raid_disk;

3966 3967 3968 3969 3970 3971 3972
	/* ->takeover must set new_* and/or delta_disks
	 * if it succeeds, and may set them when it fails.
	 */
	priv = pers->takeover(mddev);
	if (IS_ERR(priv)) {
		mddev->new_level = mddev->level;
		mddev->new_layout = mddev->layout;
3973
		mddev->new_chunk_sectors = mddev->chunk_sectors;
3974 3975
		mddev->raid_disks -= mddev->delta_disks;
		mddev->delta_disks = 0;
3976
		mddev->reshape_backwards = 0;
3977
		module_put(pers->owner);
3978 3979
		pr_warn("md: %s: %s would not accept array\n",
			mdname(mddev), clevel);
3980 3981
		rv = PTR_ERR(priv);
		goto out_unlock;
3982 3983 3984 3985
	}

	/* Looks like we have a winner */
	mddev_suspend(mddev);
3986
	mddev_detach(mddev);
3987 3988

	spin_lock(&mddev->lock);
3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999
	oldpers = mddev->pers;
	oldpriv = mddev->private;
	mddev->pers = pers;
	mddev->private = priv;
	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
	mddev->level = mddev->new_level;
	mddev->layout = mddev->new_layout;
	mddev->chunk_sectors = mddev->new_chunk_sectors;
	mddev->delta_disks = 0;
	mddev->reshape_backwards = 0;
	mddev->degraded = 0;
4000
	spin_unlock(&mddev->lock);
4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014

	if (oldpers->sync_request == NULL &&
	    mddev->external) {
		/* We are converting from a no-redundancy array
		 * to a redundancy array and metadata is managed
		 * externally so we need to be sure that writes
		 * won't block due to a need to transition
		 *      clean->dirty
		 * until external management is started.
		 */
		mddev->in_sync = 0;
		mddev->safemode_delay = 0;
		mddev->safemode = 0;
	}
4015

4016 4017 4018
	oldpers->free(mddev, oldpriv);

	if (oldpers->sync_request == NULL &&
4019 4020 4021
	    pers->sync_request != NULL) {
		/* need to add the md_redundancy_group */
		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4022 4023
			pr_warn("md: cannot register extra attributes for %s\n",
				mdname(mddev));
T
Tejun Heo 已提交
4024
		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4025
	}
4026
	if (oldpers->sync_request != NULL &&
4027 4028 4029 4030 4031 4032
	    pers->sync_request == NULL) {
		/* need to remove the md_redundancy_group */
		if (mddev->to_remove == NULL)
			mddev->to_remove = &md_redundancy_group;
	}

4033 4034
	module_put(oldpers->owner);

N
NeilBrown 已提交
4035
	rdev_for_each(rdev, mddev) {
4036 4037
		if (rdev->raid_disk < 0)
			continue;
4038
		if (rdev->new_raid_disk >= mddev->raid_disks)
4039 4040 4041
			rdev->new_raid_disk = -1;
		if (rdev->new_raid_disk == rdev->raid_disk)
			continue;
4042
		sysfs_unlink_rdev(mddev, rdev);
4043
	}
N
NeilBrown 已提交
4044
	rdev_for_each(rdev, mddev) {
4045 4046 4047 4048 4049 4050
		if (rdev->raid_disk < 0)
			continue;
		if (rdev->new_raid_disk == rdev->raid_disk)
			continue;
		rdev->raid_disk = rdev->new_raid_disk;
		if (rdev->raid_disk < 0)
4051
			clear_bit(In_sync, &rdev->flags);
4052
		else {
4053
			if (sysfs_link_rdev(mddev, rdev))
4054 4055
				pr_warn("md: cannot register rd%d for %s after level change\n",
					rdev->raid_disk, mdname(mddev));
4056
		}
4057 4058
	}

4059
	if (pers->sync_request == NULL) {
4060 4061 4062 4063 4064 4065
		/* this is now an array without redundancy, so
		 * it must always be in_sync
		 */
		mddev->in_sync = 1;
		del_timer_sync(&mddev->safemode_timer);
	}
4066
	blk_set_stacking_limits(&mddev->queue->limits);
4067
	pers->run(mddev);
4068
	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4069
	mddev_resume(mddev);
4070 4071
	if (!mddev->thread)
		md_update_sb(mddev, 1);
4072
	sysfs_notify(&mddev->kobj, NULL, "level");
4073
	md_new_event(mddev);
4074 4075 4076
	rv = len;
out_unlock:
	mddev_unlock(mddev);
4077 4078 4079 4080
	return rv;
}

static struct md_sysfs_entry md_level =
4081
__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
4082

4083
static ssize_t
4084
layout_show(struct mddev *mddev, char *page)
4085 4086
{
	/* just a number, not meaningful for all levels */
4087 4088 4089 4090
	if (mddev->reshape_position != MaxSector &&
	    mddev->layout != mddev->new_layout)
		return sprintf(page, "%d (%d)\n",
			       mddev->new_layout, mddev->layout);
4091 4092 4093 4094
	return sprintf(page, "%d\n", mddev->layout);
}

static ssize_t
4095
layout_store(struct mddev *mddev, const char *buf, size_t len)
4096
{
A
Alexey Dobriyan 已提交
4097
	unsigned int n;
4098
	int err;
4099

A
Alexey Dobriyan 已提交
4100 4101 4102
	err = kstrtouint(buf, 10, &n);
	if (err < 0)
		return err;
4103 4104 4105
	err = mddev_lock(mddev);
	if (err)
		return err;
4106

4107
	if (mddev->pers) {
4108
		if (mddev->pers->check_reshape == NULL)
4109 4110 4111 4112 4113 4114 4115 4116
			err = -EBUSY;
		else if (mddev->ro)
			err = -EROFS;
		else {
			mddev->new_layout = n;
			err = mddev->pers->check_reshape(mddev);
			if (err)
				mddev->new_layout = mddev->layout;
4117
		}
4118
	} else {
4119
		mddev->new_layout = n;
4120 4121 4122
		if (mddev->reshape_position == MaxSector)
			mddev->layout = n;
	}
4123 4124
	mddev_unlock(mddev);
	return err ?: len;
4125 4126
}
static struct md_sysfs_entry md_layout =
4127
__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
4128

4129
static ssize_t
4130
raid_disks_show(struct mddev *mddev, char *page)
4131
{
4132 4133
	if (mddev->raid_disks == 0)
		return 0;
4134 4135 4136 4137
	if (mddev->reshape_position != MaxSector &&
	    mddev->delta_disks != 0)
		return sprintf(page, "%d (%d)\n", mddev->raid_disks,
			       mddev->raid_disks - mddev->delta_disks);
4138 4139 4140
	return sprintf(page, "%d\n", mddev->raid_disks);
}

4141
static int update_raid_disks(struct mddev *mddev, int raid_disks);
4142 4143

static ssize_t
4144
raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
4145
{
A
Alexey Dobriyan 已提交
4146
	unsigned int n;
4147
	int err;
4148

A
Alexey Dobriyan 已提交
4149 4150 4151
	err = kstrtouint(buf, 10, &n);
	if (err < 0)
		return err;
4152

4153 4154 4155
	err = mddev_lock(mddev);
	if (err)
		return err;
4156
	if (mddev->pers)
4157
		err = update_raid_disks(mddev, n);
4158
	else if (mddev->reshape_position != MaxSector) {
4159
		struct md_rdev *rdev;
4160
		int olddisks = mddev->raid_disks - mddev->delta_disks;
4161

4162
		err = -EINVAL;
4163 4164 4165
		rdev_for_each(rdev, mddev) {
			if (olddisks < n &&
			    rdev->data_offset < rdev->new_data_offset)
4166
				goto out_unlock;
4167 4168
			if (olddisks > n &&
			    rdev->data_offset > rdev->new_data_offset)
4169
				goto out_unlock;
4170
		}
4171
		err = 0;
4172 4173
		mddev->delta_disks = n - olddisks;
		mddev->raid_disks = n;
4174
		mddev->reshape_backwards = (mddev->delta_disks < 0);
4175
	} else
4176
		mddev->raid_disks = n;
4177 4178 4179
out_unlock:
	mddev_unlock(mddev);
	return err ? err : len;
4180 4181
}
static struct md_sysfs_entry md_raid_disks =
4182
__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
4183

4184
static ssize_t
4185
chunk_size_show(struct mddev *mddev, char *page)
4186
{
4187
	if (mddev->reshape_position != MaxSector &&
4188 4189 4190
	    mddev->chunk_sectors != mddev->new_chunk_sectors)
		return sprintf(page, "%d (%d)\n",
			       mddev->new_chunk_sectors << 9,
4191 4192
			       mddev->chunk_sectors << 9);
	return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
4193 4194 4195
}

static ssize_t
4196
chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
4197
{
A
Alexey Dobriyan 已提交
4198
	unsigned long n;
4199
	int err;
4200

A
Alexey Dobriyan 已提交
4201 4202 4203
	err = kstrtoul(buf, 10, &n);
	if (err < 0)
		return err;
4204

4205 4206 4207
	err = mddev_lock(mddev);
	if (err)
		return err;
4208
	if (mddev->pers) {
4209
		if (mddev->pers->check_reshape == NULL)
4210 4211 4212 4213 4214 4215 4216 4217
			err = -EBUSY;
		else if (mddev->ro)
			err = -EROFS;
		else {
			mddev->new_chunk_sectors = n >> 9;
			err = mddev->pers->check_reshape(mddev);
			if (err)
				mddev->new_chunk_sectors = mddev->chunk_sectors;
4218
		}
4219
	} else {
4220
		mddev->new_chunk_sectors = n >> 9;
4221
		if (mddev->reshape_position == MaxSector)
4222
			mddev->chunk_sectors = n >> 9;
4223
	}
4224 4225
	mddev_unlock(mddev);
	return err ?: len;
4226 4227
}
static struct md_sysfs_entry md_chunk_size =
4228
__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
4229

4230
static ssize_t
4231
resync_start_show(struct mddev *mddev, char *page)
4232
{
4233 4234
	if (mddev->recovery_cp == MaxSector)
		return sprintf(page, "none\n");
4235 4236 4237 4238
	return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
}

static ssize_t
4239
resync_start_store(struct mddev *mddev, const char *buf, size_t len)
4240
{
A
Alexey Dobriyan 已提交
4241
	unsigned long long n;
4242
	int err;
A
Alexey Dobriyan 已提交
4243 4244 4245 4246 4247 4248 4249 4250 4251 4252

	if (cmd_match(buf, "none"))
		n = MaxSector;
	else {
		err = kstrtoull(buf, 10, &n);
		if (err < 0)
			return err;
		if (n != (sector_t)n)
			return -EINVAL;
	}
4253

4254 4255 4256
	err = mddev_lock(mddev);
	if (err)
		return err;
4257
	if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4258
		err = -EBUSY;
4259

4260 4261 4262
	if (!err) {
		mddev->recovery_cp = n;
		if (mddev->pers)
4263
			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4264 4265 4266
	}
	mddev_unlock(mddev);
	return err ?: len;
4267 4268
}
static struct md_sysfs_entry md_resync_start =
4269 4270
__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
		resync_start_show, resync_start_store);
4271

4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283
/*
 * The array state can be:
 *
 * clear
 *     No devices, no size, no level
 *     Equivalent to STOP_ARRAY ioctl
 * inactive
 *     May have some settings, but array is not active
 *        all IO results in error
 *     When written, doesn't tear down array, but just stops it
 * suspended (not supported yet)
 *     All IO requests will block. The array can be reconfigured.
4284
 *     Writing this, if accepted, will block until array is quiescent
4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306
 * readonly
 *     no resync can happen.  no superblocks get written.
 *     write requests fail
 * read-auto
 *     like readonly, but behaves like 'clean' on a write request.
 *
 * clean - no pending writes, but otherwise active.
 *     When written to inactive array, starts without resync
 *     If a write request arrives then
 *       if metadata is known, mark 'dirty' and switch to 'active'.
 *       if not known, block and switch to write-pending
 *     If written to an active array that has pending writes, then fails.
 * active
 *     fully active: IO and resync can be happening.
 *     When written to inactive array, starts with resync
 *
 * write-pending
 *     clean, but writes are blocked waiting for 'active' to be written.
 *
 * active-idle
 *     like active, but no writes have been seen for a while (100msec).
 *
4307 4308 4309 4310 4311
 * broken
 *     RAID0/LINEAR-only: same as clean, but array is missing a member.
 *     It's useful because RAID0/LINEAR mounted-arrays aren't stopped
 *     when a member is gone, so this state will at least alert the
 *     user that something is wrong.
4312 4313
 */
enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
4314
		   write_pending, active_idle, broken, bad_word};
4315
static char *array_states[] = {
4316
	"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
4317
	"write-pending", "active-idle", "broken", NULL };
4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328

static int match_word(const char *word, char **list)
{
	int n;
	for (n=0; list[n]; n++)
		if (cmd_match(word, list[n]))
			break;
	return n;
}

static ssize_t
4329
array_state_show(struct mddev *mddev, char *page)
4330 4331 4332
{
	enum array_state st = inactive;

4333
	if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
4334 4335 4336 4337 4338 4339 4340 4341
		switch(mddev->ro) {
		case 1:
			st = readonly;
			break;
		case 2:
			st = read_auto;
			break;
		case 0:
4342
			spin_lock(&mddev->lock);
4343
			if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4344
				st = write_pending;
4345 4346
			else if (mddev->in_sync)
				st = clean;
4347 4348 4349 4350
			else if (mddev->safemode)
				st = active_idle;
			else
				st = active;
4351
			spin_unlock(&mddev->lock);
4352
		}
4353 4354 4355 4356

		if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
			st = broken;
	} else {
4357 4358
		if (list_empty(&mddev->disks) &&
		    mddev->raid_disks == 0 &&
A
Andre Noll 已提交
4359
		    mddev->dev_sectors == 0)
4360 4361 4362 4363 4364 4365 4366
			st = clear;
		else
			st = inactive;
	}
	return sprintf(page, "%s\n", array_states[st]);
}

4367 4368 4369
static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
static int do_md_run(struct mddev *mddev);
4370
static int restart_array(struct mddev *mddev);
4371 4372

static ssize_t
4373
array_state_store(struct mddev *mddev, const char *buf, size_t len)
4374
{
N
NeilBrown 已提交
4375
	int err = 0;
4376
	enum array_state st = match_word(buf, array_states);
4377 4378 4379 4380 4381 4382 4383 4384

	if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
		/* don't take reconfig_mutex when toggling between
		 * clean and active
		 */
		spin_lock(&mddev->lock);
		if (st == active) {
			restart_array(mddev);
4385
			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4386
			md_wakeup_thread(mddev->thread);
4387 4388 4389
			wake_up(&mddev->sb_wait);
		} else /* st == clean */ {
			restart_array(mddev);
N
NeilBrown 已提交
4390
			if (!set_in_sync(mddev))
4391 4392
				err = -EBUSY;
		}
4393 4394
		if (!err)
			sysfs_notify_dirent_safe(mddev->sysfs_state);
4395
		spin_unlock(&mddev->lock);
4396
		return err ?: len;
4397 4398 4399 4400 4401
	}
	err = mddev_lock(mddev);
	if (err)
		return err;
	err = -EINVAL;
4402 4403 4404 4405 4406
	switch(st) {
	case bad_word:
		break;
	case clear:
		/* stopping an active array */
4407
		err = do_md_stop(mddev, 0, NULL);
4408 4409 4410
		break;
	case inactive:
		/* stopping an active array */
4411
		if (mddev->pers)
4412
			err = do_md_stop(mddev, 2, NULL);
4413
		else
4414
			err = 0; /* already inactive */
4415 4416 4417 4418 4419
		break;
	case suspended:
		break; /* not supported yet */
	case readonly:
		if (mddev->pers)
4420
			err = md_set_readonly(mddev, NULL);
4421 4422
		else {
			mddev->ro = 1;
4423
			set_disk_ro(mddev->gendisk, 1);
4424 4425 4426 4427 4428
			err = do_md_run(mddev);
		}
		break;
	case read_auto:
		if (mddev->pers) {
4429
			if (mddev->ro == 0)
4430
				err = md_set_readonly(mddev, NULL);
4431
			else if (mddev->ro == 1)
4432 4433 4434 4435 4436
				err = restart_array(mddev);
			if (err == 0) {
				mddev->ro = 2;
				set_disk_ro(mddev->gendisk, 0);
			}
4437 4438 4439 4440 4441 4442 4443
		} else {
			mddev->ro = 2;
			err = do_md_run(mddev);
		}
		break;
	case clean:
		if (mddev->pers) {
4444 4445 4446
			err = restart_array(mddev);
			if (err)
				break;
4447
			spin_lock(&mddev->lock);
N
NeilBrown 已提交
4448
			if (!set_in_sync(mddev))
4449
				err = -EBUSY;
4450
			spin_unlock(&mddev->lock);
4451 4452
		} else
			err = -EINVAL;
4453 4454 4455
		break;
	case active:
		if (mddev->pers) {
4456 4457 4458
			err = restart_array(mddev);
			if (err)
				break;
4459
			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4460 4461 4462 4463
			wake_up(&mddev->sb_wait);
			err = 0;
		} else {
			mddev->ro = 0;
4464
			set_disk_ro(mddev->gendisk, 0);
4465 4466 4467 4468 4469
			err = do_md_run(mddev);
		}
		break;
	case write_pending:
	case active_idle:
4470
	case broken:
4471 4472 4473
		/* these cannot be set */
		break;
	}
4474 4475

	if (!err) {
4476 4477
		if (mddev->hold_active == UNTIL_IOCTL)
			mddev->hold_active = 0;
N
NeilBrown 已提交
4478
		sysfs_notify_dirent_safe(mddev->sysfs_state);
4479
	}
4480 4481
	mddev_unlock(mddev);
	return err ?: len;
4482
}
4483
static struct md_sysfs_entry md_array_state =
4484
__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4485

4486
static ssize_t
4487
max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4488 4489 4490 4491 4492
	return sprintf(page, "%d\n",
		       atomic_read(&mddev->max_corr_read_errors));
}

static ssize_t
4493
max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4494
{
A
Alexey Dobriyan 已提交
4495 4496
	unsigned int n;
	int rv;
4497

A
Alexey Dobriyan 已提交
4498 4499 4500 4501 4502
	rv = kstrtouint(buf, 10, &n);
	if (rv < 0)
		return rv;
	atomic_set(&mddev->max_corr_read_errors, n);
	return len;
4503 4504 4505 4506 4507 4508
}

static struct md_sysfs_entry max_corr_read_errors =
__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
	max_corrected_read_errors_store);

4509
static ssize_t
4510
null_show(struct mddev *mddev, char *page)
4511 4512 4513 4514 4515
{
	return -EINVAL;
}

static ssize_t
4516
new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528
{
	/* buf must be %d:%d\n? giving major and minor numbers */
	/* The new device is added to the array.
	 * If the array has a persistent superblock, we read the
	 * superblock to initialise info and check validity.
	 * Otherwise, only checking done is that in bind_rdev_to_array,
	 * which mainly checks size.
	 */
	char *e;
	int major = simple_strtoul(buf, &e, 10);
	int minor;
	dev_t dev;
4529
	struct md_rdev *rdev;
4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541
	int err;

	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
		return -EINVAL;
	minor = simple_strtoul(e+1, &e, 10);
	if (*e && *e != '\n')
		return -EINVAL;
	dev = MKDEV(major, minor);
	if (major != MAJOR(dev) ||
	    minor != MINOR(dev))
		return -EOVERFLOW;

4542 4543 4544 4545 4546
	flush_workqueue(md_misc_wq);

	err = mddev_lock(mddev);
	if (err)
		return err;
4547 4548 4549 4550
	if (mddev->persistent) {
		rdev = md_import_device(dev, mddev->major_version,
					mddev->minor_version);
		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4551 4552 4553
			struct md_rdev *rdev0
				= list_entry(mddev->disks.next,
					     struct md_rdev, same_set);
4554 4555 4556 4557 4558
			err = super_types[mddev->major_version]
				.load_super(rdev, rdev0, mddev->minor_version);
			if (err < 0)
				goto out;
		}
4559 4560 4561
	} else if (mddev->external)
		rdev = md_import_device(dev, -2, -1);
	else
4562 4563
		rdev = md_import_device(dev, -1, -1);

4564 4565
	if (IS_ERR(rdev)) {
		mddev_unlock(mddev);
4566
		return PTR_ERR(rdev);
4567
	}
4568 4569 4570 4571
	err = bind_rdev_to_array(rdev, mddev);
 out:
	if (err)
		export_rdev(rdev);
4572
	mddev_unlock(mddev);
4573 4574
	if (!err)
		md_new_event(mddev);
4575 4576 4577 4578
	return err ? err : len;
}

static struct md_sysfs_entry md_new_device =
4579
__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4580

4581
static ssize_t
4582
bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4583 4584 4585
{
	char *end;
	unsigned long chunk, end_chunk;
4586
	int err;
4587

4588 4589 4590
	err = mddev_lock(mddev);
	if (err)
		return err;
4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602
	if (!mddev->bitmap)
		goto out;
	/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
	while (*buf) {
		chunk = end_chunk = simple_strtoul(buf, &end, 0);
		if (buf == end) break;
		if (*end == '-') { /* range */
			buf = end + 1;
			end_chunk = simple_strtoul(buf, &end, 0);
			if (buf == end) break;
		}
		if (*end && !isspace(*end)) break;
4603
		md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
4604
		buf = skip_spaces(end);
4605
	}
4606
	md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
4607
out:
4608
	mddev_unlock(mddev);
4609 4610 4611 4612 4613 4614
	return len;
}

static struct md_sysfs_entry md_bitmap =
__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);

4615
static ssize_t
4616
size_show(struct mddev *mddev, char *page)
4617
{
A
Andre Noll 已提交
4618 4619
	return sprintf(page, "%llu\n",
		(unsigned long long)mddev->dev_sectors / 2);
4620 4621
}

4622
static int update_size(struct mddev *mddev, sector_t num_sectors);
4623 4624

static ssize_t
4625
size_store(struct mddev *mddev, const char *buf, size_t len)
4626 4627 4628 4629 4630
{
	/* If array is inactive, we can reduce the component size, but
	 * not increase it (except from 0).
	 * If array is active, we can try an on-line resize
	 */
D
Dan Williams 已提交
4631 4632
	sector_t sectors;
	int err = strict_blocks_to_sectors(buf, &sectors);
4633

A
Andre Noll 已提交
4634 4635
	if (err < 0)
		return err;
4636 4637 4638
	err = mddev_lock(mddev);
	if (err)
		return err;
4639
	if (mddev->pers) {
A
Andre Noll 已提交
4640
		err = update_size(mddev, sectors);
4641 4642
		if (err == 0)
			md_update_sb(mddev, 1);
4643
	} else {
A
Andre Noll 已提交
4644 4645 4646
		if (mddev->dev_sectors == 0 ||
		    mddev->dev_sectors > sectors)
			mddev->dev_sectors = sectors;
4647 4648 4649
		else
			err = -ENOSPC;
	}
4650
	mddev_unlock(mddev);
4651 4652 4653 4654
	return err ? err : len;
}

static struct md_sysfs_entry md_size =
4655
__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4656

M
Masanari Iida 已提交
4657
/* Metadata version.
4658 4659 4660
 * This is one of
 *   'none' for arrays with no metadata (good luck...)
 *   'external' for arrays with externally managed metadata,
4661 4662 4663
 * or N.M for internally known formats
 */
static ssize_t
4664
metadata_show(struct mddev *mddev, char *page)
4665 4666 4667 4668
{
	if (mddev->persistent)
		return sprintf(page, "%d.%d\n",
			       mddev->major_version, mddev->minor_version);
4669 4670
	else if (mddev->external)
		return sprintf(page, "external:%s\n", mddev->metadata_type);
4671 4672 4673 4674 4675
	else
		return sprintf(page, "none\n");
}

static ssize_t
4676
metadata_store(struct mddev *mddev, const char *buf, size_t len)
4677 4678 4679
{
	int major, minor;
	char *e;
4680
	int err;
4681 4682 4683 4684
	/* Changing the details of 'external' metadata is
	 * always permitted.  Otherwise there must be
	 * no devices attached to the array.
	 */
4685 4686 4687 4688 4689

	err = mddev_lock(mddev);
	if (err)
		return err;
	err = -EBUSY;
4690 4691 4692
	if (mddev->external && strncmp(buf, "external:", 9) == 0)
		;
	else if (!list_empty(&mddev->disks))
4693
		goto out_unlock;
4694

4695
	err = 0;
4696 4697
	if (cmd_match(buf, "none")) {
		mddev->persistent = 0;
4698 4699 4700
		mddev->external = 0;
		mddev->major_version = 0;
		mddev->minor_version = 90;
4701
		goto out_unlock;
4702 4703
	}
	if (strncmp(buf, "external:", 9) == 0) {
4704
		size_t namelen = len-9;
4705 4706 4707 4708 4709 4710 4711 4712
		if (namelen >= sizeof(mddev->metadata_type))
			namelen = sizeof(mddev->metadata_type)-1;
		strncpy(mddev->metadata_type, buf+9, namelen);
		mddev->metadata_type[namelen] = 0;
		if (namelen && mddev->metadata_type[namelen-1] == '\n')
			mddev->metadata_type[--namelen] = 0;
		mddev->persistent = 0;
		mddev->external = 1;
4713 4714
		mddev->major_version = 0;
		mddev->minor_version = 90;
4715
		goto out_unlock;
4716 4717
	}
	major = simple_strtoul(buf, &e, 10);
4718
	err = -EINVAL;
4719
	if (e==buf || *e != '.')
4720
		goto out_unlock;
4721 4722
	buf = e+1;
	minor = simple_strtoul(buf, &e, 10);
4723
	if (e==buf || (*e && *e != '\n') )
4724 4725
		goto out_unlock;
	err = -ENOENT;
4726
	if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4727
		goto out_unlock;
4728 4729 4730
	mddev->major_version = major;
	mddev->minor_version = minor;
	mddev->persistent = 1;
4731
	mddev->external = 0;
4732 4733 4734 4735
	err = 0;
out_unlock:
	mddev_unlock(mddev);
	return err ?: len;
4736 4737 4738
}

static struct md_sysfs_entry md_metadata =
4739
__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4740

4741
static ssize_t
4742
action_show(struct mddev *mddev, char *page)
4743
{
4744
	char *type = "idle";
4745 4746
	unsigned long recovery = mddev->recovery;
	if (test_bit(MD_RECOVERY_FROZEN, &recovery))
4747
		type = "frozen";
4748 4749 4750
	else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
	    (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
		if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
4751
			type = "reshape";
4752 4753
		else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
			if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
4754
				type = "resync";
4755
			else if (test_bit(MD_RECOVERY_CHECK, &recovery))
4756 4757 4758
				type = "check";
			else
				type = "repair";
4759
		} else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4760
			type = "recover";
4761 4762
		else if (mddev->reshape_position != MaxSector)
			type = "reshape";
4763 4764 4765 4766 4767
	}
	return sprintf(page, "%s\n", type);
}

static ssize_t
4768
action_store(struct mddev *mddev, const char *page, size_t len)
4769
{
4770 4771 4772
	if (!mddev->pers || !mddev->pers->sync_request)
		return -EINVAL;

4773 4774

	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4775 4776 4777 4778
		if (cmd_match(page, "frozen"))
			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
		else
			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4779 4780 4781 4782 4783
		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
		    mddev_lock(mddev) == 0) {
			flush_workqueue(md_misc_wq);
			if (mddev->sync_thread) {
				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4784 4785
				md_reap_sync_thread(mddev);
			}
4786
			mddev_unlock(mddev);
4787
		}
4788
	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4789
		return -EBUSY;
4790
	else if (cmd_match(page, "resync"))
4791
		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4792
	else if (cmd_match(page, "recover")) {
4793
		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4794 4795
		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
	} else if (cmd_match(page, "reshape")) {
4796 4797 4798
		int err;
		if (mddev->pers->start_reshape == NULL)
			return -EINVAL;
4799 4800
		err = mddev_lock(mddev);
		if (!err) {
4801 4802 4803 4804 4805 4806
			if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
				err =  -EBUSY;
			else {
				clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
				err = mddev->pers->start_reshape(mddev);
			}
4807 4808
			mddev_unlock(mddev);
		}
4809 4810
		if (err)
			return err;
4811
		sysfs_notify(&mddev->kobj, NULL, "degraded");
4812
	} else {
4813
		if (cmd_match(page, "check"))
4814
			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4815
		else if (!cmd_match(page, "repair"))
4816
			return -EINVAL;
4817
		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4818 4819 4820
		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
	}
4821 4822 4823 4824 4825 4826 4827
	if (mddev->ro == 2) {
		/* A write to sync_action is enough to justify
		 * canceling read-auto mode
		 */
		mddev->ro = 0;
		md_wakeup_thread(mddev->sync_thread);
	}
4828
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4829
	md_wakeup_thread(mddev->thread);
N
NeilBrown 已提交
4830
	sysfs_notify_dirent_safe(mddev->sysfs_action);
4831 4832 4833
	return len;
}

4834
static struct md_sysfs_entry md_scan_mode =
4835
__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4836 4837 4838 4839 4840 4841 4842 4843 4844

static ssize_t
last_sync_action_show(struct mddev *mddev, char *page)
{
	return sprintf(page, "%s\n", mddev->last_sync_action);
}

static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);

4845
static ssize_t
4846
mismatch_cnt_show(struct mddev *mddev, char *page)
4847 4848
{
	return sprintf(page, "%llu\n",
4849 4850
		       (unsigned long long)
		       atomic64_read(&mddev->resync_mismatches));
4851 4852
}

4853
static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4854

4855
static ssize_t
4856
sync_min_show(struct mddev *mddev, char *page)
4857 4858 4859 4860 4861 4862
{
	return sprintf(page, "%d (%s)\n", speed_min(mddev),
		       mddev->sync_speed_min ? "local": "system");
}

static ssize_t
4863
sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4864
{
A
Alexey Dobriyan 已提交
4865 4866 4867
	unsigned int min;
	int rv;

4868
	if (strncmp(buf, "system", 6)==0) {
A
Alexey Dobriyan 已提交
4869 4870 4871 4872 4873 4874 4875
		min = 0;
	} else {
		rv = kstrtouint(buf, 10, &min);
		if (rv < 0)
			return rv;
		if (min == 0)
			return -EINVAL;
4876 4877 4878 4879 4880 4881 4882 4883 4884
	}
	mddev->sync_speed_min = min;
	return len;
}

static struct md_sysfs_entry md_sync_min =
__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);

static ssize_t
4885
sync_max_show(struct mddev *mddev, char *page)
4886 4887 4888 4889 4890 4891
{
	return sprintf(page, "%d (%s)\n", speed_max(mddev),
		       mddev->sync_speed_max ? "local": "system");
}

static ssize_t
4892
sync_max_store(struct mddev *mddev, const char *buf, size_t len)
4893
{
A
Alexey Dobriyan 已提交
4894 4895 4896
	unsigned int max;
	int rv;

4897
	if (strncmp(buf, "system", 6)==0) {
A
Alexey Dobriyan 已提交
4898 4899 4900 4901 4902 4903 4904
		max = 0;
	} else {
		rv = kstrtouint(buf, 10, &max);
		if (rv < 0)
			return rv;
		if (max == 0)
			return -EINVAL;
4905 4906 4907 4908 4909 4910 4911 4912
	}
	mddev->sync_speed_max = max;
	return len;
}

static struct md_sysfs_entry md_sync_max =
__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);

4913
static ssize_t
4914
degraded_show(struct mddev *mddev, char *page)
4915 4916 4917 4918
{
	return sprintf(page, "%d\n", mddev->degraded);
}
static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4919

4920
static ssize_t
4921
sync_force_parallel_show(struct mddev *mddev, char *page)
4922 4923 4924 4925 4926
{
	return sprintf(page, "%d\n", mddev->parallel_resync);
}

static ssize_t
4927
sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
4928 4929 4930
{
	long n;

4931
	if (kstrtol(buf, 10, &n))
4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949
		return -EINVAL;

	if (n != 0 && n != 1)
		return -EINVAL;

	mddev->parallel_resync = n;

	if (mddev->sync_thread)
		wake_up(&resync_wait);

	return len;
}

/* force parallel resync, even with shared block devices */
static struct md_sysfs_entry md_sync_force_parallel =
__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
       sync_force_parallel_show, sync_force_parallel_store);

4950
static ssize_t
4951
sync_speed_show(struct mddev *mddev, char *page)
4952 4953
{
	unsigned long resync, dt, db;
4954 4955
	if (mddev->curr_resync == 0)
		return sprintf(page, "none\n");
4956 4957
	resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
	dt = (jiffies - mddev->resync_mark) / HZ;
4958
	if (!dt) dt++;
4959 4960
	db = resync - mddev->resync_mark_cnt;
	return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
4961 4962
}

4963
static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4964 4965

static ssize_t
4966
sync_completed_show(struct mddev *mddev, char *page)
4967
{
4968
	unsigned long long max_sectors, resync;
4969

4970 4971 4972
	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
		return sprintf(page, "none\n");

4973 4974 4975 4976
	if (mddev->curr_resync == 1 ||
	    mddev->curr_resync == 2)
		return sprintf(page, "delayed\n");

4977 4978
	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
A
Andre Noll 已提交
4979
		max_sectors = mddev->resync_max_sectors;
4980
	else
A
Andre Noll 已提交
4981
		max_sectors = mddev->dev_sectors;
4982

4983
	resync = mddev->curr_resync_completed;
4984
	return sprintf(page, "%llu / %llu\n", resync, max_sectors);
4985 4986
}

4987 4988
static struct md_sysfs_entry md_sync_completed =
	__ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
4989

4990
static ssize_t
4991
min_sync_show(struct mddev *mddev, char *page)
4992 4993 4994 4995 4996
{
	return sprintf(page, "%llu\n",
		       (unsigned long long)mddev->resync_min);
}
static ssize_t
4997
min_sync_store(struct mddev *mddev, const char *buf, size_t len)
4998 4999
{
	unsigned long long min;
5000 5001
	int err;

5002
	if (kstrtoull(buf, 10, &min))
5003
		return -EINVAL;
5004 5005 5006

	spin_lock(&mddev->lock);
	err = -EINVAL;
5007
	if (min > mddev->resync_max)
5008 5009 5010
		goto out_unlock;

	err = -EBUSY;
5011
	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5012
		goto out_unlock;
5013

5014 5015
	/* Round down to multiple of 4K for safety */
	mddev->resync_min = round_down(min, 8);
5016
	err = 0;
5017

5018 5019 5020
out_unlock:
	spin_unlock(&mddev->lock);
	return err ?: len;
5021 5022 5023 5024 5025
}

static struct md_sysfs_entry md_min_sync =
__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);

5026
static ssize_t
5027
max_sync_show(struct mddev *mddev, char *page)
5028 5029 5030 5031 5032 5033 5034 5035
{
	if (mddev->resync_max == MaxSector)
		return sprintf(page, "max\n");
	else
		return sprintf(page, "%llu\n",
			       (unsigned long long)mddev->resync_max);
}
static ssize_t
5036
max_sync_store(struct mddev *mddev, const char *buf, size_t len)
5037
{
5038 5039
	int err;
	spin_lock(&mddev->lock);
5040 5041 5042
	if (strncmp(buf, "max", 3) == 0)
		mddev->resync_max = MaxSector;
	else {
5043
		unsigned long long max;
5044 5045 5046
		int chunk;

		err = -EINVAL;
5047
		if (kstrtoull(buf, 10, &max))
5048
			goto out_unlock;
5049
		if (max < mddev->resync_min)
5050 5051 5052
			goto out_unlock;

		err = -EBUSY;
5053
		if (max < mddev->resync_max &&
5054
		    mddev->ro == 0 &&
5055
		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5056
			goto out_unlock;
5057 5058

		/* Must be a multiple of chunk_size */
5059 5060
		chunk = mddev->chunk_sectors;
		if (chunk) {
5061
			sector_t temp = max;
5062 5063 5064 5065

			err = -EINVAL;
			if (sector_div(temp, chunk))
				goto out_unlock;
5066 5067 5068 5069
		}
		mddev->resync_max = max;
	}
	wake_up(&mddev->recovery_wait);
5070 5071 5072 5073
	err = 0;
out_unlock:
	spin_unlock(&mddev->lock);
	return err ?: len;
5074 5075 5076 5077 5078
}

static struct md_sysfs_entry md_max_sync =
__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);

5079
static ssize_t
5080
suspend_lo_show(struct mddev *mddev, char *page)
5081 5082 5083 5084 5085
{
	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
}

static ssize_t
5086
suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
5087
{
5088
	unsigned long long new;
5089
	int err;
5090

A
Alexey Dobriyan 已提交
5091 5092 5093 5094
	err = kstrtoull(buf, 10, &new);
	if (err < 0)
		return err;
	if (new != (sector_t)new)
5095
		return -EINVAL;
5096

5097 5098 5099 5100 5101 5102 5103
	err = mddev_lock(mddev);
	if (err)
		return err;
	err = -EINVAL;
	if (mddev->pers == NULL ||
	    mddev->pers->quiesce == NULL)
		goto unlock;
5104
	mddev_suspend(mddev);
5105
	mddev->suspend_lo = new;
5106 5107
	mddev_resume(mddev);

5108 5109 5110 5111
	err = 0;
unlock:
	mddev_unlock(mddev);
	return err ?: len;
5112 5113 5114 5115 5116
}
static struct md_sysfs_entry md_suspend_lo =
__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);

static ssize_t
5117
suspend_hi_show(struct mddev *mddev, char *page)
5118 5119 5120 5121 5122
{
	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
}

static ssize_t
5123
suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
5124
{
5125
	unsigned long long new;
5126
	int err;
5127

A
Alexey Dobriyan 已提交
5128 5129 5130 5131
	err = kstrtoull(buf, 10, &new);
	if (err < 0)
		return err;
	if (new != (sector_t)new)
5132
		return -EINVAL;
5133

5134 5135 5136 5137
	err = mddev_lock(mddev);
	if (err)
		return err;
	err = -EINVAL;
5138
	if (mddev->pers == NULL)
5139
		goto unlock;
5140 5141

	mddev_suspend(mddev);
5142
	mddev->suspend_hi = new;
5143 5144
	mddev_resume(mddev);

5145 5146 5147 5148
	err = 0;
unlock:
	mddev_unlock(mddev);
	return err ?: len;
5149 5150 5151 5152
}
static struct md_sysfs_entry md_suspend_hi =
__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);

5153
static ssize_t
5154
reshape_position_show(struct mddev *mddev, char *page)
5155 5156 5157 5158 5159 5160 5161 5162 5163
{
	if (mddev->reshape_position != MaxSector)
		return sprintf(page, "%llu\n",
			       (unsigned long long)mddev->reshape_position);
	strcpy(page, "none\n");
	return 5;
}

static ssize_t
5164
reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
5165
{
5166
	struct md_rdev *rdev;
A
Alexey Dobriyan 已提交
5167
	unsigned long long new;
5168 5169
	int err;

A
Alexey Dobriyan 已提交
5170 5171 5172 5173
	err = kstrtoull(buf, 10, &new);
	if (err < 0)
		return err;
	if (new != (sector_t)new)
5174
		return -EINVAL;
5175 5176 5177 5178 5179 5180
	err = mddev_lock(mddev);
	if (err)
		return err;
	err = -EBUSY;
	if (mddev->pers)
		goto unlock;
5181 5182
	mddev->reshape_position = new;
	mddev->delta_disks = 0;
5183
	mddev->reshape_backwards = 0;
5184 5185
	mddev->new_level = mddev->level;
	mddev->new_layout = mddev->layout;
5186
	mddev->new_chunk_sectors = mddev->chunk_sectors;
5187 5188
	rdev_for_each(rdev, mddev)
		rdev->new_data_offset = rdev->data_offset;
5189 5190 5191 5192
	err = 0;
unlock:
	mddev_unlock(mddev);
	return err ?: len;
5193 5194 5195 5196 5197 5198
}

static struct md_sysfs_entry md_reshape_position =
__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
       reshape_position_store);

5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209
static ssize_t
reshape_direction_show(struct mddev *mddev, char *page)
{
	return sprintf(page, "%s\n",
		       mddev->reshape_backwards ? "backwards" : "forwards");
}

static ssize_t
reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
{
	int backwards = 0;
5210 5211
	int err;

5212 5213 5214 5215 5216 5217 5218 5219 5220
	if (cmd_match(buf, "forwards"))
		backwards = 0;
	else if (cmd_match(buf, "backwards"))
		backwards = 1;
	else
		return -EINVAL;
	if (mddev->reshape_backwards == backwards)
		return len;

5221 5222 5223
	err = mddev_lock(mddev);
	if (err)
		return err;
5224 5225
	/* check if we are allowed to change */
	if (mddev->delta_disks)
5226 5227
		err = -EBUSY;
	else if (mddev->persistent &&
5228
	    mddev->major_version == 0)
5229 5230 5231 5232 5233
		err =  -EINVAL;
	else
		mddev->reshape_backwards = backwards;
	mddev_unlock(mddev);
	return err ?: len;
5234 5235 5236 5237 5238 5239
}

static struct md_sysfs_entry md_reshape_direction =
__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
       reshape_direction_store);

D
Dan Williams 已提交
5240
static ssize_t
5241
array_size_show(struct mddev *mddev, char *page)
D
Dan Williams 已提交
5242 5243 5244 5245 5246 5247 5248 5249 5250
{
	if (mddev->external_size)
		return sprintf(page, "%llu\n",
			       (unsigned long long)mddev->array_sectors/2);
	else
		return sprintf(page, "default\n");
}

static ssize_t
5251
array_size_store(struct mddev *mddev, const char *buf, size_t len)
D
Dan Williams 已提交
5252 5253
{
	sector_t sectors;
5254 5255 5256 5257 5258
	int err;

	err = mddev_lock(mddev);
	if (err)
		return err;
D
Dan Williams 已提交
5259

5260
	/* cluster raid doesn't support change array_sectors */
5261 5262
	if (mddev_is_clustered(mddev)) {
		mddev_unlock(mddev);
5263
		return -EINVAL;
5264
	}
5265

D
Dan Williams 已提交
5266 5267 5268 5269 5270 5271 5272 5273 5274
	if (strncmp(buf, "default", 7) == 0) {
		if (mddev->pers)
			sectors = mddev->pers->size(mddev, 0, 0);
		else
			sectors = mddev->array_sectors;

		mddev->external_size = 0;
	} else {
		if (strict_blocks_to_sectors(buf, &sectors) < 0)
5275 5276 5277 5278 5279
			err = -EINVAL;
		else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
			err = -E2BIG;
		else
			mddev->external_size = 1;
D
Dan Williams 已提交
5280 5281
	}

5282 5283 5284 5285 5286 5287
	if (!err) {
		mddev->array_sectors = sectors;
		if (mddev->pers) {
			set_capacity(mddev->gendisk, mddev->array_sectors);
			revalidate_disk(mddev->gendisk);
		}
5288
	}
5289 5290
	mddev_unlock(mddev);
	return err ?: len;
D
Dan Williams 已提交
5291 5292 5293 5294 5295
}

static struct md_sysfs_entry md_array_size =
__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
       array_size_store);
5296

5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322
static ssize_t
consistency_policy_show(struct mddev *mddev, char *page)
{
	int ret;

	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
		ret = sprintf(page, "journal\n");
	} else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
		ret = sprintf(page, "ppl\n");
	} else if (mddev->bitmap) {
		ret = sprintf(page, "bitmap\n");
	} else if (mddev->pers) {
		if (mddev->pers->sync_request)
			ret = sprintf(page, "resync\n");
		else
			ret = sprintf(page, "none\n");
	} else {
		ret = sprintf(page, "unknown\n");
	}

	return ret;
}

static ssize_t
consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
{
5323 5324
	int err = 0;

5325
	if (mddev->pers) {
5326 5327 5328 5329
		if (mddev->pers->change_consistency_policy)
			err = mddev->pers->change_consistency_policy(mddev, buf);
		else
			err = -EBUSY;
5330 5331 5332
	} else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
		set_bit(MD_HAS_PPL, &mddev->flags);
	} else {
5333
		err = -EINVAL;
5334
	}
5335 5336

	return err ? err : len;
5337 5338 5339 5340 5341 5342
}

static struct md_sysfs_entry md_consistency_policy =
__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
       consistency_policy_store);

5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370
static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
{
	return sprintf(page, "%d\n", mddev->fail_last_dev);
}

/*
 * Setting fail_last_dev to true to allow last device to be forcibly removed
 * from RAID1/RAID10.
 */
static ssize_t
fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
{
	int ret;
	bool value;

	ret = kstrtobool(buf, &value);
	if (ret)
		return ret;

	if (value != mddev->fail_last_dev)
		mddev->fail_last_dev = value;

	return len;
}
static struct md_sysfs_entry md_fail_last_dev =
__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
       fail_last_dev_store);

5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421
static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
{
	if (mddev->pers == NULL || (mddev->pers->level != 1))
		return sprintf(page, "n/a\n");
	else
		return sprintf(page, "%d\n", mddev->serialize_policy);
}

/*
 * Setting serialize_policy to true to enforce write IO is not reordered
 * for raid1.
 */
static ssize_t
serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
{
	int err;
	bool value;

	err = kstrtobool(buf, &value);
	if (err)
		return err;

	if (value == mddev->serialize_policy)
		return len;

	err = mddev_lock(mddev);
	if (err)
		return err;
	if (mddev->pers == NULL || (mddev->pers->level != 1)) {
		pr_err("md: serialize_policy is only effective for raid1\n");
		err = -EINVAL;
		goto unlock;
	}

	mddev_suspend(mddev);
	if (value)
		mddev_create_serial_pool(mddev, NULL, true);
	else
		mddev_destroy_serial_pool(mddev, NULL, true);
	mddev->serialize_policy = value;
	mddev_resume(mddev);
unlock:
	mddev_unlock(mddev);
	return err ?: len;
}

static struct md_sysfs_entry md_serialize_policy =
__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
       serialize_policy_store);


5422 5423
static struct attribute *md_default_attrs[] = {
	&md_level.attr,
5424
	&md_layout.attr,
5425
	&md_raid_disks.attr,
5426
	&md_chunk_size.attr,
5427
	&md_size.attr,
5428
	&md_resync_start.attr,
5429
	&md_metadata.attr,
5430
	&md_new_device.attr,
5431
	&md_safe_delay.attr,
5432
	&md_array_state.attr,
5433
	&md_reshape_position.attr,
5434
	&md_reshape_direction.attr,
D
Dan Williams 已提交
5435
	&md_array_size.attr,
5436
	&max_corr_read_errors.attr,
5437
	&md_consistency_policy.attr,
5438
	&md_fail_last_dev.attr,
5439
	&md_serialize_policy.attr,
5440 5441 5442 5443
	NULL,
};

static struct attribute *md_redundancy_attrs[] = {
5444
	&md_scan_mode.attr,
5445
	&md_last_scan_mode.attr,
5446
	&md_mismatches.attr,
5447 5448 5449
	&md_sync_min.attr,
	&md_sync_max.attr,
	&md_sync_speed.attr,
5450
	&md_sync_force_parallel.attr,
5451
	&md_sync_completed.attr,
5452
	&md_min_sync.attr,
5453
	&md_max_sync.attr,
5454 5455
	&md_suspend_lo.attr,
	&md_suspend_hi.attr,
5456
	&md_bitmap.attr,
5457
	&md_degraded.attr,
5458 5459
	NULL,
};
5460 5461 5462 5463 5464
static struct attribute_group md_redundancy_group = {
	.name = NULL,
	.attrs = md_redundancy_attrs,
};

5465 5466 5467 5468
static ssize_t
md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5469
	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5470
	ssize_t rv;
5471 5472 5473

	if (!entry->show)
		return -EIO;
5474 5475 5476 5477 5478 5479 5480 5481
	spin_lock(&all_mddevs_lock);
	if (list_empty(&mddev->all_mddevs)) {
		spin_unlock(&all_mddevs_lock);
		return -EBUSY;
	}
	mddev_get(mddev);
	spin_unlock(&all_mddevs_lock);

5482
	rv = entry->show(mddev, page);
5483
	mddev_put(mddev);
5484
	return rv;
5485 5486 5487 5488 5489 5490 5491
}

static ssize_t
md_attr_store(struct kobject *kobj, struct attribute *attr,
	      const char *page, size_t length)
{
	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5492
	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5493
	ssize_t rv;
5494 5495 5496

	if (!entry->store)
		return -EIO;
5497 5498
	if (!capable(CAP_SYS_ADMIN))
		return -EACCES;
5499 5500 5501 5502 5503 5504 5505
	spin_lock(&all_mddevs_lock);
	if (list_empty(&mddev->all_mddevs)) {
		spin_unlock(&all_mddevs_lock);
		return -EBUSY;
	}
	mddev_get(mddev);
	spin_unlock(&all_mddevs_lock);
5506
	rv = entry->store(mddev, page, length);
5507
	mddev_put(mddev);
5508
	return rv;
5509 5510 5511 5512
}

static void md_free(struct kobject *ko)
{
5513
	struct mddev *mddev = container_of(ko, struct mddev, kobj);
5514 5515 5516 5517

	if (mddev->sysfs_state)
		sysfs_put(mddev->sysfs_state);

5518 5519
	if (mddev->gendisk)
		del_gendisk(mddev->gendisk);
5520 5521
	if (mddev->queue)
		blk_cleanup_queue(mddev->queue);
5522
	if (mddev->gendisk)
5523
		put_disk(mddev->gendisk);
5524
	percpu_ref_exit(&mddev->writes_pending);
5525

5526 5527
	bioset_exit(&mddev->bio_set);
	bioset_exit(&mddev->sync_set);
5528 5529 5530
	kfree(mddev);
}

5531
static const struct sysfs_ops md_sysfs_ops = {
5532 5533 5534 5535 5536 5537 5538 5539 5540
	.show	= md_attr_show,
	.store	= md_attr_store,
};
static struct kobj_type md_ktype = {
	.release	= md_free,
	.sysfs_ops	= &md_sysfs_ops,
	.default_attrs	= md_default_attrs,
};

L
Linus Torvalds 已提交
5541 5542
int mdp_major = 0;

5543 5544
static void mddev_delayed_delete(struct work_struct *ws)
{
5545
	struct mddev *mddev = container_of(ws, struct mddev, del_work);
5546

5547
	sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
5548 5549 5550 5551
	kobject_del(&mddev->kobj);
	kobject_put(&mddev->kobj);
}

5552 5553
static void no_op(struct percpu_ref *r) {}

5554 5555 5556 5557
int mddev_init_writes_pending(struct mddev *mddev)
{
	if (mddev->writes_pending.percpu_count_ptr)
		return 0;
5558 5559
	if (percpu_ref_init(&mddev->writes_pending, no_op,
			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0)
5560 5561 5562 5563 5564 5565 5566
		return -ENOMEM;
	/* We want to start with the refcount at zero */
	percpu_ref_put(&mddev->writes_pending);
	return 0;
}
EXPORT_SYMBOL_GPL(mddev_init_writes_pending);

5567
static int md_alloc(dev_t dev, char *name)
L
Linus Torvalds 已提交
5568
{
5569 5570 5571 5572 5573 5574 5575 5576 5577
	/*
	 * If dev is zero, name is the name of a device to allocate with
	 * an arbitrary minor number.  It will be "md_???"
	 * If dev is non-zero it must be a device number with a MAJOR of
	 * MD_MAJOR or mdp_major.  In this case, if "name" is NULL, then
	 * the device is being created by opening a node in /dev.
	 * If "name" is not NULL, the device is being created by
	 * writing to /sys/module/md_mod/parameters/new_array.
	 */
A
Arjan van de Ven 已提交
5578
	static DEFINE_MUTEX(disks_mutex);
5579
	struct mddev *mddev = mddev_find(dev);
L
Linus Torvalds 已提交
5580
	struct gendisk *disk;
5581 5582 5583
	int partitioned;
	int shift;
	int unit;
5584
	int error;
L
Linus Torvalds 已提交
5585 5586

	if (!mddev)
5587 5588 5589 5590 5591
		return -ENODEV;

	partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
	shift = partitioned ? MdpMinorShift : 0;
	unit = MINOR(mddev->unit) >> shift;
L
Linus Torvalds 已提交
5592

T
Tejun Heo 已提交
5593 5594
	/* wait for any previous instance of this device to be
	 * completely removed (mddev_delayed_delete).
5595
	 */
T
Tejun Heo 已提交
5596
	flush_workqueue(md_misc_wq);
5597

A
Arjan van de Ven 已提交
5598
	mutex_lock(&disks_mutex);
N
NeilBrown 已提交
5599 5600 5601
	error = -EEXIST;
	if (mddev->gendisk)
		goto abort;
5602

5603
	if (name && !dev) {
5604 5605
		/* Need to ensure that 'name' is not a duplicate.
		 */
5606
		struct mddev *mddev2;
5607 5608 5609 5610 5611 5612
		spin_lock(&all_mddevs_lock);

		list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
			if (mddev2->gendisk &&
			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
				spin_unlock(&all_mddevs_lock);
N
NeilBrown 已提交
5613
				goto abort;
5614 5615
			}
		spin_unlock(&all_mddevs_lock);
L
Linus Torvalds 已提交
5616
	}
5617 5618 5619 5620 5621
	if (name && dev)
		/*
		 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
		 */
		mddev->hold_active = UNTIL_STOP;
5622

N
NeilBrown 已提交
5623
	error = -ENOMEM;
5624
	mddev->queue = blk_alloc_queue(GFP_KERNEL);
N
NeilBrown 已提交
5625 5626
	if (!mddev->queue)
		goto abort;
5627 5628 5629
	mddev->queue->queuedata = mddev;

	blk_queue_make_request(mddev->queue, md_make_request);
5630
	blk_set_stacking_limits(&mddev->queue->limits);
5631

L
Linus Torvalds 已提交
5632 5633
	disk = alloc_disk(1 << shift);
	if (!disk) {
5634 5635
		blk_cleanup_queue(mddev->queue);
		mddev->queue = NULL;
N
NeilBrown 已提交
5636
		goto abort;
L
Linus Torvalds 已提交
5637
	}
5638
	disk->major = MAJOR(mddev->unit);
L
Linus Torvalds 已提交
5639
	disk->first_minor = unit << shift;
5640 5641 5642
	if (name)
		strcpy(disk->disk_name, name);
	else if (partitioned)
L
Linus Torvalds 已提交
5643
		sprintf(disk->disk_name, "md_d%d", unit);
5644
	else
L
Linus Torvalds 已提交
5645 5646 5647 5648
		sprintf(disk->disk_name, "md%d", unit);
	disk->fops = &md_fops;
	disk->private_data = mddev;
	disk->queue = mddev->queue;
5649
	blk_queue_write_cache(mddev->queue, true, true);
5650
	/* Allow extended partitions.  This makes the
5651
	 * 'mdp' device redundant, but we can't really
5652 5653 5654
	 * remove it now.
	 */
	disk->flags |= GENHD_FL_EXT_DEVT;
L
Linus Torvalds 已提交
5655
	mddev->gendisk = disk;
5656 5657 5658 5659 5660 5661
	/* As soon as we call add_disk(), another thread could get
	 * through to md_open, so make sure it doesn't get too far
	 */
	mutex_lock(&mddev->open_mutex);
	add_disk(disk);

5662
	error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
N
NeilBrown 已提交
5663 5664 5665 5666
	if (error) {
		/* This isn't possible, but as kobject_init_and_add is marked
		 * __must_check, we must do something with the result
		 */
5667 5668
		pr_debug("md: cannot register %s/md - name in use\n",
			 disk->disk_name);
N
NeilBrown 已提交
5669 5670
		error = 0;
	}
N
NeilBrown 已提交
5671 5672
	if (mddev->kobj.sd &&
	    sysfs_create_group(&mddev->kobj, &md_bitmap_group))
5673
		pr_debug("pointless warning\n");
5674
	mutex_unlock(&mddev->open_mutex);
N
NeilBrown 已提交
5675 5676
 abort:
	mutex_unlock(&disks_mutex);
N
NeilBrown 已提交
5677
	if (!error && mddev->kobj.sd) {
5678
		kobject_uevent(&mddev->kobj, KOBJ_ADD);
N
NeilBrown 已提交
5679
		mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
5680
	}
5681
	mddev_put(mddev);
N
NeilBrown 已提交
5682
	return error;
5683 5684 5685 5686
}

static struct kobject *md_probe(dev_t dev, int *part, void *data)
{
5687 5688
	if (create_on_open)
		md_alloc(dev, NULL);
L
Linus Torvalds 已提交
5689 5690 5691
	return NULL;
}

5692
static int add_named_array(const char *val, const struct kernel_param *kp)
5693
{
5694 5695 5696
	/*
	 * val must be "md_*" or "mdNNN".
	 * For "md_*" we allocate an array with a large free minor number, and
5697
	 * set the name to val.  val must not already be an active name.
5698 5699
	 * For "mdNNN" we allocate an array with the minor number NNN
	 * which must not already be in use.
5700 5701 5702
	 */
	int len = strlen(val);
	char buf[DISK_NAME_LEN];
5703
	unsigned long devnum;
5704 5705 5706 5707 5708 5709

	while (len && val[len-1] == '\n')
		len--;
	if (len >= DISK_NAME_LEN)
		return -E2BIG;
	strlcpy(buf, val, len+1);
5710 5711 5712 5713 5714 5715 5716 5717 5718
	if (strncmp(buf, "md_", 3) == 0)
		return md_alloc(0, buf);
	if (strncmp(buf, "md", 2) == 0 &&
	    isdigit(buf[2]) &&
	    kstrtoul(buf+2, 10, &devnum) == 0 &&
	    devnum <= MINORMASK)
		return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);

	return -EINVAL;
5719 5720
}

5721
static void md_safemode_timeout(struct timer_list *t)
L
Linus Torvalds 已提交
5722
{
5723
	struct mddev *mddev = from_timer(mddev, t, safemode_timer);
L
Linus Torvalds 已提交
5724

5725 5726 5727 5728
	mddev->safemode = 1;
	if (mddev->external)
		sysfs_notify_dirent_safe(mddev->sysfs_state);

L
Linus Torvalds 已提交
5729 5730 5731
	md_wakeup_thread(mddev->thread);
}

5732
static int start_dirty_degraded;
L
Linus Torvalds 已提交
5733

5734
int md_run(struct mddev *mddev)
L
Linus Torvalds 已提交
5735
{
5736
	int err;
5737
	struct md_rdev *rdev;
5738
	struct md_personality *pers;
L
Linus Torvalds 已提交
5739

5740 5741
	if (list_empty(&mddev->disks))
		/* cannot run an array with no devices.. */
L
Linus Torvalds 已提交
5742 5743 5744 5745
		return -EINVAL;

	if (mddev->pers)
		return -EBUSY;
5746 5747 5748
	/* Cannot run until previous stop completes properly */
	if (mddev->sysfs_active)
		return -EBUSY;
5749

L
Linus Torvalds 已提交
5750 5751 5752
	/*
	 * Analyze all RAID superblock(s)
	 */
5753 5754 5755
	if (!mddev->raid_disks) {
		if (!mddev->persistent)
			return -EINVAL;
5756 5757 5758
		err = analyze_sbs(mddev);
		if (err)
			return -EINVAL;
5759
	}
L
Linus Torvalds 已提交
5760

5761 5762 5763 5764
	if (mddev->level != LEVEL_NONE)
		request_module("md-level-%d", mddev->level);
	else if (mddev->clevel[0])
		request_module("md-%s", mddev->clevel);
L
Linus Torvalds 已提交
5765 5766 5767 5768 5769 5770

	/*
	 * Drop all container device buffers, from now on
	 * the only valid external interface is through the md
	 * device.
	 */
5771
	mddev->has_superblocks = false;
N
NeilBrown 已提交
5772
	rdev_for_each(rdev, mddev) {
5773
		if (test_bit(Faulty, &rdev->flags))
L
Linus Torvalds 已提交
5774 5775
			continue;
		sync_blockdev(rdev->bdev);
5776
		invalidate_bdev(rdev->bdev);
5777 5778 5779 5780 5781 5782 5783
		if (mddev->ro != 1 &&
		    (bdev_read_only(rdev->bdev) ||
		     bdev_read_only(rdev->meta_bdev))) {
			mddev->ro = 1;
			if (mddev->gendisk)
				set_disk_ro(mddev->gendisk, 1);
		}
5784

5785 5786 5787
		if (rdev->sb_page)
			mddev->has_superblocks = true;

5788 5789
		/* perform some consistency tests on the device.
		 * We don't want the data to overlap the metadata,
A
Andre Noll 已提交
5790
		 * Internal Bitmap issues have been handled elsewhere.
5791
		 */
5792 5793 5794
		if (rdev->meta_bdev) {
			/* Nothing to check */;
		} else if (rdev->data_offset < rdev->sb_start) {
A
Andre Noll 已提交
5795 5796
			if (mddev->dev_sectors &&
			    rdev->data_offset + mddev->dev_sectors
5797
			    > rdev->sb_start) {
5798 5799
				pr_warn("md: %s: data overlaps metadata\n",
					mdname(mddev));
5800 5801 5802
				return -EINVAL;
			}
		} else {
5803
			if (rdev->sb_start + rdev->sb_size/512
5804
			    > rdev->data_offset) {
5805 5806
				pr_warn("md: %s: metadata overlaps data\n",
					mdname(mddev));
5807 5808 5809
				return -EINVAL;
			}
		}
N
NeilBrown 已提交
5810
		sysfs_notify_dirent_safe(rdev->sysfs_state);
L
Linus Torvalds 已提交
5811 5812
	}

5813 5814 5815 5816
	if (!bioset_initialized(&mddev->bio_set)) {
		err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
		if (err)
			return err;
5817
	}
5818 5819 5820
	if (!bioset_initialized(&mddev->sync_set)) {
		err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
		if (err)
5821
			return err;
5822
	}
5823

L
Linus Torvalds 已提交
5824
	spin_lock(&pers_lock);
5825
	pers = find_pers(mddev->level, mddev->clevel);
5826
	if (!pers || !try_module_get(pers->owner)) {
L
Linus Torvalds 已提交
5827
		spin_unlock(&pers_lock);
5828
		if (mddev->level != LEVEL_NONE)
5829 5830
			pr_warn("md: personality for level %d is not loaded!\n",
				mddev->level);
5831
		else
5832 5833
			pr_warn("md: personality for level %s is not loaded!\n",
				mddev->clevel);
S
Shaohua Li 已提交
5834 5835
		err = -EINVAL;
		goto abort;
L
Linus Torvalds 已提交
5836 5837
	}
	spin_unlock(&pers_lock);
5838 5839 5840 5841
	if (mddev->level != pers->level) {
		mddev->level = pers->level;
		mddev->new_level = pers->level;
	}
5842
	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
L
Linus Torvalds 已提交
5843

5844
	if (mddev->reshape_position != MaxSector &&
5845
	    pers->start_reshape == NULL) {
5846 5847
		/* This personality cannot handle reshaping... */
		module_put(pers->owner);
S
Shaohua Li 已提交
5848 5849
		err = -EINVAL;
		goto abort;
5850 5851
	}

5852 5853 5854 5855 5856
	if (pers->sync_request) {
		/* Warn if this is a potentially silly
		 * configuration.
		 */
		char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5857
		struct md_rdev *rdev2;
5858
		int warned = 0;
5859

N
NeilBrown 已提交
5860 5861
		rdev_for_each(rdev, mddev)
			rdev_for_each(rdev2, mddev) {
5862 5863 5864
				if (rdev < rdev2 &&
				    rdev->bdev->bd_contains ==
				    rdev2->bdev->bd_contains) {
5865 5866 5867 5868
					pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n",
						mdname(mddev),
						bdevname(rdev->bdev,b),
						bdevname(rdev2->bdev,b2));
5869 5870 5871
					warned = 1;
				}
			}
5872

5873
		if (warned)
5874
			pr_warn("True protection against single-disk failure might be compromised.\n");
5875 5876
	}

5877
	mddev->recovery = 0;
A
Andre Noll 已提交
5878 5879 5880
	/* may be over-ridden by personality */
	mddev->resync_max_sectors = mddev->dev_sectors;

5881
	mddev->ok_start_degraded = start_dirty_degraded;
L
Linus Torvalds 已提交
5882

5883
	if (start_readonly && mddev->ro == 0)
5884 5885
		mddev->ro = 2; /* read-only, but switch on first write */

5886
	err = pers->run(mddev);
5887
	if (err)
5888
		pr_warn("md: pers->run() failed ...\n");
5889
	else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
5890 5891 5892 5893 5894 5895
		WARN_ONCE(!mddev->external_size,
			  "%s: default size too small, but 'external_size' not in effect?\n",
			  __func__);
		pr_warn("md: invalid array_size %llu > default size %llu\n",
			(unsigned long long)mddev->array_sectors / 2,
			(unsigned long long)pers->size(mddev, 0, 0) / 2);
D
Dan Williams 已提交
5896 5897
		err = -EINVAL;
	}
5898
	if (err == 0 && pers->sync_request &&
5899
	    (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
5900 5901
		struct bitmap *bitmap;

5902
		bitmap = md_bitmap_create(mddev, -1);
5903 5904
		if (IS_ERR(bitmap)) {
			err = PTR_ERR(bitmap);
5905 5906
			pr_warn("%s: failed to create bitmap (%d)\n",
				mdname(mddev), err);
5907 5908 5909
		} else
			mddev->bitmap = bitmap;

5910
	}
5911 5912
	if (err)
		goto bitmap_abort;
5913 5914

	if (mddev->bitmap_info.max_write_behind > 0) {
G
Guoqing Jiang 已提交
5915
		bool create_pool = false;
5916 5917 5918

		rdev_for_each(rdev, mddev) {
			if (test_bit(WriteMostly, &rdev->flags) &&
G
Guoqing Jiang 已提交
5919
			    rdev_init_serial(rdev))
G
Guoqing Jiang 已提交
5920
				create_pool = true;
5921
		}
G
Guoqing Jiang 已提交
5922
		if (create_pool && mddev->serial_info_pool == NULL) {
G
Guoqing Jiang 已提交
5923 5924 5925 5926
			mddev->serial_info_pool =
				mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
						    sizeof(struct serial_info));
			if (!mddev->serial_info_pool) {
5927
				err = -ENOMEM;
5928
				goto bitmap_abort;
5929 5930 5931 5932
			}
		}
	}

5933
	if (mddev->queue) {
S
Shaohua Li 已提交
5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945
		bool nonrot = true;

		rdev_for_each(rdev, mddev) {
			if (rdev->raid_disk >= 0 &&
			    !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
				nonrot = false;
				break;
			}
		}
		if (mddev->degraded)
			nonrot = false;
		if (nonrot)
5946
			blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue);
S
Shaohua Li 已提交
5947
		else
5948
			blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue);
5949 5950
		mddev->queue->backing_dev_info->congested_data = mddev;
		mddev->queue->backing_dev_info->congested_fn = md_congested;
5951
	}
5952
	if (pers->sync_request) {
N
NeilBrown 已提交
5953 5954
		if (mddev->kobj.sd &&
		    sysfs_create_group(&mddev->kobj, &md_redundancy_group))
5955 5956
			pr_warn("md: cannot register extra attributes for %s\n",
				mdname(mddev));
N
NeilBrown 已提交
5957
		mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
5958
	} else if (mddev->ro == 2) /* auto-readonly not meaningful */
5959 5960
		mddev->ro = 0;

5961 5962
	atomic_set(&mddev->max_corr_read_errors,
		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
L
Linus Torvalds 已提交
5963
	mddev->safemode = 0;
5964 5965 5966 5967
	if (mddev_is_clustered(mddev))
		mddev->safemode_delay = 0;
	else
		mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
L
Linus Torvalds 已提交
5968
	mddev->in_sync = 1;
5969
	smp_wmb();
5970 5971 5972
	spin_lock(&mddev->lock);
	mddev->pers = pers;
	spin_unlock(&mddev->lock);
N
NeilBrown 已提交
5973
	rdev_for_each(rdev, mddev)
5974
		if (rdev->raid_disk >= 0)
5975
			sysfs_link_rdev(mddev, rdev); /* failure here is OK */
5976

5977 5978 5979 5980 5981
	if (mddev->degraded && !mddev->ro)
		/* This ensures that recovering status is reported immediately
		 * via sysfs - until a lack of spares is confirmed.
		 */
		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
L
Linus Torvalds 已提交
5982
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5983

5984
	if (mddev->sb_flags)
5985
		md_update_sb(mddev, 0);
L
Linus Torvalds 已提交
5986

5987
	md_new_event(mddev);
L
Linus Torvalds 已提交
5988
	return 0;
X
Xiao Ni 已提交
5989

5990 5991 5992 5993 5994 5995 5996
bitmap_abort:
	mddev_detach(mddev);
	if (mddev->private)
		pers->free(mddev, mddev->private);
	mddev->private = NULL;
	module_put(pers->owner);
	md_bitmap_destroy(mddev);
X
Xiao Ni 已提交
5997
abort:
5998 5999
	bioset_exit(&mddev->bio_set);
	bioset_exit(&mddev->sync_set);
X
Xiao Ni 已提交
6000
	return err;
L
Linus Torvalds 已提交
6001
}
6002
EXPORT_SYMBOL_GPL(md_run);
L
Linus Torvalds 已提交
6003

6004
static int do_md_run(struct mddev *mddev)
6005 6006 6007
{
	int err;

6008
	set_bit(MD_NOT_READY, &mddev->flags);
6009 6010 6011
	err = md_run(mddev);
	if (err)
		goto out;
6012
	err = md_bitmap_load(mddev);
6013
	if (err) {
6014
		md_bitmap_destroy(mddev);
6015 6016
		goto out;
	}
6017

6018 6019 6020
	if (mddev_is_clustered(mddev))
		md_allow_write(mddev);

6021 6022 6023
	/* run start up tasks that require md_thread */
	md_start(mddev);

6024 6025 6026
	md_wakeup_thread(mddev->thread);
	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */

6027 6028
	set_capacity(mddev->gendisk, mddev->array_sectors);
	revalidate_disk(mddev->gendisk);
6029
	clear_bit(MD_NOT_READY, &mddev->flags);
6030
	mddev->changed = 1;
6031
	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
6032 6033 6034
	sysfs_notify_dirent_safe(mddev->sysfs_state);
	sysfs_notify_dirent_safe(mddev->sysfs_action);
	sysfs_notify(&mddev->kobj, NULL, "degraded");
6035
out:
6036
	clear_bit(MD_NOT_READY, &mddev->flags);
6037 6038 6039
	return err;
}

6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054
int md_start(struct mddev *mddev)
{
	int ret = 0;

	if (mddev->pers->start) {
		set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
		ret = mddev->pers->start(mddev);
		clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
		md_wakeup_thread(mddev->sync_thread);
	}
	return ret;
}
EXPORT_SYMBOL_GPL(md_start);

6055
static int restart_array(struct mddev *mddev)
L
Linus Torvalds 已提交
6056 6057
{
	struct gendisk *disk = mddev->gendisk;
6058 6059 6060
	struct md_rdev *rdev;
	bool has_journal = false;
	bool has_readonly = false;
L
Linus Torvalds 已提交
6061

A
Andre Noll 已提交
6062
	/* Complain if it has no devices */
L
Linus Torvalds 已提交
6063
	if (list_empty(&mddev->disks))
A
Andre Noll 已提交
6064 6065 6066 6067 6068
		return -ENXIO;
	if (!mddev->pers)
		return -EINVAL;
	if (!mddev->ro)
		return -EBUSY;
6069

6070 6071 6072 6073 6074 6075 6076 6077 6078 6079
	rcu_read_lock();
	rdev_for_each_rcu(rdev, mddev) {
		if (test_bit(Journal, &rdev->flags) &&
		    !test_bit(Faulty, &rdev->flags))
			has_journal = true;
		if (bdev_read_only(rdev->bdev))
			has_readonly = true;
	}
	rcu_read_unlock();
	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
6080 6081
		/* Don't restart rw with journal missing/faulty */
			return -EINVAL;
6082 6083
	if (has_readonly)
		return -EROFS;
6084

A
Andre Noll 已提交
6085 6086 6087
	mddev->safemode = 0;
	mddev->ro = 0;
	set_disk_ro(disk, 0);
6088
	pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
A
Andre Noll 已提交
6089 6090 6091 6092
	/* Kick recovery or resync if necessary */
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
	md_wakeup_thread(mddev->thread);
	md_wakeup_thread(mddev->sync_thread);
N
NeilBrown 已提交
6093
	sysfs_notify_dirent_safe(mddev->sysfs_state);
A
Andre Noll 已提交
6094
	return 0;
L
Linus Torvalds 已提交
6095 6096
}

6097
static void md_clean(struct mddev *mddev)
N
NeilBrown 已提交
6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111
{
	mddev->array_sectors = 0;
	mddev->external_size = 0;
	mddev->dev_sectors = 0;
	mddev->raid_disks = 0;
	mddev->recovery_cp = 0;
	mddev->resync_min = 0;
	mddev->resync_max = MaxSector;
	mddev->reshape_position = MaxSector;
	mddev->external = 0;
	mddev->persistent = 0;
	mddev->level = LEVEL_NONE;
	mddev->clevel[0] = 0;
	mddev->flags = 0;
6112
	mddev->sb_flags = 0;
N
NeilBrown 已提交
6113 6114 6115 6116 6117 6118 6119
	mddev->ro = 0;
	mddev->metadata_type[0] = 0;
	mddev->chunk_sectors = 0;
	mddev->ctime = mddev->utime = 0;
	mddev->layout = 0;
	mddev->max_disks = 0;
	mddev->events = 0;
6120
	mddev->can_decrease_events = 0;
N
NeilBrown 已提交
6121
	mddev->delta_disks = 0;
6122
	mddev->reshape_backwards = 0;
N
NeilBrown 已提交
6123 6124 6125 6126
	mddev->new_level = LEVEL_NONE;
	mddev->new_layout = 0;
	mddev->new_chunk_sectors = 0;
	mddev->curr_resync = 0;
6127
	atomic64_set(&mddev->resync_mismatches, 0);
N
NeilBrown 已提交
6128 6129 6130 6131
	mddev->suspend_lo = mddev->suspend_hi = 0;
	mddev->sync_speed_min = mddev->sync_speed_max = 0;
	mddev->recovery = 0;
	mddev->in_sync = 0;
6132
	mddev->changed = 0;
N
NeilBrown 已提交
6133 6134
	mddev->degraded = 0;
	mddev->safemode = 0;
6135
	mddev->private = NULL;
6136
	mddev->cluster_info = NULL;
N
NeilBrown 已提交
6137 6138
	mddev->bitmap_info.offset = 0;
	mddev->bitmap_info.default_offset = 0;
6139
	mddev->bitmap_info.default_space = 0;
N
NeilBrown 已提交
6140 6141 6142
	mddev->bitmap_info.chunksize = 0;
	mddev->bitmap_info.daemon_sleep = 0;
	mddev->bitmap_info.max_write_behind = 0;
6143
	mddev->bitmap_info.nodes = 0;
N
NeilBrown 已提交
6144 6145
}

6146
static void __md_stop_writes(struct mddev *mddev)
6147
{
6148
	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6149
	flush_workqueue(md_misc_wq);
6150 6151
	if (mddev->sync_thread) {
		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6152
		md_reap_sync_thread(mddev);
6153 6154 6155 6156
	}

	del_timer_sync(&mddev->safemode_timer);

6157 6158 6159 6160
	if (mddev->pers && mddev->pers->quiesce) {
		mddev->pers->quiesce(mddev, 1);
		mddev->pers->quiesce(mddev, 0);
	}
6161
	md_bitmap_flush(mddev);
6162

6163
	if (mddev->ro == 0 &&
6164
	    ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
6165
	     mddev->sb_flags)) {
6166
		/* mark array as shutdown cleanly */
6167 6168
		if (!mddev_is_clustered(mddev))
			mddev->in_sync = 1;
6169 6170
		md_update_sb(mddev, 1);
	}
6171 6172 6173
	/* disable policy to guarantee rdevs free resources for serialization */
	mddev->serialize_policy = 0;
	mddev_destroy_serial_pool(mddev, NULL, true);
6174
}
6175

6176
void md_stop_writes(struct mddev *mddev)
6177
{
6178
	mddev_lock_nointr(mddev);
6179 6180 6181
	__md_stop_writes(mddev);
	mddev_unlock(mddev);
}
6182
EXPORT_SYMBOL_GPL(md_stop_writes);
6183

6184 6185
static void mddev_detach(struct mddev *mddev)
{
6186
	md_bitmap_wait_behind_writes(mddev);
6187
	if (mddev->pers && mddev->pers->quiesce) {
6188 6189 6190 6191 6192 6193 6194 6195
		mddev->pers->quiesce(mddev, 1);
		mddev->pers->quiesce(mddev, 0);
	}
	md_unregister_thread(&mddev->thread);
	if (mddev->queue)
		blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
}

6196
static void __md_stop(struct mddev *mddev)
N
NeilBrown 已提交
6197
{
6198
	struct md_personality *pers = mddev->pers;
6199
	md_bitmap_destroy(mddev);
6200
	mddev_detach(mddev);
6201 6202
	/* Ensure ->event_work is done */
	flush_workqueue(md_misc_wq);
6203
	spin_lock(&mddev->lock);
N
NeilBrown 已提交
6204
	mddev->pers = NULL;
6205 6206
	spin_unlock(&mddev->lock);
	pers->free(mddev, mddev->private);
6207
	mddev->private = NULL;
6208 6209 6210
	if (pers->sync_request && mddev->to_remove == NULL)
		mddev->to_remove = &md_redundancy_group;
	module_put(pers->owner);
N
NeilBrown 已提交
6211
	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
J
Jack Wang 已提交
6212 6213 6214 6215 6216 6217 6218 6219
}

void md_stop(struct mddev *mddev)
{
	/* stop the array and free an attached data structures.
	 * This is called from dm-raid
	 */
	__md_stop(mddev);
6220 6221
	bioset_exit(&mddev->bio_set);
	bioset_exit(&mddev->sync_set);
6222 6223
}

6224
EXPORT_SYMBOL_GPL(md_stop);
N
NeilBrown 已提交
6225

6226
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
6227 6228
{
	int err = 0;
6229 6230 6231 6232 6233 6234 6235
	int did_freeze = 0;

	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
		did_freeze = 1;
		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
	}
6236
	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
6237
		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6238
	if (mddev->sync_thread)
6239 6240 6241
		/* Thread might be blocked waiting for metadata update
		 * which will now never happen */
		wake_up_process(mddev->sync_thread->tsk);
6242

6243
	if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
6244
		return -EBUSY;
6245
	mddev_unlock(mddev);
6246 6247
	wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
					  &mddev->recovery));
6248
	wait_event(mddev->sb_wait,
6249
		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
6250 6251
	mddev_lock_nointr(mddev);

6252
	mutex_lock(&mddev->open_mutex);
6253
	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
6254
	    mddev->sync_thread ||
6255
	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6256
		pr_warn("md: %s still in use.\n",mdname(mddev));
6257 6258
		if (did_freeze) {
			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6259
			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6260 6261
			md_wakeup_thread(mddev->thread);
		}
6262 6263 6264 6265
		err = -EBUSY;
		goto out;
	}
	if (mddev->pers) {
6266
		__md_stop_writes(mddev);
6267 6268 6269 6270 6271 6272 6273

		err  = -ENXIO;
		if (mddev->ro==1)
			goto out;
		mddev->ro = 1;
		set_disk_ro(mddev->gendisk, 1);
		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6274 6275
		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
N
NeilBrown 已提交
6276
		sysfs_notify_dirent_safe(mddev->sysfs_state);
6277
		err = 0;
6278 6279 6280 6281 6282 6283
	}
out:
	mutex_unlock(&mddev->open_mutex);
	return err;
}

6284 6285 6286 6287
/* mode:
 *   0 - completely stop and dis-assemble array
 *   2 - stop but do not disassemble array
 */
6288
static int do_md_stop(struct mddev *mddev, int mode,
6289
		      struct block_device *bdev)
L
Linus Torvalds 已提交
6290 6291
{
	struct gendisk *disk = mddev->gendisk;
6292
	struct md_rdev *rdev;
6293 6294 6295 6296 6297 6298 6299
	int did_freeze = 0;

	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
		did_freeze = 1;
		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
	}
6300
	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
6301
		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6302
	if (mddev->sync_thread)
6303 6304 6305
		/* Thread might be blocked waiting for metadata update
		 * which will now never happen */
		wake_up_process(mddev->sync_thread->tsk);
6306

6307
	mddev_unlock(mddev);
6308 6309 6310
	wait_event(resync_wait, (mddev->sync_thread == NULL &&
				 !test_bit(MD_RECOVERY_RUNNING,
					   &mddev->recovery)));
6311
	mddev_lock_nointr(mddev);
L
Linus Torvalds 已提交
6312

N
NeilBrown 已提交
6313
	mutex_lock(&mddev->open_mutex);
6314
	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
6315 6316
	    mddev->sysfs_active ||
	    mddev->sync_thread ||
6317
	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6318
		pr_warn("md: %s still in use.\n",mdname(mddev));
N
NeilBrown 已提交
6319
		mutex_unlock(&mddev->open_mutex);
6320 6321
		if (did_freeze) {
			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6322
			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6323 6324
			md_wakeup_thread(mddev->thread);
		}
6325 6326
		return -EBUSY;
	}
N
NeilBrown 已提交
6327
	if (mddev->pers) {
6328 6329
		if (mddev->ro)
			set_disk_ro(disk, 0);
6330

6331
		__md_stop_writes(mddev);
6332
		__md_stop(mddev);
6333
		mddev->queue->backing_dev_info->congested_fn = NULL;
N
NeilBrown 已提交
6334

6335
		/* tell userspace to handle 'inactive' */
N
NeilBrown 已提交
6336
		sysfs_notify_dirent_safe(mddev->sysfs_state);
6337

N
NeilBrown 已提交
6338
		rdev_for_each(rdev, mddev)
6339 6340
			if (rdev->raid_disk >= 0)
				sysfs_unlink_rdev(mddev, rdev);
6341

6342
		set_capacity(disk, 0);
N
NeilBrown 已提交
6343
		mutex_unlock(&mddev->open_mutex);
6344
		mddev->changed = 1;
6345
		revalidate_disk(disk);
6346

6347 6348
		if (mddev->ro)
			mddev->ro = 0;
N
NeilBrown 已提交
6349 6350
	} else
		mutex_unlock(&mddev->open_mutex);
L
Linus Torvalds 已提交
6351 6352 6353
	/*
	 * Free resources if final stop
	 */
6354
	if (mode == 0) {
6355
		pr_info("md: %s stopped.\n", mdname(mddev));
L
Linus Torvalds 已提交
6356

6357
		if (mddev->bitmap_info.file) {
6358 6359
			struct file *f = mddev->bitmap_info.file;
			spin_lock(&mddev->lock);
6360
			mddev->bitmap_info.file = NULL;
6361 6362
			spin_unlock(&mddev->lock);
			fput(f);
6363
		}
6364
		mddev->bitmap_info.offset = 0;
6365

L
Linus Torvalds 已提交
6366 6367
		export_array(mddev);

N
NeilBrown 已提交
6368
		md_clean(mddev);
6369 6370
		if (mddev->hold_active == UNTIL_STOP)
			mddev->hold_active = 0;
6371
	}
6372
	md_new_event(mddev);
N
NeilBrown 已提交
6373
	sysfs_notify_dirent_safe(mddev->sysfs_state);
N
NeilBrown 已提交
6374
	return 0;
L
Linus Torvalds 已提交
6375 6376
}

J
Jeff Garzik 已提交
6377
#ifndef MODULE
6378
static void autorun_array(struct mddev *mddev)
L
Linus Torvalds 已提交
6379
{
6380
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6381 6382
	int err;

6383
	if (list_empty(&mddev->disks))
L
Linus Torvalds 已提交
6384 6385
		return;

6386
	pr_info("md: running: ");
L
Linus Torvalds 已提交
6387

N
NeilBrown 已提交
6388
	rdev_for_each(rdev, mddev) {
L
Linus Torvalds 已提交
6389
		char b[BDEVNAME_SIZE];
6390
		pr_cont("<%s>", bdevname(rdev->bdev,b));
L
Linus Torvalds 已提交
6391
	}
6392
	pr_cont("\n");
L
Linus Torvalds 已提交
6393

6394
	err = do_md_run(mddev);
L
Linus Torvalds 已提交
6395
	if (err) {
6396
		pr_warn("md: do_md_run() returned %d\n", err);
6397
		do_md_stop(mddev, 0, NULL);
L
Linus Torvalds 已提交
6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414
	}
}

/*
 * lets try to run arrays based on all disks that have arrived
 * until now. (those are in pending_raid_disks)
 *
 * the method: pick the first pending disk, collect all disks with
 * the same UUID, remove all from the pending list and put them into
 * the 'same_array' list. Then order this list based on superblock
 * update time (freshest comes first), kick out 'old' disks and
 * compare superblocks. If everything's fine then run it.
 *
 * If "unit" is allocated, then bump its reference count
 */
static void autorun_devices(int part)
{
6415
	struct md_rdev *rdev0, *rdev, *tmp;
6416
	struct mddev *mddev;
L
Linus Torvalds 已提交
6417 6418
	char b[BDEVNAME_SIZE];

6419
	pr_info("md: autorun ...\n");
L
Linus Torvalds 已提交
6420
	while (!list_empty(&pending_raid_disks)) {
6421
		int unit;
L
Linus Torvalds 已提交
6422
		dev_t dev;
6423
		LIST_HEAD(candidates);
L
Linus Torvalds 已提交
6424
		rdev0 = list_entry(pending_raid_disks.next,
6425
					 struct md_rdev, same_set);
L
Linus Torvalds 已提交
6426

6427
		pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b));
L
Linus Torvalds 已提交
6428
		INIT_LIST_HEAD(&candidates);
6429
		rdev_for_each_list(rdev, tmp, &pending_raid_disks)
L
Linus Torvalds 已提交
6430
			if (super_90_load(rdev, rdev0, 0) >= 0) {
6431 6432
				pr_debug("md:  adding %s ...\n",
					 bdevname(rdev->bdev,b));
L
Linus Torvalds 已提交
6433 6434 6435 6436 6437 6438 6439
				list_move(&rdev->same_set, &candidates);
			}
		/*
		 * now we have a set of devices, with all of them having
		 * mostly sane superblocks. It's time to allocate the
		 * mddev.
		 */
6440 6441 6442 6443 6444 6445 6446 6447 6448
		if (part) {
			dev = MKDEV(mdp_major,
				    rdev0->preferred_minor << MdpMinorShift);
			unit = MINOR(dev) >> MdpMinorShift;
		} else {
			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
			unit = MINOR(dev);
		}
		if (rdev0->preferred_minor != unit) {
6449 6450
			pr_warn("md: unit number in %s is bad: %d\n",
				bdevname(rdev0->bdev, b), rdev0->preferred_minor);
L
Linus Torvalds 已提交
6451 6452 6453 6454 6455
			break;
		}

		md_probe(dev, NULL, NULL);
		mddev = mddev_find(dev);
N
Neil Brown 已提交
6456 6457 6458
		if (!mddev || !mddev->gendisk) {
			if (mddev)
				mddev_put(mddev);
L
Linus Torvalds 已提交
6459 6460
			break;
		}
6461
		if (mddev_lock(mddev))
6462
			pr_warn("md: %s locked, cannot run\n", mdname(mddev));
L
Linus Torvalds 已提交
6463 6464
		else if (mddev->raid_disks || mddev->major_version
			 || !list_empty(&mddev->disks)) {
6465
			pr_warn("md: %s already running, cannot run %s\n",
L
Linus Torvalds 已提交
6466 6467 6468
				mdname(mddev), bdevname(rdev0->bdev,b));
			mddev_unlock(mddev);
		} else {
6469
			pr_debug("md: created %s\n", mdname(mddev));
6470
			mddev->persistent = 1;
6471
			rdev_for_each_list(rdev, tmp, &candidates) {
L
Linus Torvalds 已提交
6472 6473 6474 6475 6476 6477 6478 6479 6480 6481
				list_del_init(&rdev->same_set);
				if (bind_rdev_to_array(rdev, mddev))
					export_rdev(rdev);
			}
			autorun_array(mddev);
			mddev_unlock(mddev);
		}
		/* on success, candidates will be empty, on error
		 * it won't...
		 */
6482
		rdev_for_each_list(rdev, tmp, &candidates) {
6483
			list_del_init(&rdev->same_set);
L
Linus Torvalds 已提交
6484
			export_rdev(rdev);
6485
		}
L
Linus Torvalds 已提交
6486 6487
		mddev_put(mddev);
	}
6488
	pr_info("md: ... autorun DONE.\n");
L
Linus Torvalds 已提交
6489
}
J
Jeff Garzik 已提交
6490
#endif /* !MODULE */
L
Linus Torvalds 已提交
6491

6492
static int get_version(void __user *arg)
L
Linus Torvalds 已提交
6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505
{
	mdu_version_t ver;

	ver.major = MD_MAJOR_VERSION;
	ver.minor = MD_MINOR_VERSION;
	ver.patchlevel = MD_PATCHLEVEL_VERSION;

	if (copy_to_user(arg, &ver, sizeof(ver)))
		return -EFAULT;

	return 0;
}

6506
static int get_array_info(struct mddev *mddev, void __user *arg)
L
Linus Torvalds 已提交
6507 6508
{
	mdu_array_info_t info;
6509
	int nr,working,insync,failed,spare;
6510
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6511

6512 6513 6514
	nr = working = insync = failed = spare = 0;
	rcu_read_lock();
	rdev_for_each_rcu(rdev, mddev) {
L
Linus Torvalds 已提交
6515
		nr++;
6516
		if (test_bit(Faulty, &rdev->flags))
L
Linus Torvalds 已提交
6517 6518 6519
			failed++;
		else {
			working++;
6520
			if (test_bit(In_sync, &rdev->flags))
6521
				insync++;
6522 6523 6524
			else if (test_bit(Journal, &rdev->flags))
				/* TODO: add journal count to md_u.h */
				;
L
Linus Torvalds 已提交
6525 6526 6527 6528
			else
				spare++;
		}
	}
6529
	rcu_read_unlock();
L
Linus Torvalds 已提交
6530 6531 6532 6533

	info.major_version = mddev->major_version;
	info.minor_version = mddev->minor_version;
	info.patch_version = MD_PATCHLEVEL_VERSION;
6534
	info.ctime         = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
L
Linus Torvalds 已提交
6535
	info.level         = mddev->level;
A
Andre Noll 已提交
6536 6537
	info.size          = mddev->dev_sectors / 2;
	if (info.size != mddev->dev_sectors / 2) /* overflow */
6538
		info.size = -1;
L
Linus Torvalds 已提交
6539 6540 6541 6542 6543
	info.nr_disks      = nr;
	info.raid_disks    = mddev->raid_disks;
	info.md_minor      = mddev->md_minor;
	info.not_persistent= !mddev->persistent;

6544
	info.utime         = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
L
Linus Torvalds 已提交
6545 6546 6547
	info.state         = 0;
	if (mddev->in_sync)
		info.state = (1<<MD_SB_CLEAN);
6548
	if (mddev->bitmap && mddev->bitmap_info.offset)
6549
		info.state |= (1<<MD_SB_BITMAP_PRESENT);
6550 6551
	if (mddev_is_clustered(mddev))
		info.state |= (1<<MD_SB_CLUSTERED);
6552
	info.active_disks  = insync;
L
Linus Torvalds 已提交
6553 6554 6555 6556 6557
	info.working_disks = working;
	info.failed_disks  = failed;
	info.spare_disks   = spare;

	info.layout        = mddev->layout;
6558
	info.chunk_size    = mddev->chunk_sectors << 9;
L
Linus Torvalds 已提交
6559 6560 6561 6562 6563 6564 6565

	if (copy_to_user(arg, &info, sizeof(info)))
		return -EFAULT;

	return 0;
}

6566
static int get_bitmap_file(struct mddev *mddev, void __user * arg)
6567 6568
{
	mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
6569
	char *ptr;
6570
	int err;
6571

6572
	file = kzalloc(sizeof(*file), GFP_NOIO);
6573
	if (!file)
6574
		return -ENOMEM;
6575

6576 6577
	err = 0;
	spin_lock(&mddev->lock);
6578 6579 6580 6581 6582 6583 6584 6585 6586 6587
	/* bitmap enabled */
	if (mddev->bitmap_info.file) {
		ptr = file_path(mddev->bitmap_info.file, file->pathname,
				sizeof(file->pathname));
		if (IS_ERR(ptr))
			err = PTR_ERR(ptr);
		else
			memmove(file->pathname, ptr,
				sizeof(file->pathname)-(ptr-file->pathname));
	}
6588
	spin_unlock(&mddev->lock);
6589

6590 6591
	if (err == 0 &&
	    copy_to_user(arg, file, sizeof(*file)))
6592
		err = -EFAULT;
6593

6594 6595 6596 6597
	kfree(file);
	return err;
}

6598
static int get_disk_info(struct mddev *mddev, void __user * arg)
L
Linus Torvalds 已提交
6599 6600
{
	mdu_disk_info_t info;
6601
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6602 6603 6604 6605

	if (copy_from_user(&info, arg, sizeof(info)))
		return -EFAULT;

6606
	rcu_read_lock();
6607
	rdev = md_find_rdev_nr_rcu(mddev, info.number);
L
Linus Torvalds 已提交
6608 6609 6610 6611 6612
	if (rdev) {
		info.major = MAJOR(rdev->bdev->bd_dev);
		info.minor = MINOR(rdev->bdev->bd_dev);
		info.raid_disk = rdev->raid_disk;
		info.state = 0;
6613
		if (test_bit(Faulty, &rdev->flags))
L
Linus Torvalds 已提交
6614
			info.state |= (1<<MD_DISK_FAULTY);
6615
		else if (test_bit(In_sync, &rdev->flags)) {
L
Linus Torvalds 已提交
6616 6617 6618
			info.state |= (1<<MD_DISK_ACTIVE);
			info.state |= (1<<MD_DISK_SYNC);
		}
S
Shaohua Li 已提交
6619
		if (test_bit(Journal, &rdev->flags))
6620
			info.state |= (1<<MD_DISK_JOURNAL);
6621 6622
		if (test_bit(WriteMostly, &rdev->flags))
			info.state |= (1<<MD_DISK_WRITEMOSTLY);
6623 6624
		if (test_bit(FailFast, &rdev->flags))
			info.state |= (1<<MD_DISK_FAILFAST);
L
Linus Torvalds 已提交
6625 6626 6627 6628 6629
	} else {
		info.major = info.minor = 0;
		info.raid_disk = -1;
		info.state = (1<<MD_DISK_REMOVED);
	}
6630
	rcu_read_unlock();
L
Linus Torvalds 已提交
6631 6632 6633 6634 6635 6636 6637

	if (copy_to_user(arg, &info, sizeof(info)))
		return -EFAULT;

	return 0;
}

6638
static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
L
Linus Torvalds 已提交
6639 6640
{
	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
6641
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6642 6643
	dev_t dev = MKDEV(info->major,info->minor);

6644 6645
	if (mddev_is_clustered(mddev) &&
		!(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
6646 6647
		pr_warn("%s: Cannot add to clustered mddev.\n",
			mdname(mddev));
6648 6649 6650
		return -EINVAL;
	}

L
Linus Torvalds 已提交
6651 6652 6653 6654 6655 6656 6657 6658
	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
		return -EOVERFLOW;

	if (!mddev->raid_disks) {
		int err;
		/* expecting a device which has a superblock */
		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
		if (IS_ERR(rdev)) {
6659
			pr_warn("md: md_import_device returned %ld\n",
L
Linus Torvalds 已提交
6660 6661 6662 6663
				PTR_ERR(rdev));
			return PTR_ERR(rdev);
		}
		if (!list_empty(&mddev->disks)) {
6664 6665 6666
			struct md_rdev *rdev0
				= list_entry(mddev->disks.next,
					     struct md_rdev, same_set);
6667
			err = super_types[mddev->major_version]
L
Linus Torvalds 已提交
6668 6669
				.load_super(rdev, rdev0, mddev->minor_version);
			if (err < 0) {
6670
				pr_warn("md: %s has different UUID to %s\n",
6671
					bdevname(rdev->bdev,b),
L
Linus Torvalds 已提交
6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690
					bdevname(rdev0->bdev,b2));
				export_rdev(rdev);
				return -EINVAL;
			}
		}
		err = bind_rdev_to_array(rdev, mddev);
		if (err)
			export_rdev(rdev);
		return err;
	}

	/*
	 * add_new_disk can be used once the array is assembled
	 * to add "hot spares".  They must already have a superblock
	 * written
	 */
	if (mddev->pers) {
		int err;
		if (!mddev->pers->hot_add_disk) {
6691 6692
			pr_warn("%s: personality does not support diskops!\n",
				mdname(mddev));
L
Linus Torvalds 已提交
6693 6694
			return -EINVAL;
		}
6695 6696 6697 6698 6699
		if (mddev->persistent)
			rdev = md_import_device(dev, mddev->major_version,
						mddev->minor_version);
		else
			rdev = md_import_device(dev, -1, -1);
L
Linus Torvalds 已提交
6700
		if (IS_ERR(rdev)) {
6701
			pr_warn("md: md_import_device returned %ld\n",
L
Linus Torvalds 已提交
6702 6703 6704
				PTR_ERR(rdev));
			return PTR_ERR(rdev);
		}
6705
		/* set saved_raid_disk if appropriate */
6706 6707
		if (!mddev->persistent) {
			if (info->state & (1<<MD_DISK_SYNC)  &&
6708
			    info->raid_disk < mddev->raid_disks) {
6709
				rdev->raid_disk = info->raid_disk;
6710
				set_bit(In_sync, &rdev->flags);
6711
				clear_bit(Bitmap_sync, &rdev->flags);
6712
			} else
6713
				rdev->raid_disk = -1;
6714
			rdev->saved_raid_disk = rdev->raid_disk;
6715 6716 6717
		} else
			super_types[mddev->major_version].
				validate_super(mddev, rdev);
6718
		if ((info->state & (1<<MD_DISK_SYNC)) &&
6719
		     rdev->raid_disk != info->raid_disk) {
6720 6721 6722 6723 6724 6725 6726
			/* This was a hot-add request, but events doesn't
			 * match, so reject it.
			 */
			export_rdev(rdev);
			return -EINVAL;
		}

6727
		clear_bit(In_sync, &rdev->flags); /* just to be sure */
6728 6729
		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
			set_bit(WriteMostly, &rdev->flags);
6730 6731
		else
			clear_bit(WriteMostly, &rdev->flags);
6732 6733 6734 6735
		if (info->state & (1<<MD_DISK_FAILFAST))
			set_bit(FailFast, &rdev->flags);
		else
			clear_bit(FailFast, &rdev->flags);
6736

6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747
		if (info->state & (1<<MD_DISK_JOURNAL)) {
			struct md_rdev *rdev2;
			bool has_journal = false;

			/* make sure no existing journal disk */
			rdev_for_each(rdev2, mddev) {
				if (test_bit(Journal, &rdev2->flags)) {
					has_journal = true;
					break;
				}
			}
6748
			if (has_journal || mddev->bitmap) {
6749 6750 6751
				export_rdev(rdev);
				return -EBUSY;
			}
6752
			set_bit(Journal, &rdev->flags);
6753
		}
6754 6755 6756 6757
		/*
		 * check whether the device shows up in other nodes
		 */
		if (mddev_is_clustered(mddev)) {
6758
			if (info->state & (1 << MD_DISK_CANDIDATE))
6759
				set_bit(Candidate, &rdev->flags);
6760
			else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
6761
				/* --add initiated by this node */
6762
				err = md_cluster_ops->add_new_disk(mddev, rdev);
6763 6764 6765 6766 6767 6768 6769
				if (err) {
					export_rdev(rdev);
					return err;
				}
			}
		}

L
Linus Torvalds 已提交
6770 6771
		rdev->raid_disk = -1;
		err = bind_rdev_to_array(rdev, mddev);
6772

L
Linus Torvalds 已提交
6773 6774
		if (err)
			export_rdev(rdev);
6775 6776

		if (mddev_is_clustered(mddev)) {
6777 6778 6779 6780 6781 6782 6783 6784
			if (info->state & (1 << MD_DISK_CANDIDATE)) {
				if (!err) {
					err = md_cluster_ops->new_disk_ack(mddev,
						err == 0);
					if (err)
						md_kick_rdev_from_array(rdev);
				}
			} else {
6785 6786 6787 6788 6789 6790 6791
				if (err)
					md_cluster_ops->add_new_disk_cancel(mddev);
				else
					err = add_bound_rdev(rdev);
			}

		} else if (!err)
G
Goldwyn Rodrigues 已提交
6792
			err = add_bound_rdev(rdev);
6793

L
Linus Torvalds 已提交
6794 6795 6796 6797 6798 6799 6800
		return err;
	}

	/* otherwise, add_new_disk is only allowed
	 * for major_version==0 superblocks
	 */
	if (mddev->major_version != 0) {
6801
		pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
L
Linus Torvalds 已提交
6802 6803 6804 6805 6806
		return -EINVAL;
	}

	if (!(info->state & (1<<MD_DISK_FAULTY))) {
		int err;
6807
		rdev = md_import_device(dev, -1, 0);
L
Linus Torvalds 已提交
6808
		if (IS_ERR(rdev)) {
6809
			pr_warn("md: error, md_import_device() returned %ld\n",
L
Linus Torvalds 已提交
6810 6811 6812 6813 6814 6815 6816 6817 6818 6819
				PTR_ERR(rdev));
			return PTR_ERR(rdev);
		}
		rdev->desc_nr = info->number;
		if (info->raid_disk < mddev->raid_disks)
			rdev->raid_disk = info->raid_disk;
		else
			rdev->raid_disk = -1;

		if (rdev->raid_disk < mddev->raid_disks)
6820 6821
			if (info->state & (1<<MD_DISK_SYNC))
				set_bit(In_sync, &rdev->flags);
L
Linus Torvalds 已提交
6822

6823 6824
		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
			set_bit(WriteMostly, &rdev->flags);
6825 6826
		if (info->state & (1<<MD_DISK_FAILFAST))
			set_bit(FailFast, &rdev->flags);
6827

L
Linus Torvalds 已提交
6828
		if (!mddev->persistent) {
6829
			pr_debug("md: nonpersistent superblock ...\n");
6830 6831
			rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
		} else
6832
			rdev->sb_start = calc_dev_sboffset(rdev);
6833
		rdev->sectors = rdev->sb_start;
L
Linus Torvalds 已提交
6834

6835 6836 6837 6838 6839
		err = bind_rdev_to_array(rdev, mddev);
		if (err) {
			export_rdev(rdev);
			return err;
		}
L
Linus Torvalds 已提交
6840 6841 6842 6843 6844
	}

	return 0;
}

6845
static int hot_remove_disk(struct mddev *mddev, dev_t dev)
L
Linus Torvalds 已提交
6846 6847
{
	char b[BDEVNAME_SIZE];
6848
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6849

6850 6851 6852
	if (!mddev->pers)
		return -ENODEV;

L
Linus Torvalds 已提交
6853 6854 6855 6856
	rdev = find_rdev(mddev, dev);
	if (!rdev)
		return -ENXIO;

6857 6858
	if (rdev->raid_disk < 0)
		goto kick_rdev;
6859

6860 6861 6862
	clear_bit(Blocked, &rdev->flags);
	remove_and_add_spares(mddev, rdev);

L
Linus Torvalds 已提交
6863 6864 6865
	if (rdev->raid_disk >= 0)
		goto busy;

6866
kick_rdev:
6867
	if (mddev_is_clustered(mddev))
6868 6869
		md_cluster_ops->remove_disk(mddev, rdev);

6870
	md_kick_rdev_from_array(rdev);
6871
	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6872 6873 6874 6875
	if (mddev->thread)
		md_wakeup_thread(mddev->thread);
	else
		md_update_sb(mddev, 1);
6876
	md_new_event(mddev);
L
Linus Torvalds 已提交
6877 6878 6879

	return 0;
busy:
6880 6881
	pr_debug("md: cannot remove active disk %s from %s ...\n",
		 bdevname(rdev->bdev,b), mdname(mddev));
L
Linus Torvalds 已提交
6882 6883 6884
	return -EBUSY;
}

6885
static int hot_add_disk(struct mddev *mddev, dev_t dev)
L
Linus Torvalds 已提交
6886 6887 6888
{
	char b[BDEVNAME_SIZE];
	int err;
6889
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
6890 6891 6892 6893 6894

	if (!mddev->pers)
		return -ENODEV;

	if (mddev->major_version != 0) {
6895
		pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
L
Linus Torvalds 已提交
6896 6897 6898 6899
			mdname(mddev));
		return -EINVAL;
	}
	if (!mddev->pers->hot_add_disk) {
6900
		pr_warn("%s: personality does not support diskops!\n",
L
Linus Torvalds 已提交
6901 6902 6903 6904
			mdname(mddev));
		return -EINVAL;
	}

6905
	rdev = md_import_device(dev, -1, 0);
L
Linus Torvalds 已提交
6906
	if (IS_ERR(rdev)) {
6907
		pr_warn("md: error, md_import_device() returned %ld\n",
L
Linus Torvalds 已提交
6908 6909 6910 6911 6912
			PTR_ERR(rdev));
		return -EINVAL;
	}

	if (mddev->persistent)
6913
		rdev->sb_start = calc_dev_sboffset(rdev);
L
Linus Torvalds 已提交
6914
	else
6915
		rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
L
Linus Torvalds 已提交
6916

6917
	rdev->sectors = rdev->sb_start;
L
Linus Torvalds 已提交
6918

6919
	if (test_bit(Faulty, &rdev->flags)) {
6920
		pr_warn("md: can not hot-add faulty %s disk to %s!\n",
L
Linus Torvalds 已提交
6921 6922 6923 6924
			bdevname(rdev->bdev,b), mdname(mddev));
		err = -EINVAL;
		goto abort_export;
	}
6925

6926
	clear_bit(In_sync, &rdev->flags);
L
Linus Torvalds 已提交
6927
	rdev->desc_nr = -1;
6928
	rdev->saved_raid_disk = -1;
6929 6930
	err = bind_rdev_to_array(rdev, mddev);
	if (err)
6931
		goto abort_export;
L
Linus Torvalds 已提交
6932 6933 6934 6935 6936 6937 6938 6939

	/*
	 * The rest should better be atomic, we can have disk failures
	 * noticed in interrupt contexts ...
	 */

	rdev->raid_disk = -1;

6940
	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6941 6942
	if (!mddev->thread)
		md_update_sb(mddev, 1);
L
Linus Torvalds 已提交
6943 6944 6945 6946 6947 6948
	/*
	 * Kick recovery, maybe this spare has to be added to the
	 * array immediately.
	 */
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
	md_wakeup_thread(mddev->thread);
6949
	md_new_event(mddev);
L
Linus Torvalds 已提交
6950 6951 6952 6953 6954 6955 6956
	return 0;

abort_export:
	export_rdev(rdev);
	return err;
}

6957
static int set_bitmap_file(struct mddev *mddev, int fd)
6958
{
6959
	int err = 0;
6960

6961
	if (mddev->pers) {
6962
		if (!mddev->pers->quiesce || !mddev->thread)
6963 6964 6965 6966 6967
			return -EBUSY;
		if (mddev->recovery || mddev->sync_thread)
			return -EBUSY;
		/* we should be able to change the bitmap.. */
	}
6968

6969
	if (fd >= 0) {
6970
		struct inode *inode;
N
NeilBrown 已提交
6971 6972 6973
		struct file *f;

		if (mddev->bitmap || mddev->bitmap_info.file)
6974
			return -EEXIST; /* cannot add when bitmap is present */
N
NeilBrown 已提交
6975
		f = fget(fd);
6976

N
NeilBrown 已提交
6977
		if (f == NULL) {
6978 6979
			pr_warn("%s: error: failed to get bitmap file\n",
				mdname(mddev));
6980 6981 6982
			return -EBADF;
		}

N
NeilBrown 已提交
6983
		inode = f->f_mapping->host;
6984
		if (!S_ISREG(inode->i_mode)) {
6985 6986
			pr_warn("%s: error: bitmap file must be a regular file\n",
				mdname(mddev));
6987
			err = -EBADF;
N
NeilBrown 已提交
6988
		} else if (!(f->f_mode & FMODE_WRITE)) {
6989 6990
			pr_warn("%s: error: bitmap file must open for write\n",
				mdname(mddev));
6991 6992
			err = -EBADF;
		} else if (atomic_read(&inode->i_writecount) != 1) {
6993 6994
			pr_warn("%s: error: bitmap file is already in use\n",
				mdname(mddev));
6995 6996 6997
			err = -EBUSY;
		}
		if (err) {
N
NeilBrown 已提交
6998
			fput(f);
6999 7000
			return err;
		}
N
NeilBrown 已提交
7001
		mddev->bitmap_info.file = f;
7002
		mddev->bitmap_info.offset = 0; /* file overrides offset */
7003 7004 7005 7006
	} else if (mddev->bitmap == NULL)
		return -ENOENT; /* cannot remove what isn't there */
	err = 0;
	if (mddev->pers) {
7007
		if (fd >= 0) {
7008 7009
			struct bitmap *bitmap;

7010
			bitmap = md_bitmap_create(mddev, -1);
7011
			mddev_suspend(mddev);
7012 7013
			if (!IS_ERR(bitmap)) {
				mddev->bitmap = bitmap;
7014
				err = md_bitmap_load(mddev);
7015 7016
			} else
				err = PTR_ERR(bitmap);
7017
			if (err) {
7018
				md_bitmap_destroy(mddev);
7019 7020
				fd = -1;
			}
7021
			mddev_resume(mddev);
7022
		} else if (fd < 0) {
7023
			mddev_suspend(mddev);
7024
			md_bitmap_destroy(mddev);
7025
			mddev_resume(mddev);
7026 7027 7028
		}
	}
	if (fd < 0) {
7029 7030 7031 7032 7033 7034 7035
		struct file *f = mddev->bitmap_info.file;
		if (f) {
			spin_lock(&mddev->lock);
			mddev->bitmap_info.file = NULL;
			spin_unlock(&mddev->lock);
			fput(f);
		}
7036 7037
	}

7038 7039 7040
	return err;
}

L
Linus Torvalds 已提交
7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053
/*
 * set_array_info is used two different ways
 * The original usage is when creating a new array.
 * In this usage, raid_disks is > 0 and it together with
 *  level, size, not_persistent,layout,chunksize determine the
 *  shape of the array.
 *  This will always create an array with a type-0.90.0 superblock.
 * The newer usage is when assembling an array.
 *  In this case raid_disks will be 0, and the major_version field is
 *  use to determine which style super-blocks are to be found on the devices.
 *  The minor and patch _version numbers are also kept incase the
 *  super_block handler wishes to interpret them.
 */
7054
static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
L
Linus Torvalds 已提交
7055 7056 7057 7058 7059
{

	if (info->raid_disks == 0) {
		/* just setting version number for superblock loading */
		if (info->major_version < 0 ||
7060
		    info->major_version >= ARRAY_SIZE(super_types) ||
L
Linus Torvalds 已提交
7061 7062
		    super_types[info->major_version].name == NULL) {
			/* maybe try to auto-load a module? */
7063
			pr_warn("md: superblock version %d not known\n",
L
Linus Torvalds 已提交
7064 7065 7066 7067 7068 7069
				info->major_version);
			return -EINVAL;
		}
		mddev->major_version = info->major_version;
		mddev->minor_version = info->minor_version;
		mddev->patch_version = info->patch_version;
7070
		mddev->persistent = !info->not_persistent;
7071 7072 7073
		/* ensure mddev_put doesn't delete this now that there
		 * is some minimal configuration.
		 */
7074
		mddev->ctime         = ktime_get_real_seconds();
L
Linus Torvalds 已提交
7075 7076 7077 7078 7079
		return 0;
	}
	mddev->major_version = MD_MAJOR_VERSION;
	mddev->minor_version = MD_MINOR_VERSION;
	mddev->patch_version = MD_PATCHLEVEL_VERSION;
7080
	mddev->ctime         = ktime_get_real_seconds();
L
Linus Torvalds 已提交
7081 7082

	mddev->level         = info->level;
7083
	mddev->clevel[0]     = 0;
A
Andre Noll 已提交
7084
	mddev->dev_sectors   = 2 * (sector_t)info->size;
L
Linus Torvalds 已提交
7085 7086 7087 7088 7089 7090 7091 7092 7093
	mddev->raid_disks    = info->raid_disks;
	/* don't set md_minor, it is determined by which /dev/md* was
	 * openned
	 */
	if (info->state & (1<<MD_SB_CLEAN))
		mddev->recovery_cp = MaxSector;
	else
		mddev->recovery_cp = 0;
	mddev->persistent    = ! info->not_persistent;
7094
	mddev->external	     = 0;
L
Linus Torvalds 已提交
7095 7096

	mddev->layout        = info->layout;
7097 7098 7099
	if (mddev->level == 0)
		/* Cannot trust RAID0 layout info here */
		mddev->layout = -1;
7100
	mddev->chunk_sectors = info->chunk_size >> 9;
L
Linus Torvalds 已提交
7101

7102
	if (mddev->persistent) {
7103 7104 7105
		mddev->max_disks = MD_SB_DISKS;
		mddev->flags = 0;
		mddev->sb_flags = 0;
7106 7107
	}
	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
L
Linus Torvalds 已提交
7108

7109
	mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
7110
	mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
7111
	mddev->bitmap_info.offset = 0;
7112

7113 7114
	mddev->reshape_position = MaxSector;

L
Linus Torvalds 已提交
7115 7116 7117 7118 7119
	/*
	 * Generate a 128 bit UUID
	 */
	get_random_bytes(mddev->uuid, 16);

7120
	mddev->new_level = mddev->level;
7121
	mddev->new_chunk_sectors = mddev->chunk_sectors;
7122 7123
	mddev->new_layout = mddev->layout;
	mddev->delta_disks = 0;
7124
	mddev->reshape_backwards = 0;
7125

L
Linus Torvalds 已提交
7126 7127 7128
	return 0;
}

7129
void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
7130
{
S
Shaohua Li 已提交
7131
	lockdep_assert_held(&mddev->reconfig_mutex);
D
Dan Williams 已提交
7132 7133 7134 7135

	if (mddev->external_size)
		return;

7136 7137 7138 7139
	mddev->array_sectors = array_sectors;
}
EXPORT_SYMBOL(md_set_array_sectors);

7140
static int update_size(struct mddev *mddev, sector_t num_sectors)
7141
{
7142
	struct md_rdev *rdev;
7143
	int rv;
7144
	int fit = (num_sectors == 0);
7145
	sector_t old_dev_sectors = mddev->dev_sectors;
7146

7147 7148
	if (mddev->pers->resize == NULL)
		return -EINVAL;
7149 7150 7151 7152 7153
	/* The "num_sectors" is the number of sectors of each device that
	 * is used.  This can only make sense for arrays with redundancy.
	 * linear and raid0 always use whatever space is available. We can only
	 * consider changing this number if no resync or reconstruction is
	 * happening, and if the new size is acceptable. It must fit before the
7154
	 * sb_start or, if that is <data_offset, it must fit before the size
7155 7156
	 * of each device.  If num_sectors is zero, we find the largest size
	 * that fits.
7157
	 */
7158 7159
	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
	    mddev->sync_thread)
7160
		return -EBUSY;
7161 7162
	if (mddev->ro)
		return -EROFS;
7163

N
NeilBrown 已提交
7164
	rdev_for_each(rdev, mddev) {
7165
		sector_t avail = rdev->sectors;
7166

7167 7168 7169
		if (fit && (num_sectors == 0 || num_sectors > avail))
			num_sectors = avail;
		if (avail < num_sectors)
7170 7171
			return -ENOSPC;
	}
7172
	rv = mddev->pers->resize(mddev, num_sectors);
7173
	if (!rv) {
7174 7175 7176
		if (mddev_is_clustered(mddev))
			md_cluster_ops->update_size(mddev, old_dev_sectors);
		else if (mddev->queue) {
7177 7178 7179 7180
			set_capacity(mddev->gendisk, mddev->array_sectors);
			revalidate_disk(mddev->gendisk);
		}
	}
7181 7182 7183
	return rv;
}

7184
static int update_raid_disks(struct mddev *mddev, int raid_disks)
7185 7186
{
	int rv;
7187
	struct md_rdev *rdev;
7188
	/* change the number of raid disks */
7189
	if (mddev->pers->check_reshape == NULL)
7190
		return -EINVAL;
7191 7192
	if (mddev->ro)
		return -EROFS;
7193
	if (raid_disks <= 0 ||
7194
	    (mddev->max_disks && raid_disks >= mddev->max_disks))
7195
		return -EINVAL;
7196 7197 7198
	if (mddev->sync_thread ||
	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
	    mddev->reshape_position != MaxSector)
7199
		return -EBUSY;
7200 7201 7202 7203 7204 7205 7206 7207 7208 7209

	rdev_for_each(rdev, mddev) {
		if (mddev->raid_disks < raid_disks &&
		    rdev->data_offset < rdev->new_data_offset)
			return -EINVAL;
		if (mddev->raid_disks > raid_disks &&
		    rdev->data_offset > rdev->new_data_offset)
			return -EINVAL;
	}

7210
	mddev->delta_disks = raid_disks - mddev->raid_disks;
7211 7212 7213 7214
	if (mddev->delta_disks < 0)
		mddev->reshape_backwards = 1;
	else if (mddev->delta_disks > 0)
		mddev->reshape_backwards = 0;
7215 7216

	rv = mddev->pers->check_reshape(mddev);
7217
	if (rv < 0) {
7218
		mddev->delta_disks = 0;
7219 7220
		mddev->reshape_backwards = 0;
	}
7221 7222 7223
	return rv;
}

L
Linus Torvalds 已提交
7224 7225 7226 7227 7228 7229 7230 7231
/*
 * update_array_info is used to change the configuration of an
 * on-line array.
 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
 * fields in the info are checked against the array.
 * Any differences that cannot be handled will cause an error.
 * Normally, only one change can be managed at a time.
 */
7232
static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
L
Linus Torvalds 已提交
7233 7234 7235
{
	int rv = 0;
	int cnt = 0;
7236 7237 7238
	int state = 0;

	/* calculate expected state,ignoring low bits */
7239
	if (mddev->bitmap && mddev->bitmap_info.offset)
7240
		state |= (1 << MD_SB_BITMAP_PRESENT);
L
Linus Torvalds 已提交
7241 7242 7243 7244 7245 7246 7247

	if (mddev->major_version != info->major_version ||
	    mddev->minor_version != info->minor_version ||
/*	    mddev->patch_version != info->patch_version || */
	    mddev->ctime         != info->ctime         ||
	    mddev->level         != info->level         ||
/*	    mddev->layout        != info->layout        || */
F
Firo Yang 已提交
7248
	    mddev->persistent	 != !info->not_persistent ||
7249
	    mddev->chunk_sectors != info->chunk_size >> 9 ||
7250 7251 7252
	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
	    ((state^info->state) & 0xfffffe00)
		)
L
Linus Torvalds 已提交
7253 7254
		return -EINVAL;
	/* Check there is only one change */
A
Andre Noll 已提交
7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266
	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
		cnt++;
	if (mddev->raid_disks != info->raid_disks)
		cnt++;
	if (mddev->layout != info->layout)
		cnt++;
	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
		cnt++;
	if (cnt == 0)
		return 0;
	if (cnt > 1)
		return -EINVAL;
L
Linus Torvalds 已提交
7267 7268 7269 7270 7271 7272

	if (mddev->layout != info->layout) {
		/* Change layout
		 * we don't need to do anything at the md level, the
		 * personality will take care of it all.
		 */
7273
		if (mddev->pers->check_reshape == NULL)
L
Linus Torvalds 已提交
7274
			return -EINVAL;
7275 7276
		else {
			mddev->new_layout = info->layout;
7277
			rv = mddev->pers->check_reshape(mddev);
7278 7279 7280 7281
			if (rv)
				mddev->new_layout = mddev->layout;
			return rv;
		}
L
Linus Torvalds 已提交
7282
	}
A
Andre Noll 已提交
7283
	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7284
		rv = update_size(mddev, (sector_t)info->size * 2);
7285

7286 7287 7288
	if (mddev->raid_disks    != info->raid_disks)
		rv = update_raid_disks(mddev, info->raid_disks);

7289
	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
7290 7291 7292 7293 7294 7295 7296 7297
		if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
			rv = -EINVAL;
			goto err;
		}
		if (mddev->recovery || mddev->sync_thread) {
			rv = -EBUSY;
			goto err;
		}
7298
		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
7299
			struct bitmap *bitmap;
7300
			/* add the bitmap */
7301 7302 7303 7304 7305 7306 7307 7308
			if (mddev->bitmap) {
				rv = -EEXIST;
				goto err;
			}
			if (mddev->bitmap_info.default_offset == 0) {
				rv = -EINVAL;
				goto err;
			}
7309 7310
			mddev->bitmap_info.offset =
				mddev->bitmap_info.default_offset;
7311 7312
			mddev->bitmap_info.space =
				mddev->bitmap_info.default_space;
7313
			bitmap = md_bitmap_create(mddev, -1);
7314
			mddev_suspend(mddev);
7315 7316
			if (!IS_ERR(bitmap)) {
				mddev->bitmap = bitmap;
7317
				rv = md_bitmap_load(mddev);
7318 7319
			} else
				rv = PTR_ERR(bitmap);
7320
			if (rv)
7321
				md_bitmap_destroy(mddev);
7322
			mddev_resume(mddev);
7323 7324
		} else {
			/* remove the bitmap */
7325 7326 7327 7328 7329 7330 7331 7332
			if (!mddev->bitmap) {
				rv = -ENOENT;
				goto err;
			}
			if (mddev->bitmap->storage.file) {
				rv = -EINVAL;
				goto err;
			}
7333 7334 7335
			if (mddev->bitmap_info.nodes) {
				/* hold PW on all the bitmap lock */
				if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
7336
					pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
7337 7338 7339 7340 7341 7342 7343 7344
					rv = -EPERM;
					md_cluster_ops->unlock_all_bitmaps(mddev);
					goto err;
				}

				mddev->bitmap_info.nodes = 0;
				md_cluster_ops->leave(mddev);
			}
7345
			mddev_suspend(mddev);
7346
			md_bitmap_destroy(mddev);
7347
			mddev_resume(mddev);
7348
			mddev->bitmap_info.offset = 0;
7349 7350
		}
	}
7351
	md_update_sb(mddev, 1);
7352 7353
	return rv;
err:
L
Linus Torvalds 已提交
7354 7355 7356
	return rv;
}

7357
static int set_disk_faulty(struct mddev *mddev, dev_t dev)
L
Linus Torvalds 已提交
7358
{
7359
	struct md_rdev *rdev;
7360
	int err = 0;
L
Linus Torvalds 已提交
7361 7362 7363 7364

	if (mddev->pers == NULL)
		return -ENODEV;

7365
	rcu_read_lock();
7366
	rdev = md_find_rdev_rcu(mddev, dev);
L
Linus Torvalds 已提交
7367
	if (!rdev)
7368 7369 7370 7371 7372 7373 7374 7375
		err =  -ENODEV;
	else {
		md_error(mddev, rdev);
		if (!test_bit(Faulty, &rdev->flags))
			err = -EBUSY;
	}
	rcu_read_unlock();
	return err;
L
Linus Torvalds 已提交
7376 7377
}

7378 7379 7380 7381 7382 7383
/*
 * We have a problem here : there is no easy way to give a CHS
 * virtual geometry. We currently pretend that we have a 2 heads
 * 4 sectors (with a BIG number of cylinders...). This drives
 * dosfs just mad... ;-)
 */
7384 7385
static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
7386
	struct mddev *mddev = bdev->bd_disk->private_data;
7387 7388 7389

	geo->heads = 2;
	geo->sectors = 4;
7390
	geo->cylinders = mddev->array_sectors / 8;
7391 7392 7393
	return 0;
}

7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412
static inline bool md_ioctl_valid(unsigned int cmd)
{
	switch (cmd) {
	case ADD_NEW_DISK:
	case BLKROSET:
	case GET_ARRAY_INFO:
	case GET_BITMAP_FILE:
	case GET_DISK_INFO:
	case HOT_ADD_DISK:
	case HOT_REMOVE_DISK:
	case RAID_AUTORUN:
	case RAID_VERSION:
	case RESTART_ARRAY_RW:
	case RUN_ARRAY:
	case SET_ARRAY_INFO:
	case SET_BITMAP_FILE:
	case SET_DISK_FAULTY:
	case STOP_ARRAY:
	case STOP_ARRAY_RO:
7413
	case CLUSTERED_DISK_NACK:
7414 7415 7416 7417 7418 7419
		return true;
	default:
		return false;
	}
}

A
Al Viro 已提交
7420
static int md_ioctl(struct block_device *bdev, fmode_t mode,
L
Linus Torvalds 已提交
7421 7422 7423 7424
			unsigned int cmd, unsigned long arg)
{
	int err = 0;
	void __user *argp = (void __user *)arg;
7425
	struct mddev *mddev = NULL;
7426
	int ro;
7427
	bool did_set_md_closing = false;
L
Linus Torvalds 已提交
7428

7429 7430 7431
	if (!md_ioctl_valid(cmd))
		return -ENOTTY;

7432 7433 7434 7435 7436 7437 7438 7439 7440
	switch (cmd) {
	case RAID_VERSION:
	case GET_ARRAY_INFO:
	case GET_DISK_INFO:
		break;
	default:
		if (!capable(CAP_SYS_ADMIN))
			return -EACCES;
	}
L
Linus Torvalds 已提交
7441 7442 7443 7444 7445

	/*
	 * Commands dealing with the RAID driver but not any
	 * particular array:
	 */
7446 7447 7448
	switch (cmd) {
	case RAID_VERSION:
		err = get_version(argp);
7449
		goto out;
L
Linus Torvalds 已提交
7450 7451

#ifndef MODULE
7452 7453 7454
	case RAID_AUTORUN:
		err = 0;
		autostart_arrays(arg);
7455
		goto out;
L
Linus Torvalds 已提交
7456
#endif
7457
	default:;
L
Linus Torvalds 已提交
7458 7459 7460 7461 7462 7463
	}

	/*
	 * Commands creating/starting a new array:
	 */

A
Al Viro 已提交
7464
	mddev = bdev->bd_disk->private_data;
L
Linus Torvalds 已提交
7465 7466 7467

	if (!mddev) {
		BUG();
7468
		goto out;
L
Linus Torvalds 已提交
7469 7470
	}

7471 7472 7473 7474 7475 7476 7477
	/* Some actions do not requires the mutex */
	switch (cmd) {
	case GET_ARRAY_INFO:
		if (!mddev->raid_disks && !mddev->external)
			err = -ENODEV;
		else
			err = get_array_info(mddev, argp);
7478
		goto out;
7479 7480 7481 7482 7483 7484

	case GET_DISK_INFO:
		if (!mddev->raid_disks && !mddev->external)
			err = -ENODEV;
		else
			err = get_disk_info(mddev, argp);
7485
		goto out;
7486 7487 7488

	case SET_DISK_FAULTY:
		err = set_disk_faulty(mddev, new_decode_dev(arg));
7489
		goto out;
7490 7491 7492 7493 7494

	case GET_BITMAP_FILE:
		err = get_bitmap_file(mddev, argp);
		goto out;

7495 7496
	}

7497 7498 7499 7500
	if (cmd == ADD_NEW_DISK)
		/* need to ensure md_delayed_delete() has completed */
		flush_workqueue(md_misc_wq);

7501 7502 7503 7504
	if (cmd == HOT_REMOVE_DISK)
		/* need to ensure recovery thread has run */
		wait_event_interruptible_timeout(mddev->sb_wait,
						 !test_bit(MD_RECOVERY_NEEDED,
7505
							   &mddev->recovery),
7506
						 msecs_to_jiffies(5000));
7507 7508 7509 7510 7511
	if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
		/* Need to flush page cache, and ensure no-one else opens
		 * and writes
		 */
		mutex_lock(&mddev->open_mutex);
7512
		if (mddev->pers && atomic_read(&mddev->openers) > 1) {
7513 7514
			mutex_unlock(&mddev->open_mutex);
			err = -EBUSY;
7515
			goto out;
7516
		}
7517
		WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags));
7518
		set_bit(MD_CLOSING, &mddev->flags);
7519
		did_set_md_closing = true;
7520 7521 7522
		mutex_unlock(&mddev->open_mutex);
		sync_blockdev(bdev);
	}
L
Linus Torvalds 已提交
7523 7524
	err = mddev_lock(mddev);
	if (err) {
7525 7526
		pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
			 err, cmd);
7527
		goto out;
L
Linus Torvalds 已提交
7528 7529
	}

7530 7531 7532 7533 7534 7535
	if (cmd == SET_ARRAY_INFO) {
		mdu_array_info_t info;
		if (!arg)
			memset(&info, 0, sizeof(info));
		else if (copy_from_user(&info, argp, sizeof(info))) {
			err = -EFAULT;
7536
			goto unlock;
7537 7538 7539 7540
		}
		if (mddev->pers) {
			err = update_array_info(mddev, &info);
			if (err) {
7541
				pr_warn("md: couldn't update array info. %d\n", err);
7542
				goto unlock;
L
Linus Torvalds 已提交
7543
			}
7544
			goto unlock;
7545 7546
		}
		if (!list_empty(&mddev->disks)) {
7547
			pr_warn("md: array %s already has disks!\n", mdname(mddev));
7548
			err = -EBUSY;
7549
			goto unlock;
7550 7551
		}
		if (mddev->raid_disks) {
7552
			pr_warn("md: array %s already initialised!\n", mdname(mddev));
7553
			err = -EBUSY;
7554
			goto unlock;
7555 7556 7557
		}
		err = set_array_info(mddev, &info);
		if (err) {
7558
			pr_warn("md: couldn't set array info. %d\n", err);
7559
			goto unlock;
7560
		}
7561
		goto unlock;
L
Linus Torvalds 已提交
7562 7563 7564 7565 7566
	}

	/*
	 * Commands querying/configuring an existing array:
	 */
7567
	/* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
7568
	 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
7569 7570 7571 7572
	if ((!mddev->raid_disks && !mddev->external)
	    && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
	    && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
	    && cmd != GET_BITMAP_FILE) {
L
Linus Torvalds 已提交
7573
		err = -ENODEV;
7574
		goto unlock;
L
Linus Torvalds 已提交
7575 7576 7577 7578 7579
	}

	/*
	 * Commands even a read-only array can execute:
	 */
7580 7581 7582
	switch (cmd) {
	case RESTART_ARRAY_RW:
		err = restart_array(mddev);
7583
		goto unlock;
L
Linus Torvalds 已提交
7584

7585 7586
	case STOP_ARRAY:
		err = do_md_stop(mddev, 0, bdev);
7587
		goto unlock;
L
Linus Torvalds 已提交
7588

7589 7590
	case STOP_ARRAY_RO:
		err = md_set_readonly(mddev, bdev);
7591
		goto unlock;
L
Linus Torvalds 已提交
7592

7593 7594
	case HOT_REMOVE_DISK:
		err = hot_remove_disk(mddev, new_decode_dev(arg));
7595
		goto unlock;
7596

7597 7598
	case ADD_NEW_DISK:
		/* We can support ADD_NEW_DISK on read-only arrays
W
Wei Fang 已提交
7599
		 * only if we are re-adding a preexisting device.
7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610
		 * So require mddev->pers and MD_DISK_SYNC.
		 */
		if (mddev->pers) {
			mdu_disk_info_t info;
			if (copy_from_user(&info, argp, sizeof(info)))
				err = -EFAULT;
			else if (!(info.state & (1<<MD_DISK_SYNC)))
				/* Need to clear read-only for this */
				break;
			else
				err = add_new_disk(mddev, &info);
7611
			goto unlock;
7612 7613 7614
		}
		break;

7615 7616 7617
	case BLKROSET:
		if (get_user(ro, (int __user *)(arg))) {
			err = -EFAULT;
7618
			goto unlock;
7619 7620
		}
		err = -EINVAL;
7621

7622 7623 7624 7625
		/* if the bdev is going readonly the value of mddev->ro
		 * does not matter, no writes are coming
		 */
		if (ro)
7626
			goto unlock;
7627

7628 7629
		/* are we are already prepared for writes? */
		if (mddev->ro != 1)
7630
			goto unlock;
7631

7632 7633 7634 7635 7636 7637 7638 7639
		/* transitioning to readauto need only happen for
		 * arrays that call md_write_start
		 */
		if (mddev->pers) {
			err = restart_array(mddev);
			if (err == 0) {
				mddev->ro = 2;
				set_disk_ro(mddev->gendisk, 0);
7640
			}
7641
		}
7642
		goto unlock;
L
Linus Torvalds 已提交
7643 7644 7645 7646
	}

	/*
	 * The remaining ioctls are changing the state of the
7647
	 * superblock, so we do not allow them on read-only arrays.
L
Linus Torvalds 已提交
7648
	 */
7649
	if (mddev->ro && mddev->pers) {
7650 7651
		if (mddev->ro == 2) {
			mddev->ro = 0;
N
NeilBrown 已提交
7652
			sysfs_notify_dirent_safe(mddev->sysfs_state);
7653
			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7654 7655 7656 7657
			/* mddev_unlock will wake thread */
			/* If a device failed while we were read-only, we
			 * need to make sure the metadata is updated now.
			 */
7658
			if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
7659 7660
				mddev_unlock(mddev);
				wait_event(mddev->sb_wait,
7661 7662
					   !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
					   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7663
				mddev_lock_nointr(mddev);
7664
			}
7665 7666
		} else {
			err = -EROFS;
7667
			goto unlock;
7668
		}
L
Linus Torvalds 已提交
7669 7670
	}

7671 7672
	switch (cmd) {
	case ADD_NEW_DISK:
L
Linus Torvalds 已提交
7673
	{
7674 7675 7676 7677 7678
		mdu_disk_info_t info;
		if (copy_from_user(&info, argp, sizeof(info)))
			err = -EFAULT;
		else
			err = add_new_disk(mddev, &info);
7679
		goto unlock;
7680
	}
L
Linus Torvalds 已提交
7681

7682 7683 7684 7685 7686 7687 7688
	case CLUSTERED_DISK_NACK:
		if (mddev_is_clustered(mddev))
			md_cluster_ops->new_disk_ack(mddev, false);
		else
			err = -EINVAL;
		goto unlock;

7689 7690
	case HOT_ADD_DISK:
		err = hot_add_disk(mddev, new_decode_dev(arg));
7691
		goto unlock;
L
Linus Torvalds 已提交
7692

7693 7694
	case RUN_ARRAY:
		err = do_md_run(mddev);
7695
		goto unlock;
L
Linus Torvalds 已提交
7696

7697 7698
	case SET_BITMAP_FILE:
		err = set_bitmap_file(mddev, (int)arg);
7699
		goto unlock;
7700

7701 7702
	default:
		err = -EINVAL;
7703
		goto unlock;
L
Linus Torvalds 已提交
7704 7705
	}

7706
unlock:
7707 7708 7709
	if (mddev->hold_active == UNTIL_IOCTL &&
	    err != -EINVAL)
		mddev->hold_active = 0;
L
Linus Torvalds 已提交
7710
	mddev_unlock(mddev);
7711
out:
7712 7713
	if(did_set_md_closing)
		clear_bit(MD_CLOSING, &mddev->flags);
L
Linus Torvalds 已提交
7714 7715
	return err;
}
7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734
#ifdef CONFIG_COMPAT
static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
		    unsigned int cmd, unsigned long arg)
{
	switch (cmd) {
	case HOT_REMOVE_DISK:
	case HOT_ADD_DISK:
	case SET_DISK_FAULTY:
	case SET_BITMAP_FILE:
		/* These take in integer arg, do not convert */
		break;
	default:
		arg = (unsigned long)compat_ptr(arg);
		break;
	}

	return md_ioctl(bdev, mode, cmd, arg);
}
#endif /* CONFIG_COMPAT */
L
Linus Torvalds 已提交
7735

A
Al Viro 已提交
7736
static int md_open(struct block_device *bdev, fmode_t mode)
L
Linus Torvalds 已提交
7737 7738 7739 7740 7741
{
	/*
	 * Succeed if we can lock the mddev, which confirms that
	 * it isn't being stopped right now.
	 */
7742
	struct mddev *mddev = mddev_find(bdev->bd_dev);
L
Linus Torvalds 已提交
7743 7744
	int err;

7745 7746 7747
	if (!mddev)
		return -ENODEV;

7748 7749 7750 7751 7752 7753
	if (mddev->gendisk != bdev->bd_disk) {
		/* we are racing with mddev_put which is discarding this
		 * bd_disk.
		 */
		mddev_put(mddev);
		/* Wait until bdev->bd_disk is definitely gone */
T
Tejun Heo 已提交
7754
		flush_workqueue(md_misc_wq);
7755 7756 7757 7758 7759
		/* Then retry the open from the top */
		return -ERESTARTSYS;
	}
	BUG_ON(mddev != bdev->bd_disk->private_data);

N
NeilBrown 已提交
7760
	if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
L
Linus Torvalds 已提交
7761 7762
		goto out;

7763 7764
	if (test_bit(MD_CLOSING, &mddev->flags)) {
		mutex_unlock(&mddev->open_mutex);
7765 7766
		err = -ENODEV;
		goto out;
7767 7768
	}

L
Linus Torvalds 已提交
7769
	err = 0;
7770
	atomic_inc(&mddev->openers);
N
NeilBrown 已提交
7771
	mutex_unlock(&mddev->open_mutex);
L
Linus Torvalds 已提交
7772

7773
	check_disk_change(bdev);
L
Linus Torvalds 已提交
7774
 out:
7775 7776
	if (err)
		mddev_put(mddev);
L
Linus Torvalds 已提交
7777 7778 7779
	return err;
}

7780
static void md_release(struct gendisk *disk, fmode_t mode)
L
Linus Torvalds 已提交
7781
{
7782
	struct mddev *mddev = disk->private_data;
L
Linus Torvalds 已提交
7783

E
Eric Sesterhenn 已提交
7784
	BUG_ON(!mddev);
7785
	atomic_dec(&mddev->openers);
L
Linus Torvalds 已提交
7786 7787
	mddev_put(mddev);
}
7788 7789 7790

static int md_media_changed(struct gendisk *disk)
{
7791
	struct mddev *mddev = disk->private_data;
7792 7793 7794 7795 7796 7797

	return mddev->changed;
}

static int md_revalidate(struct gendisk *disk)
{
7798
	struct mddev *mddev = disk->private_data;
7799 7800 7801 7802

	mddev->changed = 0;
	return 0;
}
7803
static const struct block_device_operations md_fops =
L
Linus Torvalds 已提交
7804 7805
{
	.owner		= THIS_MODULE,
A
Al Viro 已提交
7806 7807
	.open		= md_open,
	.release	= md_release,
N
NeilBrown 已提交
7808
	.ioctl		= md_ioctl,
7809 7810 7811
#ifdef CONFIG_COMPAT
	.compat_ioctl	= md_compat_ioctl,
#endif
7812
	.getgeo		= md_getgeo,
7813 7814
	.media_changed  = md_media_changed,
	.revalidate_disk= md_revalidate,
L
Linus Torvalds 已提交
7815 7816
};

7817
static int md_thread(void *arg)
L
Linus Torvalds 已提交
7818
{
7819
	struct md_thread *thread = arg;
L
Linus Torvalds 已提交
7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832

	/*
	 * md_thread is a 'system-thread', it's priority should be very
	 * high. We avoid resource deadlocks individually in each
	 * raid personality. (RAID5 does preallocation) We also use RR and
	 * the very same RT priority as kswapd, thus we will never get
	 * into a priority inversion deadlock.
	 *
	 * we definitely have to have equal or higher priority than
	 * bdflush, otherwise bdflush will deadlock if there are too
	 * many dirty RAID5 blocks.
	 */

N
NeilBrown 已提交
7833
	allow_signal(SIGKILL);
7834
	while (!kthread_should_stop()) {
L
Linus Torvalds 已提交
7835

7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846
		/* We need to wait INTERRUPTIBLE so that
		 * we don't add to the load-average.
		 * That means we need to be sure no signals are
		 * pending
		 */
		if (signal_pending(current))
			flush_signals(current);

		wait_event_interruptible_timeout
			(thread->wqueue,
			 test_bit(THREAD_WAKEUP, &thread->flags)
7847
			 || kthread_should_stop() || kthread_should_park(),
7848
			 thread->timeout);
L
Linus Torvalds 已提交
7849

7850
		clear_bit(THREAD_WAKEUP, &thread->flags);
7851 7852
		if (kthread_should_park())
			kthread_parkme();
7853
		if (!kthread_should_stop())
S
Shaohua Li 已提交
7854
			thread->run(thread);
L
Linus Torvalds 已提交
7855
	}
7856

L
Linus Torvalds 已提交
7857 7858 7859
	return 0;
}

7860
void md_wakeup_thread(struct md_thread *thread)
L
Linus Torvalds 已提交
7861 7862
{
	if (thread) {
7863
		pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
7864 7865
		set_bit(THREAD_WAKEUP, &thread->flags);
		wake_up(&thread->wqueue);
L
Linus Torvalds 已提交
7866 7867
	}
}
7868
EXPORT_SYMBOL(md_wakeup_thread);
L
Linus Torvalds 已提交
7869

S
Shaohua Li 已提交
7870 7871
struct md_thread *md_register_thread(void (*run) (struct md_thread *),
		struct mddev *mddev, const char *name)
L
Linus Torvalds 已提交
7872
{
7873
	struct md_thread *thread;
L
Linus Torvalds 已提交
7874

7875
	thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
L
Linus Torvalds 已提交
7876 7877 7878 7879 7880 7881 7882
	if (!thread)
		return NULL;

	init_waitqueue_head(&thread->wqueue);

	thread->run = run;
	thread->mddev = mddev;
7883
	thread->timeout = MAX_SCHEDULE_TIMEOUT;
7884 7885 7886
	thread->tsk = kthread_run(md_thread, thread,
				  "%s_%s",
				  mdname(thread->mddev),
7887
				  name);
7888
	if (IS_ERR(thread->tsk)) {
L
Linus Torvalds 已提交
7889 7890 7891 7892 7893
		kfree(thread);
		return NULL;
	}
	return thread;
}
7894
EXPORT_SYMBOL(md_register_thread);
L
Linus Torvalds 已提交
7895

7896
void md_unregister_thread(struct md_thread **threadp)
L
Linus Torvalds 已提交
7897
{
7898
	struct md_thread *thread = *threadp;
7899 7900
	if (!thread)
		return;
7901
	pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
7902 7903 7904 7905 7906 7907
	/* Locking ensures that mddev_unlock does not wake_up a
	 * non-existent thread
	 */
	spin_lock(&pers_lock);
	*threadp = NULL;
	spin_unlock(&pers_lock);
7908 7909

	kthread_stop(thread->tsk);
L
Linus Torvalds 已提交
7910 7911
	kfree(thread);
}
7912
EXPORT_SYMBOL(md_unregister_thread);
L
Linus Torvalds 已提交
7913

7914
void md_error(struct mddev *mddev, struct md_rdev *rdev)
L
Linus Torvalds 已提交
7915
{
7916
	if (!rdev || test_bit(Faulty, &rdev->flags))
L
Linus Torvalds 已提交
7917
		return;
7918

7919
	if (!mddev->pers || !mddev->pers->error_handler)
L
Linus Torvalds 已提交
7920 7921
		return;
	mddev->pers->error_handler(mddev,rdev);
7922 7923
	if (mddev->degraded)
		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
N
NeilBrown 已提交
7924
	sysfs_notify_dirent_safe(rdev->sysfs_state);
L
Linus Torvalds 已提交
7925 7926 7927
	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
	md_wakeup_thread(mddev->thread);
7928
	if (mddev->event_work.func)
T
Tejun Heo 已提交
7929
		queue_work(md_misc_wq, &mddev->event_work);
7930
	md_new_event(mddev);
L
Linus Torvalds 已提交
7931
}
7932
EXPORT_SYMBOL(md_error);
L
Linus Torvalds 已提交
7933 7934 7935 7936 7937 7938

/* seq_file implementation /proc/mdstat */

static void status_unused(struct seq_file *seq)
{
	int i = 0;
7939
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
7940 7941 7942

	seq_printf(seq, "unused devices: ");

7943
	list_for_each_entry(rdev, &pending_raid_disks, same_set) {
L
Linus Torvalds 已提交
7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954
		char b[BDEVNAME_SIZE];
		i++;
		seq_printf(seq, "%s ",
			      bdevname(rdev->bdev,b));
	}
	if (!i)
		seq_printf(seq, "<none>");

	seq_printf(seq, "\n");
}

7955
static int status_resync(struct seq_file *seq, struct mddev *mddev)
L
Linus Torvalds 已提交
7956
{
7957
	sector_t max_sectors, resync, res;
7958 7959 7960
	unsigned long dt, db = 0;
	sector_t rt, curr_mark_cnt, resync_mark_cnt;
	int scale, recovery_active;
7961
	unsigned int per_milli;
L
Linus Torvalds 已提交
7962

7963 7964
	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7965
		max_sectors = mddev->resync_max_sectors;
L
Linus Torvalds 已提交
7966
	else
7967
		max_sectors = mddev->dev_sectors;
L
Linus Torvalds 已提交
7968

7969 7970 7971 7972 7973
	resync = mddev->curr_resync;
	if (resync <= 3) {
		if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
			/* Still cleaning up */
			resync = max_sectors;
7974 7975 7976
	} else if (resync > max_sectors)
		resync = max_sectors;
	else
7977 7978 7979
		resync -= atomic_read(&mddev->recovery_active);

	if (resync == 0) {
7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996
		if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
			struct md_rdev *rdev;

			rdev_for_each(rdev, mddev)
				if (rdev->raid_disk >= 0 &&
				    !test_bit(Faulty, &rdev->flags) &&
				    rdev->recovery_offset != MaxSector &&
				    rdev->recovery_offset) {
					seq_printf(seq, "\trecover=REMOTE");
					return 1;
				}
			if (mddev->reshape_position != MaxSector)
				seq_printf(seq, "\treshape=REMOTE");
			else
				seq_printf(seq, "\tresync=REMOTE");
			return 1;
		}
7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007
		if (mddev->recovery_cp < MaxSector) {
			seq_printf(seq, "\tresync=PENDING");
			return 1;
		}
		return 0;
	}
	if (resync < 3) {
		seq_printf(seq, "\tresync=DELAYED");
		return 1;
	}

N
NeilBrown 已提交
8008
	WARN_ON(max_sectors == 0);
8009
	/* Pick 'scale' such that (resync>>scale)*1000 will fit
8010
	 * in a sector_t, and (max_sectors>>scale) will fit in a
8011 8012 8013 8014 8015
	 * u32, as those are the requirements for sector_div.
	 * Thus 'scale' must be at least 10
	 */
	scale = 10;
	if (sizeof(sector_t) > sizeof(unsigned long)) {
8016
		while ( max_sectors/2 > (1ULL<<(scale+32)))
8017 8018 8019
			scale++;
	}
	res = (resync>>scale)*1000;
8020
	sector_div(res, (u32)((max_sectors>>scale)+1));
8021 8022

	per_milli = res;
L
Linus Torvalds 已提交
8023
	{
8024
		int i, x = per_milli/50, y = 20-x;
L
Linus Torvalds 已提交
8025 8026 8027 8028 8029 8030 8031 8032
		seq_printf(seq, "[");
		for (i = 0; i < x; i++)
			seq_printf(seq, "=");
		seq_printf(seq, ">");
		for (i = 0; i < y; i++)
			seq_printf(seq, ".");
		seq_printf(seq, "] ");
	}
8033
	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
8034 8035
		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
		    "reshape" :
8036 8037 8038 8039 8040
		    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
		     "check" :
		     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
		      "resync" : "recovery"))),
		   per_milli/10, per_milli % 10,
8041 8042
		   (unsigned long long) resync/2,
		   (unsigned long long) max_sectors/2);
L
Linus Torvalds 已提交
8043 8044 8045 8046 8047

	/*
	 * dt: time from mark until now
	 * db: blocks written from mark until now
	 * rt: remaining time
8048
	 *
8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059
	 * rt is a sector_t, which is always 64bit now. We are keeping
	 * the original algorithm, but it is not really necessary.
	 *
	 * Original algorithm:
	 *   So we divide before multiply in case it is 32bit and close
	 *   to the limit.
	 *   We scale the divisor (db) by 32 to avoid losing precision
	 *   near the end of resync when the number of remaining sectors
	 *   is close to 'db'.
	 *   We then divide rt by 32 after multiplying by db to compensate.
	 *   The '+1' avoids division by zero if db is very small.
L
Linus Torvalds 已提交
8060 8061 8062
	 */
	dt = ((jiffies - mddev->resync_mark) / HZ);
	if (!dt) dt++;
8063 8064 8065 8066 8067 8068 8069

	curr_mark_cnt = mddev->curr_mark_cnt;
	recovery_active = atomic_read(&mddev->recovery_active);
	resync_mark_cnt = mddev->resync_mark_cnt;

	if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
		db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
L
Linus Torvalds 已提交
8070

8071
	rt = max_sectors - resync;    /* number of remaining sectors */
8072
	rt = div64_u64(rt, db/32+1);
8073 8074 8075 8076 8077
	rt *= dt;
	rt >>= 5;

	seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
		   ((unsigned long)rt % 60)/6);
L
Linus Torvalds 已提交
8078

8079
	seq_printf(seq, " speed=%ldK/sec", db/2/dt);
8080
	return 1;
L
Linus Torvalds 已提交
8081 8082 8083 8084 8085 8086
}

static void *md_seq_start(struct seq_file *seq, loff_t *pos)
{
	struct list_head *tmp;
	loff_t l = *pos;
8087
	struct mddev *mddev;
L
Linus Torvalds 已提交
8088 8089 8090 8091 8092 8093 8094 8095 8096 8097

	if (l >= 0x10000)
		return NULL;
	if (!l--)
		/* header */
		return (void*)1;

	spin_lock(&all_mddevs_lock);
	list_for_each(tmp,&all_mddevs)
		if (!l--) {
8098
			mddev = list_entry(tmp, struct mddev, all_mddevs);
L
Linus Torvalds 已提交
8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111
			mddev_get(mddev);
			spin_unlock(&all_mddevs_lock);
			return mddev;
		}
	spin_unlock(&all_mddevs_lock);
	if (!l--)
		return (void*)2;/* tail */
	return NULL;
}

static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct list_head *tmp;
8112
	struct mddev *next_mddev, *mddev = v;
8113

L
Linus Torvalds 已提交
8114 8115 8116 8117 8118 8119 8120 8121 8122 8123
	++*pos;
	if (v == (void*)2)
		return NULL;

	spin_lock(&all_mddevs_lock);
	if (v == (void*)1)
		tmp = all_mddevs.next;
	else
		tmp = mddev->all_mddevs.next;
	if (tmp != &all_mddevs)
8124
		next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
L
Linus Torvalds 已提交
8125 8126 8127
	else {
		next_mddev = (void*)2;
		*pos = 0x10000;
8128
	}
L
Linus Torvalds 已提交
8129 8130 8131 8132 8133 8134 8135 8136 8137 8138
	spin_unlock(&all_mddevs_lock);

	if (v != (void*)1)
		mddev_put(mddev);
	return next_mddev;

}

static void md_seq_stop(struct seq_file *seq, void *v)
{
8139
	struct mddev *mddev = v;
L
Linus Torvalds 已提交
8140 8141 8142 8143 8144 8145 8146

	if (mddev && v != (void*)1 && v != (void*)2)
		mddev_put(mddev);
}

static int md_seq_show(struct seq_file *seq, void *v)
{
8147
	struct mddev *mddev = v;
8148
	sector_t sectors;
8149
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
8150 8151

	if (v == (void*)1) {
8152
		struct md_personality *pers;
L
Linus Torvalds 已提交
8153 8154
		seq_printf(seq, "Personalities : ");
		spin_lock(&pers_lock);
8155 8156
		list_for_each_entry(pers, &pers_list, list)
			seq_printf(seq, "[%s] ", pers->name);
L
Linus Torvalds 已提交
8157 8158 8159

		spin_unlock(&pers_lock);
		seq_printf(seq, "\n");
8160
		seq->poll_event = atomic_read(&md_event_count);
L
Linus Torvalds 已提交
8161 8162 8163 8164 8165 8166 8167
		return 0;
	}
	if (v == (void*)2) {
		status_unused(seq);
		return 0;
	}

8168
	spin_lock(&mddev->lock);
L
Linus Torvalds 已提交
8169 8170 8171 8172
	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
		seq_printf(seq, "%s : %sactive", mdname(mddev),
						mddev->pers ? "" : "in");
		if (mddev->pers) {
8173
			if (mddev->ro==1)
L
Linus Torvalds 已提交
8174
				seq_printf(seq, " (read-only)");
8175
			if (mddev->ro==2)
8176
				seq_printf(seq, " (auto-read-only)");
L
Linus Torvalds 已提交
8177 8178 8179
			seq_printf(seq, " %s", mddev->pers->name);
		}

8180
		sectors = 0;
8181 8182
		rcu_read_lock();
		rdev_for_each_rcu(rdev, mddev) {
L
Linus Torvalds 已提交
8183 8184 8185
			char b[BDEVNAME_SIZE];
			seq_printf(seq, " %s[%d]",
				bdevname(rdev->bdev,b), rdev->desc_nr);
8186 8187
			if (test_bit(WriteMostly, &rdev->flags))
				seq_printf(seq, "(W)");
S
Shaohua Li 已提交
8188 8189
			if (test_bit(Journal, &rdev->flags))
				seq_printf(seq, "(J)");
8190
			if (test_bit(Faulty, &rdev->flags)) {
L
Linus Torvalds 已提交
8191 8192
				seq_printf(seq, "(F)");
				continue;
8193 8194
			}
			if (rdev->raid_disk < 0)
8195
				seq_printf(seq, "(S)"); /* spare */
8196 8197
			if (test_bit(Replacement, &rdev->flags))
				seq_printf(seq, "(R)");
8198
			sectors += rdev->sectors;
L
Linus Torvalds 已提交
8199
		}
8200
		rcu_read_unlock();
L
Linus Torvalds 已提交
8201 8202 8203 8204

		if (!list_empty(&mddev->disks)) {
			if (mddev->pers)
				seq_printf(seq, "\n      %llu blocks",
8205 8206
					   (unsigned long long)
					   mddev->array_sectors / 2);
L
Linus Torvalds 已提交
8207 8208
			else
				seq_printf(seq, "\n      %llu blocks",
8209
					   (unsigned long long)sectors / 2);
L
Linus Torvalds 已提交
8210
		}
8211 8212 8213 8214 8215 8216 8217
		if (mddev->persistent) {
			if (mddev->major_version != 0 ||
			    mddev->minor_version != 90) {
				seq_printf(seq," super %d.%d",
					   mddev->major_version,
					   mddev->minor_version);
			}
8218 8219 8220 8221
		} else if (mddev->external)
			seq_printf(seq, " super external:%s",
				   mddev->metadata_type);
		else
8222
			seq_printf(seq, " super non-persistent");
L
Linus Torvalds 已提交
8223 8224

		if (mddev->pers) {
8225
			mddev->pers->status(seq, mddev);
8226
			seq_printf(seq, "\n      ");
8227
			if (mddev->pers->sync_request) {
8228
				if (status_resync(seq, mddev))
8229 8230
					seq_printf(seq, "\n      ");
			}
8231 8232 8233
		} else
			seq_printf(seq, "\n       ");

8234
		md_bitmap_status(seq, mddev->bitmap);
L
Linus Torvalds 已提交
8235 8236 8237

		seq_printf(seq, "\n");
	}
8238
	spin_unlock(&mddev->lock);
8239

L
Linus Torvalds 已提交
8240 8241 8242
	return 0;
}

J
Jan Engelhardt 已提交
8243
static const struct seq_operations md_seq_ops = {
L
Linus Torvalds 已提交
8244 8245 8246 8247 8248 8249 8250 8251
	.start  = md_seq_start,
	.next   = md_seq_next,
	.stop   = md_seq_stop,
	.show   = md_seq_show,
};

static int md_seq_open(struct inode *inode, struct file *file)
{
8252
	struct seq_file *seq;
L
Linus Torvalds 已提交
8253 8254 8255
	int error;

	error = seq_open(file, &md_seq_ops);
8256
	if (error)
8257 8258 8259 8260
		return error;

	seq = file->private_data;
	seq->poll_event = atomic_read(&md_event_count);
L
Linus Torvalds 已提交
8261 8262 8263
	return error;
}

8264
static int md_unloading;
8265
static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
8266
{
8267
	struct seq_file *seq = filp->private_data;
8268
	__poll_t mask;
8269

8270
	if (md_unloading)
8271
		return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
8272 8273 8274
	poll_wait(filp, &md_event_waiters, wait);

	/* always allow read */
8275
	mask = EPOLLIN | EPOLLRDNORM;
8276

8277
	if (seq->poll_event != atomic_read(&md_event_count))
8278
		mask |= EPOLLERR | EPOLLPRI;
8279 8280 8281
	return mask;
}

8282
static const struct file_operations md_seq_fops = {
8283
	.owner		= THIS_MODULE,
L
Linus Torvalds 已提交
8284 8285 8286
	.open           = md_seq_open,
	.read           = seq_read,
	.llseek         = seq_lseek,
8287
	.release	= seq_release,
8288
	.poll		= mdstat_poll,
L
Linus Torvalds 已提交
8289 8290
};

8291
int register_md_personality(struct md_personality *p)
L
Linus Torvalds 已提交
8292
{
8293 8294
	pr_debug("md: %s personality registered for level %d\n",
		 p->name, p->level);
L
Linus Torvalds 已提交
8295
	spin_lock(&pers_lock);
8296
	list_add_tail(&p->list, &pers_list);
L
Linus Torvalds 已提交
8297 8298 8299
	spin_unlock(&pers_lock);
	return 0;
}
8300
EXPORT_SYMBOL(register_md_personality);
L
Linus Torvalds 已提交
8301

8302
int unregister_md_personality(struct md_personality *p)
L
Linus Torvalds 已提交
8303
{
8304
	pr_debug("md: %s personality unregistered\n", p->name);
L
Linus Torvalds 已提交
8305
	spin_lock(&pers_lock);
8306
	list_del_init(&p->list);
L
Linus Torvalds 已提交
8307 8308 8309
	spin_unlock(&pers_lock);
	return 0;
}
8310
EXPORT_SYMBOL(unregister_md_personality);
L
Linus Torvalds 已提交
8311

8312 8313
int register_md_cluster_operations(struct md_cluster_operations *ops,
				   struct module *module)
8314
{
8315
	int ret = 0;
8316
	spin_lock(&pers_lock);
8317 8318 8319 8320 8321 8322
	if (md_cluster_ops != NULL)
		ret = -EALREADY;
	else {
		md_cluster_ops = ops;
		md_cluster_mod = module;
	}
8323
	spin_unlock(&pers_lock);
8324
	return ret;
8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338
}
EXPORT_SYMBOL(register_md_cluster_operations);

int unregister_md_cluster_operations(void)
{
	spin_lock(&pers_lock);
	md_cluster_ops = NULL;
	spin_unlock(&pers_lock);
	return 0;
}
EXPORT_SYMBOL(unregister_md_cluster_operations);

int md_setup_cluster(struct mddev *mddev, int nodes)
{
8339 8340
	if (!md_cluster_ops)
		request_module("md-cluster");
8341
	spin_lock(&pers_lock);
8342
	/* ensure module won't be unloaded */
8343
	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
8344
		pr_warn("can't find md-cluster module or get it's reference.\n");
8345 8346 8347 8348 8349
		spin_unlock(&pers_lock);
		return -ENOENT;
	}
	spin_unlock(&pers_lock);

G
Goldwyn Rodrigues 已提交
8350
	return md_cluster_ops->join(mddev, nodes);
8351 8352 8353 8354
}

void md_cluster_stop(struct mddev *mddev)
{
G
Goldwyn Rodrigues 已提交
8355 8356
	if (!md_cluster_ops)
		return;
8357 8358 8359 8360
	md_cluster_ops->leave(mddev);
	module_put(md_cluster_mod);
}

8361
static int is_mddev_idle(struct mddev *mddev, int init)
L
Linus Torvalds 已提交
8362
{
8363
	struct md_rdev *rdev;
L
Linus Torvalds 已提交
8364
	int idle;
N
NeilBrown 已提交
8365
	int curr_events;
L
Linus Torvalds 已提交
8366 8367

	idle = 1;
8368 8369
	rcu_read_lock();
	rdev_for_each_rcu(rdev, mddev) {
L
Linus Torvalds 已提交
8370
		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
8371
		curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
N
NeilBrown 已提交
8372
			      atomic_read(&disk->sync_io);
8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392
		/* sync IO will cause sync_io to increase before the disk_stats
		 * as sync_io is counted when a request starts, and
		 * disk_stats is counted when it completes.
		 * So resync activity will cause curr_events to be smaller than
		 * when there was no such activity.
		 * non-sync IO will cause disk_stat to increase without
		 * increasing sync_io so curr_events will (eventually)
		 * be larger than it was before.  Once it becomes
		 * substantially larger, the test below will cause
		 * the array to appear non-idle, and resync will slow
		 * down.
		 * If there is a lot of outstanding resync activity when
		 * we set last_event to curr_events, then all that activity
		 * completing might cause the array to appear non-idle
		 * and resync will be slowed down even though there might
		 * not have been non-resync activity.  This will only
		 * happen once though.  'last_events' will soon reflect
		 * the state where there is little or no outstanding
		 * resync requests, and further resync activity will
		 * always make curr_events less than last_events.
8393
		 *
L
Linus Torvalds 已提交
8394
		 */
N
NeilBrown 已提交
8395
		if (init || curr_events - rdev->last_events > 64) {
L
Linus Torvalds 已提交
8396 8397 8398 8399
			rdev->last_events = curr_events;
			idle = 0;
		}
	}
8400
	rcu_read_unlock();
L
Linus Torvalds 已提交
8401 8402 8403
	return idle;
}

8404
void md_done_sync(struct mddev *mddev, int blocks, int ok)
L
Linus Torvalds 已提交
8405 8406 8407 8408 8409
{
	/* another "blocks" (512byte) blocks have been synced */
	atomic_sub(blocks, &mddev->recovery_active);
	wake_up(&mddev->recovery_wait);
	if (!ok) {
8410
		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8411
		set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
L
Linus Torvalds 已提交
8412 8413 8414 8415
		md_wakeup_thread(mddev->thread);
		// stop recovery, signal do_sync ....
	}
}
8416
EXPORT_SYMBOL(md_done_sync);
L
Linus Torvalds 已提交
8417

8418 8419
/* md_write_start(mddev, bi)
 * If we need to update some array metadata (e.g. 'active' flag
8420 8421
 * in superblock) before writing, schedule a superblock update
 * and wait for it to complete.
8422 8423
 * A return value of 'false' means that the write wasn't recorded
 * and cannot proceed as the array is being suspend.
8424
 */
8425
bool md_write_start(struct mddev *mddev, struct bio *bi)
L
Linus Torvalds 已提交
8426
{
8427
	int did_change = 0;
8428

8429
	if (bio_data_dir(bi) != WRITE)
8430
		return true;
8431

8432 8433 8434 8435 8436 8437
	BUG_ON(mddev->ro == 1);
	if (mddev->ro == 2) {
		/* need to switch to read/write */
		mddev->ro = 0;
		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
8438
		md_wakeup_thread(mddev->sync_thread);
8439
		did_change = 1;
8440
	}
8441 8442
	rcu_read_lock();
	percpu_ref_get(&mddev->writes_pending);
8443
	smp_mb(); /* Match smp_mb in set_in_sync() */
8444 8445
	if (mddev->safemode == 1)
		mddev->safemode = 0;
8446
	/* sync_checkers is always 0 when writes_pending is in per-cpu mode */
N
NeilBrown 已提交
8447
	if (mddev->in_sync || mddev->sync_checkers) {
8448
		spin_lock(&mddev->lock);
8449 8450
		if (mddev->in_sync) {
			mddev->in_sync = 0;
8451 8452
			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
			set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8453
			md_wakeup_thread(mddev->thread);
8454
			did_change = 1;
8455
		}
8456
		spin_unlock(&mddev->lock);
8457
	}
8458
	rcu_read_unlock();
8459
	if (did_change)
N
NeilBrown 已提交
8460
		sysfs_notify_dirent_safe(mddev->sysfs_state);
8461 8462
	if (!mddev->has_superblocks)
		return true;
8463
	wait_event(mddev->sb_wait,
8464 8465
		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
		   mddev->suspended);
8466 8467 8468 8469 8470
	if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
		percpu_ref_put(&mddev->writes_pending);
		return false;
	}
	return true;
L
Linus Torvalds 已提交
8471
}
8472
EXPORT_SYMBOL(md_write_start);
L
Linus Torvalds 已提交
8473

8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486
/* md_write_inc can only be called when md_write_start() has
 * already been called at least once of the current request.
 * It increments the counter and is useful when a single request
 * is split into several parts.  Each part causes an increment and
 * so needs a matching md_write_end().
 * Unlike md_write_start(), it is safe to call md_write_inc() inside
 * a spinlocked region.
 */
void md_write_inc(struct mddev *mddev, struct bio *bi)
{
	if (bio_data_dir(bi) != WRITE)
		return;
	WARN_ON_ONCE(mddev->in_sync || mddev->ro);
8487
	percpu_ref_get(&mddev->writes_pending);
8488 8489 8490
}
EXPORT_SYMBOL(md_write_inc);

8491
void md_write_end(struct mddev *mddev)
L
Linus Torvalds 已提交
8492
{
8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503
	percpu_ref_put(&mddev->writes_pending);

	if (mddev->safemode == 2)
		md_wakeup_thread(mddev->thread);
	else if (mddev->safemode_delay)
		/* The roundup() ensures this only performs locking once
		 * every ->safemode_delay jiffies
		 */
		mod_timer(&mddev->safemode_timer,
			  roundup(jiffies, mddev->safemode_delay) +
			  mddev->safemode_delay);
L
Linus Torvalds 已提交
8504
}
8505

8506
EXPORT_SYMBOL(md_write_end);
L
Linus Torvalds 已提交
8507

8508 8509 8510 8511 8512 8513
/* md_allow_write(mddev)
 * Calling this ensures that the array is marked 'active' so that writes
 * may proceed without blocking.  It is important to call this before
 * attempting a GFP_KERNEL allocation while holding the mddev lock.
 * Must be called with mddev_lock held.
 */
8514
void md_allow_write(struct mddev *mddev)
8515 8516
{
	if (!mddev->pers)
8517
		return;
8518
	if (mddev->ro)
8519
		return;
8520
	if (!mddev->pers->sync_request)
8521
		return;
8522

8523
	spin_lock(&mddev->lock);
8524 8525
	if (mddev->in_sync) {
		mddev->in_sync = 0;
8526 8527
		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
		set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8528 8529 8530
		if (mddev->safemode_delay &&
		    mddev->safemode == 0)
			mddev->safemode = 1;
8531
		spin_unlock(&mddev->lock);
8532
		md_update_sb(mddev, 0);
N
NeilBrown 已提交
8533
		sysfs_notify_dirent_safe(mddev->sysfs_state);
8534 8535 8536
		/* wait for the dirty state to be recorded in the metadata */
		wait_event(mddev->sb_wait,
			   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8537
	} else
8538
		spin_unlock(&mddev->lock);
8539 8540 8541
}
EXPORT_SYMBOL_GPL(md_allow_write);

L
Linus Torvalds 已提交
8542 8543
#define SYNC_MARKS	10
#define	SYNC_MARK_STEP	(3*HZ)
8544
#define UPDATE_FREQUENCY (5*60*HZ)
S
Shaohua Li 已提交
8545
void md_do_sync(struct md_thread *thread)
L
Linus Torvalds 已提交
8546
{
S
Shaohua Li 已提交
8547
	struct mddev *mddev = thread->mddev;
8548
	struct mddev *mddev2;
8549
	unsigned int currspeed = 0, window;
X
Xiao Ni 已提交
8550
	sector_t max_sectors,j, io_sectors, recovery_done;
L
Linus Torvalds 已提交
8551
	unsigned long mark[SYNC_MARKS];
8552
	unsigned long update_time;
L
Linus Torvalds 已提交
8553 8554 8555 8556
	sector_t mark_cnt[SYNC_MARKS];
	int last_mark,m;
	struct list_head *tmp;
	sector_t last_check;
8557
	int skipped = 0;
8558
	struct md_rdev *rdev;
8559
	char *desc, *action = NULL;
M
majianpeng 已提交
8560
	struct blk_plug plug;
8561
	int ret;
L
Linus Torvalds 已提交
8562 8563

	/* just incase thread restarts... */
8564 8565
	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
	    test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
L
Linus Torvalds 已提交
8566
		return;
8567 8568
	if (mddev->ro) {/* never try to sync a read-only array */
		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8569
		return;
8570
	}
L
Linus Torvalds 已提交
8571

8572 8573 8574 8575 8576
	if (mddev_is_clustered(mddev)) {
		ret = md_cluster_ops->resync_start(mddev);
		if (ret)
			goto skip;

8577
		set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
8578 8579 8580 8581 8582 8583 8584 8585
		if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
			test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
			test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
		     && ((unsigned long long)mddev->curr_resync_completed
			 < (unsigned long long)mddev->resync_max_sectors))
			goto skip;
	}

8586
	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8587
		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
8588
			desc = "data-check";
8589 8590
			action = "check";
		} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8591
			desc = "requested-resync";
8592 8593
			action = "repair";
		} else
8594 8595 8596 8597 8598 8599
			desc = "resync";
	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
		desc = "reshape";
	else
		desc = "recovery";

8600 8601
	mddev->last_sync_action = action ?: desc;

L
Linus Torvalds 已提交
8602 8603 8604 8605
	/* we overload curr_resync somewhat here.
	 * 0 == not engaged in resync at all
	 * 2 == checking that there is no conflict with another sync
	 * 1 == like 2, but have yielded to allow conflicting resync to
8606
	 *		commence
L
Linus Torvalds 已提交
8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618
	 * other == active in resync - this many blocks
	 *
	 * Before starting a resync we must have set curr_resync to
	 * 2, and then checked that every "conflicting" array has curr_resync
	 * less than ours.  When we find one that is the same or higher
	 * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
	 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
	 * This will mean we have to start checking from the beginning again.
	 *
	 */

	do {
8619
		int mddev2_minor = -1;
L
Linus Torvalds 已提交
8620 8621 8622
		mddev->curr_resync = 2;

	try_again:
8623
		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
L
Linus Torvalds 已提交
8624
			goto skip;
8625
		for_each_mddev(mddev2, tmp) {
L
Linus Torvalds 已提交
8626 8627
			if (mddev2 == mddev)
				continue;
8628 8629 8630
			if (!mddev->parallel_resync
			&&  mddev2->curr_resync
			&&  match_mddev_units(mddev, mddev2)) {
L
Linus Torvalds 已提交
8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641
				DEFINE_WAIT(wq);
				if (mddev < mddev2 && mddev->curr_resync == 2) {
					/* arbitrarily yield */
					mddev->curr_resync = 1;
					wake_up(&resync_wait);
				}
				if (mddev > mddev2 && mddev->curr_resync == 1)
					/* no need to wait here, we can wait the next
					 * time 'round when curr_resync == 2
					 */
					continue;
8642 8643 8644 8645 8646
				/* We need to wait 'interruptible' so as not to
				 * contribute to the load average, and not to
				 * be caught by 'softlockup'
				 */
				prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
8647
				if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8648
				    mddev2->curr_resync >= mddev->curr_resync) {
8649 8650
					if (mddev2_minor != mddev2->md_minor) {
						mddev2_minor = mddev2->md_minor;
8651 8652 8653
						pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
							desc, mdname(mddev),
							mdname(mddev2));
8654
					}
L
Linus Torvalds 已提交
8655
					mddev_put(mddev2);
8656 8657
					if (signal_pending(current))
						flush_signals(current);
L
Linus Torvalds 已提交
8658 8659 8660 8661 8662 8663 8664 8665 8666
					schedule();
					finish_wait(&resync_wait, &wq);
					goto try_again;
				}
				finish_wait(&resync_wait, &wq);
			}
		}
	} while (mddev->curr_resync < 2);

8667
	j = 0;
8668
	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
L
Linus Torvalds 已提交
8669
		/* resync follows the size requested by the personality,
8670
		 * which defaults to physical size, but can be virtual size
L
Linus Torvalds 已提交
8671 8672
		 */
		max_sectors = mddev->resync_max_sectors;
8673
		atomic64_set(&mddev->resync_mismatches, 0);
8674
		/* we don't use the checkpoint if there's a bitmap */
8675 8676 8677
		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
			j = mddev->resync_min;
		else if (!mddev->bitmap)
8678
			j = mddev->recovery_cp;
8679

8680
	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
8681
		max_sectors = mddev->resync_max_sectors;
8682 8683 8684 8685 8686 8687 8688 8689 8690
		/*
		 * If the original node aborts reshaping then we continue the
		 * reshaping, so set j again to avoid restart reshape from the
		 * first beginning
		 */
		if (mddev_is_clustered(mddev) &&
		    mddev->reshape_position != MaxSector)
			j = mddev->reshape_position;
	} else {
L
Linus Torvalds 已提交
8691
		/* recovery follows the physical size of devices */
A
Andre Noll 已提交
8692
		max_sectors = mddev->dev_sectors;
8693
		j = MaxSector;
8694
		rcu_read_lock();
N
NeilBrown 已提交
8695
		rdev_for_each_rcu(rdev, mddev)
8696
			if (rdev->raid_disk >= 0 &&
S
Shaohua Li 已提交
8697
			    !test_bit(Journal, &rdev->flags) &&
8698 8699 8700 8701
			    !test_bit(Faulty, &rdev->flags) &&
			    !test_bit(In_sync, &rdev->flags) &&
			    rdev->recovery_offset < j)
				j = rdev->recovery_offset;
8702
		rcu_read_unlock();
8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715

		/* If there is a bitmap, we need to make sure all
		 * writes that started before we added a spare
		 * complete before we start doing a recovery.
		 * Otherwise the write might complete and (via
		 * bitmap_endwrite) set a bit in the bitmap after the
		 * recovery has checked that bit and skipped that
		 * region.
		 */
		if (mddev->bitmap) {
			mddev->pers->quiesce(mddev, 1);
			mddev->pers->quiesce(mddev, 0);
		}
8716
	}
L
Linus Torvalds 已提交
8717

8718 8719 8720 8721
	pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
	pr_debug("md: minimum _guaranteed_  speed: %d KB/sec/disk.\n", speed_min(mddev));
	pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
		 speed_max(mddev), desc);
L
Linus Torvalds 已提交
8722

N
NeilBrown 已提交
8723
	is_mddev_idle(mddev, 1); /* this initializes IO event counters */
8724

8725
	io_sectors = 0;
L
Linus Torvalds 已提交
8726 8727
	for (m = 0; m < SYNC_MARKS; m++) {
		mark[m] = jiffies;
8728
		mark_cnt[m] = io_sectors;
L
Linus Torvalds 已提交
8729 8730 8731 8732 8733 8734 8735 8736
	}
	last_mark = 0;
	mddev->resync_mark = mark[last_mark];
	mddev->resync_mark_cnt = mark_cnt[last_mark];

	/*
	 * Tune reconstruction:
	 */
8737
	window = 32 * (PAGE_SIZE / 512);
8738 8739
	pr_debug("md: using %dk window, over a total of %lluk.\n",
		 window/2, (unsigned long long)max_sectors/2);
L
Linus Torvalds 已提交
8740 8741 8742 8743 8744

	atomic_set(&mddev->recovery_active, 0);
	last_check = 0;

	if (j>2) {
8745 8746
		pr_debug("md: resuming %s of %s from checkpoint.\n",
			 desc, mdname(mddev));
L
Linus Torvalds 已提交
8747
		mddev->curr_resync = j;
8748 8749
	} else
		mddev->curr_resync = 3; /* no longer delayed */
8750
	mddev->curr_resync_completed = j;
8751 8752
	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
	md_new_event(mddev);
8753
	update_time = jiffies;
L
Linus Torvalds 已提交
8754

M
majianpeng 已提交
8755
	blk_start_plug(&plug);
L
Linus Torvalds 已提交
8756
	while (j < max_sectors) {
8757
		sector_t sectors;
L
Linus Torvalds 已提交
8758

8759
		skipped = 0;
8760

8761 8762 8763 8764
		if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
		    ((mddev->curr_resync > mddev->curr_resync_completed &&
		      (mddev->curr_resync - mddev->curr_resync_completed)
		      > (max_sectors >> 4)) ||
8765
		     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
8766
		     (j - mddev->curr_resync_completed)*2
8767 8768
		     >= mddev->resync_max - mddev->curr_resync_completed ||
		     mddev->curr_resync_completed > mddev->resync_max
8769
			    )) {
8770 8771 8772
			/* time to update curr_resync_completed */
			wait_event(mddev->recovery_wait,
				   atomic_read(&mddev->recovery_active) == 0);
8773
			mddev->curr_resync_completed = j;
K
kernelmail 已提交
8774 8775 8776
			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
			    j > mddev->recovery_cp)
				mddev->recovery_cp = j;
8777
			update_time = jiffies;
8778
			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8779
			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8780
		}
8781

8782 8783
		while (j >= mddev->resync_max &&
		       !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8784 8785 8786 8787 8788 8789 8790
			/* As this condition is controlled by user-space,
			 * we can block indefinitely, so use '_interruptible'
			 * to avoid triggering warnings.
			 */
			flush_signals(current); /* just in case */
			wait_event_interruptible(mddev->recovery_wait,
						 mddev->resync_max > j
8791 8792
						 || test_bit(MD_RECOVERY_INTR,
							     &mddev->recovery));
8793
		}
8794

8795 8796
		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
			break;
8797

8798
		sectors = mddev->pers->sync_request(mddev, j, &skipped);
8799
		if (sectors == 0) {
8800
			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8801
			break;
L
Linus Torvalds 已提交
8802
		}
8803 8804 8805 8806 8807 8808

		if (!skipped) { /* actual IO requested */
			io_sectors += sectors;
			atomic_add(sectors, &mddev->recovery_active);
		}

8809 8810 8811
		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
			break;

L
Linus Torvalds 已提交
8812
		j += sectors;
8813 8814 8815
		if (j > max_sectors)
			/* when skipping, extra large numbers can be returned. */
			j = max_sectors;
8816 8817
		if (j > 2)
			mddev->curr_resync = j;
8818
		mddev->curr_mark_cnt = io_sectors;
8819
		if (last_check == 0)
8820
			/* this is the earliest that rebuild will be
8821 8822 8823
			 * visible in /proc/mdstat
			 */
			md_new_event(mddev);
8824 8825

		if (last_check + window > io_sectors || j == max_sectors)
L
Linus Torvalds 已提交
8826 8827
			continue;

8828
		last_check = io_sectors;
L
Linus Torvalds 已提交
8829 8830 8831 8832 8833 8834 8835 8836
	repeat:
		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
			/* step marks */
			int next = (last_mark+1) % SYNC_MARKS;

			mddev->resync_mark = mark[next];
			mddev->resync_mark_cnt = mark_cnt[next];
			mark[next] = jiffies;
8837
			mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
L
Linus Torvalds 已提交
8838 8839 8840
			last_mark = next;
		}

8841 8842
		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
			break;
L
Linus Torvalds 已提交
8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853

		/*
		 * this loop exits only if either when we are slower than
		 * the 'hard' speed limit, or the system was IO-idle for
		 * a jiffy.
		 * the system might be non-idle CPU-wise, but we only care
		 * about not overloading the IO subsystem. (things like an
		 * e2fsck being done on the RAID array should execute fast)
		 */
		cond_resched();

X
Xiao Ni 已提交
8854 8855
		recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
		currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
8856
			/((jiffies-mddev->resync_mark)/HZ +1) +1;
L
Linus Torvalds 已提交
8857

8858
		if (currspeed > speed_min(mddev)) {
8859
			if (currspeed > speed_max(mddev)) {
8860
				msleep(500);
L
Linus Torvalds 已提交
8861 8862
				goto repeat;
			}
8863 8864 8865 8866 8867 8868 8869 8870
			if (!is_mddev_idle(mddev, 0)) {
				/*
				 * Give other IO more of a chance.
				 * The faster the devices, the less we wait.
				 */
				wait_event(mddev->recovery_wait,
					   !atomic_read(&mddev->recovery_active));
			}
L
Linus Torvalds 已提交
8871 8872
		}
	}
8873 8874 8875
	pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
		test_bit(MD_RECOVERY_INTR, &mddev->recovery)
		? "interrupted" : "done");
L
Linus Torvalds 已提交
8876 8877 8878
	/*
	 * this also signals 'finished resyncing' to md_stop
	 */
M
majianpeng 已提交
8879
	blk_finish_plug(&plug);
L
Linus Torvalds 已提交
8880 8881
	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));

8882 8883
	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8884
	    mddev->curr_resync > 3) {
8885 8886 8887
		mddev->curr_resync_completed = mddev->curr_resync;
		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
	}
8888
	mddev->pers->sync_request(mddev, max_sectors, &skipped);
L
Linus Torvalds 已提交
8889

8890
	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
8891
	    mddev->curr_resync > 3) {
8892 8893 8894
		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
				if (mddev->curr_resync >= mddev->recovery_cp) {
8895 8896
					pr_debug("md: checkpointing %s of %s.\n",
						 desc, mdname(mddev));
8897 8898 8899 8900 8901 8902 8903
					if (test_bit(MD_RECOVERY_ERROR,
						&mddev->recovery))
						mddev->recovery_cp =
							mddev->curr_resync_completed;
					else
						mddev->recovery_cp =
							mddev->curr_resync;
8904 8905 8906 8907 8908 8909
				}
			} else
				mddev->recovery_cp = MaxSector;
		} else {
			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
				mddev->curr_resync = MaxSector;
8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922
			if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
				rcu_read_lock();
				rdev_for_each_rcu(rdev, mddev)
					if (rdev->raid_disk >= 0 &&
					    mddev->delta_disks >= 0 &&
					    !test_bit(Journal, &rdev->flags) &&
					    !test_bit(Faulty, &rdev->flags) &&
					    !test_bit(In_sync, &rdev->flags) &&
					    rdev->recovery_offset < mddev->curr_resync)
						rdev->recovery_offset = mddev->curr_resync;
				rcu_read_unlock();
			}
8923
		}
L
Linus Torvalds 已提交
8924
	}
8925
 skip:
8926 8927 8928
	/* set CHANGE_PENDING here since maybe another update is needed,
	 * so other nodes are informed. It should be harmless for normal
	 * raid */
8929 8930
	set_mask_bits(&mddev->sb_flags, 0,
		      BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
8931

8932 8933 8934 8935 8936 8937 8938 8939 8940
	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
			!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
			mddev->delta_disks > 0 &&
			mddev->pers->finish_reshape &&
			mddev->pers->size &&
			mddev->queue) {
		mddev_lock_nointr(mddev);
		md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
		mddev_unlock(mddev);
8941 8942 8943 8944
		if (!mddev_is_clustered(mddev)) {
			set_capacity(mddev->gendisk, mddev->array_sectors);
			revalidate_disk(mddev->gendisk);
		}
8945 8946
	}

8947
	spin_lock(&mddev->lock);
8948 8949 8950 8951 8952 8953 8954
	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
		/* We completed so min/max setting can be forgotten if used. */
		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
			mddev->resync_min = 0;
		mddev->resync_max = MaxSector;
	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
		mddev->resync_min = mddev->curr_resync_completed;
8955
	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
L
Linus Torvalds 已提交
8956
	mddev->curr_resync = 0;
8957 8958
	spin_unlock(&mddev->lock);

L
Linus Torvalds 已提交
8959 8960
	wake_up(&resync_wait);
	md_wakeup_thread(mddev->thread);
8961
	return;
L
Linus Torvalds 已提交
8962
}
8963
EXPORT_SYMBOL_GPL(md_do_sync);
L
Linus Torvalds 已提交
8964

8965 8966
static int remove_and_add_spares(struct mddev *mddev,
				 struct md_rdev *this)
8967
{
8968
	struct md_rdev *rdev;
8969
	int spares = 0;
8970
	int removed = 0;
8971
	bool remove_some = false;
8972

8973 8974 8975 8976
	if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
		/* Mustn't remove devices when resync thread is running */
		return 0;

8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995
	rdev_for_each(rdev, mddev) {
		if ((this == NULL || rdev == this) &&
		    rdev->raid_disk >= 0 &&
		    !test_bit(Blocked, &rdev->flags) &&
		    test_bit(Faulty, &rdev->flags) &&
		    atomic_read(&rdev->nr_pending)==0) {
			/* Faulty non-Blocked devices with nr_pending == 0
			 * never get nr_pending incremented,
			 * never get Faulty cleared, and never get Blocked set.
			 * So we can synchronize_rcu now rather than once per device
			 */
			remove_some = true;
			set_bit(RemoveSynchronized, &rdev->flags);
		}
	}

	if (remove_some)
		synchronize_rcu();
	rdev_for_each(rdev, mddev) {
8996 8997
		if ((this == NULL || rdev == this) &&
		    rdev->raid_disk >= 0 &&
8998
		    !test_bit(Blocked, &rdev->flags) &&
8999
		    ((test_bit(RemoveSynchronized, &rdev->flags) ||
S
Shaohua Li 已提交
9000 9001
		     (!test_bit(In_sync, &rdev->flags) &&
		      !test_bit(Journal, &rdev->flags))) &&
9002
		    atomic_read(&rdev->nr_pending)==0)) {
9003
			if (mddev->pers->hot_remove_disk(
9004
				    mddev, rdev) == 0) {
9005
				sysfs_unlink_rdev(mddev, rdev);
9006
				rdev->saved_raid_disk = rdev->raid_disk;
9007
				rdev->raid_disk = -1;
9008
				removed++;
9009 9010
			}
		}
9011 9012 9013 9014
		if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
			clear_bit(RemoveSynchronized, &rdev->flags);
	}

9015 9016
	if (removed && mddev->kobj.sd)
		sysfs_notify(&mddev->kobj, NULL, "degraded");
9017

9018
	if (this && removed)
9019 9020
		goto no_add;

N
NeilBrown 已提交
9021
	rdev_for_each(rdev, mddev) {
9022 9023
		if (this && this != rdev)
			continue;
9024 9025
		if (test_bit(Candidate, &rdev->flags))
			continue;
9026 9027
		if (rdev->raid_disk >= 0 &&
		    !test_bit(In_sync, &rdev->flags) &&
S
Shaohua Li 已提交
9028
		    !test_bit(Journal, &rdev->flags) &&
9029 9030
		    !test_bit(Faulty, &rdev->flags))
			spares++;
9031 9032 9033 9034
		if (rdev->raid_disk >= 0)
			continue;
		if (test_bit(Faulty, &rdev->flags))
			continue;
9035 9036 9037 9038 9039
		if (!test_bit(Journal, &rdev->flags)) {
			if (mddev->ro &&
			    ! (rdev->saved_raid_disk >= 0 &&
			       !test_bit(Bitmap_sync, &rdev->flags)))
				continue;
9040

9041 9042
			rdev->recovery_offset = 0;
		}
9043 9044 9045 9046
		if (mddev->pers->
		    hot_add_disk(mddev, rdev) == 0) {
			if (sysfs_link_rdev(mddev, rdev))
				/* failure here is OK */;
9047 9048
			if (!test_bit(Journal, &rdev->flags))
				spares++;
9049
			md_new_event(mddev);
9050
			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9051
		}
9052
	}
9053
no_add:
9054
	if (removed)
9055
		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9056 9057
	return spares;
}
9058

9059 9060 9061
static void md_start_sync(struct work_struct *ws)
{
	struct mddev *mddev = container_of(ws, struct mddev, del_work);
9062

9063 9064 9065 9066
	mddev->sync_thread = md_register_thread(md_do_sync,
						mddev,
						"resync");
	if (!mddev->sync_thread) {
9067 9068
		pr_warn("%s: could not start resync thread...\n",
			mdname(mddev));
9069 9070 9071 9072 9073 9074
		/* leave the spares where they are, it shouldn't hurt */
		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
		clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
		clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
		clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9075
		wake_up(&resync_wait);
9076 9077 9078 9079 9080 9081 9082 9083 9084 9085
		if (test_and_clear_bit(MD_RECOVERY_RECOVER,
				       &mddev->recovery))
			if (mddev->sysfs_action)
				sysfs_notify_dirent_safe(mddev->sysfs_action);
	} else
		md_wakeup_thread(mddev->sync_thread);
	sysfs_notify_dirent_safe(mddev->sysfs_action);
	md_new_event(mddev);
}

L
Linus Torvalds 已提交
9086 9087 9088 9089 9090 9091 9092 9093 9094 9095
/*
 * This routine is regularly called by all per-raid-array threads to
 * deal with generic issues like resync and super-block update.
 * Raid personalities that don't have a thread (linear/raid0) do not
 * need this as they never do any recovery or update the superblock.
 *
 * It does not do any resync itself, but rather "forks" off other threads
 * to do that as needed.
 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
 * "->recovery" and create a thread at ->sync_thread.
9096
 * When the thread finishes it sets MD_RECOVERY_DONE
L
Linus Torvalds 已提交
9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107
 * and wakeups up this thread which will reap the thread and finish up.
 * This thread also removes any faulty devices (with nr_pending == 0).
 *
 * The overall approach is:
 *  1/ if the superblock needs updating, update it.
 *  2/ If a recovery thread is running, don't do anything else.
 *  3/ If recovery has finished, clean up, possibly marking spares active.
 *  4/ If there are any faulty devices, remove them.
 *  5/ If array is degraded, try to add spares devices
 *  6/ If array has spares or is not in-sync, start a resync thread.
 */
9108
void md_check_recovery(struct mddev *mddev)
L
Linus Torvalds 已提交
9109
{
9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121
	if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
		/* Write superblock - thread that called mddev_suspend()
		 * holds reconfig_mutex for us.
		 */
		set_bit(MD_UPDATING_SB, &mddev->flags);
		smp_mb__after_atomic();
		if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
			md_update_sb(mddev, 0);
		clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
		wake_up(&mddev->sb_wait);
	}

9122 9123 9124
	if (mddev->suspended)
		return;

9125
	if (mddev->bitmap)
9126
		md_bitmap_daemon_work(mddev);
L
Linus Torvalds 已提交
9127

9128
	if (signal_pending(current)) {
9129
		if (mddev->pers->sync_request && !mddev->external) {
9130 9131
			pr_debug("md: %s in immediate safe mode\n",
				 mdname(mddev));
9132 9133 9134 9135 9136
			mddev->safemode = 2;
		}
		flush_signals(current);
	}

9137 9138
	if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
		return;
L
Linus Torvalds 已提交
9139
	if ( ! (
9140
		(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
L
Linus Torvalds 已提交
9141
		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
9142
		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
9143
		(mddev->external == 0 && mddev->safemode == 1) ||
9144
		(mddev->safemode == 2
9145
		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
L
Linus Torvalds 已提交
9146 9147
		))
		return;
9148

9149
	if (mddev_trylock(mddev)) {
9150
		int spares = 0;
9151
		bool try_set_sync = mddev->safemode != 0;
9152

9153
		if (!mddev->external && mddev->safemode == 1)
9154 9155
			mddev->safemode = 0;

9156
		if (mddev->ro) {
9157 9158 9159 9160 9161 9162 9163 9164 9165
			struct md_rdev *rdev;
			if (!mddev->external && mddev->in_sync)
				/* 'Blocked' flag not needed as failed devices
				 * will be recorded if array switched to read/write.
				 * Leaving it set will prevent the device
				 * from being removed.
				 */
				rdev_for_each(rdev, mddev)
					clear_bit(Blocked, &rdev->flags);
9166 9167 9168 9169 9170 9171
			/* On a read-only array we can:
			 * - remove failed devices
			 * - add already-in_sync devices if the array itself
			 *   is in-sync.
			 * As we only add devices that are already in-sync,
			 * we can activate the spares immediately.
9172
			 */
9173
			remove_and_add_spares(mddev, NULL);
9174 9175 9176
			/* There is no thread, but we need to call
			 * ->spare_active and clear saved_raid_disk
			 */
9177
			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9178
			md_reap_sync_thread(mddev);
9179
			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9180
			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9181
			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
9182 9183 9184
			goto unlock;
		}

9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196
		if (mddev_is_clustered(mddev)) {
			struct md_rdev *rdev;
			/* kick the device if another node issued a
			 * remove disk.
			 */
			rdev_for_each(rdev, mddev) {
				if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
						rdev->raid_disk < 0)
					md_kick_rdev_from_array(rdev);
			}
		}

9197
		if (try_set_sync && !mddev->external && !mddev->in_sync) {
9198
			spin_lock(&mddev->lock);
N
NeilBrown 已提交
9199
			set_in_sync(mddev);
9200
			spin_unlock(&mddev->lock);
9201 9202
		}

9203
		if (mddev->sb_flags)
9204
			md_update_sb(mddev, 0);
9205

L
Linus Torvalds 已提交
9206 9207 9208 9209 9210 9211 9212
		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
			/* resync/recovery still happening */
			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
			goto unlock;
		}
		if (mddev->sync_thread) {
9213
			md_reap_sync_thread(mddev);
L
Linus Torvalds 已提交
9214 9215
			goto unlock;
		}
9216 9217 9218
		/* Set RUNNING before clearing NEEDED to avoid
		 * any transients in the value of "sync_action".
		 */
9219
		mddev->curr_resync_completed = 0;
9220
		spin_lock(&mddev->lock);
9221
		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9222
		spin_unlock(&mddev->lock);
9223 9224 9225 9226 9227
		/* Clear some bits that don't mean anything, but
		 * might be left set
		 */
		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
L
Linus Torvalds 已提交
9228

9229 9230
		if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
		    test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
9231
			goto not_running;
L
Linus Torvalds 已提交
9232 9233 9234
		/* no recovery is running.
		 * remove any failed drives, then
		 * add spares if possible.
9235
		 * Spares are also removed and re-added, to allow
L
Linus Torvalds 已提交
9236 9237 9238
		 * the personality to fail the re-add.
		 */

9239
		if (mddev->reshape_position != MaxSector) {
9240 9241
			if (mddev->pers->check_reshape == NULL ||
			    mddev->pers->check_reshape(mddev) != 0)
9242
				/* Cannot proceed */
9243
				goto not_running;
9244
			set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9245
			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9246
		} else if ((spares = remove_and_add_spares(mddev, NULL))) {
9247 9248
			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9249
			clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9250
			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9251 9252
		} else if (mddev->recovery_cp < MaxSector) {
			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9253
			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9254 9255
		} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
			/* nothing to be done ... */
9256
			goto not_running;
9257

L
Linus Torvalds 已提交
9258
		if (mddev->pers->sync_request) {
9259
			if (spares) {
9260 9261 9262 9263
				/* We are adding a device or devices to an array
				 * which has the bitmap stored on all devices.
				 * So make sure all bitmap pages get written
				 */
9264
				md_bitmap_write_all(mddev->bitmap);
9265
			}
9266 9267 9268
			INIT_WORK(&mddev->del_work, md_start_sync);
			queue_work(md_misc_wq, &mddev->del_work);
			goto unlock;
L
Linus Torvalds 已提交
9269
		}
9270
	not_running:
9271 9272
		if (!mddev->sync_thread) {
			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9273
			wake_up(&resync_wait);
9274 9275
			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
					       &mddev->recovery))
9276
				if (mddev->sysfs_action)
N
NeilBrown 已提交
9277
					sysfs_notify_dirent_safe(mddev->sysfs_action);
9278
		}
9279 9280
	unlock:
		wake_up(&mddev->sb_wait);
L
Linus Torvalds 已提交
9281 9282 9283
		mddev_unlock(mddev);
	}
}
9284
EXPORT_SYMBOL(md_check_recovery);
L
Linus Torvalds 已提交
9285

9286 9287 9288
void md_reap_sync_thread(struct mddev *mddev)
{
	struct md_rdev *rdev;
9289 9290
	sector_t old_dev_sectors = mddev->dev_sectors;
	bool is_reshaped = false;
9291 9292 9293 9294

	/* resync has finished, collect result */
	md_unregister_thread(&mddev->sync_thread);
	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9295 9296
	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
	    mddev->degraded != mddev->raid_disks) {
9297 9298 9299 9300 9301
		/* success...*/
		/* activate any spares */
		if (mddev->pers->spare_active(mddev)) {
			sysfs_notify(&mddev->kobj, NULL,
				     "degraded");
9302
			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9303 9304 9305
		}
	}
	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9306
	    mddev->pers->finish_reshape) {
9307
		mddev->pers->finish_reshape(mddev);
9308 9309 9310
		if (mddev_is_clustered(mddev))
			is_reshaped = true;
	}
9311 9312

	/* If array is no-longer degraded, then any saved_raid_disk
9313
	 * information must be scrapped.
9314
	 */
9315 9316
	if (!mddev->degraded)
		rdev_for_each(rdev, mddev)
9317 9318 9319
			rdev->saved_raid_disk = -1;

	md_update_sb(mddev, 1);
9320
	/* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
9321 9322 9323 9324
	 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
	 * clustered raid */
	if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
		md_cluster_ops->resync_finish(mddev);
9325
	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9326
	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9327 9328 9329 9330
	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9331 9332 9333 9334 9335 9336 9337 9338
	/*
	 * We call md_cluster_ops->update_size here because sync_size could
	 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
	 * so it is time to update size across cluster.
	 */
	if (mddev_is_clustered(mddev) && is_reshaped
				      && !test_bit(MD_CLOSING, &mddev->flags))
		md_cluster_ops->update_size(mddev, old_dev_sectors);
9339
	wake_up(&resync_wait);
9340 9341 9342 9343 9344 9345 9346
	/* flag recovery needed just to double check */
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
	sysfs_notify_dirent_safe(mddev->sysfs_action);
	md_new_event(mddev);
	if (mddev->event_work.func)
		queue_work(md_misc_wq, &mddev->event_work);
}
9347
EXPORT_SYMBOL(md_reap_sync_thread);
9348

9349
void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
9350
{
N
NeilBrown 已提交
9351
	sysfs_notify_dirent_safe(rdev->sysfs_state);
9352
	wait_event_timeout(rdev->blocked_wait,
9353 9354
			   !test_bit(Blocked, &rdev->flags) &&
			   !test_bit(BlockedBadBlocks, &rdev->flags),
9355 9356 9357 9358 9359
			   msecs_to_jiffies(5000));
	rdev_dec_pending(rdev, mddev);
}
EXPORT_SYMBOL(md_wait_for_blocked_rdev);

9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373
void md_finish_reshape(struct mddev *mddev)
{
	/* called be personality module when reshape completes. */
	struct md_rdev *rdev;

	rdev_for_each(rdev, mddev) {
		if (rdev->data_offset > rdev->new_data_offset)
			rdev->sectors += rdev->data_offset - rdev->new_data_offset;
		else
			rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
		rdev->data_offset = rdev->new_data_offset;
	}
}
EXPORT_SYMBOL(md_finish_reshape);
9374

9375
/* Bad block management */
9376

9377
/* Returns 1 on success, 0 on failure */
9378
int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9379
		       int is_new)
9380
{
9381
	struct mddev *mddev = rdev->mddev;
9382 9383 9384 9385 9386
	int rv;
	if (is_new)
		s += rdev->new_data_offset;
	else
		s += rdev->data_offset;
9387 9388
	rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
	if (rv == 0) {
9389
		/* Make sure they get written out promptly */
9390 9391 9392
		if (test_bit(ExternalBbl, &rdev->flags))
			sysfs_notify(&rdev->kobj, NULL,
				     "unacknowledged_bad_blocks");
9393
		sysfs_notify_dirent_safe(rdev->sysfs_state);
9394 9395
		set_mask_bits(&mddev->sb_flags, 0,
			      BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
9396
		md_wakeup_thread(rdev->mddev->thread);
9397 9398 9399
		return 1;
	} else
		return 0;
9400 9401 9402
}
EXPORT_SYMBOL_GPL(rdev_set_badblocks);

9403 9404
int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
			 int is_new)
9405
{
9406
	int rv;
9407 9408 9409 9410
	if (is_new)
		s += rdev->new_data_offset;
	else
		s += rdev->data_offset;
9411 9412 9413 9414
	rv = badblocks_clear(&rdev->badblocks, s, sectors);
	if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
		sysfs_notify(&rdev->kobj, NULL, "bad_blocks");
	return rv;
9415 9416 9417
}
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);

A
Adrian Bunk 已提交
9418 9419
static int md_notify_reboot(struct notifier_block *this,
			    unsigned long code, void *x)
L
Linus Torvalds 已提交
9420 9421
{
	struct list_head *tmp;
9422
	struct mddev *mddev;
9423
	int need_delay = 0;
L
Linus Torvalds 已提交
9424

9425 9426
	for_each_mddev(mddev, tmp) {
		if (mddev_trylock(mddev)) {
9427 9428
			if (mddev->pers)
				__md_stop_writes(mddev);
9429 9430
			if (mddev->persistent)
				mddev->safemode = 2;
9431
			mddev_unlock(mddev);
9432
		}
9433
		need_delay = 1;
L
Linus Torvalds 已提交
9434
	}
9435 9436 9437 9438 9439 9440 9441 9442 9443
	/*
	 * certain more exotic SCSI devices are known to be
	 * volatile wrt too early system reboots. While the
	 * right place to handle this issue is the given
	 * driver, we do want to have a safe RAID driver ...
	 */
	if (need_delay)
		mdelay(1000*1);

L
Linus Torvalds 已提交
9444 9445 9446
	return NOTIFY_DONE;
}

A
Adrian Bunk 已提交
9447
static struct notifier_block md_notifier = {
L
Linus Torvalds 已提交
9448 9449 9450 9451 9452 9453 9454
	.notifier_call	= md_notify_reboot,
	.next		= NULL,
	.priority	= INT_MAX, /* before any real devices */
};

static void md_geninit(void)
{
9455
	pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
L
Linus Torvalds 已提交
9456

9457
	proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
L
Linus Torvalds 已提交
9458 9459
}

A
Adrian Bunk 已提交
9460
static int __init md_init(void)
L
Linus Torvalds 已提交
9461
{
T
Tejun Heo 已提交
9462 9463
	int ret = -ENOMEM;

9464
	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
T
Tejun Heo 已提交
9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478
	if (!md_wq)
		goto err_wq;

	md_misc_wq = alloc_workqueue("md_misc", 0, 0);
	if (!md_misc_wq)
		goto err_misc_wq;

	if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
		goto err_md;

	if ((ret = register_blkdev(0, "mdp")) < 0)
		goto err_mdp;
	mdp_major = ret;

9479
	blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
9480 9481
			    md_probe, NULL, NULL);
	blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
L
Linus Torvalds 已提交
9482 9483 9484
			    md_probe, NULL, NULL);

	register_reboot_notifier(&md_notifier);
9485
	raid_table_header = register_sysctl_table(raid_root_table);
L
Linus Torvalds 已提交
9486 9487

	md_geninit();
9488
	return 0;
L
Linus Torvalds 已提交
9489

T
Tejun Heo 已提交
9490 9491 9492 9493 9494 9495 9496 9497 9498
err_mdp:
	unregister_blkdev(MD_MAJOR, "md");
err_md:
	destroy_workqueue(md_misc_wq);
err_misc_wq:
	destroy_workqueue(md_wq);
err_wq:
	return ret;
}
L
Linus Torvalds 已提交
9499

9500
static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
9501
{
9502 9503 9504 9505
	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
	struct md_rdev *rdev2;
	int role, ret;
	char b[BDEVNAME_SIZE];
9506

9507 9508 9509 9510 9511 9512 9513 9514 9515
	/*
	 * If size is changed in another node then we need to
	 * do resize as well.
	 */
	if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
		ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
		if (ret)
			pr_info("md-cluster: resize failed\n");
		else
9516
			md_bitmap_update_sb(mddev->bitmap);
9517 9518
	}

9519 9520 9521 9522 9523 9524 9525
	/* Check for change of roles in the active devices */
	rdev_for_each(rdev2, mddev) {
		if (test_bit(Faulty, &rdev2->flags))
			continue;

		/* Check if the roles changed */
		role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536

		if (test_bit(Candidate, &rdev2->flags)) {
			if (role == 0xfffe) {
				pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
				md_kick_rdev_from_array(rdev2);
				continue;
			}
			else
				clear_bit(Candidate, &rdev2->flags);
		}

9537
		if (role != rdev2->raid_disk) {
9538 9539 9540 9541 9542 9543
			/*
			 * got activated except reshape is happening.
			 */
			if (rdev2->raid_disk == -1 && role != 0xffff &&
			    !(le32_to_cpu(sb->feature_map) &
			      MD_FEATURE_RESHAPE_ACTIVE)) {
9544 9545 9546
				rdev2->saved_raid_disk = role;
				ret = remove_and_add_spares(mddev, rdev2);
				pr_info("Activated spare: %s\n",
9547
					bdevname(rdev2->bdev,b));
9548 9549 9550 9551
				/* wakeup mddev->thread here, so array could
				 * perform resync with the new activated disk */
				set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
				md_wakeup_thread(mddev->thread);
9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562
			}
			/* device faulty
			 * We just want to do the minimum to mark the disk
			 * as faulty. The recovery is performed by the
			 * one who initiated the error.
			 */
			if ((role == 0xfffe) || (role == 0xfffd)) {
				md_error(mddev, rdev2);
				clear_bit(Blocked, &rdev2->flags);
			}
		}
9563
	}
9564

9565 9566
	if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
		update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
9567

9568 9569 9570 9571 9572 9573 9574 9575 9576 9577
	/*
	 * Since mddev->delta_disks has already updated in update_raid_disks,
	 * so it is time to check reshape.
	 */
	if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
		/*
		 * reshape is happening in the remote node, we need to
		 * update reshape_position and call start_reshape.
		 */
9578
		mddev->reshape_position = le64_to_cpu(sb->reshape_position);
9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591
		if (mddev->pers->update_reshape_pos)
			mddev->pers->update_reshape_pos(mddev);
		if (mddev->pers->start_reshape)
			mddev->pers->start_reshape(mddev);
	} else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
		   mddev->reshape_position != MaxSector &&
		   !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
		/* reshape is just done in another node. */
		mddev->reshape_position = MaxSector;
		if (mddev->pers->update_reshape_pos)
			mddev->pers->update_reshape_pos(mddev);
	}

9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605
	/* Finally set the event to be up to date */
	mddev->events = le64_to_cpu(sb->events);
}

static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
	int err;
	struct page *swapout = rdev->sb_page;
	struct mdp_superblock_1 *sb;

	/* Store the sb page of the rdev in the swapout temporary
	 * variable in case we err in the future
	 */
	rdev->sb_page = NULL;
9606 9607 9608 9609 9610 9611 9612
	err = alloc_disk_sb(rdev);
	if (err == 0) {
		ClearPageUptodate(rdev->sb_page);
		rdev->sb_loaded = 0;
		err = super_types[mddev->major_version].
			load_super(rdev, NULL, mddev->minor_version);
	}
9613 9614 9615
	if (err < 0) {
		pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
				__func__, __LINE__, rdev->desc_nr, err);
9616 9617
		if (rdev->sb_page)
			put_page(rdev->sb_page);
9618 9619 9620
		rdev->sb_page = swapout;
		rdev->sb_loaded = 1;
		return err;
9621 9622
	}

9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665
	sb = page_address(rdev->sb_page);
	/* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
	 * is not set
	 */

	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
		rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);

	/* The other node finished recovery, call spare_active to set
	 * device In_sync and mddev->degraded
	 */
	if (rdev->recovery_offset == MaxSector &&
	    !test_bit(In_sync, &rdev->flags) &&
	    mddev->pers->spare_active(mddev))
		sysfs_notify(&mddev->kobj, NULL, "degraded");

	put_page(swapout);
	return 0;
}

void md_reload_sb(struct mddev *mddev, int nr)
{
	struct md_rdev *rdev;
	int err;

	/* Find the rdev */
	rdev_for_each_rcu(rdev, mddev) {
		if (rdev->desc_nr == nr)
			break;
	}

	if (!rdev || rdev->desc_nr != nr) {
		pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
		return;
	}

	err = read_rdev(mddev, rdev);
	if (err < 0)
		return;

	check_sb_changes(mddev, rdev);

	/* Read all rdev's to update recovery_offset */
9666 9667 9668 9669
	rdev_for_each_rcu(rdev, mddev) {
		if (!test_bit(Faulty, &rdev->flags))
			read_rdev(mddev, rdev);
	}
9670 9671 9672
}
EXPORT_SYMBOL(md_reload_sb);

L
Linus Torvalds 已提交
9673 9674 9675 9676 9677 9678
#ifndef MODULE

/*
 * Searches all registered partitions for autorun RAID arrays
 * at boot time.
 */
9679

9680
static DEFINE_MUTEX(detected_devices_mutex);
9681 9682 9683 9684 9685
static LIST_HEAD(all_detected_devices);
struct detected_devices_node {
	struct list_head list;
	dev_t dev;
};
L
Linus Torvalds 已提交
9686 9687 9688

void md_autodetect_dev(dev_t dev)
{
9689 9690 9691 9692 9693
	struct detected_devices_node *node_detected_dev;

	node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
	if (node_detected_dev) {
		node_detected_dev->dev = dev;
9694
		mutex_lock(&detected_devices_mutex);
9695
		list_add_tail(&node_detected_dev->list, &all_detected_devices);
9696
		mutex_unlock(&detected_devices_mutex);
9697
	}
L
Linus Torvalds 已提交
9698 9699 9700 9701
}

static void autostart_arrays(int part)
{
9702
	struct md_rdev *rdev;
9703 9704 9705
	struct detected_devices_node *node_detected_dev;
	dev_t dev;
	int i_scanned, i_passed;
L
Linus Torvalds 已提交
9706

9707 9708
	i_scanned = 0;
	i_passed = 0;
L
Linus Torvalds 已提交
9709

9710
	pr_info("md: Autodetecting RAID arrays.\n");
L
Linus Torvalds 已提交
9711

9712
	mutex_lock(&detected_devices_mutex);
9713 9714 9715 9716 9717 9718 9719
	while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
		i_scanned++;
		node_detected_dev = list_entry(all_detected_devices.next,
					struct detected_devices_node, list);
		list_del(&node_detected_dev->list);
		dev = node_detected_dev->dev;
		kfree(node_detected_dev);
S
Shaohua Li 已提交
9720
		mutex_unlock(&detected_devices_mutex);
9721
		rdev = md_import_device(dev,0, 90);
S
Shaohua Li 已提交
9722
		mutex_lock(&detected_devices_mutex);
L
Linus Torvalds 已提交
9723 9724 9725
		if (IS_ERR(rdev))
			continue;

N
NeilBrown 已提交
9726
		if (test_bit(Faulty, &rdev->flags))
L
Linus Torvalds 已提交
9727
			continue;
N
NeilBrown 已提交
9728

9729
		set_bit(AutoDetected, &rdev->flags);
L
Linus Torvalds 已提交
9730
		list_add(&rdev->same_set, &pending_raid_disks);
9731
		i_passed++;
L
Linus Torvalds 已提交
9732
	}
9733
	mutex_unlock(&detected_devices_mutex);
9734

9735
	pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
L
Linus Torvalds 已提交
9736 9737 9738 9739

	autorun_devices(part);
}

J
Jeff Garzik 已提交
9740
#endif /* !MODULE */
L
Linus Torvalds 已提交
9741 9742 9743

static __exit void md_exit(void)
{
9744
	struct mddev *mddev;
L
Linus Torvalds 已提交
9745
	struct list_head *tmp;
9746
	int delay = 1;
9747

9748
	blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
9749
	blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
L
Linus Torvalds 已提交
9750

C
Christoph Hellwig 已提交
9751
	unregister_blkdev(MD_MAJOR,"md");
L
Linus Torvalds 已提交
9752 9753 9754
	unregister_blkdev(mdp_major, "mdp");
	unregister_reboot_notifier(&md_notifier);
	unregister_sysctl_table(raid_table_header);
9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765

	/* We cannot unload the modules while some process is
	 * waiting for us in select() or poll() - wake them up
	 */
	md_unloading = 1;
	while (waitqueue_active(&md_event_waiters)) {
		/* not safe to leave yet */
		wake_up(&md_event_waiters);
		msleep(delay);
		delay += delay;
	}
L
Linus Torvalds 已提交
9766
	remove_proc_entry("mdstat", NULL);
9767

9768
	for_each_mddev(mddev, tmp) {
L
Linus Torvalds 已提交
9769
		export_array(mddev);
9770
		mddev->ctime = 0;
9771
		mddev->hold_active = 0;
9772 9773 9774 9775 9776 9777
		/*
		 * for_each_mddev() will call mddev_put() at the end of each
		 * iteration.  As the mddev is now fully clear, this will
		 * schedule the mddev for destruction by a workqueue, and the
		 * destroy_workqueue() below will wait for that to complete.
		 */
L
Linus Torvalds 已提交
9778
	}
T
Tejun Heo 已提交
9779 9780
	destroy_workqueue(md_misc_wq);
	destroy_workqueue(md_wq);
L
Linus Torvalds 已提交
9781 9782
}

9783
subsys_initcall(md_init);
L
Linus Torvalds 已提交
9784 9785
module_exit(md_exit)

9786
static int get_ro(char *buffer, const struct kernel_param *kp)
9787 9788 9789
{
	return sprintf(buffer, "%d", start_readonly);
}
9790
static int set_ro(const char *val, const struct kernel_param *kp)
9791
{
A
Alexey Dobriyan 已提交
9792
	return kstrtouint(val, 10, (unsigned int *)&start_readonly);
9793 9794
}

9795 9796
module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
9797
module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
9798
module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
9799

L
Linus Torvalds 已提交
9800
MODULE_LICENSE("GPL");
9801
MODULE_DESCRIPTION("MD RAID framework");
9802
MODULE_ALIAS("md");
9803
MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);