提交 2943c833 编写于 作者: L Linus Torvalds

Merge tag 'md-3.3' of git://neil.brown.name/md

md update for 3.3

Big change is new hot-replacement.
A slot in an array can hold 2 devices - one that
wants-replacement and one that is the replacement.
Once the replacement is built - either from the
original or (in the case of errors) from elsewhere,
the wants-replacement device will be removed.

* tag 'md-3.3' of git://neil.brown.name/md: (36 commits)
  md/raid1: Mark device want_replacement when we see a write error.
  md/raid1: If there is a spare and a want_replacement device, start replacement.
  md/raid1: recognise replacements when assembling arrays.
  md/raid1: handle activation of replacement device when recovery completes.
  md/raid1: Allow a failed replacement device to be removed.
  md/raid1: Allocate spare to store replacement devices and their bios.
  md/raid1:  Replace use of mddev->raid_disks with conf->raid_disks.
  md/raid10: If there is a spare and a want_replacement device, start replacement.
  md/raid10: recognise replacements when assembling array.
  md/raid10: Allow replacement device to be replace old drive.
  md/raid10: handle recovery of replacement devices.
  md/raid10:  Handle replacement devices during resync.
  md/raid10: writes should get directed to replacement as well as original.
  md/raid10: allow removal of failed replacement devices.
  md/raid10: preferentially read from replacement device if possible.
  md/raid10:  change read_balance to return an rdev
  md/raid10: prepare data structures for handling replacement.
  md/raid5: Mark device want_replacement when we see a write error.
  md/raid5: If there is a spare and a want_replacement device, start replacement.
  md/raid5: recognise replacements when assembling array.
  ...
......@@ -357,14 +357,14 @@ Each directory contains:
written to, that device.
state
A file recording the current state of the device in the array
A file recording the current state of the device in the array
which can be a comma separated list of
faulty - device has been kicked from active use due to
a detected fault or it has unacknowledged bad
blocks
a detected fault, or it has unacknowledged bad
blocks
in_sync - device is a fully in-sync member of the array
writemostly - device will only be subject to read
requests if there are no other options.
requests if there are no other options.
This applies only to raid1 arrays.
blocked - device has failed, and the failure hasn't been
acknowledged yet by the metadata handler.
......@@ -374,6 +374,13 @@ Each directory contains:
This includes spares that are in the process
of being recovered to
write_error - device has ever seen a write error.
want_replacement - device is (mostly) working but probably
should be replaced, either due to errors or
due to user request.
replacement - device is a replacement for another active
device with same raid_disk.
This list may grow in future.
This can be written to.
Writing "faulty" simulates a failure on the device.
......@@ -386,6 +393,13 @@ Each directory contains:
Writing "in_sync" sets the in_sync flag.
Writing "write_error" sets writeerrorseen flag.
Writing "-write_error" clears writeerrorseen flag.
Writing "want_replacement" is allowed at any time except to a
replacement device or a spare. It sets the flag.
Writing "-want_replacement" is allowed at any time. It clears
the flag.
Writing "replacement" or "-replacement" is only allowed before
starting the array. It sets or clears the flag.
This file responds to select/poll. Any change to 'faulty'
or 'blocked' causes an event.
......
......@@ -1149,12 +1149,12 @@ void bitmap_daemon_work(struct mddev *mddev)
return;
}
if (time_before(jiffies, bitmap->daemon_lastrun
+ bitmap->mddev->bitmap_info.daemon_sleep))
+ mddev->bitmap_info.daemon_sleep))
goto done;
bitmap->daemon_lastrun = jiffies;
if (bitmap->allclean) {
bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
goto done;
}
bitmap->allclean = 1;
......@@ -1206,7 +1206,7 @@ void bitmap_daemon_work(struct mddev *mddev)
* sure that events_cleared is up-to-date.
*/
if (bitmap->need_sync &&
bitmap->mddev->bitmap_info.external == 0) {
mddev->bitmap_info.external == 0) {
bitmap_super_t *sb;
bitmap->need_sync = 0;
sb = kmap_atomic(bitmap->sb_page, KM_USER0);
......@@ -1270,8 +1270,8 @@ void bitmap_daemon_work(struct mddev *mddev)
done:
if (bitmap->allclean == 0)
bitmap->mddev->thread->timeout =
bitmap->mddev->bitmap_info.daemon_sleep;
mddev->thread->timeout =
mddev->bitmap_info.daemon_sleep;
mutex_unlock(&mddev->bitmap_info.mutex);
}
......@@ -1587,7 +1587,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
}
if (!*bmc) {
struct page *page;
*bmc = 1 | (needed ? NEEDED_MASK : 0);
*bmc = 2 | (needed ? NEEDED_MASK : 0);
bitmap_count_page(bitmap, offset, 1);
page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
......
......@@ -1713,6 +1713,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
}
if (sb->devflags & WriteMostly1)
set_bit(WriteMostly, &rdev->flags);
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
set_bit(Replacement, &rdev->flags);
} else /* MULTIPATH are always insync */
set_bit(In_sync, &rdev->flags);
......@@ -1766,6 +1768,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->recovery_offset =
cpu_to_le64(rdev->recovery_offset);
}
if (test_bit(Replacement, &rdev->flags))
sb->feature_map |=
cpu_to_le32(MD_FEATURE_REPLACEMENT);
if (mddev->reshape_position != MaxSector) {
sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
......@@ -2559,6 +2564,15 @@ state_show(struct md_rdev *rdev, char *page)
len += sprintf(page+len, "%swrite_error", sep);
sep = ",";
}
if (test_bit(WantReplacement, &rdev->flags)) {
len += sprintf(page+len, "%swant_replacement", sep);
sep = ",";
}
if (test_bit(Replacement, &rdev->flags)) {
len += sprintf(page+len, "%sreplacement", sep);
sep = ",";
}
return len+sprintf(page+len, "\n");
}
......@@ -2627,6 +2641,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
} else if (cmd_match(buf, "-write_error")) {
clear_bit(WriteErrorSeen, &rdev->flags);
err = 0;
} else if (cmd_match(buf, "want_replacement")) {
/* Any non-spare device that is not a replacement can
* become want_replacement at any time, but we then need to
* check if recovery is needed.
*/
if (rdev->raid_disk >= 0 &&
!test_bit(Replacement, &rdev->flags))
set_bit(WantReplacement, &rdev->flags);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
err = 0;
} else if (cmd_match(buf, "-want_replacement")) {
/* Clearing 'want_replacement' is always allowed.
* Once replacements starts it is too late though.
*/
err = 0;
clear_bit(WantReplacement, &rdev->flags);
} else if (cmd_match(buf, "replacement")) {
/* Can only set a device as a replacement when array has not
* yet been started. Once running, replacement is automatic
* from spares, or by assigning 'slot'.
*/
if (rdev->mddev->pers)
err = -EBUSY;
else {
set_bit(Replacement, &rdev->flags);
err = 0;
}
} else if (cmd_match(buf, "-replacement")) {
/* Similarly, can only clear Replacement before start */
if (rdev->mddev->pers)
err = -EBUSY;
else {
clear_bit(Replacement, &rdev->flags);
err = 0;
}
}
if (!err)
sysfs_notify_dirent_safe(rdev->sysfs_state);
......@@ -2688,7 +2738,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
if (rdev->mddev->pers->hot_remove_disk == NULL)
return -EINVAL;
err = rdev->mddev->pers->
hot_remove_disk(rdev->mddev, rdev->raid_disk);
hot_remove_disk(rdev->mddev, rdev);
if (err)
return err;
sysfs_unlink_rdev(rdev->mddev, rdev);
......@@ -2696,7 +2746,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
} else if (rdev->mddev->pers) {
struct md_rdev *rdev2;
/* Activating a spare .. or possibly reactivating
* if we ever get bitmaps working here.
*/
......@@ -2710,10 +2759,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
if (rdev->mddev->pers->hot_add_disk == NULL)
return -EINVAL;
list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
if (rdev2->raid_disk == slot)
return -EEXIST;
if (slot >= rdev->mddev->raid_disks &&
slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
return -ENOSPC;
......@@ -6053,8 +6098,15 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
struct mddev *mddev = NULL;
int ro;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
switch (cmd) {
case RAID_VERSION:
case GET_ARRAY_INFO:
case GET_DISK_INFO:
break;
default:
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
}
/*
* Commands dealing with the RAID driver but not any
......@@ -6714,8 +6766,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
if (test_bit(Faulty, &rdev->flags)) {
seq_printf(seq, "(F)");
continue;
} else if (rdev->raid_disk < 0)
}
if (rdev->raid_disk < 0)
seq_printf(seq, "(S)"); /* spare */
if (test_bit(Replacement, &rdev->flags))
seq_printf(seq, "(R)");
sectors += rdev->sectors;
}
......@@ -7337,29 +7392,27 @@ static int remove_and_add_spares(struct mddev *mddev)
! test_bit(In_sync, &rdev->flags)) &&
atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk(
mddev, rdev->raid_disk)==0) {
mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1;
}
}
if (mddev->degraded) {
list_for_each_entry(rdev, &mddev->disks, same_set) {
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
list_for_each_entry(rdev, &mddev->disks, same_set) {
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
spares++;
if (rdev->raid_disk < 0
&& !test_bit(Faulty, &rdev->flags)) {
rdev->recovery_offset = 0;
if (mddev->pers->
hot_add_disk(mddev, rdev) == 0) {
if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */;
spares++;
if (rdev->raid_disk < 0
&& !test_bit(Faulty, &rdev->flags)) {
rdev->recovery_offset = 0;
if (mddev->pers->
hot_add_disk(mddev, rdev) == 0) {
if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */;
spares++;
md_new_event(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
}
md_new_event(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
}
}
}
......@@ -7474,7 +7527,7 @@ void md_check_recovery(struct mddev *mddev)
test_bit(Faulty, &rdev->flags) &&
atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk(
mddev, rdev->raid_disk)==0) {
mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1;
}
......
......@@ -72,34 +72,7 @@ struct md_rdev {
* This reduces the burden of testing multiple flags in many cases
*/
unsigned long flags;
#define Faulty 1 /* device is known to have a fault */
#define In_sync 2 /* device is in_sync with rest of array */
#define WriteMostly 4 /* Avoid reading if at all possible */
#define AutoDetected 7 /* added by auto-detect */
#define Blocked 8 /* An error occurred but has not yet
* been acknowledged by the metadata
* handler, so don't allow writes
* until it is cleared */
#define WriteErrorSeen 9 /* A write error has been seen on this
* device
*/
#define FaultRecorded 10 /* Intermediate state for clearing
* Blocked. The Fault is/will-be
* recorded in the metadata, but that
* metadata hasn't been stored safely
* on disk yet.
*/
#define BlockedBadBlocks 11 /* A writer is blocked because they
* found an unacknowledged bad-block.
* This can safely be cleared at any
* time, and the writer will re-check.
* It may be set at any time, and at
* worst the writer will timeout and
* re-check. So setting it as
* accurately as possible is good, but
* not absolutely critical.
*/
unsigned long flags; /* bit set of 'enum flag_bits' bits. */
wait_queue_head_t blocked_wait;
int desc_nr; /* descriptor index in the superblock */
......@@ -152,6 +125,44 @@ struct md_rdev {
sector_t size; /* in sectors */
} badblocks;
};
enum flag_bits {
Faulty, /* device is known to have a fault */
In_sync, /* device is in_sync with rest of array */
WriteMostly, /* Avoid reading if at all possible */
AutoDetected, /* added by auto-detect */
Blocked, /* An error occurred but has not yet
* been acknowledged by the metadata
* handler, so don't allow writes
* until it is cleared */
WriteErrorSeen, /* A write error has been seen on this
* device
*/
FaultRecorded, /* Intermediate state for clearing
* Blocked. The Fault is/will-be
* recorded in the metadata, but that
* metadata hasn't been stored safely
* on disk yet.
*/
BlockedBadBlocks, /* A writer is blocked because they
* found an unacknowledged bad-block.
* This can safely be cleared at any
* time, and the writer will re-check.
* It may be set at any time, and at
* worst the writer will timeout and
* re-check. So setting it as
* accurately as possible is good, but
* not absolutely critical.
*/
WantReplacement, /* This device is a candidate to be
* hot-replaced, either because it has
* reported some faults, or because
* of explicit request.
*/
Replacement, /* This device is a replacement for
* a want_replacement device with same
* raid_disk number.
*/
};
#define BB_LEN_MASK (0x00000000000001FFULL)
#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
......@@ -428,7 +439,7 @@ struct md_personality
*/
void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev);
int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
int (*hot_remove_disk) (struct mddev *mddev, int number);
int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev);
int (*spare_active) (struct mddev *mddev);
sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster);
int (*resize) (struct mddev *mddev, sector_t sectors);
......@@ -482,15 +493,20 @@ static inline char * mdname (struct mddev * mddev)
static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
char nm[20];
sprintf(nm, "rd%d", rdev->raid_disk);
return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
if (!test_bit(Replacement, &rdev->flags)) {
sprintf(nm, "rd%d", rdev->raid_disk);
return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
} else
return 0;
}
static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
char nm[20];
sprintf(nm, "rd%d", rdev->raid_disk);
sysfs_remove_link(&mddev->kobj, nm);
if (!test_bit(Replacement, &rdev->flags)) {
sprintf(nm, "rd%d", rdev->raid_disk);
sysfs_remove_link(&mddev->kobj, nm);
}
}
/*
......
......@@ -292,17 +292,16 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
return err;
}
static int multipath_remove_disk(struct mddev *mddev, int number)
static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct mpconf *conf = mddev->private;
int err = 0;
struct md_rdev *rdev;
int number = rdev->raid_disk;
struct multipath_info *p = conf->multipaths + number;
print_multipath_conf(conf);
rdev = p->rdev;
if (rdev) {
if (rdev == p->rdev) {
if (test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending)) {
printk(KERN_ERR "hot-remove-disk, slot %d is identified"
......
......@@ -135,7 +135,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
j = -1;
out_free_bio:
while ( ++j < pi->raid_disks )
while (++j < pi->raid_disks)
bio_put(r1_bio->bios[j]);
r1bio_pool_free(r1_bio, data);
return NULL;
......@@ -164,7 +164,7 @@ static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
{
int i;
for (i = 0; i < conf->raid_disks; i++) {
for (i = 0; i < conf->raid_disks * 2; i++) {
struct bio **bio = r1_bio->bios + i;
if (!BIO_SPECIAL(*bio))
bio_put(*bio);
......@@ -185,7 +185,7 @@ static void put_buf(struct r1bio *r1_bio)
struct r1conf *conf = r1_bio->mddev->private;
int i;
for (i=0; i<conf->raid_disks; i++) {
for (i = 0; i < conf->raid_disks * 2; i++) {
struct bio *bio = r1_bio->bios[i];
if (bio->bi_end_io)
rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
......@@ -277,13 +277,14 @@ static inline void update_head_pos(int disk, struct r1bio *r1_bio)
static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
{
int mirror;
int raid_disks = r1_bio->mddev->raid_disks;
struct r1conf *conf = r1_bio->mddev->private;
int raid_disks = conf->raid_disks;
for (mirror = 0; mirror < raid_disks; mirror++)
for (mirror = 0; mirror < raid_disks * 2; mirror++)
if (r1_bio->bios[mirror] == bio)
break;
BUG_ON(mirror == raid_disks);
BUG_ON(mirror == raid_disks * 2);
update_head_pos(mirror, r1_bio);
return mirror;
......@@ -390,6 +391,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
if (!uptodate) {
set_bit(WriteErrorSeen,
&conf->mirrors[mirror].rdev->flags);
if (!test_and_set_bit(WantReplacement,
&conf->mirrors[mirror].rdev->flags))
set_bit(MD_RECOVERY_NEEDED, &
conf->mddev->recovery);
set_bit(R1BIO_WriteError, &r1_bio->state);
} else {
/*
......@@ -505,7 +511,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
start_disk = conf->last_used;
}
for (i = 0 ; i < conf->raid_disks ; i++) {
for (i = 0 ; i < conf->raid_disks * 2 ; i++) {
sector_t dist;
sector_t first_bad;
int bad_sectors;
......@@ -609,7 +615,7 @@ int md_raid1_congested(struct mddev *mddev, int bits)
return 1;
rcu_read_lock();
for (i = 0; i < mddev->raid_disks; i++) {
for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev && !test_bit(Faulty, &rdev->flags)) {
struct request_queue *q = bdev_get_queue(rdev->bdev);
......@@ -974,7 +980,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
*/
plugged = mddev_check_plugged(mddev);
disks = conf->raid_disks;
disks = conf->raid_disks * 2;
retry_write:
blocked_rdev = NULL;
rcu_read_lock();
......@@ -988,7 +994,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
}
r1_bio->bios[i] = NULL;
if (!rdev || test_bit(Faulty, &rdev->flags)) {
set_bit(R1BIO_Degraded, &r1_bio->state);
if (i < conf->raid_disks)
set_bit(R1BIO_Degraded, &r1_bio->state);
continue;
}
......@@ -1263,6 +1270,25 @@ static int raid1_spare_active(struct mddev *mddev)
*/
for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = conf->mirrors[i].rdev;
struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
if (repl
&& repl->recovery_offset == MaxSector
&& !test_bit(Faulty, &repl->flags)
&& !test_and_set_bit(In_sync, &repl->flags)) {
/* replacement has just become active */
if (!rdev ||
!test_and_clear_bit(In_sync, &rdev->flags))
count++;
if (rdev) {
/* Replaced device not technically
* faulty, but we need to be sure
* it gets removed and never re-added
*/
set_bit(Faulty, &rdev->flags);
sysfs_notify_dirent_safe(
rdev->sysfs_state);
}
}
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_and_set_bit(In_sync, &rdev->flags)) {
......@@ -1286,7 +1312,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int mirror = 0;
struct mirror_info *p;
int first = 0;
int last = mddev->raid_disks - 1;
int last = conf->raid_disks - 1;
if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY;
......@@ -1294,8 +1320,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
for (mirror = first; mirror <= last; mirror++)
if ( !(p=conf->mirrors+mirror)->rdev) {
for (mirror = first; mirror <= last; mirror++) {
p = conf->mirrors+mirror;
if (!p->rdev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
......@@ -1322,21 +1349,35 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
rcu_assign_pointer(p->rdev, rdev);
break;
}
if (test_bit(WantReplacement, &p->rdev->flags) &&
p[conf->raid_disks].rdev == NULL) {
/* Add this device as a replacement */
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
rdev->raid_disk = mirror;
err = 0;
conf->fullsync = 1;
rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
break;
}
}
md_integrity_add_rdev(rdev, mddev);
print_conf(conf);
return err;
}
static int raid1_remove_disk(struct mddev *mddev, int number)
static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct r1conf *conf = mddev->private;
int err = 0;
struct md_rdev *rdev;
int number = rdev->raid_disk;
struct mirror_info *p = conf->mirrors+ number;
if (rdev != p->rdev)
p = conf->mirrors + conf->raid_disks + number;
print_conf(conf);
rdev = p->rdev;
if (rdev) {
if (rdev == p->rdev) {
if (test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending)) {
err = -EBUSY;
......@@ -1358,7 +1399,21 @@ static int raid1_remove_disk(struct mddev *mddev, int number)
err = -EBUSY;
p->rdev = rdev;
goto abort;
}
} else if (conf->mirrors[conf->raid_disks + number].rdev) {
/* We just removed a device that is being replaced.
* Move down the replacement. We drain all IO before
* doing this to avoid confusion.
*/
struct md_rdev *repl =
conf->mirrors[conf->raid_disks + number].rdev;
raise_barrier(conf);
clear_bit(Replacement, &repl->flags);
p->rdev = repl;
conf->mirrors[conf->raid_disks + number].rdev = NULL;
lower_barrier(conf);
clear_bit(WantReplacement, &rdev->flags);
} else
clear_bit(WantReplacement, &rdev->flags);
err = md_integrity_register(mddev);
}
abort:
......@@ -1411,6 +1466,10 @@ static void end_sync_write(struct bio *bio, int error)
} while (sectors_to_go > 0);
set_bit(WriteErrorSeen,
&conf->mirrors[mirror].rdev->flags);
if (!test_and_set_bit(WantReplacement,
&conf->mirrors[mirror].rdev->flags))
set_bit(MD_RECOVERY_NEEDED, &
mddev->recovery);
set_bit(R1BIO_WriteError, &r1_bio->state);
} else if (is_badblock(conf->mirrors[mirror].rdev,
r1_bio->sector,
......@@ -1441,8 +1500,13 @@ static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
/* success */
return 1;
if (rw == WRITE)
if (rw == WRITE) {
set_bit(WriteErrorSeen, &rdev->flags);
if (!test_and_set_bit(WantReplacement,
&rdev->flags))
set_bit(MD_RECOVERY_NEEDED, &
rdev->mddev->recovery);
}
/* need to record an error - either for the block or the device */
if (!rdev_set_badblocks(rdev, sector, sectors, 0))
md_error(rdev->mddev, rdev);
......@@ -1493,7 +1557,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
}
}
d++;
if (d == conf->raid_disks)
if (d == conf->raid_disks * 2)
d = 0;
} while (!success && d != r1_bio->read_disk);
......@@ -1510,7 +1574,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
mdname(mddev),
bdevname(bio->bi_bdev, b),
(unsigned long long)r1_bio->sector);
for (d = 0; d < conf->raid_disks; d++) {
for (d = 0; d < conf->raid_disks * 2; d++) {
rdev = conf->mirrors[d].rdev;
if (!rdev || test_bit(Faulty, &rdev->flags))
continue;
......@@ -1536,7 +1600,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
/* write it back and re-read */
while (d != r1_bio->read_disk) {
if (d == 0)
d = conf->raid_disks;
d = conf->raid_disks * 2;
d--;
if (r1_bio->bios[d]->bi_end_io != end_sync_read)
continue;
......@@ -1551,7 +1615,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
d = start;
while (d != r1_bio->read_disk) {
if (d == 0)
d = conf->raid_disks;
d = conf->raid_disks * 2;
d--;
if (r1_bio->bios[d]->bi_end_io != end_sync_read)
continue;
......@@ -1584,7 +1648,7 @@ static int process_checks(struct r1bio *r1_bio)
int primary;
int i;
for (primary = 0; primary < conf->raid_disks; primary++)
for (primary = 0; primary < conf->raid_disks * 2; primary++)
if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
r1_bio->bios[primary]->bi_end_io = NULL;
......@@ -1592,7 +1656,7 @@ static int process_checks(struct r1bio *r1_bio)
break;
}
r1_bio->read_disk = primary;
for (i = 0; i < conf->raid_disks; i++) {
for (i = 0; i < conf->raid_disks * 2; i++) {
int j;
int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
struct bio *pbio = r1_bio->bios[primary];
......@@ -1656,7 +1720,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
{
struct r1conf *conf = mddev->private;
int i;
int disks = conf->raid_disks;
int disks = conf->raid_disks * 2;
struct bio *bio, *wbio;
bio = r1_bio->bios[r1_bio->read_disk];
......@@ -1737,7 +1801,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
success = 1;
else {
d++;
if (d == conf->raid_disks)
if (d == conf->raid_disks * 2)
d = 0;
}
} while (!success && d != read_disk);
......@@ -1753,7 +1817,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
start = d;
while (d != read_disk) {
if (d==0)
d = conf->raid_disks;
d = conf->raid_disks * 2;
d--;
rdev = conf->mirrors[d].rdev;
if (rdev &&
......@@ -1765,7 +1829,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
while (d != read_disk) {
char b[BDEVNAME_SIZE];
if (d==0)
d = conf->raid_disks;
d = conf->raid_disks * 2;
d--;
rdev = conf->mirrors[d].rdev;
if (rdev &&
......@@ -1887,7 +1951,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
{
int m;
int s = r1_bio->sectors;
for (m = 0; m < conf->raid_disks ; m++) {
for (m = 0; m < conf->raid_disks * 2 ; m++) {
struct md_rdev *rdev = conf->mirrors[m].rdev;
struct bio *bio = r1_bio->bios[m];
if (bio->bi_end_io == NULL)
......@@ -1909,7 +1973,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
{
int m;
for (m = 0; m < conf->raid_disks ; m++)
for (m = 0; m < conf->raid_disks * 2 ; m++)
if (r1_bio->bios[m] == IO_MADE_GOOD) {
struct md_rdev *rdev = conf->mirrors[m].rdev;
rdev_clear_badblocks(rdev,
......@@ -2184,7 +2248,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
r1_bio->state = 0;
set_bit(R1BIO_IsSync, &r1_bio->state);
for (i=0; i < conf->raid_disks; i++) {
for (i = 0; i < conf->raid_disks * 2; i++) {
struct md_rdev *rdev;
bio = r1_bio->bios[i];
......@@ -2203,7 +2267,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev == NULL ||
test_bit(Faulty, &rdev->flags)) {
still_degraded = 1;
if (i < conf->raid_disks)
still_degraded = 1;
} else if (!test_bit(In_sync, &rdev->flags)) {
bio->bi_rw = WRITE;
bio->bi_end_io = end_sync_write;
......@@ -2254,7 +2319,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
* need to mark them bad on all write targets
*/
int ok = 1;
for (i = 0 ; i < conf->raid_disks ; i++)
for (i = 0 ; i < conf->raid_disks * 2 ; i++)
if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
struct md_rdev *rdev =
rcu_dereference(conf->mirrors[i].rdev);
......@@ -2323,7 +2388,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
len = sync_blocks<<9;
}
for (i=0 ; i < conf->raid_disks; i++) {
for (i = 0 ; i < conf->raid_disks * 2; i++) {
bio = r1_bio->bios[i];
if (bio->bi_end_io) {
page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
......@@ -2356,7 +2421,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
*/
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
atomic_set(&r1_bio->remaining, read_targets);
for (i=0; i<conf->raid_disks; i++) {
for (i = 0; i < conf->raid_disks * 2; i++) {
bio = r1_bio->bios[i];
if (bio->bi_end_io == end_sync_read) {
md_sync_acct(bio->bi_bdev, nr_sectors);
......@@ -2393,7 +2458,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf)
goto abort;
conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
conf->mirrors = kzalloc(sizeof(struct mirror_info)
* mddev->raid_disks * 2,
GFP_KERNEL);
if (!conf->mirrors)
goto abort;
......@@ -2405,7 +2471,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
if (!conf->poolinfo)
goto abort;
conf->poolinfo->raid_disks = mddev->raid_disks;
conf->poolinfo->raid_disks = mddev->raid_disks * 2;
conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
r1bio_pool_free,
conf->poolinfo);
......@@ -2414,14 +2480,20 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->poolinfo->mddev = mddev;
err = -EINVAL;
spin_lock_init(&conf->device_lock);
list_for_each_entry(rdev, &mddev->disks, same_set) {
int disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
|| disk_idx < 0)
continue;
disk = conf->mirrors + disk_idx;
if (test_bit(Replacement, &rdev->flags))
disk = conf->mirrors + conf->raid_disks + disk_idx;
else
disk = conf->mirrors + disk_idx;
if (disk->rdev)
goto abort;
disk->rdev = rdev;
disk->head_position = 0;
......@@ -2437,11 +2509,27 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->pending_count = 0;
conf->recovery_disabled = mddev->recovery_disabled - 1;
err = -EIO;
conf->last_used = -1;
for (i = 0; i < conf->raid_disks; i++) {
for (i = 0; i < conf->raid_disks * 2; i++) {
disk = conf->mirrors + i;
if (i < conf->raid_disks &&
disk[conf->raid_disks].rdev) {
/* This slot has a replacement. */
if (!disk->rdev) {
/* No original, just make the replacement
* a recovering spare
*/
disk->rdev =
disk[conf->raid_disks].rdev;
disk[conf->raid_disks].rdev = NULL;
} else if (!test_bit(In_sync, &disk->rdev->flags))
/* Original is not in_sync - bad */
goto abort;
}
if (!disk->rdev ||
!test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0;
......@@ -2455,7 +2543,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->last_used = i;
}
err = -EIO;
if (conf->last_used < 0) {
printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
mdname(mddev));
......@@ -2665,7 +2752,7 @@ static int raid1_reshape(struct mddev *mddev)
if (!newpoolinfo)
return -ENOMEM;
newpoolinfo->mddev = mddev;
newpoolinfo->raid_disks = raid_disks;
newpoolinfo->raid_disks = raid_disks * 2;
newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
r1bio_pool_free, newpoolinfo);
......@@ -2673,7 +2760,8 @@ static int raid1_reshape(struct mddev *mddev)
kfree(newpoolinfo);
return -ENOMEM;
}
newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2,
GFP_KERNEL);
if (!newmirrors) {
kfree(newpoolinfo);
mempool_destroy(newpool);
......
......@@ -12,6 +12,9 @@ struct mirror_info {
* pool was allocated for, so they know how much to allocate and free.
* mddev->raid_disks cannot be used, as it can change while a pool is active
* These two datums are stored in a kmalloced struct.
* The 'raid_disks' here is twice the raid_disks in r1conf.
* This allows space for each 'real' device can have a replacement in the
* second half of the array.
*/
struct pool_info {
......@@ -21,7 +24,9 @@ struct pool_info {
struct r1conf {
struct mddev *mddev;
struct mirror_info *mirrors;
struct mirror_info *mirrors; /* twice 'raid_disks' to
* allow for replacements.
*/
int raid_disks;
/* When choose the best device for a read (read_balance())
......
此差异已折叠。
......@@ -2,7 +2,7 @@
#define _RAID10_H
struct mirror_info {
struct md_rdev *rdev;
struct md_rdev *rdev, *replacement;
sector_t head_position;
int recovery_disabled; /* matches
* mddev->recovery_disabled
......@@ -18,12 +18,13 @@ struct r10conf {
spinlock_t device_lock;
/* geometry */
int near_copies; /* number of copies laid out raid0 style */
int near_copies; /* number of copies laid out
* raid0 style */
int far_copies; /* number of copies laid out
* at large strides across drives
*/
int far_offset; /* far_copies are offset by 1 stripe
* instead of many
int far_offset; /* far_copies are offset by 1
* stripe instead of many
*/
int copies; /* near_copies * far_copies.
* must be <= raid_disks
......@@ -34,10 +35,11 @@ struct r10conf {
* 1 stripe.
*/
sector_t dev_sectors; /* temp copy of mddev->dev_sectors */
sector_t dev_sectors; /* temp copy of
* mddev->dev_sectors */
int chunk_shift; /* shift from chunks to sectors */
sector_t chunk_mask;
int chunk_shift; /* shift from chunks to sectors */
sector_t chunk_mask;
struct list_head retry_list;
/* queue pending writes and submit them on unplug */
......@@ -45,20 +47,22 @@ struct r10conf {
int pending_count;
spinlock_t resync_lock;
int nr_pending;
int nr_waiting;
int nr_queued;
int barrier;
int nr_pending;
int nr_waiting;
int nr_queued;
int barrier;
sector_t next_resync;
int fullsync; /* set to 1 if a full sync is needed,
* (fresh device added).
* Cleared when a sync completes.
*/
int have_replacement; /* There is at least one
* replacement device.
*/
wait_queue_head_t wait_barrier;
mempool_t *r10bio_pool;
mempool_t *r10buf_pool;
mempool_t *r10bio_pool;
mempool_t *r10buf_pool;
struct page *tmppage;
/* When taking over an array from a different personality, we store
......@@ -98,11 +102,18 @@ struct r10bio {
* When resyncing we also use one for each copy.
* When reconstructing, we use 2 bios, one for read, one for write.
* We choose the number when they are allocated.
* We sometimes need an extra bio to write to the replacement.
*/
struct {
struct bio *bio;
sector_t addr;
int devnum;
struct bio *bio;
union {
struct bio *repl_bio; /* used for resync and
* writes */
struct md_rdev *rdev; /* used for reads
* (read_slot >= 0) */
};
sector_t addr;
int devnum;
} devs[0];
};
......@@ -121,17 +132,19 @@ struct r10bio {
#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
/* bits for r10bio.state */
#define R10BIO_Uptodate 0
#define R10BIO_IsSync 1
#define R10BIO_IsRecover 2
#define R10BIO_Degraded 3
enum r10bio_state {
R10BIO_Uptodate,
R10BIO_IsSync,
R10BIO_IsRecover,
R10BIO_Degraded,
/* Set ReadError on bios that experience a read error
* so that raid10d knows what to do with them.
*/
#define R10BIO_ReadError 4
R10BIO_ReadError,
/* If a write for this request means we can clear some
* known-bad-block records, we set this flag.
*/
#define R10BIO_MadeGood 5
#define R10BIO_WriteError 6
R10BIO_MadeGood,
R10BIO_WriteError,
};
#endif
此差异已折叠。
......@@ -27,7 +27,7 @@
* The possible state transitions are:
*
* Empty -> Want - on read or write to get old data for parity calc
* Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
* Empty -> Dirty - on compute_parity to satisfy write/sync request.
* Empty -> Clean - on compute_block when computing a block for failed drive
* Want -> Empty - on failed read
* Want -> Clean - on successful completion of read request
......@@ -226,8 +226,11 @@ struct stripe_head {
#endif
} ops;
struct r5dev {
struct bio req;
struct bio_vec vec;
/* rreq and rvec are used for the replacement device when
* writing data to both devices.
*/
struct bio req, rreq;
struct bio_vec vec, rvec;
struct page *page;
struct bio *toread, *read, *towrite, *written;
sector_t sector; /* sector of this page */
......@@ -239,7 +242,13 @@ struct stripe_head {
* for handle_stripe.
*/
struct stripe_head_state {
int syncing, expanding, expanded;
/* 'syncing' means that we need to read all devices, either
* to check/correct parity, or to reconstruct a missing device.
* 'replacing' means we are replacing one or more drives and
* the source is valid at this point so we don't need to
* read all devices, just the replacement targets.
*/
int syncing, expanding, expanded, replacing;
int locked, uptodate, to_read, to_write, failed, written;
int to_fill, compute, req_compute, non_overwrite;
int failed_num[2];
......@@ -252,38 +261,41 @@ struct stripe_head_state {
int handle_bad_blocks;
};
/* Flags */
#define R5_UPTODATE 0 /* page contains current data */
#define R5_LOCKED 1 /* IO has been submitted on "req" */
#define R5_OVERWRITE 2 /* towrite covers whole page */
/* Flags for struct r5dev.flags */
enum r5dev_flags {
R5_UPTODATE, /* page contains current data */
R5_LOCKED, /* IO has been submitted on "req" */
R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */
R5_OVERWRITE, /* towrite covers whole page */
/* and some that are internal to handle_stripe */
#define R5_Insync 3 /* rdev && rdev->in_sync at start */
#define R5_Wantread 4 /* want to schedule a read */
#define R5_Wantwrite 5
#define R5_Overlap 7 /* There is a pending overlapping request on this block */
#define R5_ReadError 8 /* seen a read error here recently */
#define R5_ReWrite 9 /* have tried to over-write the readerror */
R5_Insync, /* rdev && rdev->in_sync at start */
R5_Wantread, /* want to schedule a read */
R5_Wantwrite,
R5_Overlap, /* There is a pending overlapping request
* on this block */
R5_ReadError, /* seen a read error here recently */
R5_ReWrite, /* have tried to over-write the readerror */
#define R5_Expanded 10 /* This block now has post-expand data */
#define R5_Wantcompute 11 /* compute_block in progress treat as
* uptodate
*/
#define R5_Wantfill 12 /* dev->toread contains a bio that needs
* filling
*/
#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
#define R5_WantFUA 14 /* Write should be FUA */
#define R5_WriteError 15 /* got a write error - need to record it */
#define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/
/*
* Write method
*/
#define RECONSTRUCT_WRITE 1
#define READ_MODIFY_WRITE 2
/* not a write method, but a compute_parity mode */
#define CHECK_PARITY 3
/* Additional compute_parity mode -- updates the parity w/o LOCKING */
#define UPDATE_PARITY 4
R5_Expanded, /* This block now has post-expand data */
R5_Wantcompute, /* compute_block in progress treat as
* uptodate
*/
R5_Wantfill, /* dev->toread contains a bio that needs
* filling
*/
R5_Wantdrain, /* dev->towrite needs to be drained */
R5_WantFUA, /* Write should be FUA */
R5_WriteError, /* got a write error - need to record it */
R5_MadeGood, /* A bad block has been fixed by writing to it */
R5_ReadRepl, /* Will/did read from replacement rather than orig */
R5_MadeGoodRepl,/* A bad block on the replacement device has been
* fixed by writing to it */
R5_NeedReplace, /* This device has a replacement which is not
* up-to-date at this stripe. */
R5_WantReplace, /* We need to update the replacement, we have read
* data in, and now is a good time to write it out.
*/
};
/*
* Stripe state
......@@ -311,13 +323,14 @@ enum {
/*
* Operation request flags
*/
#define STRIPE_OP_BIOFILL 0
#define STRIPE_OP_COMPUTE_BLK 1
#define STRIPE_OP_PREXOR 2
#define STRIPE_OP_BIODRAIN 3
#define STRIPE_OP_RECONSTRUCT 4
#define STRIPE_OP_CHECK 5
enum {
STRIPE_OP_BIOFILL,
STRIPE_OP_COMPUTE_BLK,
STRIPE_OP_PREXOR,
STRIPE_OP_BIODRAIN,
STRIPE_OP_RECONSTRUCT,
STRIPE_OP_CHECK,
};
/*
* Plugging:
*
......@@ -344,13 +357,12 @@ enum {
struct disk_info {
struct md_rdev *rdev;
struct md_rdev *rdev, *replacement;
};
struct r5conf {
struct hlist_head *stripe_hashtbl;
struct mddev *mddev;
struct disk_info *spare;
int chunk_sectors;
int level, algorithm;
int max_degraded;
......
......@@ -277,7 +277,10 @@ struct mdp_superblock_1 {
*/
#define MD_FEATURE_RESHAPE_ACTIVE 4
#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */
#define MD_FEATURE_ALL (1|2|4|8)
#define MD_FEATURE_REPLACEMENT 16 /* This device is replacing an
* active device with same 'role'.
* 'recovery_offset' is also set.
*/
#define MD_FEATURE_ALL (1|2|4|8|16)
#endif
......@@ -132,7 +132,7 @@ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb,
PROT_READ|PROT_WRITE, \
MAP_PRIVATE|MAP_ANONYMOUS,\
0, 0))
# define free_pages(x, y) munmap((void *)(x), (y)*PAGE_SIZE)
# define free_pages(x, y) munmap((void *)(x), PAGE_SIZE << (y))
static inline void cpu_relax(void)
{
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册