提交 4b382d06 编写于 作者: L Linus Torvalds

Merge branch 'for-linus' of git://neil.brown.name/md

* 'for-linus' of git://neil.brown.name/md:
  md: allow resync_start to be set while an array is active.
  md/raid10:  reformat some loops with less indenting.
  md/raid10: remove unused variable.
  md/raid10: make more use of 'slot' in raid10d.
  md/raid10: some tidying up in fix_read_error
  md/raid1: improve handling of pages allocated for write-behind.
  md/raid1: try fix_sync_read_error before process_checks.
  md/raid1: tidy up new functions: process_checks and fix_sync_read_error.
  md/raid1: split out two sub-functions from sync_request_write
  md: make error_handler functions more uniform and correct.
  md/multipath: discard ->working_disks in favour of ->degraded
  md/raid1: clean up read_balance.
  md: simplify raid10 read_balance
  md/bitmap: fix saving of events_cleared and other state.
  md: reject a re-add request that cannot be honoured.
  md: Fix race when creating a new md device.
...@@ -493,11 +493,11 @@ void bitmap_update_sb(struct bitmap *bitmap) ...@@ -493,11 +493,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
spin_unlock_irqrestore(&bitmap->lock, flags); spin_unlock_irqrestore(&bitmap->lock, flags);
sb = kmap_atomic(bitmap->sb_page, KM_USER0); sb = kmap_atomic(bitmap->sb_page, KM_USER0);
sb->events = cpu_to_le64(bitmap->mddev->events); sb->events = cpu_to_le64(bitmap->mddev->events);
if (bitmap->mddev->events < bitmap->events_cleared) { if (bitmap->mddev->events < bitmap->events_cleared)
/* rocking back to read-only */ /* rocking back to read-only */
bitmap->events_cleared = bitmap->mddev->events; bitmap->events_cleared = bitmap->mddev->events;
sb->events_cleared = cpu_to_le64(bitmap->events_cleared); sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
} sb->state = cpu_to_le32(bitmap->flags);
/* Just in case these have been changed via sysfs: */ /* Just in case these have been changed via sysfs: */
sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
...@@ -618,7 +618,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) ...@@ -618,7 +618,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
bitmap->flags |= BITMAP_HOSTENDIAN; bitmap->flags |= BITMAP_HOSTENDIAN;
bitmap->events_cleared = le64_to_cpu(sb->events_cleared); bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
if (sb->state & cpu_to_le32(BITMAP_STALE)) if (bitmap->flags & BITMAP_STALE)
bitmap->events_cleared = bitmap->mddev->events; bitmap->events_cleared = bitmap->mddev->events;
err = 0; err = 0;
out: out:
...@@ -652,9 +652,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, ...@@ -652,9 +652,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
switch (op) { switch (op) {
case MASK_SET: case MASK_SET:
sb->state |= cpu_to_le32(bits); sb->state |= cpu_to_le32(bits);
bitmap->flags |= bits;
break; break;
case MASK_UNSET: case MASK_UNSET:
sb->state &= cpu_to_le32(~bits); sb->state &= cpu_to_le32(~bits);
bitmap->flags &= ~bits;
break; break;
default: default:
BUG(); BUG();
......
...@@ -3324,7 +3324,7 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len) ...@@ -3324,7 +3324,7 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len)
char *e; char *e;
unsigned long long n = simple_strtoull(buf, &e, 10); unsigned long long n = simple_strtoull(buf, &e, 10);
if (mddev->pers) if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
return -EBUSY; return -EBUSY;
if (cmd_match(buf, "none")) if (cmd_match(buf, "none"))
n = MaxSector; n = MaxSector;
...@@ -4347,13 +4347,19 @@ static int md_alloc(dev_t dev, char *name) ...@@ -4347,13 +4347,19 @@ static int md_alloc(dev_t dev, char *name)
disk->fops = &md_fops; disk->fops = &md_fops;
disk->private_data = mddev; disk->private_data = mddev;
disk->queue = mddev->queue; disk->queue = mddev->queue;
blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
/* Allow extended partitions. This makes the /* Allow extended partitions. This makes the
* 'mdp' device redundant, but we can't really * 'mdp' device redundant, but we can't really
* remove it now. * remove it now.
*/ */
disk->flags |= GENHD_FL_EXT_DEVT; disk->flags |= GENHD_FL_EXT_DEVT;
add_disk(disk);
mddev->gendisk = disk; mddev->gendisk = disk;
/* As soon as we call add_disk(), another thread could get
* through to md_open, so make sure it doesn't get too far
*/
mutex_lock(&mddev->open_mutex);
add_disk(disk);
error = kobject_init_and_add(&mddev->kobj, &md_ktype, error = kobject_init_and_add(&mddev->kobj, &md_ktype,
&disk_to_dev(disk)->kobj, "%s", "md"); &disk_to_dev(disk)->kobj, "%s", "md");
if (error) { if (error) {
...@@ -4367,8 +4373,7 @@ static int md_alloc(dev_t dev, char *name) ...@@ -4367,8 +4373,7 @@ static int md_alloc(dev_t dev, char *name)
if (mddev->kobj.sd && if (mddev->kobj.sd &&
sysfs_create_group(&mddev->kobj, &md_bitmap_group)) sysfs_create_group(&mddev->kobj, &md_bitmap_group))
printk(KERN_DEBUG "pointless warning\n"); printk(KERN_DEBUG "pointless warning\n");
mutex_unlock(&mddev->open_mutex);
blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
abort: abort:
mutex_unlock(&disks_mutex); mutex_unlock(&disks_mutex);
if (!error && mddev->kobj.sd) { if (!error && mddev->kobj.sd) {
...@@ -5211,6 +5216,16 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) ...@@ -5211,6 +5216,16 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
} else } else
super_types[mddev->major_version]. super_types[mddev->major_version].
validate_super(mddev, rdev); validate_super(mddev, rdev);
if ((info->state & (1<<MD_DISK_SYNC)) &&
(!test_bit(In_sync, &rdev->flags) ||
rdev->raid_disk != info->raid_disk)) {
/* This was a hot-add request, but events doesn't
* match, so reject it.
*/
export_rdev(rdev);
return -EINVAL;
}
if (test_bit(In_sync, &rdev->flags)) if (test_bit(In_sync, &rdev->flags))
rdev->saved_raid_disk = rdev->raid_disk; rdev->saved_raid_disk = rdev->raid_disk;
else else
......
...@@ -146,7 +146,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev) ...@@ -146,7 +146,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev)
int i; int i;
seq_printf (seq, " [%d/%d] [", conf->raid_disks, seq_printf (seq, " [%d/%d] [", conf->raid_disks,
conf->working_disks); conf->raid_disks - mddev->degraded);
for (i = 0; i < conf->raid_disks; i++) for (i = 0; i < conf->raid_disks; i++)
seq_printf (seq, "%s", seq_printf (seq, "%s",
conf->multipaths[i].rdev && conf->multipaths[i].rdev &&
...@@ -186,8 +186,9 @@ static int multipath_congested(void *data, int bits) ...@@ -186,8 +186,9 @@ static int multipath_congested(void *data, int bits)
static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
{ {
multipath_conf_t *conf = mddev->private; multipath_conf_t *conf = mddev->private;
char b[BDEVNAME_SIZE];
if (conf->working_disks <= 1) { if (conf->raid_disks - mddev->degraded <= 1) {
/* /*
* Uh oh, we can do nothing if this is our last path, but * Uh oh, we can do nothing if this is our last path, but
* first check if this is a queued request for a device * first check if this is a queued request for a device
...@@ -196,25 +197,25 @@ static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -196,25 +197,25 @@ static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
printk(KERN_ALERT printk(KERN_ALERT
"multipath: only one IO path left and IO error.\n"); "multipath: only one IO path left and IO error.\n");
/* leave it active... it's all we have */ /* leave it active... it's all we have */
} else { return;
}
/* /*
* Mark disk as unusable * Mark disk as unusable
*/ */
if (!test_bit(Faulty, &rdev->flags)) { if (test_and_clear_bit(In_sync, &rdev->flags)) {
char b[BDEVNAME_SIZE]; unsigned long flags;
clear_bit(In_sync, &rdev->flags); spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
spin_unlock_irqrestore(&conf->device_lock, flags);
}
set_bit(Faulty, &rdev->flags); set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags);
conf->working_disks--;
mddev->degraded++;
printk(KERN_ALERT "multipath: IO failure on %s," printk(KERN_ALERT "multipath: IO failure on %s,"
" disabling IO path.\n" " disabling IO path.\n"
"multipath: Operation continuing" "multipath: Operation continuing"
" on %d IO paths.\n", " on %d IO paths.\n",
bdevname (rdev->bdev,b), bdevname(rdev->bdev, b),
conf->working_disks); conf->raid_disks - mddev->degraded);
}
}
} }
static void print_multipath_conf (multipath_conf_t *conf) static void print_multipath_conf (multipath_conf_t *conf)
...@@ -227,7 +228,7 @@ static void print_multipath_conf (multipath_conf_t *conf) ...@@ -227,7 +228,7 @@ static void print_multipath_conf (multipath_conf_t *conf)
printk("(conf==NULL)\n"); printk("(conf==NULL)\n");
return; return;
} }
printk(" --- wd:%d rd:%d\n", conf->working_disks, printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
conf->raid_disks); conf->raid_disks);
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
...@@ -274,10 +275,11 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -274,10 +275,11 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
PAGE_CACHE_SIZE - 1); PAGE_CACHE_SIZE - 1);
} }
conf->working_disks++; spin_lock_irq(&conf->device_lock);
mddev->degraded--; mddev->degraded--;
rdev->raid_disk = path; rdev->raid_disk = path;
set_bit(In_sync, &rdev->flags); set_bit(In_sync, &rdev->flags);
spin_unlock_irq(&conf->device_lock);
rcu_assign_pointer(p->rdev, rdev); rcu_assign_pointer(p->rdev, rdev);
err = 0; err = 0;
md_integrity_add_rdev(rdev, mddev); md_integrity_add_rdev(rdev, mddev);
...@@ -391,6 +393,7 @@ static int multipath_run (mddev_t *mddev) ...@@ -391,6 +393,7 @@ static int multipath_run (mddev_t *mddev)
int disk_idx; int disk_idx;
struct multipath_info *disk; struct multipath_info *disk;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
int working_disks;
if (md_check_no_bitmap(mddev)) if (md_check_no_bitmap(mddev))
return -EINVAL; return -EINVAL;
...@@ -424,7 +427,7 @@ static int multipath_run (mddev_t *mddev) ...@@ -424,7 +427,7 @@ static int multipath_run (mddev_t *mddev)
goto out_free_conf; goto out_free_conf;
} }
conf->working_disks = 0; working_disks = 0;
list_for_each_entry(rdev, &mddev->disks, same_set) { list_for_each_entry(rdev, &mddev->disks, same_set) {
disk_idx = rdev->raid_disk; disk_idx = rdev->raid_disk;
if (disk_idx < 0 || if (disk_idx < 0 ||
...@@ -446,7 +449,7 @@ static int multipath_run (mddev_t *mddev) ...@@ -446,7 +449,7 @@ static int multipath_run (mddev_t *mddev)
} }
if (!test_bit(Faulty, &rdev->flags)) if (!test_bit(Faulty, &rdev->flags))
conf->working_disks++; working_disks++;
} }
conf->raid_disks = mddev->raid_disks; conf->raid_disks = mddev->raid_disks;
...@@ -454,12 +457,12 @@ static int multipath_run (mddev_t *mddev) ...@@ -454,12 +457,12 @@ static int multipath_run (mddev_t *mddev)
spin_lock_init(&conf->device_lock); spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list); INIT_LIST_HEAD(&conf->retry_list);
if (!conf->working_disks) { if (!working_disks) {
printk(KERN_ERR "multipath: no operational IO paths for %s\n", printk(KERN_ERR "multipath: no operational IO paths for %s\n",
mdname(mddev)); mdname(mddev));
goto out_free_conf; goto out_free_conf;
} }
mddev->degraded = conf->raid_disks - conf->working_disks; mddev->degraded = conf->raid_disks - working_disks;
conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
sizeof(struct multipath_bh)); sizeof(struct multipath_bh));
...@@ -481,7 +484,8 @@ static int multipath_run (mddev_t *mddev) ...@@ -481,7 +484,8 @@ static int multipath_run (mddev_t *mddev)
printk(KERN_INFO printk(KERN_INFO
"multipath: array %s active with %d out of %d IO paths\n", "multipath: array %s active with %d out of %d IO paths\n",
mdname(mddev), conf->working_disks, mddev->raid_disks); mdname(mddev), conf->raid_disks - mddev->degraded,
mddev->raid_disks);
/* /*
* Ok, everything is just fine now * Ok, everything is just fine now
*/ */
......
...@@ -9,7 +9,6 @@ struct multipath_private_data { ...@@ -9,7 +9,6 @@ struct multipath_private_data {
mddev_t *mddev; mddev_t *mddev;
struct multipath_info *multipaths; struct multipath_info *multipaths;
int raid_disks; int raid_disks;
int working_disks;
spinlock_t device_lock; spinlock_t device_lock;
struct list_head retry_list; struct list_head retry_list;
......
...@@ -297,23 +297,24 @@ static void raid1_end_read_request(struct bio *bio, int error) ...@@ -297,23 +297,24 @@ static void raid1_end_read_request(struct bio *bio, int error)
rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
} }
static void r1_bio_write_done(r1bio_t *r1_bio, int vcnt, struct bio_vec *bv, static void r1_bio_write_done(r1bio_t *r1_bio)
int behind)
{ {
if (atomic_dec_and_test(&r1_bio->remaining)) if (atomic_dec_and_test(&r1_bio->remaining))
{ {
/* it really is the end of this request */ /* it really is the end of this request */
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
/* free extra copy of the data pages */ /* free extra copy of the data pages */
int i = vcnt; int i = r1_bio->behind_page_count;
while (i--) while (i--)
safe_put_page(bv[i].bv_page); safe_put_page(r1_bio->behind_pages[i]);
kfree(r1_bio->behind_pages);
r1_bio->behind_pages = NULL;
} }
/* clear the bitmap if all writes complete successfully */ /* clear the bitmap if all writes complete successfully */
bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
r1_bio->sectors, r1_bio->sectors,
!test_bit(R1BIO_Degraded, &r1_bio->state), !test_bit(R1BIO_Degraded, &r1_bio->state),
behind); test_bit(R1BIO_BehindIO, &r1_bio->state));
md_write_end(r1_bio->mddev); md_write_end(r1_bio->mddev);
raid_end_bio_io(r1_bio); raid_end_bio_io(r1_bio);
} }
...@@ -386,7 +387,7 @@ static void raid1_end_write_request(struct bio *bio, int error) ...@@ -386,7 +387,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
* Let's see if all mirrored write operations have finished * Let's see if all mirrored write operations have finished
* already. * already.
*/ */
r1_bio_write_done(r1_bio, bio->bi_vcnt, bio->bi_io_vec, behind); r1_bio_write_done(r1_bio);
if (to_put) if (to_put)
bio_put(to_put); bio_put(to_put);
...@@ -411,10 +412,10 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) ...@@ -411,10 +412,10 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
{ {
const sector_t this_sector = r1_bio->sector; const sector_t this_sector = r1_bio->sector;
const int sectors = r1_bio->sectors; const int sectors = r1_bio->sectors;
int new_disk = -1;
int start_disk; int start_disk;
int best_disk;
int i; int i;
sector_t new_distance, current_distance; sector_t best_dist;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
int choose_first; int choose_first;
...@@ -425,6 +426,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) ...@@ -425,6 +426,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
* We take the first readable disk when above the resync window. * We take the first readable disk when above the resync window.
*/ */
retry: retry:
best_disk = -1;
best_dist = MaxSector;
if (conf->mddev->recovery_cp < MaxSector && if (conf->mddev->recovery_cp < MaxSector &&
(this_sector + sectors >= conf->next_resync)) { (this_sector + sectors >= conf->next_resync)) {
choose_first = 1; choose_first = 1;
...@@ -434,8 +437,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) ...@@ -434,8 +437,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
start_disk = conf->last_used; start_disk = conf->last_used;
} }
/* make sure the disk is operational */
for (i = 0 ; i < conf->raid_disks ; i++) { for (i = 0 ; i < conf->raid_disks ; i++) {
sector_t dist;
int disk = start_disk + i; int disk = start_disk + i;
if (disk >= conf->raid_disks) if (disk >= conf->raid_disks)
disk -= conf->raid_disks; disk -= conf->raid_disks;
...@@ -443,60 +446,43 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) ...@@ -443,60 +446,43 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
rdev = rcu_dereference(conf->mirrors[disk].rdev); rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (r1_bio->bios[disk] == IO_BLOCKED if (r1_bio->bios[disk] == IO_BLOCKED
|| rdev == NULL || rdev == NULL
|| !test_bit(In_sync, &rdev->flags)) || test_bit(Faulty, &rdev->flags))
continue;
if (!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < this_sector + sectors)
continue;
if (test_bit(WriteMostly, &rdev->flags)) {
/* Don't balance among write-mostly, just
* use the first as a last resort */
if (best_disk < 0)
best_disk = disk;
continue; continue;
new_disk = disk;
if (!test_bit(WriteMostly, &rdev->flags))
break;
} }
/* This is a reasonable device to use. It might
if (new_disk < 0 || choose_first) * even be best.
goto rb_out;
/*
* Don't change to another disk for sequential reads:
*/ */
if (conf->next_seq_sect == this_sector) dist = abs(this_sector - conf->mirrors[disk].head_position);
goto rb_out; if (choose_first
if (this_sector == conf->mirrors[new_disk].head_position) /* Don't change to another disk for sequential reads */
goto rb_out; || conf->next_seq_sect == this_sector
|| dist == 0
current_distance = abs(this_sector /* If device is idle, use it */
- conf->mirrors[new_disk].head_position); || atomic_read(&rdev->nr_pending) == 0) {
best_disk = disk;
/* look for a better disk - i.e. head is closer */
start_disk = new_disk;
for (i = 1; i < conf->raid_disks; i++) {
int disk = start_disk + 1;
if (disk >= conf->raid_disks)
disk -= conf->raid_disks;
rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (r1_bio->bios[disk] == IO_BLOCKED
|| rdev == NULL
|| !test_bit(In_sync, &rdev->flags)
|| test_bit(WriteMostly, &rdev->flags))
continue;
if (!atomic_read(&rdev->nr_pending)) {
new_disk = disk;
break; break;
} }
new_distance = abs(this_sector - conf->mirrors[disk].head_position); if (dist < best_dist) {
if (new_distance < current_distance) { best_dist = dist;
current_distance = new_distance; best_disk = disk;
new_disk = disk;
} }
} }
rb_out: if (best_disk >= 0) {
if (new_disk >= 0) { rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
if (!rdev) if (!rdev)
goto retry; goto retry;
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
if (!test_bit(In_sync, &rdev->flags)) { if (test_bit(Faulty, &rdev->flags)) {
/* cannot risk returning a device that failed /* cannot risk returning a device that failed
* before we inc'ed nr_pending * before we inc'ed nr_pending
*/ */
...@@ -504,11 +490,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) ...@@ -504,11 +490,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
goto retry; goto retry;
} }
conf->next_seq_sect = this_sector + sectors; conf->next_seq_sect = this_sector + sectors;
conf->last_used = new_disk; conf->last_used = best_disk;
} }
rcu_read_unlock(); rcu_read_unlock();
return new_disk; return best_disk;
} }
static int raid1_congested(void *data, int bits) static int raid1_congested(void *data, int bits)
...@@ -675,37 +661,36 @@ static void unfreeze_array(conf_t *conf) ...@@ -675,37 +661,36 @@ static void unfreeze_array(conf_t *conf)
/* duplicate the data pages for behind I/O /* duplicate the data pages for behind I/O
* We return a list of bio_vec rather than just page pointers
* as it makes freeing easier
*/ */
static struct bio_vec *alloc_behind_pages(struct bio *bio) static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
{ {
int i; int i;
struct bio_vec *bvec; struct bio_vec *bvec;
struct bio_vec *pages = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec), struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*),
GFP_NOIO); GFP_NOIO);
if (unlikely(!pages)) if (unlikely(!pages))
goto do_sync_io; return;
bio_for_each_segment(bvec, bio, i) { bio_for_each_segment(bvec, bio, i) {
pages[i].bv_page = alloc_page(GFP_NOIO); pages[i] = alloc_page(GFP_NOIO);
if (unlikely(!pages[i].bv_page)) if (unlikely(!pages[i]))
goto do_sync_io; goto do_sync_io;
memcpy(kmap(pages[i].bv_page) + bvec->bv_offset, memcpy(kmap(pages[i]) + bvec->bv_offset,
kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
kunmap(pages[i].bv_page); kunmap(pages[i]);
kunmap(bvec->bv_page); kunmap(bvec->bv_page);
} }
r1_bio->behind_pages = pages;
return pages; r1_bio->behind_page_count = bio->bi_vcnt;
set_bit(R1BIO_BehindIO, &r1_bio->state);
return;
do_sync_io: do_sync_io:
if (pages) for (i = 0; i < bio->bi_vcnt; i++)
for (i = 0; i < bio->bi_vcnt && pages[i].bv_page; i++) if (pages[i])
put_page(pages[i].bv_page); put_page(pages[i]);
kfree(pages); kfree(pages);
PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
return NULL;
} }
static int make_request(mddev_t *mddev, struct bio * bio) static int make_request(mddev_t *mddev, struct bio * bio)
...@@ -717,7 +702,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) ...@@ -717,7 +702,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
int i, targets = 0, disks; int i, targets = 0, disks;
struct bitmap *bitmap; struct bitmap *bitmap;
unsigned long flags; unsigned long flags;
struct bio_vec *behind_pages = NULL;
const int rw = bio_data_dir(bio); const int rw = bio_data_dir(bio);
const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
...@@ -870,9 +854,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) ...@@ -870,9 +854,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
if (bitmap && if (bitmap &&
(atomic_read(&bitmap->behind_writes) (atomic_read(&bitmap->behind_writes)
< mddev->bitmap_info.max_write_behind) && < mddev->bitmap_info.max_write_behind) &&
!waitqueue_active(&bitmap->behind_wait) && !waitqueue_active(&bitmap->behind_wait))
(behind_pages = alloc_behind_pages(bio)) != NULL) alloc_behind_pages(bio, r1_bio);
set_bit(R1BIO_BehindIO, &r1_bio->state);
atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0); atomic_set(&r1_bio->behind_remaining, 0);
...@@ -893,7 +876,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) ...@@ -893,7 +876,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
mbio->bi_rw = WRITE | do_flush_fua | do_sync; mbio->bi_rw = WRITE | do_flush_fua | do_sync;
mbio->bi_private = r1_bio; mbio->bi_private = r1_bio;
if (behind_pages) { if (r1_bio->behind_pages) {
struct bio_vec *bvec; struct bio_vec *bvec;
int j; int j;
...@@ -905,7 +888,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) ...@@ -905,7 +888,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
* them all * them all
*/ */
__bio_for_each_segment(bvec, mbio, j, 0) __bio_for_each_segment(bvec, mbio, j, 0)
bvec->bv_page = behind_pages[j].bv_page; bvec->bv_page = r1_bio->behind_pages[j];
if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
atomic_inc(&r1_bio->behind_remaining); atomic_inc(&r1_bio->behind_remaining);
} }
...@@ -915,8 +898,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) ...@@ -915,8 +898,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
bio_list_add(&conf->pending_bio_list, mbio); bio_list_add(&conf->pending_bio_list, mbio);
spin_unlock_irqrestore(&conf->device_lock, flags); spin_unlock_irqrestore(&conf->device_lock, flags);
} }
r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL); r1_bio_write_done(r1_bio);
kfree(behind_pages); /* the behind pages are attached to the bios now */
/* In case raid1d snuck in to freeze_array */ /* In case raid1d snuck in to freeze_array */
wake_up(&conf->wait_barrier); wake_up(&conf->wait_barrier);
...@@ -1196,101 +1178,9 @@ static void end_sync_write(struct bio *bio, int error) ...@@ -1196,101 +1178,9 @@ static void end_sync_write(struct bio *bio, int error)
} }
} }
static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) static int fix_sync_read_error(r1bio_t *r1_bio)
{ {
conf_t *conf = mddev->private; /* Try some synchronous reads of other devices to get
int i;
int disks = conf->raid_disks;
struct bio *bio, *wbio;
bio = r1_bio->bios[r1_bio->read_disk];
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
/* We have read all readable devices. If we haven't
* got the block, then there is no hope left.
* If we have, then we want to do a comparison
* and skip the write if everything is the same.
* If any blocks failed to read, then we need to
* attempt an over-write
*/
int primary;
if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
for (i=0; i<mddev->raid_disks; i++)
if (r1_bio->bios[i]->bi_end_io == end_sync_read)
md_error(mddev, conf->mirrors[i].rdev);
md_done_sync(mddev, r1_bio->sectors, 1);
put_buf(r1_bio);
return;
}
for (primary=0; primary<mddev->raid_disks; primary++)
if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
r1_bio->bios[primary]->bi_end_io = NULL;
rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
break;
}
r1_bio->read_disk = primary;
for (i=0; i<mddev->raid_disks; i++)
if (r1_bio->bios[i]->bi_end_io == end_sync_read) {
int j;
int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
struct bio *pbio = r1_bio->bios[primary];
struct bio *sbio = r1_bio->bios[i];
if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
for (j = vcnt; j-- ; ) {
struct page *p, *s;
p = pbio->bi_io_vec[j].bv_page;
s = sbio->bi_io_vec[j].bv_page;
if (memcmp(page_address(p),
page_address(s),
PAGE_SIZE))
break;
}
} else
j = 0;
if (j >= 0)
mddev->resync_mismatches += r1_bio->sectors;
if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
&& test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
sbio->bi_end_io = NULL;
rdev_dec_pending(conf->mirrors[i].rdev, mddev);
} else {
/* fixup the bio for reuse */
int size;
sbio->bi_vcnt = vcnt;
sbio->bi_size = r1_bio->sectors << 9;
sbio->bi_idx = 0;
sbio->bi_phys_segments = 0;
sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
sbio->bi_flags |= 1 << BIO_UPTODATE;
sbio->bi_next = NULL;
sbio->bi_sector = r1_bio->sector +
conf->mirrors[i].rdev->data_offset;
sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
size = sbio->bi_size;
for (j = 0; j < vcnt ; j++) {
struct bio_vec *bi;
bi = &sbio->bi_io_vec[j];
bi->bv_offset = 0;
if (size > PAGE_SIZE)
bi->bv_len = PAGE_SIZE;
else
bi->bv_len = size;
size -= PAGE_SIZE;
memcpy(page_address(bi->bv_page),
page_address(pbio->bi_io_vec[j].bv_page),
PAGE_SIZE);
}
}
}
}
if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
/* ouch - failed to read all of that.
* Try some synchronous reads of other devices to get
* good data, much like with normal read errors. Only * good data, much like with normal read errors. Only
* read into the pages we already have so we don't * read into the pages we already have so we don't
* need to re-issue the read request. * need to re-issue the read request.
...@@ -1298,6 +1188,9 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) ...@@ -1298,6 +1188,9 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
* active sync request, there is no normal IO, and * active sync request, there is no normal IO, and
* no overlapping syncs. * no overlapping syncs.
*/ */
mddev_t *mddev = r1_bio->mddev;
conf_t *conf = mddev->private;
struct bio *bio = r1_bio->bios[r1_bio->read_disk];
sector_t sect = r1_bio->sector; sector_t sect = r1_bio->sector;
int sectors = r1_bio->sectors; int sectors = r1_bio->sectors;
int idx = 0; int idx = 0;
...@@ -1307,6 +1200,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) ...@@ -1307,6 +1200,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
int d = r1_bio->read_disk; int d = r1_bio->read_disk;
int success = 0; int success = 0;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
int start;
if (s > (PAGE_SIZE>>9)) if (s > (PAGE_SIZE>>9))
s = PAGE_SIZE >> 9; s = PAGE_SIZE >> 9;
...@@ -1331,10 +1225,22 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) ...@@ -1331,10 +1225,22 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
d = 0; d = 0;
} while (!success && d != r1_bio->read_disk); } while (!success && d != r1_bio->read_disk);
if (success) { if (!success) {
int start = d; char b[BDEVNAME_SIZE];
/* Cannot read from anywhere, array is toast */
md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
" for block %llu\n",
mdname(mddev),
bdevname(bio->bi_bdev, b),
(unsigned long long)r1_bio->sector);
md_done_sync(mddev, r1_bio->sectors, 0);
put_buf(r1_bio);
return 0;
}
start = d;
/* write it back and re-read */ /* write it back and re-read */
set_bit(R1BIO_Uptodate, &r1_bio->state);
while (d != r1_bio->read_disk) { while (d != r1_bio->read_disk) {
if (d == 0) if (d == 0)
d = conf->raid_disks; d = conf->raid_disks;
...@@ -1342,13 +1248,16 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) ...@@ -1342,13 +1248,16 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
if (r1_bio->bios[d]->bi_end_io != end_sync_read) if (r1_bio->bios[d]->bi_end_io != end_sync_read)
continue; continue;
rdev = conf->mirrors[d].rdev; rdev = conf->mirrors[d].rdev;
atomic_add(s, &rdev->corrected_errors);
if (sync_page_io(rdev, if (sync_page_io(rdev,
sect, sect,
s<<9, s<<9,
bio->bi_io_vec[idx].bv_page, bio->bi_io_vec[idx].bv_page,
WRITE, false) == 0) WRITE, false) == 0) {
r1_bio->bios[d]->bi_end_io = NULL;
rdev_dec_pending(rdev, mddev);
md_error(mddev, rdev); md_error(mddev, rdev);
} else
atomic_add(s, &rdev->corrected_errors);
} }
d = start; d = start;
while (d != r1_bio->read_disk) { while (d != r1_bio->read_disk) {
...@@ -1365,25 +1274,114 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) ...@@ -1365,25 +1274,114 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
READ, false) == 0) READ, false) == 0)
md_error(mddev, rdev); md_error(mddev, rdev);
} }
} else {
char b[BDEVNAME_SIZE];
/* Cannot read from anywhere, array is toast */
md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
" for block %llu\n",
mdname(mddev),
bdevname(bio->bi_bdev, b),
(unsigned long long)r1_bio->sector);
md_done_sync(mddev, r1_bio->sectors, 0);
put_buf(r1_bio);
return;
}
sectors -= s; sectors -= s;
sect += s; sect += s;
idx ++; idx ++;
} }
set_bit(R1BIO_Uptodate, &r1_bio->state);
set_bit(BIO_UPTODATE, &bio->bi_flags);
return 1;
}
static int process_checks(r1bio_t *r1_bio)
{
/* We have read all readable devices. If we haven't
* got the block, then there is no hope left.
* If we have, then we want to do a comparison
* and skip the write if everything is the same.
* If any blocks failed to read, then we need to
* attempt an over-write
*/
mddev_t *mddev = r1_bio->mddev;
conf_t *conf = mddev->private;
int primary;
int i;
for (primary = 0; primary < conf->raid_disks; primary++)
if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
r1_bio->bios[primary]->bi_end_io = NULL;
rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
break;
}
r1_bio->read_disk = primary;
for (i = 0; i < conf->raid_disks; i++) {
int j;
int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
struct bio *pbio = r1_bio->bios[primary];
struct bio *sbio = r1_bio->bios[i];
int size;
if (r1_bio->bios[i]->bi_end_io != end_sync_read)
continue;
if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
for (j = vcnt; j-- ; ) {
struct page *p, *s;
p = pbio->bi_io_vec[j].bv_page;
s = sbio->bi_io_vec[j].bv_page;
if (memcmp(page_address(p),
page_address(s),
PAGE_SIZE))
break;
}
} else
j = 0;
if (j >= 0)
mddev->resync_mismatches += r1_bio->sectors;
if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
&& test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
/* No need to write to this device. */
sbio->bi_end_io = NULL;
rdev_dec_pending(conf->mirrors[i].rdev, mddev);
continue;
} }
/* fixup the bio for reuse */
sbio->bi_vcnt = vcnt;
sbio->bi_size = r1_bio->sectors << 9;
sbio->bi_idx = 0;
sbio->bi_phys_segments = 0;
sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
sbio->bi_flags |= 1 << BIO_UPTODATE;
sbio->bi_next = NULL;
sbio->bi_sector = r1_bio->sector +
conf->mirrors[i].rdev->data_offset;
sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
size = sbio->bi_size;
for (j = 0; j < vcnt ; j++) {
struct bio_vec *bi;
bi = &sbio->bi_io_vec[j];
bi->bv_offset = 0;
if (size > PAGE_SIZE)
bi->bv_len = PAGE_SIZE;
else
bi->bv_len = size;
size -= PAGE_SIZE;
memcpy(page_address(bi->bv_page),
page_address(pbio->bi_io_vec[j].bv_page),
PAGE_SIZE);
}
}
return 0;
}
static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
{
conf_t *conf = mddev->private;
int i;
int disks = conf->raid_disks;
struct bio *bio, *wbio;
bio = r1_bio->bios[r1_bio->read_disk];
if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
/* ouch - failed to read all of that. */
if (!fix_sync_read_error(r1_bio))
return;
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
if (process_checks(r1_bio) < 0)
return;
/* /*
* schedule writes * schedule writes
*/ */
...@@ -2063,7 +2061,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) ...@@ -2063,7 +2061,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
set_capacity(mddev->gendisk, mddev->array_sectors); set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk); revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors && if (sectors > mddev->dev_sectors &&
mddev->recovery_cp == MaxSector) { mddev->recovery_cp > mddev->dev_sectors) {
mddev->recovery_cp = mddev->dev_sectors; mddev->recovery_cp = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
} }
......
...@@ -94,7 +94,9 @@ struct r1bio_s { ...@@ -94,7 +94,9 @@ struct r1bio_s {
int read_disk; int read_disk;
struct list_head retry_list; struct list_head retry_list;
struct bitmap_update *bitmap_update; /* Next two are only valid when R1BIO_BehindIO is set */
struct page **behind_pages;
int behind_page_count;
/* /*
* if the IO is in WRITE direction, then multiple bios are used. * if the IO is in WRITE direction, then multiple bios are used.
* We choose the number when they are allocated. * We choose the number when they are allocated.
......
...@@ -271,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error) ...@@ -271,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error)
*/ */
set_bit(R10BIO_Uptodate, &r10_bio->state); set_bit(R10BIO_Uptodate, &r10_bio->state);
raid_end_bio_io(r10_bio); raid_end_bio_io(r10_bio);
rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
} else { } else {
/* /*
* oops, read error: * oops, read error - keep the refcount on the rdev
*/ */
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
if (printk_ratelimit()) if (printk_ratelimit())
...@@ -282,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error) ...@@ -282,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error)
bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
reschedule_retry(r10_bio); reschedule_retry(r10_bio);
} }
rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
} }
static void raid10_end_write_request(struct bio *bio, int error) static void raid10_end_write_request(struct bio *bio, int error)
...@@ -488,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q, ...@@ -488,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q,
static int read_balance(conf_t *conf, r10bio_t *r10_bio) static int read_balance(conf_t *conf, r10bio_t *r10_bio)
{ {
const sector_t this_sector = r10_bio->sector; const sector_t this_sector = r10_bio->sector;
int disk, slot, nslot; int disk, slot;
const int sectors = r10_bio->sectors; const int sectors = r10_bio->sectors;
sector_t new_distance, current_distance; sector_t new_distance, best_dist;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
int do_balance;
int best_slot;
raid10_find_phys(conf, r10_bio); raid10_find_phys(conf, r10_bio);
rcu_read_lock(); rcu_read_lock();
retry:
best_slot = -1;
best_dist = MaxSector;
do_balance = 1;
/* /*
* Check if we can balance. We can balance on the whole * Check if we can balance. We can balance on the whole
* device if no resync is going on (recovery is ok), or below * device if no resync is going on (recovery is ok), or below
...@@ -502,86 +507,58 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) ...@@ -502,86 +507,58 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
* above the resync window. * above the resync window.
*/ */
if (conf->mddev->recovery_cp < MaxSector if (conf->mddev->recovery_cp < MaxSector
&& (this_sector + sectors >= conf->next_resync)) { && (this_sector + sectors >= conf->next_resync))
/* make sure that disk is operational */ do_balance = 0;
slot = 0;
disk = r10_bio->devs[slot].devnum;
while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
r10_bio->devs[slot].bio == IO_BLOCKED ||
!test_bit(In_sync, &rdev->flags)) {
slot++;
if (slot == conf->copies) {
slot = 0;
disk = -1;
break;
}
disk = r10_bio->devs[slot].devnum;
}
goto rb_out;
}
for (slot = 0; slot < conf->copies ; slot++) {
/* make sure the disk is operational */ if (r10_bio->devs[slot].bio == IO_BLOCKED)
slot = 0; continue;
disk = r10_bio->devs[slot].devnum;
while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
r10_bio->devs[slot].bio == IO_BLOCKED ||
!test_bit(In_sync, &rdev->flags)) {
slot ++;
if (slot == conf->copies) {
disk = -1;
goto rb_out;
}
disk = r10_bio->devs[slot].devnum; disk = r10_bio->devs[slot].devnum;
} rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (rdev == NULL)
current_distance = abs(r10_bio->devs[slot].addr -
conf->mirrors[disk].head_position);
/* Find the disk whose head is closest,
* or - for far > 1 - find the closest to partition beginning */
for (nslot = slot; nslot < conf->copies; nslot++) {
int ndisk = r10_bio->devs[nslot].devnum;
if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
r10_bio->devs[nslot].bio == IO_BLOCKED ||
!test_bit(In_sync, &rdev->flags))
continue; continue;
if (!test_bit(In_sync, &rdev->flags))
continue;
if (!do_balance)
break;
/* This optimisation is debatable, and completely destroys /* This optimisation is debatable, and completely destroys
* sequential read speed for 'far copies' arrays. So only * sequential read speed for 'far copies' arrays. So only
* keep it for 'near' arrays, and review those later. * keep it for 'near' arrays, and review those later.
*/ */
if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) { if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
disk = ndisk;
slot = nslot;
break; break;
}
/* for far > 1 always use the lowest address */ /* for far > 1 always use the lowest address */
if (conf->far_copies > 1) if (conf->far_copies > 1)
new_distance = r10_bio->devs[nslot].addr; new_distance = r10_bio->devs[slot].addr;
else else
new_distance = abs(r10_bio->devs[nslot].addr - new_distance = abs(r10_bio->devs[slot].addr -
conf->mirrors[ndisk].head_position); conf->mirrors[disk].head_position);
if (new_distance < current_distance) { if (new_distance < best_dist) {
current_distance = new_distance; best_dist = new_distance;
disk = ndisk; best_slot = slot;
slot = nslot;
} }
} }
if (slot == conf->copies)
slot = best_slot;
rb_out: if (slot >= 0) {
disk = r10_bio->devs[slot].devnum;
rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (!rdev)
goto retry;
atomic_inc(&rdev->nr_pending);
if (test_bit(Faulty, &rdev->flags)) {
/* Cannot risk returning a device that failed
* before we inc'ed nr_pending
*/
rdev_dec_pending(rdev, conf->mddev);
goto retry;
}
r10_bio->read_slot = slot; r10_bio->read_slot = slot;
/* conf->next_seq_sect = this_sector + sectors;*/ } else
if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
else
disk = -1; disk = -1;
rcu_read_unlock(); rcu_read_unlock();
...@@ -1460,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) ...@@ -1460,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
int max_read_errors = atomic_read(&mddev->max_corr_read_errors); int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
int d = r10_bio->devs[r10_bio->read_slot].devnum; int d = r10_bio->devs[r10_bio->read_slot].devnum;
rcu_read_lock(); /* still own a reference to this rdev, so it cannot
rdev = rcu_dereference(conf->mirrors[d].rdev); * have been cleared recently.
if (rdev) { /* If rdev is not NULL */ */
char b[BDEVNAME_SIZE]; rdev = conf->mirrors[d].rdev;
int cur_read_error_count = 0;
bdevname(rdev->bdev, b);
if (test_bit(Faulty, &rdev->flags)) { if (test_bit(Faulty, &rdev->flags))
rcu_read_unlock();
/* drive has already been failed, just ignore any /* drive has already been failed, just ignore any
more fix_read_error() attempts */ more fix_read_error() attempts */
return; return;
}
check_decay_read_errors(mddev, rdev); check_decay_read_errors(mddev, rdev);
atomic_inc(&rdev->read_errors); atomic_inc(&rdev->read_errors);
cur_read_error_count = atomic_read(&rdev->read_errors); if (atomic_read(&rdev->read_errors) > max_read_errors) {
if (cur_read_error_count > max_read_errors) { char b[BDEVNAME_SIZE];
rcu_read_unlock(); bdevname(rdev->bdev, b);
printk(KERN_NOTICE printk(KERN_NOTICE
"md/raid10:%s: %s: Raid device exceeded " "md/raid10:%s: %s: Raid device exceeded "
"read_error threshold " "read_error threshold [cur %d:max %d]\n",
"[cur %d:max %d]\n", mdname(mddev), b,
mdname(mddev), atomic_read(&rdev->read_errors), max_read_errors);
b, cur_read_error_count, max_read_errors);
printk(KERN_NOTICE printk(KERN_NOTICE
"md/raid10:%s: %s: Failing raid " "md/raid10:%s: %s: Failing raid device\n",
"device\n", mdname(mddev), b); mdname(mddev), b);
md_error(mddev, conf->mirrors[d].rdev); md_error(mddev, conf->mirrors[d].rdev);
return; return;
} }
}
rcu_read_unlock();
while(sectors) { while(sectors) {
int s = sectors; int s = sectors;
...@@ -1562,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) ...@@ -1562,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
"write failed" "write failed"
" (%d sectors at %llu on %s)\n", " (%d sectors at %llu on %s)\n",
mdname(mddev), s, mdname(mddev), s,
(unsigned long long)(sect+ (unsigned long long)(
rdev->data_offset), sect + rdev->data_offset),
bdevname(rdev->bdev, b)); bdevname(rdev->bdev, b));
printk(KERN_NOTICE "md/raid10:%s: %s: failing " printk(KERN_NOTICE "md/raid10:%s: %s: failing "
"drive\n", "drive\n",
...@@ -1599,8 +1569,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) ...@@ -1599,8 +1569,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
"corrected sectors" "corrected sectors"
" (%d sectors at %llu on %s)\n", " (%d sectors at %llu on %s)\n",
mdname(mddev), s, mdname(mddev), s,
(unsigned long long)(sect+ (unsigned long long)(
rdev->data_offset), sect + rdev->data_offset),
bdevname(rdev->bdev, b)); bdevname(rdev->bdev, b));
printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
mdname(mddev), mdname(mddev),
...@@ -1612,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) ...@@ -1612,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
"md/raid10:%s: read error corrected" "md/raid10:%s: read error corrected"
" (%d sectors at %llu on %s)\n", " (%d sectors at %llu on %s)\n",
mdname(mddev), s, mdname(mddev), s,
(unsigned long long)(sect+ (unsigned long long)(
rdev->data_offset), sect + rdev->data_offset),
bdevname(rdev->bdev, b)); bdevname(rdev->bdev, b));
} }
...@@ -1663,7 +1633,8 @@ static void raid10d(mddev_t *mddev) ...@@ -1663,7 +1633,8 @@ static void raid10d(mddev_t *mddev)
else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
recovery_request_write(mddev, r10_bio); recovery_request_write(mddev, r10_bio);
else { else {
int mirror; int slot = r10_bio->read_slot;
int mirror = r10_bio->devs[slot].devnum;
/* we got a read error. Maybe the drive is bad. Maybe just /* we got a read error. Maybe the drive is bad. Maybe just
* the block and we can fix it. * the block and we can fix it.
* We freeze all other IO, and try reading the block from * We freeze all other IO, and try reading the block from
...@@ -1677,9 +1648,10 @@ static void raid10d(mddev_t *mddev) ...@@ -1677,9 +1648,10 @@ static void raid10d(mddev_t *mddev)
fix_read_error(conf, mddev, r10_bio); fix_read_error(conf, mddev, r10_bio);
unfreeze_array(conf); unfreeze_array(conf);
} }
rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
bio = r10_bio->devs[r10_bio->read_slot].bio; bio = r10_bio->devs[slot].bio;
r10_bio->devs[r10_bio->read_slot].bio = r10_bio->devs[slot].bio =
mddev->ro ? IO_BLOCKED : NULL; mddev->ro ? IO_BLOCKED : NULL;
mirror = read_balance(conf, r10_bio); mirror = read_balance(conf, r10_bio);
if (mirror == -1) { if (mirror == -1) {
...@@ -1693,6 +1665,7 @@ static void raid10d(mddev_t *mddev) ...@@ -1693,6 +1665,7 @@ static void raid10d(mddev_t *mddev)
} else { } else {
const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
bio_put(bio); bio_put(bio);
slot = r10_bio->read_slot;
rdev = conf->mirrors[mirror].rdev; rdev = conf->mirrors[mirror].rdev;
if (printk_ratelimit()) if (printk_ratelimit())
printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
...@@ -1702,8 +1675,8 @@ static void raid10d(mddev_t *mddev) ...@@ -1702,8 +1675,8 @@ static void raid10d(mddev_t *mddev)
(unsigned long long)r10_bio->sector); (unsigned long long)r10_bio->sector);
bio = bio_clone_mddev(r10_bio->master_bio, bio = bio_clone_mddev(r10_bio->master_bio,
GFP_NOIO, mddev); GFP_NOIO, mddev);
r10_bio->devs[r10_bio->read_slot].bio = bio; r10_bio->devs[slot].bio = bio;
bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr bio->bi_sector = r10_bio->devs[slot].addr
+ rdev->data_offset; + rdev->data_offset;
bio->bi_bdev = rdev->bdev; bio->bi_bdev = rdev->bdev;
bio->bi_rw = READ | do_sync; bio->bi_rw = READ | do_sync;
...@@ -1763,13 +1736,13 @@ static int init_resync(conf_t *conf) ...@@ -1763,13 +1736,13 @@ static int init_resync(conf_t *conf)
* *
*/ */
static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
int *skipped, int go_faster)
{ {
conf_t *conf = mddev->private; conf_t *conf = mddev->private;
r10bio_t *r10_bio; r10bio_t *r10_bio;
struct bio *biolist = NULL, *bio; struct bio *biolist = NULL, *bio;
sector_t max_sector, nr_sectors; sector_t max_sector, nr_sectors;
int disk;
int i; int i;
int max_sync; int max_sync;
sector_t sync_blocks; sector_t sync_blocks;
...@@ -1858,14 +1831,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1858,14 +1831,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
int j, k; int j, k;
r10_bio = NULL; r10_bio = NULL;
for (i=0 ; i<conf->raid_disks; i++) for (i=0 ; i<conf->raid_disks; i++) {
if (conf->mirrors[i].rdev && int still_degraded;
!test_bit(In_sync, &conf->mirrors[i].rdev->flags)) { r10bio_t *rb2;
int still_degraded = 0; sector_t sect;
/* want to reconstruct this device */
r10bio_t *rb2 = r10_bio;
sector_t sect = raid10_find_virt(conf, sector_nr, i);
int must_sync; int must_sync;
if (conf->mirrors[i].rdev == NULL ||
test_bit(In_sync, &conf->mirrors[i].rdev->flags))
continue;
still_degraded = 0;
/* want to reconstruct this device */
rb2 = r10_bio;
sect = raid10_find_virt(conf, sector_nr, i);
/* Unless we are doing a full sync, we only need /* Unless we are doing a full sync, we only need
* to recover the block if it is set in the bitmap * to recover the block if it is set in the bitmap
*/ */
...@@ -1910,8 +1889,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1910,8 +1889,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
for (j=0; j<conf->copies;j++) { for (j=0; j<conf->copies;j++) {
int d = r10_bio->devs[j].devnum; int d = r10_bio->devs[j].devnum;
if (conf->mirrors[d].rdev && if (!conf->mirrors[d].rdev ||
test_bit(In_sync, &conf->mirrors[d].rdev->flags)) { !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
continue;
/* This is where we read from */ /* This is where we read from */
bio = r10_bio->devs[0].bio; bio = r10_bio->devs[0].bio;
bio->bi_next = biolist; bio->bi_next = biolist;
...@@ -1945,7 +1925,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1945,7 +1925,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
break; break;
} }
}
if (j == conf->copies) { if (j == conf->copies) {
/* Cannot recover, so abort the recovery */ /* Cannot recover, so abort the recovery */
put_buf(r10_bio); put_buf(r10_bio);
...@@ -1977,7 +1956,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1977,7 +1956,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
if (!bitmap_start_sync(mddev->bitmap, sector_nr, if (!bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) && &sync_blocks, mddev->degraded) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
&mddev->recovery)) {
/* We can skip this block */ /* We can skip this block */
*skipped = 1; *skipped = 1;
return sync_blocks + sectors_skipped; return sync_blocks + sectors_skipped;
...@@ -2022,7 +2002,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -2022,7 +2002,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
for (i=0; i<conf->copies; i++) { for (i=0; i<conf->copies; i++) {
int d = r10_bio->devs[i].devnum; int d = r10_bio->devs[i].devnum;
if (r10_bio->devs[i].bio->bi_end_io) if (r10_bio->devs[i].bio->bi_end_io)
rdev_dec_pending(conf->mirrors[d].rdev, mddev); rdev_dec_pending(conf->mirrors[d].rdev,
mddev);
} }
put_buf(r10_bio); put_buf(r10_bio);
biolist = NULL; biolist = NULL;
...@@ -2047,18 +2028,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -2047,18 +2028,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
do { do {
struct page *page; struct page *page;
int len = PAGE_SIZE; int len = PAGE_SIZE;
disk = 0;
if (sector_nr + (len>>9) > max_sector) if (sector_nr + (len>>9) > max_sector)
len = (max_sector - sector_nr) << 9; len = (max_sector - sector_nr) << 9;
if (len == 0) if (len == 0)
break; break;
for (bio= biolist ; bio ; bio=bio->bi_next) { for (bio= biolist ; bio ; bio=bio->bi_next) {
struct bio *bio2;
page = bio->bi_io_vec[bio->bi_vcnt].bv_page; page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
if (bio_add_page(bio, page, len, 0) == 0) { if (bio_add_page(bio, page, len, 0))
continue;
/* stop here */ /* stop here */
struct bio *bio2;
bio->bi_io_vec[bio->bi_vcnt].bv_page = page; bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) { for (bio2 = biolist;
bio2 && bio2 != bio;
bio2 = bio2->bi_next) {
/* remove last page from this bio */ /* remove last page from this bio */
bio2->bi_vcnt--; bio2->bi_vcnt--;
bio2->bi_size -= len; bio2->bi_size -= len;
...@@ -2066,8 +2050,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -2066,8 +2050,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
} }
goto bio_full; goto bio_full;
} }
disk = i;
}
nr_sectors += len>>9; nr_sectors += len>>9;
sector_nr += len>>9; sector_nr += len>>9;
} while (biolist->bi_vcnt < RESYNC_PAGES); } while (biolist->bi_vcnt < RESYNC_PAGES);
......
...@@ -1700,8 +1700,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1700,8 +1700,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
raid5_conf_t *conf = mddev->private; raid5_conf_t *conf = mddev->private;
pr_debug("raid456: error called\n"); pr_debug("raid456: error called\n");
if (!test_bit(Faulty, &rdev->flags)) {
set_bit(MD_CHANGE_DEVS, &mddev->flags);
if (test_and_clear_bit(In_sync, &rdev->flags)) { if (test_and_clear_bit(In_sync, &rdev->flags)) {
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&conf->device_lock, flags); spin_lock_irqsave(&conf->device_lock, flags);
...@@ -1713,6 +1711,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1713,6 +1711,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery);
} }
set_bit(Faulty, &rdev->flags); set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
printk(KERN_ALERT printk(KERN_ALERT
"md/raid:%s: Disk failure on %s, disabling device.\n" "md/raid:%s: Disk failure on %s, disabling device.\n"
"md/raid:%s: Operation continuing on %d devices.\n", "md/raid:%s: Operation continuing on %d devices.\n",
...@@ -1720,7 +1719,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1720,7 +1719,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
bdevname(rdev->bdev, b), bdevname(rdev->bdev, b),
mdname(mddev), mdname(mddev),
conf->raid_disks - mddev->degraded); conf->raid_disks - mddev->degraded);
}
} }
/* /*
...@@ -5391,7 +5389,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) ...@@ -5391,7 +5389,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
return -EINVAL; return -EINVAL;
set_capacity(mddev->gendisk, mddev->array_sectors); set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk); revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > mddev->dev_sectors) {
mddev->recovery_cp = mddev->dev_sectors; mddev->recovery_cp = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册