diff --git a/block/badblocks.c b/block/badblocks.c index d39056630d9c1de07d3923daeb8b80e6ab6a086e..b387109df02e2b1500b94fb66d1dda9da3b49be7 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -165,7 +165,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors, { u64 *p; int lo, hi; - int rv = 0; + int rv = 0, changed = 0; unsigned long flags; if (bb->shift < 0) @@ -230,6 +230,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors, s = a + BB_MAX_LEN; } sectors = e - s; + changed = 1; } } if (sectors && hi < bb->count) { @@ -260,24 +261,24 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors, sectors = e - s; lo = hi; hi++; + changed = 1; } } if (sectors == 0 && hi < bb->count) { /* we might be able to combine lo and hi */ /* Note: 's' is at the end of 'lo' */ - sector_t a = BB_OFFSET(p[hi]); - int lolen = BB_LEN(p[lo]); - int hilen = BB_LEN(p[hi]); - int newlen = lolen + hilen - (s - a); + sector_t a = BB_OFFSET(p[lo]); + int newlen = max(s, BB_OFFSET(p[hi]) + BB_LEN(p[hi])) - a; - if (s >= a && newlen < BB_MAX_LEN) { + if (s >= BB_OFFSET(p[hi]) && newlen < BB_MAX_LEN) { /* yes, we can combine them */ int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); - p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); + p[lo] = BB_MAKE(a, newlen, ack); memmove(p + hi, p + hi + 1, (bb->count - hi - 1) * 8); bb->count--; + changed = 1; } } while (sectors) { @@ -300,14 +301,18 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors, p[hi] = BB_MAKE(s, this_sectors, acknowledged); sectors -= this_sectors; s += this_sectors; + hi++; + changed = 1; } } - bb->changed = 1; - if (!acknowledged) - bb->unacked_exist = 1; - else - badblocks_update_acked(bb); + if (changed) { + bb->changed = changed; + if (!acknowledged) + bb->unacked_exist = 1; + else + badblocks_update_acked(bb); + } write_sequnlock_irqrestore(&bb->lock, flags); return rv; diff --git a/block/blk-iocost.c b/block/blk-iocost.c index c87320fa221e6d6c112352c570306780974f8186..81dd3b02b36ad8a29afb0ec25e4de74a1846734c 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2414,6 +2414,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, u32 hwi, adj_step; s64 margin; u64 cost, new_inuse; + unsigned long flags; current_hweight(iocg, NULL, &hwi); old_hwi = hwi; @@ -2432,11 +2433,11 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, iocg->inuse == iocg->active) return cost; - spin_lock_irq(&ioc->lock); + spin_lock_irqsave(&ioc->lock, flags); /* we own inuse only when @iocg is in the normal active state */ if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { - spin_unlock_irq(&ioc->lock); + spin_unlock_irqrestore(&ioc->lock, flags); return cost; } @@ -2457,7 +2458,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, } while (time_after64(vtime + cost, now->vnow) && iocg->inuse != iocg->active); - spin_unlock_irq(&ioc->lock); + spin_unlock_irqrestore(&ioc->lock, flags); TRACE_IOCG_PATH(inuse_adjust, iocg, now, old_inuse, iocg->inuse, old_hwi, hwi); @@ -2873,15 +2874,21 @@ static int blk_iocost_init(struct request_queue *q) * called before policy activation completion, can't assume that the * target bio has an iocg associated and need to test for NULL iocg. */ - rq_qos_add(q, rqos); + ret = rq_qos_add(q, rqos); + if (ret) + goto err_free_ioc; + ret = blkcg_activate_policy(q, &blkcg_policy_iocost); - if (ret) { - rq_qos_del(q, rqos); - free_percpu(ioc->pcpu_stat); - kfree(ioc); - return ret; - } + if (ret) + goto err_del_qos; return 0; + +err_del_qos: + rq_qos_del(q, rqos); +err_free_ioc: + free_percpu(ioc->pcpu_stat); + kfree(ioc); + return ret; } static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp) @@ -3166,6 +3173,10 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, disk = blkcg_conf_get_disk(&input); if (IS_ERR(disk)) return PTR_ERR(disk); + if (!queue_is_mq(disk->queue)) { + ret = -EOPNOTSUPP; + goto err; + } ioc = q_to_ioc(disk->queue); if (!ioc) { @@ -3333,6 +3344,10 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, disk = blkcg_conf_get_disk(&input); if (IS_ERR(disk)) return PTR_ERR(disk); + if (!queue_is_mq(disk->queue)) { + ret = -EOPNOTSUPP; + goto err; + } ioc = q_to_ioc(disk->queue); if (!ioc) { diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 74511a060d598513433f043cdbceb0588a2fb027..9811ee74b69f51253000091a912e6df4b9641c48 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -772,19 +772,23 @@ int blk_iolatency_init(struct request_queue *q) rqos->ops = &blkcg_iolatency_ops; rqos->q = q; - rq_qos_add(q, rqos); - + ret = rq_qos_add(q, rqos); + if (ret) + goto err_free; ret = blkcg_activate_policy(q, &blkcg_policy_iolatency); - if (ret) { - rq_qos_del(q, rqos); - kfree(blkiolat); - return ret; - } + if (ret) + goto err_qos_del; timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn); return 0; + +err_qos_del: + rq_qos_del(q, rqos); +err_free: + kfree(blkiolat); + return ret; } static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 2bcb3495e376b5b63c56b4a0501532fec80996fe..37c59d7d6ba7f2333ed37ca7d2653173c79d7945 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -98,7 +98,7 @@ static inline void rq_wait_init(struct rq_wait *rq_wait) init_waitqueue_head(&rq_wait->wait); } -static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) +static inline int rq_qos_add(struct request_queue *q, struct rq_qos *rqos) { /* * No IO can be in-flight when adding rqos, so freeze queue, which @@ -110,6 +110,8 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) blk_mq_freeze_queue(q); spin_lock_irq(&q->queue_lock); + if (rq_qos_id(q, rqos->id)) + goto ebusy; rqos->next = q->rq_qos; q->rq_qos = rqos; spin_unlock_irq(&q->queue_lock); @@ -118,6 +120,13 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) if (rqos->ops->debugfs_attrs) blk_mq_debugfs_register_rqos(rqos); + + return 0; +ebusy: + spin_unlock_irq(&q->queue_lock); + blk_mq_unfreeze_queue(q); + return -EBUSY; + } static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 6f63920f073c6b1bda014ad22f1895e1d7f57578..28eb25b947cd2008d736428423c4dd7875960ad3 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -818,6 +818,7 @@ int wbt_init(struct request_queue *q) { struct rq_wb *rwb; int i; + int ret; rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); if (!rwb) @@ -847,8 +848,17 @@ int wbt_init(struct request_queue *q) /* * Assign rwb and add the stats callback. */ - rq_qos_add(q, &rwb->rqos); + ret = rq_qos_add(q, &rwb->rqos); + if (ret) + goto err_free; + blk_stat_add_callback(q, rwb->cb); return 0; + +err_free: + blk_stat_free_callback(rwb->cb); + kfree(rwb); + return ret; + } diff --git a/drivers/md/md.c b/drivers/md/md.c index 61f68689ddfda994cedb8a2c7dcc9e652ed7ad68..4563ef0df4c5f0b171331fcff1bdee4a8f54f5ba 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2402,8 +2402,9 @@ EXPORT_SYMBOL(md_integrity_add_rdev); static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) { - char b[BDEVNAME_SIZE]; + char b[BDEVNAME_SIZE + 4]; struct kobject *ko; + struct kernfs_node *sysfs_rdev; int err; /* prevent duplicates */ @@ -2454,7 +2455,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) mdname(mddev), mddev->max_disks); return -EBUSY; } - bdevname(rdev->bdev,b); + memcpy(b, "dev-", 4); + bdevname(rdev->bdev, b + 4); strreplace(b, '/', '!'); rdev->mddev = mddev; @@ -2463,7 +2465,15 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) if (mddev->raid_disks) mddev_create_serial_pool(mddev, rdev, false); - if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) + sysfs_rdev = sysfs_get_dirent_safe(mddev->kobj.sd, b); + if (sysfs_rdev) { + sysfs_put(sysfs_rdev); + err = -EBUSY; + goto fail; + } + + err = kobject_add(&rdev->kobj, &mddev->kobj, b); + if (err) goto fail; ko = &part_to_dev(rdev->bdev->bd_part)->kobj; @@ -2484,7 +2494,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) return 0; fail: - pr_warn("md: failed to register dev-%s for %s\n", + pr_warn("md: failed to register %s for %s\n", b, mdname(mddev)); return err; } @@ -4592,20 +4602,6 @@ null_show(struct mddev *mddev, char *page) return -EINVAL; } -/* need to ensure rdev_delayed_delete() has completed */ -static void flush_rdev_wq(struct mddev *mddev) -{ - struct md_rdev *rdev; - - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (work_pending(&rdev->del_work)) { - flush_workqueue(md_rdev_misc_wq); - break; - } - rcu_read_unlock(); -} - static ssize_t new_dev_store(struct mddev *mddev, const char *buf, size_t len) { @@ -4633,7 +4629,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; - flush_rdev_wq(mddev); + flush_workqueue(md_rdev_misc_wq); err = mddev_lock(mddev); if (err) return err; @@ -5743,6 +5739,7 @@ static int md_alloc(dev_t dev, char *name) * completely removed (mddev_delayed_delete). */ flush_workqueue(md_misc_wq); + flush_workqueue(md_rdev_misc_wq); mutex_lock(&disks_mutex); error = -EEXIST; @@ -7646,7 +7643,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, } if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK) - flush_rdev_wq(mddev); + flush_workqueue(md_rdev_misc_wq); if (cmd == HOT_REMOVE_DISK) /* need to ensure recovery thread has run */ @@ -9581,12 +9578,13 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, { struct mddev *mddev = rdev->mddev; int rv; + if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; rv = badblocks_set(&rdev->badblocks, s, sectors, 0); - if (rv == 0) { + if (rdev->badblocks.changed) { /* Make sure they get written out promptly */ if (test_bit(ExternalBbl, &rdev->flags)) sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); @@ -9594,9 +9592,8 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); md_wakeup_thread(rdev->mddev->thread); - return 1; - } else - return 0; + } + return !rv; } EXPORT_SYMBOL_GPL(rdev_set_badblocks); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 8780c95f9b86ecddd24b038d286c182fc18f34c2..00915e6ec4106ab8bbd70d507465439020ad0ff1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3158,6 +3158,7 @@ static int raid1_run(struct mddev *mddev) * RAID1 needs at least one disk in active */ if (conf->raid_disks - mddev->degraded < 1) { + md_unregister_thread(&conf->thread); ret = -EINVAL; goto abort; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 04869394e3458e038a7993c54e327bef9c576536..babc5d29f0e36131b0a202efcdb6bde09a96894b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -441,47 +441,50 @@ static void raid10_end_write_request(struct bio *bio) dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); - if (repl) - rdev = conf->mirrors[dev].replacement; - if (!rdev) { - smp_rmb(); - repl = 0; - rdev = conf->mirrors[dev].rdev; + if (repl) { + rdev = r10_bio->devs[slot].replacement; + if (rdev == conf->mirrors[dev].replacement) { + if (bio->bi_status && !discard_error) { + /* + * Never record new bad blocks to replacement, + * just fail it. + */ + md_error(rdev->mddev, rdev); + goto out; + } + } + } else { + rdev = r10_bio->devs[slot].rdev; } /* * this branch is our 'one mirror IO has finished' event handler: */ if (bio->bi_status && !discard_error) { - if (repl) - /* Never record new bad blocks to replacement, - * just fail it. - */ - md_error(rdev->mddev, rdev); - else { - set_bit(WriteErrorSeen, &rdev->flags); - if (!test_and_set_bit(WantReplacement, &rdev->flags)) - set_bit(MD_RECOVERY_NEEDED, - &rdev->mddev->recovery); + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); - dec_rdev = 0; - if (test_bit(FailFast, &rdev->flags) && - (bio->bi_opf & MD_FAILFAST)) { - md_error(rdev->mddev, rdev); - } + dec_rdev = 0; + if (test_bit(FailFast, &rdev->flags) && + (bio->bi_opf & MD_FAILFAST)) + md_error(rdev->mddev, rdev); - /* - * When the device is faulty, it is not necessary to - * handle write error. - */ - if (!test_bit(Faulty, &rdev->flags)) - set_bit(R10BIO_WriteError, &r10_bio->state); - else { - /* Fail the request */ - set_bit(R10BIO_Degraded, &r10_bio->state); + /* + * When the device is faulty, it is not necessary to + * handle write error. + */ + if (!test_bit(Faulty, &rdev->flags)) { + set_bit(R10BIO_WriteError, &r10_bio->state); + } else { + /* Fail the request */ + set_bit(R10BIO_Degraded, &r10_bio->state); + if (repl) + r10_bio->devs[slot].repl_bio = NULL; + else r10_bio->devs[slot].bio = NULL; - to_put = bio; - dec_rdev = 1; - } + to_put = bio; + dec_rdev = 1; } } else { /* @@ -513,16 +516,17 @@ static void raid10_end_write_request(struct bio *bio) r10_bio->devs[slot].addr, r10_bio->sectors, &first_bad, &bad_sectors) && !discard_error) { - bio_put(bio); if (repl) r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; else r10_bio->devs[slot].bio = IO_MADE_GOOD; + bio_put(bio); dec_rdev = 0; set_bit(R10BIO_MadeGood, &r10_bio->state); } } +out: /* * * Let's see if all mirrored write operations have finished @@ -753,9 +757,19 @@ static struct md_rdev *read_balance(struct r10conf *conf, disk = r10_bio->devs[slot].devnum; rdev = rcu_dereference(conf->mirrors[disk].replacement); if (rdev == NULL || test_bit(Faulty, &rdev->flags) || - r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) + test_bit(WantRemove, &rdev->flags) || + r10_bio->devs[slot].addr + sectors > + rdev->recovery_offset) { + /* + * Read replacement first to prevent reading both rdev + * and replacement as NULL during replacement replace + * rdev + */ + smp_mb(); rdev = rcu_dereference(conf->mirrors[disk].rdev); + } if (rdev == NULL || + test_bit(WantRemove, &rdev->flags) || test_bit(Faulty, &rdev->flags)) continue; if (!test_bit(In_sync, &rdev->flags) && @@ -896,6 +910,7 @@ static void flush_pending_writes(struct r10conf *conf) else submit_bio_noacct(bio); bio = next; + cond_resched(); } blk_finish_plug(&plug); } else @@ -952,36 +967,45 @@ static void lower_barrier(struct r10conf *conf) spin_unlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); } +static bool stop_waiting_barrier(struct r10conf *conf) +{ + struct bio_list *bio_list = current->bio_list; + + /* barrier is dropped */ + if (!conf->barrier) + return true; + + /* + * If there are already pending requests (preventing the barrier from + * rising completely), and the pre-process bio queue isn't empty, then + * don't wait, as we need to empty that queue to get the nr_pending + * count down. + */ + if (atomic_read(&conf->nr_pending) && bio_list && + (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1]))) + return true; + + /* + * move on if io is issued from raid10d(), nr_pending is not released + * from original io(see handle_read_error()). All raise barrier is + * blocked until this io is done. + */ + if (conf->mddev->thread->tsk == current) { + WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0); + return true; + } + + return false; +} static void wait_barrier(struct r10conf *conf) { spin_lock_irq(&conf->resync_lock); if (conf->barrier) { - struct bio_list *bio_list = current->bio_list; conf->nr_waiting++; - /* Wait for the barrier to drop. - * However if there are already pending - * requests (preventing the barrier from - * rising completely), and the - * pre-process bio queue isn't empty, - * then don't wait, as we need to empty - * that queue to get the nr_pending - * count down. - */ raid10_log(conf->mddev, "wait barrier"); wait_event_lock_irq(conf->wait_barrier, - !conf->barrier || - (atomic_read(&conf->nr_pending) && - bio_list && - (!bio_list_empty(&bio_list[0]) || - !bio_list_empty(&bio_list[1]))) || - /* move on if recovery thread is - * blocked by us - */ - (conf->mddev->thread->tsk == current && - test_bit(MD_RECOVERY_RUNNING, - &conf->mddev->recovery) && - conf->nr_queued > 0), + stop_waiting_barrier(conf), conf->resync_lock); conf->nr_waiting--; if (!conf->nr_waiting) @@ -1089,6 +1113,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) else submit_bio_noacct(bio); bio = next; + cond_resched(); } kfree(plug); } @@ -1227,29 +1252,21 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, int devnum = r10_bio->devs[n_copy].devnum; struct bio *mbio; - if (replacement) { - rdev = conf->mirrors[devnum].replacement; - if (rdev == NULL) { - /* Replacement just got moved to main 'rdev' */ - smp_mb(); - rdev = conf->mirrors[devnum].rdev; - } - } else - rdev = conf->mirrors[devnum].rdev; - mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); - if (replacement) + if (replacement) { r10_bio->devs[n_copy].repl_bio = mbio; - else + rdev = r10_bio->devs[n_copy].replacement; + } else { r10_bio->devs[n_copy].bio = mbio; + rdev = r10_bio->devs[n_copy].rdev; + } mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + choose_data_offset(r10_bio, rdev)); bio_set_dev(mbio, rdev->bdev); mbio->bi_end_io = raid10_end_write_request; bio_set_op_attrs(mbio, op, do_sync | do_fua); - if (!replacement && test_bit(FailFast, - &conf->mirrors[devnum].rdev->flags) + if (!replacement && test_bit(FailFast, &rdev->flags) && enough(conf, devnum)) mbio->bi_opf |= MD_FAILFAST; mbio->bi_private = r10_bio; @@ -1350,9 +1367,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; - struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[d].replacement); + struct md_rdev *rrdev, *rdev; + + rrdev = rcu_dereference(conf->mirrors[d].replacement); + /* + * Read replacement first to Prevent reading both rdev and + * replacement as NULL during replacement replace rdev. + */ + smp_mb(); + rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev == rrdev) rrdev = NULL; if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { @@ -1365,9 +1388,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, blocked_rdev = rrdev; break; } - if (rdev && (test_bit(Faulty, &rdev->flags))) + if (rdev && (test_bit(Faulty, &rdev->flags) || + test_bit(WantRemove, &rdev->flags))) rdev = NULL; - if (rrdev && (test_bit(Faulty, &rrdev->flags))) + if (rrdev && (test_bit(Faulty, &rrdev->flags) || + test_bit(WantRemove, &rrdev->flags))) rrdev = NULL; r10_bio->devs[i].bio = NULL; @@ -1420,10 +1445,12 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, } if (rdev) { r10_bio->devs[i].bio = bio; + r10_bio->devs[i].rdev = rdev; atomic_inc(&rdev->nr_pending); } if (rrdev) { r10_bio->devs[i].repl_bio = bio; + r10_bio->devs[i].replacement = rrdev; atomic_inc(&rrdev->nr_pending); } } @@ -1432,24 +1459,12 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, if (unlikely(blocked_rdev)) { /* Have to wait for this device to get unblocked, then retry */ int j; - int d; for (j = 0; j < i; j++) { - if (r10_bio->devs[j].bio) { - d = r10_bio->devs[j].devnum; - rdev_dec_pending(conf->mirrors[d].rdev, mddev); - } - if (r10_bio->devs[j].repl_bio) { - struct md_rdev *rdev; - d = r10_bio->devs[j].devnum; - rdev = conf->mirrors[d].replacement; - if (!rdev) { - /* Race with remove_disk */ - smp_mb(); - rdev = conf->mirrors[d].rdev; - } - rdev_dec_pending(rdev, mddev); - } + if (r10_bio->devs[j].bio) + rdev_dec_pending(r10_bio->devs[j].rdev, mddev); + if (r10_bio->devs[j].repl_bio) + rdev_dec_pending(r10_bio->devs[j].replacement, mddev); } allow_barrier(conf); raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); @@ -1745,9 +1760,10 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r10conf *conf = mddev->private; int err = -EEXIST; - int mirror; + int mirror, repl_slot = -1; int first = 0; int last = conf->geo.raid_disks - 1; + struct raid10_info *p; if (mddev->recovery_cp < MaxSector) /* only hot-add to in-sync arrays, as recovery is @@ -1770,23 +1786,14 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) else mirror = first; for ( ; mirror <= last ; mirror++) { - struct raid10_info *p = &conf->mirrors[mirror]; + p = &conf->mirrors[mirror]; if (p->recovery_disabled == mddev->recovery_disabled) continue; if (p->rdev) { - if (!test_bit(WantReplacement, &p->rdev->flags) || - p->replacement != NULL) - continue; - clear_bit(In_sync, &rdev->flags); - set_bit(Replacement, &rdev->flags); - rdev->raid_disk = mirror; - err = 0; - if (mddev->gendisk) - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - conf->fullsync = 1; - rcu_assign_pointer(p->replacement, rdev); - break; + if (test_bit(WantReplacement, &p->rdev->flags) && + p->replacement == NULL && repl_slot < 0) + repl_slot = mirror; + continue; } if (mddev->gendisk) @@ -1796,12 +1803,28 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) p->head_position = 0; p->recovery_disabled = mddev->recovery_disabled - 1; rdev->raid_disk = mirror; + clear_bit(WantRemove, &rdev->flags); err = 0; if (rdev->saved_raid_disk != mirror) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); break; } + + if (err && repl_slot >= 0) { + p = &conf->mirrors[repl_slot]; + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + clear_bit(WantRemove, &rdev->flags); + rdev->raid_disk = repl_slot; + err = 0; + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + conf->fullsync = 1; + rcu_assign_pointer(p->replacement, rdev); + } + if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue); @@ -1844,16 +1867,22 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) err = -EBUSY; goto abort; } - *rdevp = NULL; + /* + * Before set p->rdev = NULL, we set WantRemove bit avoiding + * race between rdev remove and issue bio, which can cause + * NULL pointer deference of rdev by conf->mirrors[i].rdev. + */ + set_bit(WantRemove, &rdev->flags); if (!test_bit(RemoveSynchronized, &rdev->flags)) { synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { /* lost the race, try later */ err = -EBUSY; - *rdevp = rdev; + md_error(rdev->mddev, rdev); goto abort; } } + *rdevp = NULL; if (p->replacement) { /* We must have just cleared 'rdev' */ p->rdev = p->replacement; @@ -2598,9 +2627,13 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) md_error(mddev, rdev); rdev_dec_pending(rdev, mddev); - allow_barrier(conf); r10_bio->state = 0; raid10_read_request(mddev, r10_bio->master_bio, r10_bio, true); + /* + * allow_barrier after re-submit to ensure no sync io + * can be issued while regular io pending. + */ + allow_barrier(conf); } static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) @@ -2656,9 +2689,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) } else { bool fail = false; for (m = 0; m < conf->copies; m++) { - int dev = r10_bio->devs[m].devnum; struct bio *bio = r10_bio->devs[m].bio; - rdev = conf->mirrors[dev].rdev; + rdev = r10_bio->devs[m].rdev; if (bio == IO_MADE_GOOD) { rdev_clear_badblocks( rdev, @@ -2675,7 +2707,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_dec_pending(rdev, conf->mddev); } bio = r10_bio->devs[m].repl_bio; - rdev = conf->mirrors[dev].replacement; + rdev = r10_bio->devs[m].replacement; if (rdev && bio == IO_MADE_GOOD) { rdev_clear_badblocks( rdev, @@ -2908,10 +2940,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, int chunks_skipped = 0; sector_t chunk_mask = conf->geo.chunk_mask; int page_idx = 0; - - if (!mempool_initialized(&conf->r10buf_pool)) - if (init_resync(conf)) - return 0; + int error_disk = -1; /* * Allow skipping a full rebuild for incremental assembly @@ -2928,6 +2957,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, return mddev->dev_sectors - sector_nr; } + if (!mempool_initialized(&conf->r10buf_pool)) + if (init_resync(conf)) + return 0; + skipped: max_sector = mddev->dev_sectors; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || @@ -2991,7 +3024,18 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, return reshape_request(mddev, sector_nr, skipped); if (chunks_skipped >= conf->geo.raid_disks) { - /* if there has been nothing to do on any drive, + pr_err("md/raid10:%s: %s fail\n", mdname(mddev), + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery"); + if (error_disk >= 0 && !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* + * recovery fail, set mirrors.recovory_disabled, + * device shouldn't be added to there. + */ + conf->mirrors[error_disk].recovery_disabled = mddev->recovery_disabled; + return 0; + } + /* + * if there has been nothing to do on any drive, * then there is nothing to do at all.. */ *skipped = 1; @@ -3058,6 +3102,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (mreplace != NULL && !test_bit(Faulty, &mreplace->flags)) need_replace = 1; + else + mreplace = NULL; if (!need_recover && !need_replace) { rcu_read_unlock(); @@ -3075,8 +3121,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, rcu_read_unlock(); continue; } - if (mreplace && test_bit(Faulty, &mreplace->flags)) - mreplace = NULL; /* Unless we are doing a full sync, or a replacement * we only need to recover the block if it is set in * the bitmap @@ -3248,6 +3292,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, mdname(mddev)); mirror->recovery_disabled = mddev->recovery_disabled; + } else { + error_disk = i; } put_buf(r10_bio); if (rb2) diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 5420250d4bd63a59bfd40eacaf7f0baeaa7e21f4..73d243e12363fbb97c8bfd142d94ec80dda0fc7e 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -146,12 +146,12 @@ struct r10bio { */ struct r10dev { struct bio *bio; - union { - struct bio *repl_bio; /* used for resync and - * writes */ - struct md_rdev *rdev; /* used for reads - * (read_slot >= 0) */ - }; + /* Currently just used for normal reads and writes */ + struct md_rdev *rdev; + /* used for resync and writes */ + struct bio *repl_bio; + /* Currently just used for normal writes */ + struct md_rdev *replacement; sector_t addr; int devnum; } devs[];