未验证 提交 4ac8d141 编写于 作者: O openeuler-ci-bot 提交者: Gitee

!903 backport block bugfix

Merge Pull Request from: @zhangjialin11 
 
This patch series fix block layer bug.
3 patchs fix iocost bug. Other patchs fix raid10 and badblocks bug.
 
 
Link:https://gitee.com/openeuler/kernel/pulls/903 

Reviewed-by: Zheng Zengkai <zhengzengkai@huawei.com> 
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com> 
...@@ -165,7 +165,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors, ...@@ -165,7 +165,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
{ {
u64 *p; u64 *p;
int lo, hi; int lo, hi;
int rv = 0; int rv = 0, changed = 0;
unsigned long flags; unsigned long flags;
if (bb->shift < 0) if (bb->shift < 0)
...@@ -230,6 +230,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors, ...@@ -230,6 +230,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
s = a + BB_MAX_LEN; s = a + BB_MAX_LEN;
} }
sectors = e - s; sectors = e - s;
changed = 1;
} }
} }
if (sectors && hi < bb->count) { if (sectors && hi < bb->count) {
...@@ -260,24 +261,24 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors, ...@@ -260,24 +261,24 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
sectors = e - s; sectors = e - s;
lo = hi; lo = hi;
hi++; hi++;
changed = 1;
} }
} }
if (sectors == 0 && hi < bb->count) { if (sectors == 0 && hi < bb->count) {
/* we might be able to combine lo and hi */ /* we might be able to combine lo and hi */
/* Note: 's' is at the end of 'lo' */ /* Note: 's' is at the end of 'lo' */
sector_t a = BB_OFFSET(p[hi]); sector_t a = BB_OFFSET(p[lo]);
int lolen = BB_LEN(p[lo]); int newlen = max(s, BB_OFFSET(p[hi]) + BB_LEN(p[hi])) - a;
int hilen = BB_LEN(p[hi]);
int newlen = lolen + hilen - (s - a);
if (s >= a && newlen < BB_MAX_LEN) { if (s >= BB_OFFSET(p[hi]) && newlen < BB_MAX_LEN) {
/* yes, we can combine them */ /* yes, we can combine them */
int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); p[lo] = BB_MAKE(a, newlen, ack);
memmove(p + hi, p + hi + 1, memmove(p + hi, p + hi + 1,
(bb->count - hi - 1) * 8); (bb->count - hi - 1) * 8);
bb->count--; bb->count--;
changed = 1;
} }
} }
while (sectors) { while (sectors) {
...@@ -300,14 +301,18 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors, ...@@ -300,14 +301,18 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
p[hi] = BB_MAKE(s, this_sectors, acknowledged); p[hi] = BB_MAKE(s, this_sectors, acknowledged);
sectors -= this_sectors; sectors -= this_sectors;
s += this_sectors; s += this_sectors;
hi++;
changed = 1;
} }
} }
bb->changed = 1; if (changed) {
if (!acknowledged) bb->changed = changed;
bb->unacked_exist = 1; if (!acknowledged)
else bb->unacked_exist = 1;
badblocks_update_acked(bb); else
badblocks_update_acked(bb);
}
write_sequnlock_irqrestore(&bb->lock, flags); write_sequnlock_irqrestore(&bb->lock, flags);
return rv; return rv;
......
...@@ -2414,6 +2414,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, ...@@ -2414,6 +2414,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
u32 hwi, adj_step; u32 hwi, adj_step;
s64 margin; s64 margin;
u64 cost, new_inuse; u64 cost, new_inuse;
unsigned long flags;
current_hweight(iocg, NULL, &hwi); current_hweight(iocg, NULL, &hwi);
old_hwi = hwi; old_hwi = hwi;
...@@ -2432,11 +2433,11 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, ...@@ -2432,11 +2433,11 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
iocg->inuse == iocg->active) iocg->inuse == iocg->active)
return cost; return cost;
spin_lock_irq(&ioc->lock); spin_lock_irqsave(&ioc->lock, flags);
/* we own inuse only when @iocg is in the normal active state */ /* we own inuse only when @iocg is in the normal active state */
if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
spin_unlock_irq(&ioc->lock); spin_unlock_irqrestore(&ioc->lock, flags);
return cost; return cost;
} }
...@@ -2457,7 +2458,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, ...@@ -2457,7 +2458,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
} while (time_after64(vtime + cost, now->vnow) && } while (time_after64(vtime + cost, now->vnow) &&
iocg->inuse != iocg->active); iocg->inuse != iocg->active);
spin_unlock_irq(&ioc->lock); spin_unlock_irqrestore(&ioc->lock, flags);
TRACE_IOCG_PATH(inuse_adjust, iocg, now, TRACE_IOCG_PATH(inuse_adjust, iocg, now,
old_inuse, iocg->inuse, old_hwi, hwi); old_inuse, iocg->inuse, old_hwi, hwi);
...@@ -2873,15 +2874,21 @@ static int blk_iocost_init(struct request_queue *q) ...@@ -2873,15 +2874,21 @@ static int blk_iocost_init(struct request_queue *q)
* called before policy activation completion, can't assume that the * called before policy activation completion, can't assume that the
* target bio has an iocg associated and need to test for NULL iocg. * target bio has an iocg associated and need to test for NULL iocg.
*/ */
rq_qos_add(q, rqos); ret = rq_qos_add(q, rqos);
if (ret)
goto err_free_ioc;
ret = blkcg_activate_policy(q, &blkcg_policy_iocost); ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
if (ret) { if (ret)
rq_qos_del(q, rqos); goto err_del_qos;
free_percpu(ioc->pcpu_stat);
kfree(ioc);
return ret;
}
return 0; return 0;
err_del_qos:
rq_qos_del(q, rqos);
err_free_ioc:
free_percpu(ioc->pcpu_stat);
kfree(ioc);
return ret;
} }
static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp) static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
...@@ -3166,6 +3173,10 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ...@@ -3166,6 +3173,10 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
disk = blkcg_conf_get_disk(&input); disk = blkcg_conf_get_disk(&input);
if (IS_ERR(disk)) if (IS_ERR(disk))
return PTR_ERR(disk); return PTR_ERR(disk);
if (!queue_is_mq(disk->queue)) {
ret = -EOPNOTSUPP;
goto err;
}
ioc = q_to_ioc(disk->queue); ioc = q_to_ioc(disk->queue);
if (!ioc) { if (!ioc) {
...@@ -3333,6 +3344,10 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ...@@ -3333,6 +3344,10 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
disk = blkcg_conf_get_disk(&input); disk = blkcg_conf_get_disk(&input);
if (IS_ERR(disk)) if (IS_ERR(disk))
return PTR_ERR(disk); return PTR_ERR(disk);
if (!queue_is_mq(disk->queue)) {
ret = -EOPNOTSUPP;
goto err;
}
ioc = q_to_ioc(disk->queue); ioc = q_to_ioc(disk->queue);
if (!ioc) { if (!ioc) {
......
...@@ -772,19 +772,23 @@ int blk_iolatency_init(struct request_queue *q) ...@@ -772,19 +772,23 @@ int blk_iolatency_init(struct request_queue *q)
rqos->ops = &blkcg_iolatency_ops; rqos->ops = &blkcg_iolatency_ops;
rqos->q = q; rqos->q = q;
rq_qos_add(q, rqos); ret = rq_qos_add(q, rqos);
if (ret)
goto err_free;
ret = blkcg_activate_policy(q, &blkcg_policy_iolatency); ret = blkcg_activate_policy(q, &blkcg_policy_iolatency);
if (ret) { if (ret)
rq_qos_del(q, rqos); goto err_qos_del;
kfree(blkiolat);
return ret;
}
timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0);
INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn); INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn);
return 0; return 0;
err_qos_del:
rq_qos_del(q, rqos);
err_free:
kfree(blkiolat);
return ret;
} }
static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
......
...@@ -98,7 +98,7 @@ static inline void rq_wait_init(struct rq_wait *rq_wait) ...@@ -98,7 +98,7 @@ static inline void rq_wait_init(struct rq_wait *rq_wait)
init_waitqueue_head(&rq_wait->wait); init_waitqueue_head(&rq_wait->wait);
} }
static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) static inline int rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
{ {
/* /*
* No IO can be in-flight when adding rqos, so freeze queue, which * No IO can be in-flight when adding rqos, so freeze queue, which
...@@ -110,6 +110,8 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) ...@@ -110,6 +110,8 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
blk_mq_freeze_queue(q); blk_mq_freeze_queue(q);
spin_lock_irq(&q->queue_lock); spin_lock_irq(&q->queue_lock);
if (rq_qos_id(q, rqos->id))
goto ebusy;
rqos->next = q->rq_qos; rqos->next = q->rq_qos;
q->rq_qos = rqos; q->rq_qos = rqos;
spin_unlock_irq(&q->queue_lock); spin_unlock_irq(&q->queue_lock);
...@@ -118,6 +120,13 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) ...@@ -118,6 +120,13 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
if (rqos->ops->debugfs_attrs) if (rqos->ops->debugfs_attrs)
blk_mq_debugfs_register_rqos(rqos); blk_mq_debugfs_register_rqos(rqos);
return 0;
ebusy:
spin_unlock_irq(&q->queue_lock);
blk_mq_unfreeze_queue(q);
return -EBUSY;
} }
static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
......
...@@ -818,6 +818,7 @@ int wbt_init(struct request_queue *q) ...@@ -818,6 +818,7 @@ int wbt_init(struct request_queue *q)
{ {
struct rq_wb *rwb; struct rq_wb *rwb;
int i; int i;
int ret;
rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
if (!rwb) if (!rwb)
...@@ -847,8 +848,17 @@ int wbt_init(struct request_queue *q) ...@@ -847,8 +848,17 @@ int wbt_init(struct request_queue *q)
/* /*
* Assign rwb and add the stats callback. * Assign rwb and add the stats callback.
*/ */
rq_qos_add(q, &rwb->rqos); ret = rq_qos_add(q, &rwb->rqos);
if (ret)
goto err_free;
blk_stat_add_callback(q, rwb->cb); blk_stat_add_callback(q, rwb->cb);
return 0; return 0;
err_free:
blk_stat_free_callback(rwb->cb);
kfree(rwb);
return ret;
} }
...@@ -2402,8 +2402,9 @@ EXPORT_SYMBOL(md_integrity_add_rdev); ...@@ -2402,8 +2402,9 @@ EXPORT_SYMBOL(md_integrity_add_rdev);
static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
{ {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE + 4];
struct kobject *ko; struct kobject *ko;
struct kernfs_node *sysfs_rdev;
int err; int err;
/* prevent duplicates */ /* prevent duplicates */
...@@ -2454,7 +2455,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) ...@@ -2454,7 +2455,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
mdname(mddev), mddev->max_disks); mdname(mddev), mddev->max_disks);
return -EBUSY; return -EBUSY;
} }
bdevname(rdev->bdev,b); memcpy(b, "dev-", 4);
bdevname(rdev->bdev, b + 4);
strreplace(b, '/', '!'); strreplace(b, '/', '!');
rdev->mddev = mddev; rdev->mddev = mddev;
...@@ -2463,7 +2465,15 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) ...@@ -2463,7 +2465,15 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
if (mddev->raid_disks) if (mddev->raid_disks)
mddev_create_serial_pool(mddev, rdev, false); mddev_create_serial_pool(mddev, rdev, false);
if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) sysfs_rdev = sysfs_get_dirent_safe(mddev->kobj.sd, b);
if (sysfs_rdev) {
sysfs_put(sysfs_rdev);
err = -EBUSY;
goto fail;
}
err = kobject_add(&rdev->kobj, &mddev->kobj, b);
if (err)
goto fail; goto fail;
ko = &part_to_dev(rdev->bdev->bd_part)->kobj; ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
...@@ -2484,7 +2494,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) ...@@ -2484,7 +2494,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
return 0; return 0;
fail: fail:
pr_warn("md: failed to register dev-%s for %s\n", pr_warn("md: failed to register %s for %s\n",
b, mdname(mddev)); b, mdname(mddev));
return err; return err;
} }
...@@ -4592,20 +4602,6 @@ null_show(struct mddev *mddev, char *page) ...@@ -4592,20 +4602,6 @@ null_show(struct mddev *mddev, char *page)
return -EINVAL; return -EINVAL;
} }
/* need to ensure rdev_delayed_delete() has completed */
static void flush_rdev_wq(struct mddev *mddev)
{
struct md_rdev *rdev;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (work_pending(&rdev->del_work)) {
flush_workqueue(md_rdev_misc_wq);
break;
}
rcu_read_unlock();
}
static ssize_t static ssize_t
new_dev_store(struct mddev *mddev, const char *buf, size_t len) new_dev_store(struct mddev *mddev, const char *buf, size_t len)
{ {
...@@ -4633,7 +4629,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -4633,7 +4629,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
minor != MINOR(dev)) minor != MINOR(dev))
return -EOVERFLOW; return -EOVERFLOW;
flush_rdev_wq(mddev); flush_workqueue(md_rdev_misc_wq);
err = mddev_lock(mddev); err = mddev_lock(mddev);
if (err) if (err)
return err; return err;
...@@ -5743,6 +5739,7 @@ static int md_alloc(dev_t dev, char *name) ...@@ -5743,6 +5739,7 @@ static int md_alloc(dev_t dev, char *name)
* completely removed (mddev_delayed_delete). * completely removed (mddev_delayed_delete).
*/ */
flush_workqueue(md_misc_wq); flush_workqueue(md_misc_wq);
flush_workqueue(md_rdev_misc_wq);
mutex_lock(&disks_mutex); mutex_lock(&disks_mutex);
error = -EEXIST; error = -EEXIST;
...@@ -7646,7 +7643,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, ...@@ -7646,7 +7643,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
} }
if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK) if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK)
flush_rdev_wq(mddev); flush_workqueue(md_rdev_misc_wq);
if (cmd == HOT_REMOVE_DISK) if (cmd == HOT_REMOVE_DISK)
/* need to ensure recovery thread has run */ /* need to ensure recovery thread has run */
...@@ -9581,12 +9578,13 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, ...@@ -9581,12 +9578,13 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
{ {
struct mddev *mddev = rdev->mddev; struct mddev *mddev = rdev->mddev;
int rv; int rv;
if (is_new) if (is_new)
s += rdev->new_data_offset; s += rdev->new_data_offset;
else else
s += rdev->data_offset; s += rdev->data_offset;
rv = badblocks_set(&rdev->badblocks, s, sectors, 0); rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
if (rv == 0) { if (rdev->badblocks.changed) {
/* Make sure they get written out promptly */ /* Make sure they get written out promptly */
if (test_bit(ExternalBbl, &rdev->flags)) if (test_bit(ExternalBbl, &rdev->flags))
sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
...@@ -9594,9 +9592,8 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, ...@@ -9594,9 +9592,8 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
set_mask_bits(&mddev->sb_flags, 0, set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
md_wakeup_thread(rdev->mddev->thread); md_wakeup_thread(rdev->mddev->thread);
return 1; }
} else return !rv;
return 0;
} }
EXPORT_SYMBOL_GPL(rdev_set_badblocks); EXPORT_SYMBOL_GPL(rdev_set_badblocks);
......
...@@ -3158,6 +3158,7 @@ static int raid1_run(struct mddev *mddev) ...@@ -3158,6 +3158,7 @@ static int raid1_run(struct mddev *mddev)
* RAID1 needs at least one disk in active * RAID1 needs at least one disk in active
*/ */
if (conf->raid_disks - mddev->degraded < 1) { if (conf->raid_disks - mddev->degraded < 1) {
md_unregister_thread(&conf->thread);
ret = -EINVAL; ret = -EINVAL;
goto abort; goto abort;
} }
......
...@@ -441,47 +441,50 @@ static void raid10_end_write_request(struct bio *bio) ...@@ -441,47 +441,50 @@ static void raid10_end_write_request(struct bio *bio)
dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
if (repl) if (repl) {
rdev = conf->mirrors[dev].replacement; rdev = r10_bio->devs[slot].replacement;
if (!rdev) { if (rdev == conf->mirrors[dev].replacement) {
smp_rmb(); if (bio->bi_status && !discard_error) {
repl = 0; /*
rdev = conf->mirrors[dev].rdev; * Never record new bad blocks to replacement,
* just fail it.
*/
md_error(rdev->mddev, rdev);
goto out;
}
}
} else {
rdev = r10_bio->devs[slot].rdev;
} }
/* /*
* this branch is our 'one mirror IO has finished' event handler: * this branch is our 'one mirror IO has finished' event handler:
*/ */
if (bio->bi_status && !discard_error) { if (bio->bi_status && !discard_error) {
if (repl) set_bit(WriteErrorSeen, &rdev->flags);
/* Never record new bad blocks to replacement, if (!test_and_set_bit(WantReplacement, &rdev->flags))
* just fail it. set_bit(MD_RECOVERY_NEEDED,
*/ &rdev->mddev->recovery);
md_error(rdev->mddev, rdev);
else {
set_bit(WriteErrorSeen, &rdev->flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
dec_rdev = 0; dec_rdev = 0;
if (test_bit(FailFast, &rdev->flags) && if (test_bit(FailFast, &rdev->flags) &&
(bio->bi_opf & MD_FAILFAST)) { (bio->bi_opf & MD_FAILFAST))
md_error(rdev->mddev, rdev); md_error(rdev->mddev, rdev);
}
/* /*
* When the device is faulty, it is not necessary to * When the device is faulty, it is not necessary to
* handle write error. * handle write error.
*/ */
if (!test_bit(Faulty, &rdev->flags)) if (!test_bit(Faulty, &rdev->flags)) {
set_bit(R10BIO_WriteError, &r10_bio->state); set_bit(R10BIO_WriteError, &r10_bio->state);
else { } else {
/* Fail the request */ /* Fail the request */
set_bit(R10BIO_Degraded, &r10_bio->state); set_bit(R10BIO_Degraded, &r10_bio->state);
if (repl)
r10_bio->devs[slot].repl_bio = NULL;
else
r10_bio->devs[slot].bio = NULL; r10_bio->devs[slot].bio = NULL;
to_put = bio; to_put = bio;
dec_rdev = 1; dec_rdev = 1;
}
} }
} else { } else {
/* /*
...@@ -513,16 +516,17 @@ static void raid10_end_write_request(struct bio *bio) ...@@ -513,16 +516,17 @@ static void raid10_end_write_request(struct bio *bio)
r10_bio->devs[slot].addr, r10_bio->devs[slot].addr,
r10_bio->sectors, r10_bio->sectors,
&first_bad, &bad_sectors) && !discard_error) { &first_bad, &bad_sectors) && !discard_error) {
bio_put(bio);
if (repl) if (repl)
r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
else else
r10_bio->devs[slot].bio = IO_MADE_GOOD; r10_bio->devs[slot].bio = IO_MADE_GOOD;
bio_put(bio);
dec_rdev = 0; dec_rdev = 0;
set_bit(R10BIO_MadeGood, &r10_bio->state); set_bit(R10BIO_MadeGood, &r10_bio->state);
} }
} }
out:
/* /*
* *
* Let's see if all mirrored write operations have finished * Let's see if all mirrored write operations have finished
...@@ -753,9 +757,19 @@ static struct md_rdev *read_balance(struct r10conf *conf, ...@@ -753,9 +757,19 @@ static struct md_rdev *read_balance(struct r10conf *conf,
disk = r10_bio->devs[slot].devnum; disk = r10_bio->devs[slot].devnum;
rdev = rcu_dereference(conf->mirrors[disk].replacement); rdev = rcu_dereference(conf->mirrors[disk].replacement);
if (rdev == NULL || test_bit(Faulty, &rdev->flags) || if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) test_bit(WantRemove, &rdev->flags) ||
r10_bio->devs[slot].addr + sectors >
rdev->recovery_offset) {
/*
* Read replacement first to prevent reading both rdev
* and replacement as NULL during replacement replace
* rdev
*/
smp_mb();
rdev = rcu_dereference(conf->mirrors[disk].rdev); rdev = rcu_dereference(conf->mirrors[disk].rdev);
}
if (rdev == NULL || if (rdev == NULL ||
test_bit(WantRemove, &rdev->flags) ||
test_bit(Faulty, &rdev->flags)) test_bit(Faulty, &rdev->flags))
continue; continue;
if (!test_bit(In_sync, &rdev->flags) && if (!test_bit(In_sync, &rdev->flags) &&
...@@ -896,6 +910,7 @@ static void flush_pending_writes(struct r10conf *conf) ...@@ -896,6 +910,7 @@ static void flush_pending_writes(struct r10conf *conf)
else else
submit_bio_noacct(bio); submit_bio_noacct(bio);
bio = next; bio = next;
cond_resched();
} }
blk_finish_plug(&plug); blk_finish_plug(&plug);
} else } else
...@@ -952,36 +967,45 @@ static void lower_barrier(struct r10conf *conf) ...@@ -952,36 +967,45 @@ static void lower_barrier(struct r10conf *conf)
spin_unlock_irqrestore(&conf->resync_lock, flags); spin_unlock_irqrestore(&conf->resync_lock, flags);
wake_up(&conf->wait_barrier); wake_up(&conf->wait_barrier);
} }
static bool stop_waiting_barrier(struct r10conf *conf)
{
struct bio_list *bio_list = current->bio_list;
/* barrier is dropped */
if (!conf->barrier)
return true;
/*
* If there are already pending requests (preventing the barrier from
* rising completely), and the pre-process bio queue isn't empty, then
* don't wait, as we need to empty that queue to get the nr_pending
* count down.
*/
if (atomic_read(&conf->nr_pending) && bio_list &&
(!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1])))
return true;
/*
* move on if io is issued from raid10d(), nr_pending is not released
* from original io(see handle_read_error()). All raise barrier is
* blocked until this io is done.
*/
if (conf->mddev->thread->tsk == current) {
WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0);
return true;
}
return false;
}
static void wait_barrier(struct r10conf *conf) static void wait_barrier(struct r10conf *conf)
{ {
spin_lock_irq(&conf->resync_lock); spin_lock_irq(&conf->resync_lock);
if (conf->barrier) { if (conf->barrier) {
struct bio_list *bio_list = current->bio_list;
conf->nr_waiting++; conf->nr_waiting++;
/* Wait for the barrier to drop.
* However if there are already pending
* requests (preventing the barrier from
* rising completely), and the
* pre-process bio queue isn't empty,
* then don't wait, as we need to empty
* that queue to get the nr_pending
* count down.
*/
raid10_log(conf->mddev, "wait barrier"); raid10_log(conf->mddev, "wait barrier");
wait_event_lock_irq(conf->wait_barrier, wait_event_lock_irq(conf->wait_barrier,
!conf->barrier || stop_waiting_barrier(conf),
(atomic_read(&conf->nr_pending) &&
bio_list &&
(!bio_list_empty(&bio_list[0]) ||
!bio_list_empty(&bio_list[1]))) ||
/* move on if recovery thread is
* blocked by us
*/
(conf->mddev->thread->tsk == current &&
test_bit(MD_RECOVERY_RUNNING,
&conf->mddev->recovery) &&
conf->nr_queued > 0),
conf->resync_lock); conf->resync_lock);
conf->nr_waiting--; conf->nr_waiting--;
if (!conf->nr_waiting) if (!conf->nr_waiting)
...@@ -1089,6 +1113,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) ...@@ -1089,6 +1113,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
else else
submit_bio_noacct(bio); submit_bio_noacct(bio);
bio = next; bio = next;
cond_resched();
} }
kfree(plug); kfree(plug);
} }
...@@ -1227,29 +1252,21 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, ...@@ -1227,29 +1252,21 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
int devnum = r10_bio->devs[n_copy].devnum; int devnum = r10_bio->devs[n_copy].devnum;
struct bio *mbio; struct bio *mbio;
if (replacement) {
rdev = conf->mirrors[devnum].replacement;
if (rdev == NULL) {
/* Replacement just got moved to main 'rdev' */
smp_mb();
rdev = conf->mirrors[devnum].rdev;
}
} else
rdev = conf->mirrors[devnum].rdev;
mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
if (replacement) if (replacement) {
r10_bio->devs[n_copy].repl_bio = mbio; r10_bio->devs[n_copy].repl_bio = mbio;
else rdev = r10_bio->devs[n_copy].replacement;
} else {
r10_bio->devs[n_copy].bio = mbio; r10_bio->devs[n_copy].bio = mbio;
rdev = r10_bio->devs[n_copy].rdev;
}
mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
choose_data_offset(r10_bio, rdev)); choose_data_offset(r10_bio, rdev));
bio_set_dev(mbio, rdev->bdev); bio_set_dev(mbio, rdev->bdev);
mbio->bi_end_io = raid10_end_write_request; mbio->bi_end_io = raid10_end_write_request;
bio_set_op_attrs(mbio, op, do_sync | do_fua); bio_set_op_attrs(mbio, op, do_sync | do_fua);
if (!replacement && test_bit(FailFast, if (!replacement && test_bit(FailFast, &rdev->flags)
&conf->mirrors[devnum].rdev->flags)
&& enough(conf, devnum)) && enough(conf, devnum))
mbio->bi_opf |= MD_FAILFAST; mbio->bi_opf |= MD_FAILFAST;
mbio->bi_private = r10_bio; mbio->bi_private = r10_bio;
...@@ -1350,9 +1367,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, ...@@ -1350,9 +1367,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
for (i = 0; i < conf->copies; i++) { for (i = 0; i < conf->copies; i++) {
int d = r10_bio->devs[i].devnum; int d = r10_bio->devs[i].devnum;
struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); struct md_rdev *rrdev, *rdev;
struct md_rdev *rrdev = rcu_dereference(
conf->mirrors[d].replacement); rrdev = rcu_dereference(conf->mirrors[d].replacement);
/*
* Read replacement first to Prevent reading both rdev and
* replacement as NULL during replacement replace rdev.
*/
smp_mb();
rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev == rrdev) if (rdev == rrdev)
rrdev = NULL; rrdev = NULL;
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
...@@ -1365,9 +1388,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, ...@@ -1365,9 +1388,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
blocked_rdev = rrdev; blocked_rdev = rrdev;
break; break;
} }
if (rdev && (test_bit(Faulty, &rdev->flags))) if (rdev && (test_bit(Faulty, &rdev->flags) ||
test_bit(WantRemove, &rdev->flags)))
rdev = NULL; rdev = NULL;
if (rrdev && (test_bit(Faulty, &rrdev->flags))) if (rrdev && (test_bit(Faulty, &rrdev->flags) ||
test_bit(WantRemove, &rrdev->flags)))
rrdev = NULL; rrdev = NULL;
r10_bio->devs[i].bio = NULL; r10_bio->devs[i].bio = NULL;
...@@ -1420,10 +1445,12 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, ...@@ -1420,10 +1445,12 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
} }
if (rdev) { if (rdev) {
r10_bio->devs[i].bio = bio; r10_bio->devs[i].bio = bio;
r10_bio->devs[i].rdev = rdev;
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
} }
if (rrdev) { if (rrdev) {
r10_bio->devs[i].repl_bio = bio; r10_bio->devs[i].repl_bio = bio;
r10_bio->devs[i].replacement = rrdev;
atomic_inc(&rrdev->nr_pending); atomic_inc(&rrdev->nr_pending);
} }
} }
...@@ -1432,24 +1459,12 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, ...@@ -1432,24 +1459,12 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
if (unlikely(blocked_rdev)) { if (unlikely(blocked_rdev)) {
/* Have to wait for this device to get unblocked, then retry */ /* Have to wait for this device to get unblocked, then retry */
int j; int j;
int d;
for (j = 0; j < i; j++) { for (j = 0; j < i; j++) {
if (r10_bio->devs[j].bio) { if (r10_bio->devs[j].bio)
d = r10_bio->devs[j].devnum; rdev_dec_pending(r10_bio->devs[j].rdev, mddev);
rdev_dec_pending(conf->mirrors[d].rdev, mddev); if (r10_bio->devs[j].repl_bio)
} rdev_dec_pending(r10_bio->devs[j].replacement, mddev);
if (r10_bio->devs[j].repl_bio) {
struct md_rdev *rdev;
d = r10_bio->devs[j].devnum;
rdev = conf->mirrors[d].replacement;
if (!rdev) {
/* Race with remove_disk */
smp_mb();
rdev = conf->mirrors[d].rdev;
}
rdev_dec_pending(rdev, mddev);
}
} }
allow_barrier(conf); allow_barrier(conf);
raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
...@@ -1745,9 +1760,10 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1745,9 +1760,10 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{ {
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
int err = -EEXIST; int err = -EEXIST;
int mirror; int mirror, repl_slot = -1;
int first = 0; int first = 0;
int last = conf->geo.raid_disks - 1; int last = conf->geo.raid_disks - 1;
struct raid10_info *p;
if (mddev->recovery_cp < MaxSector) if (mddev->recovery_cp < MaxSector)
/* only hot-add to in-sync arrays, as recovery is /* only hot-add to in-sync arrays, as recovery is
...@@ -1770,23 +1786,14 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1770,23 +1786,14 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
else else
mirror = first; mirror = first;
for ( ; mirror <= last ; mirror++) { for ( ; mirror <= last ; mirror++) {
struct raid10_info *p = &conf->mirrors[mirror]; p = &conf->mirrors[mirror];
if (p->recovery_disabled == mddev->recovery_disabled) if (p->recovery_disabled == mddev->recovery_disabled)
continue; continue;
if (p->rdev) { if (p->rdev) {
if (!test_bit(WantReplacement, &p->rdev->flags) || if (test_bit(WantReplacement, &p->rdev->flags) &&
p->replacement != NULL) p->replacement == NULL && repl_slot < 0)
continue; repl_slot = mirror;
clear_bit(In_sync, &rdev->flags); continue;
set_bit(Replacement, &rdev->flags);
rdev->raid_disk = mirror;
err = 0;
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
conf->fullsync = 1;
rcu_assign_pointer(p->replacement, rdev);
break;
} }
if (mddev->gendisk) if (mddev->gendisk)
...@@ -1796,12 +1803,28 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1796,12 +1803,28 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
p->head_position = 0; p->head_position = 0;
p->recovery_disabled = mddev->recovery_disabled - 1; p->recovery_disabled = mddev->recovery_disabled - 1;
rdev->raid_disk = mirror; rdev->raid_disk = mirror;
clear_bit(WantRemove, &rdev->flags);
err = 0; err = 0;
if (rdev->saved_raid_disk != mirror) if (rdev->saved_raid_disk != mirror)
conf->fullsync = 1; conf->fullsync = 1;
rcu_assign_pointer(p->rdev, rdev); rcu_assign_pointer(p->rdev, rdev);
break; break;
} }
if (err && repl_slot >= 0) {
p = &conf->mirrors[repl_slot];
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
clear_bit(WantRemove, &rdev->flags);
rdev->raid_disk = repl_slot;
err = 0;
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
conf->fullsync = 1;
rcu_assign_pointer(p->replacement, rdev);
}
if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue); blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue);
...@@ -1844,16 +1867,22 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1844,16 +1867,22 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
err = -EBUSY; err = -EBUSY;
goto abort; goto abort;
} }
*rdevp = NULL; /*
* Before set p->rdev = NULL, we set WantRemove bit avoiding
* race between rdev remove and issue bio, which can cause
* NULL pointer deference of rdev by conf->mirrors[i].rdev.
*/
set_bit(WantRemove, &rdev->flags);
if (!test_bit(RemoveSynchronized, &rdev->flags)) { if (!test_bit(RemoveSynchronized, &rdev->flags)) {
synchronize_rcu(); synchronize_rcu();
if (atomic_read(&rdev->nr_pending)) { if (atomic_read(&rdev->nr_pending)) {
/* lost the race, try later */ /* lost the race, try later */
err = -EBUSY; err = -EBUSY;
*rdevp = rdev; md_error(rdev->mddev, rdev);
goto abort; goto abort;
} }
} }
*rdevp = NULL;
if (p->replacement) { if (p->replacement) {
/* We must have just cleared 'rdev' */ /* We must have just cleared 'rdev' */
p->rdev = p->replacement; p->rdev = p->replacement;
...@@ -2598,9 +2627,13 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -2598,9 +2627,13 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
md_error(mddev, rdev); md_error(mddev, rdev);
rdev_dec_pending(rdev, mddev); rdev_dec_pending(rdev, mddev);
allow_barrier(conf);
r10_bio->state = 0; r10_bio->state = 0;
raid10_read_request(mddev, r10_bio->master_bio, r10_bio, true); raid10_read_request(mddev, r10_bio->master_bio, r10_bio, true);
/*
* allow_barrier after re-submit to ensure no sync io
* can be issued while regular io pending.
*/
allow_barrier(conf);
} }
static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
...@@ -2656,9 +2689,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) ...@@ -2656,9 +2689,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
} else { } else {
bool fail = false; bool fail = false;
for (m = 0; m < conf->copies; m++) { for (m = 0; m < conf->copies; m++) {
int dev = r10_bio->devs[m].devnum;
struct bio *bio = r10_bio->devs[m].bio; struct bio *bio = r10_bio->devs[m].bio;
rdev = conf->mirrors[dev].rdev; rdev = r10_bio->devs[m].rdev;
if (bio == IO_MADE_GOOD) { if (bio == IO_MADE_GOOD) {
rdev_clear_badblocks( rdev_clear_badblocks(
rdev, rdev,
...@@ -2675,7 +2707,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) ...@@ -2675,7 +2707,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
rdev_dec_pending(rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
} }
bio = r10_bio->devs[m].repl_bio; bio = r10_bio->devs[m].repl_bio;
rdev = conf->mirrors[dev].replacement; rdev = r10_bio->devs[m].replacement;
if (rdev && bio == IO_MADE_GOOD) { if (rdev && bio == IO_MADE_GOOD) {
rdev_clear_badblocks( rdev_clear_badblocks(
rdev, rdev,
...@@ -2908,10 +2940,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2908,10 +2940,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
int chunks_skipped = 0; int chunks_skipped = 0;
sector_t chunk_mask = conf->geo.chunk_mask; sector_t chunk_mask = conf->geo.chunk_mask;
int page_idx = 0; int page_idx = 0;
int error_disk = -1;
if (!mempool_initialized(&conf->r10buf_pool))
if (init_resync(conf))
return 0;
/* /*
* Allow skipping a full rebuild for incremental assembly * Allow skipping a full rebuild for incremental assembly
...@@ -2928,6 +2957,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2928,6 +2957,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
return mddev->dev_sectors - sector_nr; return mddev->dev_sectors - sector_nr;
} }
if (!mempool_initialized(&conf->r10buf_pool))
if (init_resync(conf))
return 0;
skipped: skipped:
max_sector = mddev->dev_sectors; max_sector = mddev->dev_sectors;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
...@@ -2991,7 +3024,18 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2991,7 +3024,18 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
return reshape_request(mddev, sector_nr, skipped); return reshape_request(mddev, sector_nr, skipped);
if (chunks_skipped >= conf->geo.raid_disks) { if (chunks_skipped >= conf->geo.raid_disks) {
/* if there has been nothing to do on any drive, pr_err("md/raid10:%s: %s fail\n", mdname(mddev),
test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery");
if (error_disk >= 0 && !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
/*
* recovery fail, set mirrors.recovory_disabled,
* device shouldn't be added to there.
*/
conf->mirrors[error_disk].recovery_disabled = mddev->recovery_disabled;
return 0;
}
/*
* if there has been nothing to do on any drive,
* then there is nothing to do at all.. * then there is nothing to do at all..
*/ */
*skipped = 1; *skipped = 1;
...@@ -3058,6 +3102,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -3058,6 +3102,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (mreplace != NULL && if (mreplace != NULL &&
!test_bit(Faulty, &mreplace->flags)) !test_bit(Faulty, &mreplace->flags))
need_replace = 1; need_replace = 1;
else
mreplace = NULL;
if (!need_recover && !need_replace) { if (!need_recover && !need_replace) {
rcu_read_unlock(); rcu_read_unlock();
...@@ -3075,8 +3121,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -3075,8 +3121,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
rcu_read_unlock(); rcu_read_unlock();
continue; continue;
} }
if (mreplace && test_bit(Faulty, &mreplace->flags))
mreplace = NULL;
/* Unless we are doing a full sync, or a replacement /* Unless we are doing a full sync, or a replacement
* we only need to recover the block if it is set in * we only need to recover the block if it is set in
* the bitmap * the bitmap
...@@ -3248,6 +3292,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -3248,6 +3292,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
mdname(mddev)); mdname(mddev));
mirror->recovery_disabled mirror->recovery_disabled
= mddev->recovery_disabled; = mddev->recovery_disabled;
} else {
error_disk = i;
} }
put_buf(r10_bio); put_buf(r10_bio);
if (rb2) if (rb2)
......
...@@ -146,12 +146,12 @@ struct r10bio { ...@@ -146,12 +146,12 @@ struct r10bio {
*/ */
struct r10dev { struct r10dev {
struct bio *bio; struct bio *bio;
union { /* Currently just used for normal reads and writes */
struct bio *repl_bio; /* used for resync and struct md_rdev *rdev;
* writes */ /* used for resync and writes */
struct md_rdev *rdev; /* used for reads struct bio *repl_bio;
* (read_slot >= 0) */ /* Currently just used for normal writes */
}; struct md_rdev *replacement;
sector_t addr; sector_t addr;
int devnum; int devnum;
} devs[]; } devs[];
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册