未验证 提交 4ac8d141 编写于 作者: O openeuler-ci-bot 提交者: Gitee

!903 backport block bugfix

Merge Pull Request from: @zhangjialin11 
 
This patch series fix block layer bug.
3 patchs fix iocost bug. Other patchs fix raid10 and badblocks bug.
 
 
Link:https://gitee.com/openeuler/kernel/pulls/903 

Reviewed-by: Zheng Zengkai <zhengzengkai@huawei.com> 
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com> 
......@@ -165,7 +165,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
{
u64 *p;
int lo, hi;
int rv = 0;
int rv = 0, changed = 0;
unsigned long flags;
if (bb->shift < 0)
......@@ -230,6 +230,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
s = a + BB_MAX_LEN;
}
sectors = e - s;
changed = 1;
}
}
if (sectors && hi < bb->count) {
......@@ -260,24 +261,24 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
sectors = e - s;
lo = hi;
hi++;
changed = 1;
}
}
if (sectors == 0 && hi < bb->count) {
/* we might be able to combine lo and hi */
/* Note: 's' is at the end of 'lo' */
sector_t a = BB_OFFSET(p[hi]);
int lolen = BB_LEN(p[lo]);
int hilen = BB_LEN(p[hi]);
int newlen = lolen + hilen - (s - a);
sector_t a = BB_OFFSET(p[lo]);
int newlen = max(s, BB_OFFSET(p[hi]) + BB_LEN(p[hi])) - a;
if (s >= a && newlen < BB_MAX_LEN) {
if (s >= BB_OFFSET(p[hi]) && newlen < BB_MAX_LEN) {
/* yes, we can combine them */
int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
p[lo] = BB_MAKE(a, newlen, ack);
memmove(p + hi, p + hi + 1,
(bb->count - hi - 1) * 8);
bb->count--;
changed = 1;
}
}
while (sectors) {
......@@ -300,14 +301,18 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
p[hi] = BB_MAKE(s, this_sectors, acknowledged);
sectors -= this_sectors;
s += this_sectors;
hi++;
changed = 1;
}
}
bb->changed = 1;
if (!acknowledged)
bb->unacked_exist = 1;
else
badblocks_update_acked(bb);
if (changed) {
bb->changed = changed;
if (!acknowledged)
bb->unacked_exist = 1;
else
badblocks_update_acked(bb);
}
write_sequnlock_irqrestore(&bb->lock, flags);
return rv;
......
......@@ -2414,6 +2414,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
u32 hwi, adj_step;
s64 margin;
u64 cost, new_inuse;
unsigned long flags;
current_hweight(iocg, NULL, &hwi);
old_hwi = hwi;
......@@ -2432,11 +2433,11 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
iocg->inuse == iocg->active)
return cost;
spin_lock_irq(&ioc->lock);
spin_lock_irqsave(&ioc->lock, flags);
/* we own inuse only when @iocg is in the normal active state */
if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
spin_unlock_irq(&ioc->lock);
spin_unlock_irqrestore(&ioc->lock, flags);
return cost;
}
......@@ -2457,7 +2458,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
} while (time_after64(vtime + cost, now->vnow) &&
iocg->inuse != iocg->active);
spin_unlock_irq(&ioc->lock);
spin_unlock_irqrestore(&ioc->lock, flags);
TRACE_IOCG_PATH(inuse_adjust, iocg, now,
old_inuse, iocg->inuse, old_hwi, hwi);
......@@ -2873,15 +2874,21 @@ static int blk_iocost_init(struct request_queue *q)
* called before policy activation completion, can't assume that the
* target bio has an iocg associated and need to test for NULL iocg.
*/
rq_qos_add(q, rqos);
ret = rq_qos_add(q, rqos);
if (ret)
goto err_free_ioc;
ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
if (ret) {
rq_qos_del(q, rqos);
free_percpu(ioc->pcpu_stat);
kfree(ioc);
return ret;
}
if (ret)
goto err_del_qos;
return 0;
err_del_qos:
rq_qos_del(q, rqos);
err_free_ioc:
free_percpu(ioc->pcpu_stat);
kfree(ioc);
return ret;
}
static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
......@@ -3166,6 +3173,10 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
disk = blkcg_conf_get_disk(&input);
if (IS_ERR(disk))
return PTR_ERR(disk);
if (!queue_is_mq(disk->queue)) {
ret = -EOPNOTSUPP;
goto err;
}
ioc = q_to_ioc(disk->queue);
if (!ioc) {
......@@ -3333,6 +3344,10 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
disk = blkcg_conf_get_disk(&input);
if (IS_ERR(disk))
return PTR_ERR(disk);
if (!queue_is_mq(disk->queue)) {
ret = -EOPNOTSUPP;
goto err;
}
ioc = q_to_ioc(disk->queue);
if (!ioc) {
......
......@@ -772,19 +772,23 @@ int blk_iolatency_init(struct request_queue *q)
rqos->ops = &blkcg_iolatency_ops;
rqos->q = q;
rq_qos_add(q, rqos);
ret = rq_qos_add(q, rqos);
if (ret)
goto err_free;
ret = blkcg_activate_policy(q, &blkcg_policy_iolatency);
if (ret) {
rq_qos_del(q, rqos);
kfree(blkiolat);
return ret;
}
if (ret)
goto err_qos_del;
timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0);
INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn);
return 0;
err_qos_del:
rq_qos_del(q, rqos);
err_free:
kfree(blkiolat);
return ret;
}
static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
......
......@@ -98,7 +98,7 @@ static inline void rq_wait_init(struct rq_wait *rq_wait)
init_waitqueue_head(&rq_wait->wait);
}
static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
static inline int rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
{
/*
* No IO can be in-flight when adding rqos, so freeze queue, which
......@@ -110,6 +110,8 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
blk_mq_freeze_queue(q);
spin_lock_irq(&q->queue_lock);
if (rq_qos_id(q, rqos->id))
goto ebusy;
rqos->next = q->rq_qos;
q->rq_qos = rqos;
spin_unlock_irq(&q->queue_lock);
......@@ -118,6 +120,13 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
if (rqos->ops->debugfs_attrs)
blk_mq_debugfs_register_rqos(rqos);
return 0;
ebusy:
spin_unlock_irq(&q->queue_lock);
blk_mq_unfreeze_queue(q);
return -EBUSY;
}
static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
......
......@@ -818,6 +818,7 @@ int wbt_init(struct request_queue *q)
{
struct rq_wb *rwb;
int i;
int ret;
rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
if (!rwb)
......@@ -847,8 +848,17 @@ int wbt_init(struct request_queue *q)
/*
* Assign rwb and add the stats callback.
*/
rq_qos_add(q, &rwb->rqos);
ret = rq_qos_add(q, &rwb->rqos);
if (ret)
goto err_free;
blk_stat_add_callback(q, rwb->cb);
return 0;
err_free:
blk_stat_free_callback(rwb->cb);
kfree(rwb);
return ret;
}
......@@ -2402,8 +2402,9 @@ EXPORT_SYMBOL(md_integrity_add_rdev);
static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
{
char b[BDEVNAME_SIZE];
char b[BDEVNAME_SIZE + 4];
struct kobject *ko;
struct kernfs_node *sysfs_rdev;
int err;
/* prevent duplicates */
......@@ -2454,7 +2455,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
mdname(mddev), mddev->max_disks);
return -EBUSY;
}
bdevname(rdev->bdev,b);
memcpy(b, "dev-", 4);
bdevname(rdev->bdev, b + 4);
strreplace(b, '/', '!');
rdev->mddev = mddev;
......@@ -2463,7 +2465,15 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
if (mddev->raid_disks)
mddev_create_serial_pool(mddev, rdev, false);
if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
sysfs_rdev = sysfs_get_dirent_safe(mddev->kobj.sd, b);
if (sysfs_rdev) {
sysfs_put(sysfs_rdev);
err = -EBUSY;
goto fail;
}
err = kobject_add(&rdev->kobj, &mddev->kobj, b);
if (err)
goto fail;
ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
......@@ -2484,7 +2494,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
return 0;
fail:
pr_warn("md: failed to register dev-%s for %s\n",
pr_warn("md: failed to register %s for %s\n",
b, mdname(mddev));
return err;
}
......@@ -4592,20 +4602,6 @@ null_show(struct mddev *mddev, char *page)
return -EINVAL;
}
/* need to ensure rdev_delayed_delete() has completed */
static void flush_rdev_wq(struct mddev *mddev)
{
struct md_rdev *rdev;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (work_pending(&rdev->del_work)) {
flush_workqueue(md_rdev_misc_wq);
break;
}
rcu_read_unlock();
}
static ssize_t
new_dev_store(struct mddev *mddev, const char *buf, size_t len)
{
......@@ -4633,7 +4629,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
minor != MINOR(dev))
return -EOVERFLOW;
flush_rdev_wq(mddev);
flush_workqueue(md_rdev_misc_wq);
err = mddev_lock(mddev);
if (err)
return err;
......@@ -5743,6 +5739,7 @@ static int md_alloc(dev_t dev, char *name)
* completely removed (mddev_delayed_delete).
*/
flush_workqueue(md_misc_wq);
flush_workqueue(md_rdev_misc_wq);
mutex_lock(&disks_mutex);
error = -EEXIST;
......@@ -7646,7 +7643,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
}
if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK)
flush_rdev_wq(mddev);
flush_workqueue(md_rdev_misc_wq);
if (cmd == HOT_REMOVE_DISK)
/* need to ensure recovery thread has run */
......@@ -9581,12 +9578,13 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
{
struct mddev *mddev = rdev->mddev;
int rv;
if (is_new)
s += rdev->new_data_offset;
else
s += rdev->data_offset;
rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
if (rv == 0) {
if (rdev->badblocks.changed) {
/* Make sure they get written out promptly */
if (test_bit(ExternalBbl, &rdev->flags))
sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
......@@ -9594,9 +9592,8 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
md_wakeup_thread(rdev->mddev->thread);
return 1;
} else
return 0;
}
return !rv;
}
EXPORT_SYMBOL_GPL(rdev_set_badblocks);
......
......@@ -3158,6 +3158,7 @@ static int raid1_run(struct mddev *mddev)
* RAID1 needs at least one disk in active
*/
if (conf->raid_disks - mddev->degraded < 1) {
md_unregister_thread(&conf->thread);
ret = -EINVAL;
goto abort;
}
......
......@@ -441,47 +441,50 @@ static void raid10_end_write_request(struct bio *bio)
dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
if (repl)
rdev = conf->mirrors[dev].replacement;
if (!rdev) {
smp_rmb();
repl = 0;
rdev = conf->mirrors[dev].rdev;
if (repl) {
rdev = r10_bio->devs[slot].replacement;
if (rdev == conf->mirrors[dev].replacement) {
if (bio->bi_status && !discard_error) {
/*
* Never record new bad blocks to replacement,
* just fail it.
*/
md_error(rdev->mddev, rdev);
goto out;
}
}
} else {
rdev = r10_bio->devs[slot].rdev;
}
/*
* this branch is our 'one mirror IO has finished' event handler:
*/
if (bio->bi_status && !discard_error) {
if (repl)
/* Never record new bad blocks to replacement,
* just fail it.
*/
md_error(rdev->mddev, rdev);
else {
set_bit(WriteErrorSeen, &rdev->flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
set_bit(WriteErrorSeen, &rdev->flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
dec_rdev = 0;
if (test_bit(FailFast, &rdev->flags) &&
(bio->bi_opf & MD_FAILFAST)) {
md_error(rdev->mddev, rdev);
}
dec_rdev = 0;
if (test_bit(FailFast, &rdev->flags) &&
(bio->bi_opf & MD_FAILFAST))
md_error(rdev->mddev, rdev);
/*
* When the device is faulty, it is not necessary to
* handle write error.
*/
if (!test_bit(Faulty, &rdev->flags))
set_bit(R10BIO_WriteError, &r10_bio->state);
else {
/* Fail the request */
set_bit(R10BIO_Degraded, &r10_bio->state);
/*
* When the device is faulty, it is not necessary to
* handle write error.
*/
if (!test_bit(Faulty, &rdev->flags)) {
set_bit(R10BIO_WriteError, &r10_bio->state);
} else {
/* Fail the request */
set_bit(R10BIO_Degraded, &r10_bio->state);
if (repl)
r10_bio->devs[slot].repl_bio = NULL;
else
r10_bio->devs[slot].bio = NULL;
to_put = bio;
dec_rdev = 1;
}
to_put = bio;
dec_rdev = 1;
}
} else {
/*
......@@ -513,16 +516,17 @@ static void raid10_end_write_request(struct bio *bio)
r10_bio->devs[slot].addr,
r10_bio->sectors,
&first_bad, &bad_sectors) && !discard_error) {
bio_put(bio);
if (repl)
r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
else
r10_bio->devs[slot].bio = IO_MADE_GOOD;
bio_put(bio);
dec_rdev = 0;
set_bit(R10BIO_MadeGood, &r10_bio->state);
}
}
out:
/*
*
* Let's see if all mirrored write operations have finished
......@@ -753,9 +757,19 @@ static struct md_rdev *read_balance(struct r10conf *conf,
disk = r10_bio->devs[slot].devnum;
rdev = rcu_dereference(conf->mirrors[disk].replacement);
if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
test_bit(WantRemove, &rdev->flags) ||
r10_bio->devs[slot].addr + sectors >
rdev->recovery_offset) {
/*
* Read replacement first to prevent reading both rdev
* and replacement as NULL during replacement replace
* rdev
*/
smp_mb();
rdev = rcu_dereference(conf->mirrors[disk].rdev);
}
if (rdev == NULL ||
test_bit(WantRemove, &rdev->flags) ||
test_bit(Faulty, &rdev->flags))
continue;
if (!test_bit(In_sync, &rdev->flags) &&
......@@ -896,6 +910,7 @@ static void flush_pending_writes(struct r10conf *conf)
else
submit_bio_noacct(bio);
bio = next;
cond_resched();
}
blk_finish_plug(&plug);
} else
......@@ -952,36 +967,45 @@ static void lower_barrier(struct r10conf *conf)
spin_unlock_irqrestore(&conf->resync_lock, flags);
wake_up(&conf->wait_barrier);
}
static bool stop_waiting_barrier(struct r10conf *conf)
{
struct bio_list *bio_list = current->bio_list;
/* barrier is dropped */
if (!conf->barrier)
return true;
/*
* If there are already pending requests (preventing the barrier from
* rising completely), and the pre-process bio queue isn't empty, then
* don't wait, as we need to empty that queue to get the nr_pending
* count down.
*/
if (atomic_read(&conf->nr_pending) && bio_list &&
(!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1])))
return true;
/*
* move on if io is issued from raid10d(), nr_pending is not released
* from original io(see handle_read_error()). All raise barrier is
* blocked until this io is done.
*/
if (conf->mddev->thread->tsk == current) {
WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0);
return true;
}
return false;
}
static void wait_barrier(struct r10conf *conf)
{
spin_lock_irq(&conf->resync_lock);
if (conf->barrier) {
struct bio_list *bio_list = current->bio_list;
conf->nr_waiting++;
/* Wait for the barrier to drop.
* However if there are already pending
* requests (preventing the barrier from
* rising completely), and the
* pre-process bio queue isn't empty,
* then don't wait, as we need to empty
* that queue to get the nr_pending
* count down.
*/
raid10_log(conf->mddev, "wait barrier");
wait_event_lock_irq(conf->wait_barrier,
!conf->barrier ||
(atomic_read(&conf->nr_pending) &&
bio_list &&
(!bio_list_empty(&bio_list[0]) ||
!bio_list_empty(&bio_list[1]))) ||
/* move on if recovery thread is
* blocked by us
*/
(conf->mddev->thread->tsk == current &&
test_bit(MD_RECOVERY_RUNNING,
&conf->mddev->recovery) &&
conf->nr_queued > 0),
stop_waiting_barrier(conf),
conf->resync_lock);
conf->nr_waiting--;
if (!conf->nr_waiting)
......@@ -1089,6 +1113,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
else
submit_bio_noacct(bio);
bio = next;
cond_resched();
}
kfree(plug);
}
......@@ -1227,29 +1252,21 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
int devnum = r10_bio->devs[n_copy].devnum;
struct bio *mbio;
if (replacement) {
rdev = conf->mirrors[devnum].replacement;
if (rdev == NULL) {
/* Replacement just got moved to main 'rdev' */
smp_mb();
rdev = conf->mirrors[devnum].rdev;
}
} else
rdev = conf->mirrors[devnum].rdev;
mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
if (replacement)
if (replacement) {
r10_bio->devs[n_copy].repl_bio = mbio;
else
rdev = r10_bio->devs[n_copy].replacement;
} else {
r10_bio->devs[n_copy].bio = mbio;
rdev = r10_bio->devs[n_copy].rdev;
}
mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
choose_data_offset(r10_bio, rdev));
bio_set_dev(mbio, rdev->bdev);
mbio->bi_end_io = raid10_end_write_request;
bio_set_op_attrs(mbio, op, do_sync | do_fua);
if (!replacement && test_bit(FailFast,
&conf->mirrors[devnum].rdev->flags)
if (!replacement && test_bit(FailFast, &rdev->flags)
&& enough(conf, devnum))
mbio->bi_opf |= MD_FAILFAST;
mbio->bi_private = r10_bio;
......@@ -1350,9 +1367,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
for (i = 0; i < conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
struct md_rdev *rrdev = rcu_dereference(
conf->mirrors[d].replacement);
struct md_rdev *rrdev, *rdev;
rrdev = rcu_dereference(conf->mirrors[d].replacement);
/*
* Read replacement first to Prevent reading both rdev and
* replacement as NULL during replacement replace rdev.
*/
smp_mb();
rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev == rrdev)
rrdev = NULL;
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
......@@ -1365,9 +1388,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
blocked_rdev = rrdev;
break;
}
if (rdev && (test_bit(Faulty, &rdev->flags)))
if (rdev && (test_bit(Faulty, &rdev->flags) ||
test_bit(WantRemove, &rdev->flags)))
rdev = NULL;
if (rrdev && (test_bit(Faulty, &rrdev->flags)))
if (rrdev && (test_bit(Faulty, &rrdev->flags) ||
test_bit(WantRemove, &rrdev->flags)))
rrdev = NULL;
r10_bio->devs[i].bio = NULL;
......@@ -1420,10 +1445,12 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
}
if (rdev) {
r10_bio->devs[i].bio = bio;
r10_bio->devs[i].rdev = rdev;
atomic_inc(&rdev->nr_pending);
}
if (rrdev) {
r10_bio->devs[i].repl_bio = bio;
r10_bio->devs[i].replacement = rrdev;
atomic_inc(&rrdev->nr_pending);
}
}
......@@ -1432,24 +1459,12 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
if (unlikely(blocked_rdev)) {
/* Have to wait for this device to get unblocked, then retry */
int j;
int d;
for (j = 0; j < i; j++) {
if (r10_bio->devs[j].bio) {
d = r10_bio->devs[j].devnum;
rdev_dec_pending(conf->mirrors[d].rdev, mddev);
}
if (r10_bio->devs[j].repl_bio) {
struct md_rdev *rdev;
d = r10_bio->devs[j].devnum;
rdev = conf->mirrors[d].replacement;
if (!rdev) {
/* Race with remove_disk */
smp_mb();
rdev = conf->mirrors[d].rdev;
}
rdev_dec_pending(rdev, mddev);
}
if (r10_bio->devs[j].bio)
rdev_dec_pending(r10_bio->devs[j].rdev, mddev);
if (r10_bio->devs[j].repl_bio)
rdev_dec_pending(r10_bio->devs[j].replacement, mddev);
}
allow_barrier(conf);
raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
......@@ -1745,9 +1760,10 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct r10conf *conf = mddev->private;
int err = -EEXIST;
int mirror;
int mirror, repl_slot = -1;
int first = 0;
int last = conf->geo.raid_disks - 1;
struct raid10_info *p;
if (mddev->recovery_cp < MaxSector)
/* only hot-add to in-sync arrays, as recovery is
......@@ -1770,23 +1786,14 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
else
mirror = first;
for ( ; mirror <= last ; mirror++) {
struct raid10_info *p = &conf->mirrors[mirror];
p = &conf->mirrors[mirror];
if (p->recovery_disabled == mddev->recovery_disabled)
continue;
if (p->rdev) {
if (!test_bit(WantReplacement, &p->rdev->flags) ||
p->replacement != NULL)
continue;
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
rdev->raid_disk = mirror;
err = 0;
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
conf->fullsync = 1;
rcu_assign_pointer(p->replacement, rdev);
break;
if (test_bit(WantReplacement, &p->rdev->flags) &&
p->replacement == NULL && repl_slot < 0)
repl_slot = mirror;
continue;
}
if (mddev->gendisk)
......@@ -1796,12 +1803,28 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
p->head_position = 0;
p->recovery_disabled = mddev->recovery_disabled - 1;
rdev->raid_disk = mirror;
clear_bit(WantRemove, &rdev->flags);
err = 0;
if (rdev->saved_raid_disk != mirror)
conf->fullsync = 1;
rcu_assign_pointer(p->rdev, rdev);
break;
}
if (err && repl_slot >= 0) {
p = &conf->mirrors[repl_slot];
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
clear_bit(WantRemove, &rdev->flags);
rdev->raid_disk = repl_slot;
err = 0;
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
conf->fullsync = 1;
rcu_assign_pointer(p->replacement, rdev);
}
if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue);
......@@ -1844,16 +1867,22 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
err = -EBUSY;
goto abort;
}
*rdevp = NULL;
/*
* Before set p->rdev = NULL, we set WantRemove bit avoiding
* race between rdev remove and issue bio, which can cause
* NULL pointer deference of rdev by conf->mirrors[i].rdev.
*/
set_bit(WantRemove, &rdev->flags);
if (!test_bit(RemoveSynchronized, &rdev->flags)) {
synchronize_rcu();
if (atomic_read(&rdev->nr_pending)) {
/* lost the race, try later */
err = -EBUSY;
*rdevp = rdev;
md_error(rdev->mddev, rdev);
goto abort;
}
}
*rdevp = NULL;
if (p->replacement) {
/* We must have just cleared 'rdev' */
p->rdev = p->replacement;
......@@ -2598,9 +2627,13 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
md_error(mddev, rdev);
rdev_dec_pending(rdev, mddev);
allow_barrier(conf);
r10_bio->state = 0;
raid10_read_request(mddev, r10_bio->master_bio, r10_bio, true);
/*
* allow_barrier after re-submit to ensure no sync io
* can be issued while regular io pending.
*/
allow_barrier(conf);
}
static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
......@@ -2656,9 +2689,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
} else {
bool fail = false;
for (m = 0; m < conf->copies; m++) {
int dev = r10_bio->devs[m].devnum;
struct bio *bio = r10_bio->devs[m].bio;
rdev = conf->mirrors[dev].rdev;
rdev = r10_bio->devs[m].rdev;
if (bio == IO_MADE_GOOD) {
rdev_clear_badblocks(
rdev,
......@@ -2675,7 +2707,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
rdev_dec_pending(rdev, conf->mddev);
}
bio = r10_bio->devs[m].repl_bio;
rdev = conf->mirrors[dev].replacement;
rdev = r10_bio->devs[m].replacement;
if (rdev && bio == IO_MADE_GOOD) {
rdev_clear_badblocks(
rdev,
......@@ -2908,10 +2940,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
int chunks_skipped = 0;
sector_t chunk_mask = conf->geo.chunk_mask;
int page_idx = 0;
if (!mempool_initialized(&conf->r10buf_pool))
if (init_resync(conf))
return 0;
int error_disk = -1;
/*
* Allow skipping a full rebuild for incremental assembly
......@@ -2928,6 +2957,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
return mddev->dev_sectors - sector_nr;
}
if (!mempool_initialized(&conf->r10buf_pool))
if (init_resync(conf))
return 0;
skipped:
max_sector = mddev->dev_sectors;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
......@@ -2991,7 +3024,18 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
return reshape_request(mddev, sector_nr, skipped);
if (chunks_skipped >= conf->geo.raid_disks) {
/* if there has been nothing to do on any drive,
pr_err("md/raid10:%s: %s fail\n", mdname(mddev),
test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery");
if (error_disk >= 0 && !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
/*
* recovery fail, set mirrors.recovory_disabled,
* device shouldn't be added to there.
*/
conf->mirrors[error_disk].recovery_disabled = mddev->recovery_disabled;
return 0;
}
/*
* if there has been nothing to do on any drive,
* then there is nothing to do at all..
*/
*skipped = 1;
......@@ -3058,6 +3102,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (mreplace != NULL &&
!test_bit(Faulty, &mreplace->flags))
need_replace = 1;
else
mreplace = NULL;
if (!need_recover && !need_replace) {
rcu_read_unlock();
......@@ -3075,8 +3121,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
rcu_read_unlock();
continue;
}
if (mreplace && test_bit(Faulty, &mreplace->flags))
mreplace = NULL;
/* Unless we are doing a full sync, or a replacement
* we only need to recover the block if it is set in
* the bitmap
......@@ -3248,6 +3292,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
mdname(mddev));
mirror->recovery_disabled
= mddev->recovery_disabled;
} else {
error_disk = i;
}
put_buf(r10_bio);
if (rb2)
......
......@@ -146,12 +146,12 @@ struct r10bio {
*/
struct r10dev {
struct bio *bio;
union {
struct bio *repl_bio; /* used for resync and
* writes */
struct md_rdev *rdev; /* used for reads
* (read_slot >= 0) */
};
/* Currently just used for normal reads and writes */
struct md_rdev *rdev;
/* used for resync and writes */
struct bio *repl_bio;
/* Currently just used for normal writes */
struct md_rdev *replacement;
sector_t addr;
int devnum;
} devs[];
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册