提交 c230e7e5 编写于 作者: N NeilBrown 提交者: Shaohua Li

md/raid1: simplify the splitting of requests.

raid1 currently splits requests in two different ways for
two different reasons.

First, bio_split() is used to ensure the bio fits within a
resync accounting region.
Second, multiple r1bios are allocated for each bio to handle
the possiblity of known bad blocks on some devices.

This can be simplified to just use bio_split() once, and not
use multiple r1bios.
We delay the split until we know a maximum bio size that can
be handled with a single r1bio, and then split the bio and
queue the remainder for later handling.

This avoids all loops inside raid1.c request handling.  Just
a single read, or a single set of writes, is submitted to
lower-level devices for each bio that comes from
generic_make_request().

When the bio needs to be split, generic_make_request() will
do the necessary looping and call md_make_request() multiple
times.

raid1_make_request() no longer queues request for raid1 to handle,
so we can remove that branch from the 'if'.

This patch also creates a new private bio_set
(conf->bio_split) for splitting bios.  Using fs_bio_set
is wrong, as it is meant to be used by filesystems, not
block devices.  Using it inside md can lead to deadlocks
under high memory pressure.

Delete unused variable in raid1_write_request() (Shaohua)
Signed-off-by: NNeilBrown <neilb@suse.com>
Signed-off-by: NShaohua Li <shli@fb.com>
上级 ae1713e2
...@@ -1202,7 +1202,8 @@ alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled) ...@@ -1202,7 +1202,8 @@ alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled)
return r1_bio; return r1_bio;
} }
static void raid1_read_request(struct mddev *mddev, struct bio *bio) static void raid1_read_request(struct mddev *mddev, struct bio *bio,
int max_read_sectors)
{ {
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
struct raid1_info *mirror; struct raid1_info *mirror;
...@@ -1211,7 +1212,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio) ...@@ -1211,7 +1212,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio)
struct bitmap *bitmap = mddev->bitmap; struct bitmap *bitmap = mddev->bitmap;
const int op = bio_op(bio); const int op = bio_op(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
int sectors_handled;
int max_sectors; int max_sectors;
int rdisk; int rdisk;
...@@ -1222,12 +1222,12 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio) ...@@ -1222,12 +1222,12 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio)
wait_read_barrier(conf, bio->bi_iter.bi_sector); wait_read_barrier(conf, bio->bi_iter.bi_sector);
r1_bio = alloc_r1bio(mddev, bio, 0); r1_bio = alloc_r1bio(mddev, bio, 0);
r1_bio->sectors = max_read_sectors;
/* /*
* make_request() can abort the operation when read-ahead is being * make_request() can abort the operation when read-ahead is being
* used and no empty request is available. * used and no empty request is available.
*/ */
read_again:
rdisk = read_balance(conf, r1_bio, &max_sectors); rdisk = read_balance(conf, r1_bio, &max_sectors);
if (rdisk < 0) { if (rdisk < 0) {
...@@ -1247,11 +1247,20 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio) ...@@ -1247,11 +1247,20 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio)
wait_event(bitmap->behind_wait, wait_event(bitmap->behind_wait,
atomic_read(&bitmap->behind_writes) == 0); atomic_read(&bitmap->behind_writes) == 0);
} }
if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
GFP_NOIO, conf->bio_split);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
r1_bio->master_bio = bio;
r1_bio->sectors = max_sectors;
}
r1_bio->read_disk = rdisk; r1_bio->read_disk = rdisk;
read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
max_sectors);
r1_bio->bios[rdisk] = read_bio; r1_bio->bios[rdisk] = read_bio;
...@@ -1270,30 +1279,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio) ...@@ -1270,30 +1279,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio)
read_bio, disk_devt(mddev->gendisk), read_bio, disk_devt(mddev->gendisk),
r1_bio->sector); r1_bio->sector);
if (max_sectors < r1_bio->sectors) { generic_make_request(read_bio);
/*
* could not read all from this device, so we will need another
* r1_bio.
*/
sectors_handled = (r1_bio->sector + max_sectors
- bio->bi_iter.bi_sector);
r1_bio->sectors = max_sectors;
bio_inc_remaining(bio);
/*
* Cannot call generic_make_request directly as that will be
* queued in __make_request and subsequent mempool_alloc might
* block waiting for it. So hand bio over to raid1d.
*/
reschedule_retry(r1_bio);
r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
goto read_again;
} else
generic_make_request(read_bio);
} }
static void raid1_write_request(struct mddev *mddev, struct bio *bio) static void raid1_write_request(struct mddev *mddev, struct bio *bio,
int max_write_sectors)
{ {
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
struct r1bio *r1_bio; struct r1bio *r1_bio;
...@@ -1304,9 +1294,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio) ...@@ -1304,9 +1294,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
struct blk_plug_cb *cb; struct blk_plug_cb *cb;
struct raid1_plug_cb *plug = NULL; struct raid1_plug_cb *plug = NULL;
int first_clone; int first_clone;
int sectors_handled;
int max_sectors; int max_sectors;
sector_t offset;
/* /*
* Register the new request and wait if the reconstruction * Register the new request and wait if the reconstruction
...@@ -1345,6 +1333,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio) ...@@ -1345,6 +1333,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
wait_barrier(conf, bio->bi_iter.bi_sector); wait_barrier(conf, bio->bi_iter.bi_sector);
r1_bio = alloc_r1bio(mddev, bio, 0); r1_bio = alloc_r1bio(mddev, bio, 0);
r1_bio->sectors = max_write_sectors;
if (conf->pending_count >= max_queued_requests) { if (conf->pending_count >= max_queued_requests) {
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
...@@ -1443,17 +1432,21 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio) ...@@ -1443,17 +1432,21 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
goto retry_write; goto retry_write;
} }
if (max_sectors < r1_bio->sectors) if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
GFP_NOIO, conf->bio_split);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
r1_bio->master_bio = bio;
r1_bio->sectors = max_sectors; r1_bio->sectors = max_sectors;
}
sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0); atomic_set(&r1_bio->behind_remaining, 0);
first_clone = 1; first_clone = 1;
offset = r1_bio->sector - bio->bi_iter.bi_sector;
for (i = 0; i < disks; i++) { for (i = 0; i < disks; i++) {
struct bio *mbio = NULL; struct bio *mbio = NULL;
if (!r1_bio->bios[i]) if (!r1_bio->bios[i])
...@@ -1470,7 +1463,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio) ...@@ -1470,7 +1463,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
< mddev->bitmap_info.max_write_behind) && < mddev->bitmap_info.max_write_behind) &&
!waitqueue_active(&bitmap->behind_wait)) { !waitqueue_active(&bitmap->behind_wait)) {
mbio = alloc_behind_master_bio(r1_bio, bio, mbio = alloc_behind_master_bio(r1_bio, bio,
offset << 9, 0,
max_sectors << 9); max_sectors << 9);
} }
...@@ -1486,10 +1479,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio) ...@@ -1486,10 +1479,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
mbio = bio_clone_fast(r1_bio->behind_master_bio, mbio = bio_clone_fast(r1_bio->behind_master_bio,
GFP_NOIO, GFP_NOIO,
mddev->bio_set); mddev->bio_set);
else { else
mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
bio_trim(mbio, offset, max_sectors);
}
} }
if (r1_bio->behind_master_bio) { if (r1_bio->behind_master_bio) {
...@@ -1536,19 +1527,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio) ...@@ -1536,19 +1527,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
if (!plug) if (!plug)
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
} }
/* Mustn't call r1_bio_write_done before this next test,
* as it could result in the bio being freed.
*/
if (sectors_handled < bio_sectors(bio)) {
/* We need another r1_bio, which must be counted */
sector_t sect = bio->bi_iter.bi_sector + sectors_handled;
inc_pending(conf, sect);
bio_inc_remaining(bio);
r1_bio_write_done(r1_bio);
r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
goto retry_write;
}
r1_bio_write_done(r1_bio); r1_bio_write_done(r1_bio);
...@@ -1558,7 +1536,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio) ...@@ -1558,7 +1536,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
static void raid1_make_request(struct mddev *mddev, struct bio *bio) static void raid1_make_request(struct mddev *mddev, struct bio *bio)
{ {
struct bio *split;
sector_t sectors; sector_t sectors;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
...@@ -1566,43 +1543,20 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio) ...@@ -1566,43 +1543,20 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio)
return; return;
} }
/* if bio exceeds barrier unit boundary, split it */ /*
do { * There is a limit to the maximum size, but
sectors = align_to_barrier_unit_end( * the read/write handler might find a lower limit
bio->bi_iter.bi_sector, bio_sectors(bio)); * due to bad blocks. To avoid multiple splits,
if (sectors < bio_sectors(bio)) { * we pass the maximum number of sectors down
split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set); * and let the lower level perform the split.
bio_chain(split, bio); */
} else { sectors = align_to_barrier_unit_end(
split = bio; bio->bi_iter.bi_sector, bio_sectors(bio));
}
if (bio_data_dir(split) == READ) {
raid1_read_request(mddev, split);
/* if (bio_data_dir(bio) == READ)
* If a bio is splitted, the first part of bio will raid1_read_request(mddev, bio, sectors);
* pass barrier but the bio is queued in else
* current->bio_list (see generic_make_request). If raid1_write_request(mddev, bio, sectors);
* there is a raise_barrier() called here, the second
* part of bio can't pass barrier. But since the first
* part bio isn't dispatched to underlaying disks yet,
* the barrier is never released, hence raise_barrier
* will alays wait. We have a deadlock.
* Note, this only happens in read path. For write
* path, the first part of bio is dispatched in a
* schedule() call (because of blk plug) or offloaded
* to raid10d.
* Quitting from the function immediately can change
* the bio order queued in bio_list and avoid the deadlock.
*/
if (split != bio) {
generic_make_request(bio);
break;
}
} else
raid1_write_request(mddev, split);
} while (split != bio);
} }
static void raid1_status(struct seq_file *seq, struct mddev *mddev) static void raid1_status(struct seq_file *seq, struct mddev *mddev)
...@@ -2647,10 +2601,7 @@ static void raid1d(struct md_thread *thread) ...@@ -2647,10 +2601,7 @@ static void raid1d(struct md_thread *thread)
else if (test_bit(R1BIO_ReadError, &r1_bio->state)) else if (test_bit(R1BIO_ReadError, &r1_bio->state))
handle_read_error(conf, r1_bio); handle_read_error(conf, r1_bio);
else else
/* just a partial read to be scheduled from separate WARN_ON_ONCE(1);
* context
*/
generic_make_request(r1_bio->bios[r1_bio->read_disk]);
cond_resched(); cond_resched();
if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
...@@ -3038,6 +2989,10 @@ static struct r1conf *setup_conf(struct mddev *mddev) ...@@ -3038,6 +2989,10 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf->r1bio_pool) if (!conf->r1bio_pool)
goto abort; goto abort;
conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
if (!conf->bio_split)
goto abort;
conf->poolinfo->mddev = mddev; conf->poolinfo->mddev = mddev;
err = -EINVAL; err = -EINVAL;
...@@ -3119,6 +3074,8 @@ static struct r1conf *setup_conf(struct mddev *mddev) ...@@ -3119,6 +3074,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
kfree(conf->nr_waiting); kfree(conf->nr_waiting);
kfree(conf->nr_queued); kfree(conf->nr_queued);
kfree(conf->barrier); kfree(conf->barrier);
if (conf->bio_split)
bioset_free(conf->bio_split);
kfree(conf); kfree(conf);
} }
return ERR_PTR(err); return ERR_PTR(err);
...@@ -3224,6 +3181,8 @@ static void raid1_free(struct mddev *mddev, void *priv) ...@@ -3224,6 +3181,8 @@ static void raid1_free(struct mddev *mddev, void *priv)
kfree(conf->nr_waiting); kfree(conf->nr_waiting);
kfree(conf->nr_queued); kfree(conf->nr_queued);
kfree(conf->barrier); kfree(conf->barrier);
if (conf->bio_split)
bioset_free(conf->bio_split);
kfree(conf); kfree(conf);
} }
......
...@@ -107,6 +107,8 @@ struct r1conf { ...@@ -107,6 +107,8 @@ struct r1conf {
mempool_t *r1bio_pool; mempool_t *r1bio_pool;
mempool_t *r1buf_pool; mempool_t *r1buf_pool;
struct bio_set *bio_split;
/* temporary buffer to synchronous IO when attempting to repair /* temporary buffer to synchronous IO when attempting to repair
* a read error. * a read error.
*/ */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册