提交 fcff06c4 编写于 作者: L Linus Torvalds

Merge branch 'for-next' of git://neil.brown.name/md

Pull md updates from NeilBrown.

* 'for-next' of git://neil.brown.name/md:
  DM RAID: Add support for MD RAID10
  md/RAID1: Add missing case for attempting to repair known bad blocks.
  md/raid5: For odirect-write performance, do not set STRIPE_PREREAD_ACTIVE.
  md/raid1: don't abort a resync on the first badblock.
  md: remove duplicated test on ->openers when calling do_md_stop()
  raid5: Add R5_ReadNoMerge flag which prevent bio from merging at block layer
  md/raid1: prevent merging too large request
  md/raid1: read balance chooses idlest disk for SSD
  md/raid1: make sequential read detection per disk based
  MD RAID10: Export md_raid10_congested
  MD: Move macros from raid1*.h to raid1*.c
  MD RAID1: rename mirror_info structure
  MD RAID10: rename mirror_info structure
  MD RAID10: Fix compiler warning.
  raid5: add a per-stripe lock
  raid5: remove unnecessary bitmap write optimization
  raid5: lockless access raid5 overrided bi_phys_segments
  raid5: reduce chance release_stripe() taking device_lock
...@@ -27,6 +27,10 @@ The target is named "raid" and it accepts the following parameters: ...@@ -27,6 +27,10 @@ The target is named "raid" and it accepts the following parameters:
- rotating parity N (right-to-left) with data restart - rotating parity N (right-to-left) with data restart
raid6_nc RAID6 N continue raid6_nc RAID6 N continue
- rotating parity N (right-to-left) with data continuation - rotating parity N (right-to-left) with data continuation
raid10 Various RAID10 inspired algorithms chosen by additional params
- RAID10: Striped Mirrors (aka 'Striping on top of mirrors')
- RAID1E: Integrated Adjacent Stripe Mirroring
- and other similar RAID10 variants
Reference: Chapter 4 of Reference: Chapter 4 of
http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf
...@@ -59,6 +63,28 @@ The target is named "raid" and it accepts the following parameters: ...@@ -59,6 +63,28 @@ The target is named "raid" and it accepts the following parameters:
logical size of the array. The bitmap records the device logical size of the array. The bitmap records the device
synchronisation state for each region. synchronisation state for each region.
[raid10_copies <# copies>]
[raid10_format near]
These two options are used to alter the default layout of
a RAID10 configuration. The number of copies is can be
specified, but the default is 2. There are other variations
to how the copies are laid down - the default and only current
option is "near". Near copies are what most people think of
with respect to mirroring. If these options are left
unspecified, or 'raid10_copies 2' and/or 'raid10_format near'
are given, then the layouts for 2, 3 and 4 devices are:
2 drives 3 drives 4 drives
-------- ---------- --------------
A1 A1 A1 A1 A2 A1 A1 A2 A2
A2 A2 A2 A3 A3 A3 A3 A4 A4
A3 A3 A4 A4 A5 A5 A5 A6 A6
A4 A4 A5 A6 A6 A7 A7 A8 A8
.. .. .. .. .. .. .. .. ..
The 2-device layout is equivalent 2-way RAID1. The 4-device
layout is what a traditional RAID10 would look like. The
3-device layout is what might be called a 'RAID1E - Integrated
Adjacent Stripe Mirroring'.
<#raid_devs>: The number of devices composing the array. <#raid_devs>: The number of devices composing the array.
Each device consists of two entries. The first is the device Each device consists of two entries. The first is the device
containing the metadata (if any); the second is the one containing the containing the metadata (if any); the second is the one containing the
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include "md.h" #include "md.h"
#include "raid1.h" #include "raid1.h"
#include "raid5.h" #include "raid5.h"
#include "raid10.h"
#include "bitmap.h" #include "bitmap.h"
#include <linux/device-mapper.h> #include <linux/device-mapper.h>
...@@ -52,7 +53,10 @@ struct raid_dev { ...@@ -52,7 +53,10 @@ struct raid_dev {
#define DMPF_MAX_RECOVERY_RATE 0x20 #define DMPF_MAX_RECOVERY_RATE 0x20
#define DMPF_MAX_WRITE_BEHIND 0x40 #define DMPF_MAX_WRITE_BEHIND 0x40
#define DMPF_STRIPE_CACHE 0x80 #define DMPF_STRIPE_CACHE 0x80
#define DMPF_REGION_SIZE 0X100 #define DMPF_REGION_SIZE 0x100
#define DMPF_RAID10_COPIES 0x200
#define DMPF_RAID10_FORMAT 0x400
struct raid_set { struct raid_set {
struct dm_target *ti; struct dm_target *ti;
...@@ -76,6 +80,7 @@ static struct raid_type { ...@@ -76,6 +80,7 @@ static struct raid_type {
const unsigned algorithm; /* RAID algorithm. */ const unsigned algorithm; /* RAID algorithm. */
} raid_types[] = { } raid_types[] = {
{"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
{"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */},
{"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
{"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
{"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
...@@ -86,6 +91,17 @@ static struct raid_type { ...@@ -86,6 +91,17 @@ static struct raid_type {
{"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
}; };
static unsigned raid10_md_layout_to_copies(int layout)
{
return layout & 0xFF;
}
static int raid10_format_to_md_layout(char *format, unsigned copies)
{
/* 1 "far" copy, and 'copies' "near" copies */
return (1 << 8) | (copies & 0xFF);
}
static struct raid_type *get_raid_type(char *name) static struct raid_type *get_raid_type(char *name)
{ {
int i; int i;
...@@ -339,10 +355,16 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) ...@@ -339,10 +355,16 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
* [max_write_behind <sectors>] See '-write-behind=' (man mdadm) * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
* [stripe_cache <sectors>] Stripe cache size for higher RAIDs * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
* [region_size <sectors>] Defines granularity of bitmap * [region_size <sectors>] Defines granularity of bitmap
*
* RAID10-only options:
* [raid10_copies <# copies>] Number of copies. (Default: 2)
* [raid10_format <near>] Layout algorithm. (Default: near)
*/ */
static int parse_raid_params(struct raid_set *rs, char **argv, static int parse_raid_params(struct raid_set *rs, char **argv,
unsigned num_raid_params) unsigned num_raid_params)
{ {
char *raid10_format = "near";
unsigned raid10_copies = 2;
unsigned i, rebuild_cnt = 0; unsigned i, rebuild_cnt = 0;
unsigned long value, region_size = 0; unsigned long value, region_size = 0;
sector_t sectors_per_dev = rs->ti->len; sector_t sectors_per_dev = rs->ti->len;
...@@ -416,11 +438,28 @@ static int parse_raid_params(struct raid_set *rs, char **argv, ...@@ -416,11 +438,28 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
} }
key = argv[i++]; key = argv[i++];
/* Parameters that take a string value are checked here. */
if (!strcasecmp(key, "raid10_format")) {
if (rs->raid_type->level != 10) {
rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
return -EINVAL;
}
if (strcmp("near", argv[i])) {
rs->ti->error = "Invalid 'raid10_format' value given";
return -EINVAL;
}
raid10_format = argv[i];
rs->print_flags |= DMPF_RAID10_FORMAT;
continue;
}
if (strict_strtoul(argv[i], 10, &value) < 0) { if (strict_strtoul(argv[i], 10, &value) < 0) {
rs->ti->error = "Bad numerical argument given in raid params"; rs->ti->error = "Bad numerical argument given in raid params";
return -EINVAL; return -EINVAL;
} }
/* Parameters that take a numeric value are checked here */
if (!strcasecmp(key, "rebuild")) { if (!strcasecmp(key, "rebuild")) {
rebuild_cnt++; rebuild_cnt++;
...@@ -439,6 +478,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, ...@@ -439,6 +478,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
return -EINVAL; return -EINVAL;
} }
break; break;
case 10:
default: default:
DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name);
rs->ti->error = "Rebuild not supported for this RAID type"; rs->ti->error = "Rebuild not supported for this RAID type";
...@@ -495,7 +535,8 @@ static int parse_raid_params(struct raid_set *rs, char **argv, ...@@ -495,7 +535,8 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
*/ */
value /= 2; value /= 2;
if (rs->raid_type->level < 5) { if ((rs->raid_type->level != 5) &&
(rs->raid_type->level != 6)) {
rs->ti->error = "Inappropriate argument: stripe_cache"; rs->ti->error = "Inappropriate argument: stripe_cache";
return -EINVAL; return -EINVAL;
} }
...@@ -520,6 +561,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv, ...@@ -520,6 +561,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
} else if (!strcasecmp(key, "region_size")) { } else if (!strcasecmp(key, "region_size")) {
rs->print_flags |= DMPF_REGION_SIZE; rs->print_flags |= DMPF_REGION_SIZE;
region_size = value; region_size = value;
} else if (!strcasecmp(key, "raid10_copies") &&
(rs->raid_type->level == 10)) {
if ((value < 2) || (value > 0xFF)) {
rs->ti->error = "Bad value for 'raid10_copies'";
return -EINVAL;
}
rs->print_flags |= DMPF_RAID10_COPIES;
raid10_copies = value;
} else { } else {
DMERR("Unable to parse RAID parameter: %s", key); DMERR("Unable to parse RAID parameter: %s", key);
rs->ti->error = "Unable to parse RAID parameters"; rs->ti->error = "Unable to parse RAID parameters";
...@@ -538,8 +587,22 @@ static int parse_raid_params(struct raid_set *rs, char **argv, ...@@ -538,8 +587,22 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
if (dm_set_target_max_io_len(rs->ti, max_io_len)) if (dm_set_target_max_io_len(rs->ti, max_io_len))
return -EINVAL; return -EINVAL;
if ((rs->raid_type->level > 1) && if (rs->raid_type->level == 10) {
sector_div(sectors_per_dev, (rs->md.raid_disks - rs->raid_type->parity_devs))) { if (raid10_copies > rs->md.raid_disks) {
rs->ti->error = "Not enough devices to satisfy specification";
return -EINVAL;
}
/* (Len * #mirrors) / #devices */
sectors_per_dev = rs->ti->len * raid10_copies;
sector_div(sectors_per_dev, rs->md.raid_disks);
rs->md.layout = raid10_format_to_md_layout(raid10_format,
raid10_copies);
rs->md.new_layout = rs->md.layout;
} else if ((rs->raid_type->level > 1) &&
sector_div(sectors_per_dev,
(rs->md.raid_disks - rs->raid_type->parity_devs))) {
rs->ti->error = "Target length not divisible by number of data devices"; rs->ti->error = "Target length not divisible by number of data devices";
return -EINVAL; return -EINVAL;
} }
...@@ -566,6 +629,9 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits) ...@@ -566,6 +629,9 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
if (rs->raid_type->level == 1) if (rs->raid_type->level == 1)
return md_raid1_congested(&rs->md, bits); return md_raid1_congested(&rs->md, bits);
if (rs->raid_type->level == 10)
return md_raid10_congested(&rs->md, bits);
return md_raid5_congested(&rs->md, bits); return md_raid5_congested(&rs->md, bits);
} }
...@@ -884,6 +950,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) ...@@ -884,6 +950,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
case 6: case 6:
redundancy = rs->raid_type->parity_devs; redundancy = rs->raid_type->parity_devs;
break; break;
case 10:
redundancy = raid10_md_layout_to_copies(mddev->layout) - 1;
break;
default: default:
ti->error = "Unknown RAID type"; ti->error = "Unknown RAID type";
return -EINVAL; return -EINVAL;
...@@ -1049,12 +1118,19 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) ...@@ -1049,12 +1118,19 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad; goto bad;
} }
if (ti->len != rs->md.array_sectors) {
ti->error = "Array size does not match requested target length";
ret = -EINVAL;
goto size_mismatch;
}
rs->callbacks.congested_fn = raid_is_congested; rs->callbacks.congested_fn = raid_is_congested;
dm_table_add_target_callbacks(ti->table, &rs->callbacks); dm_table_add_target_callbacks(ti->table, &rs->callbacks);
mddev_suspend(&rs->md); mddev_suspend(&rs->md);
return 0; return 0;
size_mismatch:
md_stop(&rs->md);
bad: bad:
context_free(rs); context_free(rs);
...@@ -1203,6 +1279,13 @@ static int raid_status(struct dm_target *ti, status_type_t type, ...@@ -1203,6 +1279,13 @@ static int raid_status(struct dm_target *ti, status_type_t type,
DMEMIT(" region_size %lu", DMEMIT(" region_size %lu",
rs->md.bitmap_info.chunksize >> 9); rs->md.bitmap_info.chunksize >> 9);
if (rs->print_flags & DMPF_RAID10_COPIES)
DMEMIT(" raid10_copies %u",
raid10_md_layout_to_copies(rs->md.layout));
if (rs->print_flags & DMPF_RAID10_FORMAT)
DMEMIT(" raid10_format near");
DMEMIT(" %d", rs->md.raid_disks); DMEMIT(" %d", rs->md.raid_disks);
for (i = 0; i < rs->md.raid_disks; i++) { for (i = 0; i < rs->md.raid_disks; i++) {
if (rs->dev[i].meta_dev) if (rs->dev[i].meta_dev)
...@@ -1277,7 +1360,7 @@ static void raid_resume(struct dm_target *ti) ...@@ -1277,7 +1360,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = { static struct target_type raid_target = {
.name = "raid", .name = "raid",
.version = {1, 2, 0}, .version = {1, 3, 0},
.module = THIS_MODULE, .module = THIS_MODULE,
.ctr = raid_ctr, .ctr = raid_ctr,
.dtr = raid_dtr, .dtr = raid_dtr,
...@@ -1304,6 +1387,8 @@ module_init(dm_raid_init); ...@@ -1304,6 +1387,8 @@ module_init(dm_raid_init);
module_exit(dm_raid_exit); module_exit(dm_raid_exit);
MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
MODULE_ALIAS("dm-raid1");
MODULE_ALIAS("dm-raid10");
MODULE_ALIAS("dm-raid4"); MODULE_ALIAS("dm-raid4");
MODULE_ALIAS("dm-raid5"); MODULE_ALIAS("dm-raid5");
MODULE_ALIAS("dm-raid6"); MODULE_ALIAS("dm-raid6");
......
...@@ -3942,17 +3942,13 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -3942,17 +3942,13 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
break; break;
case clear: case clear:
/* stopping an active array */ /* stopping an active array */
if (atomic_read(&mddev->openers) > 0)
return -EBUSY;
err = do_md_stop(mddev, 0, NULL); err = do_md_stop(mddev, 0, NULL);
break; break;
case inactive: case inactive:
/* stopping an active array */ /* stopping an active array */
if (mddev->pers) { if (mddev->pers)
if (atomic_read(&mddev->openers) > 0)
return -EBUSY;
err = do_md_stop(mddev, 2, NULL); err = do_md_stop(mddev, 2, NULL);
} else else
err = 0; /* already inactive */ err = 0; /* already inactive */
break; break;
case suspended: case suspended:
......
...@@ -46,6 +46,20 @@ ...@@ -46,6 +46,20 @@
*/ */
#define NR_RAID1_BIOS 256 #define NR_RAID1_BIOS 256
/* when we get a read error on a read-only array, we redirect to another
* device without failing the first device, or trying to over-write to
* correct the read error. To keep track of bad blocks on a per-bio
* level, we store IO_BLOCKED in the appropriate 'bios' pointer
*/
#define IO_BLOCKED ((struct bio *)1)
/* When we successfully write to a known bad-block, we need to remove the
* bad-block marking which must be done from process context. So we record
* the success by setting devs[n].bio to IO_MADE_GOOD
*/
#define IO_MADE_GOOD ((struct bio *)2)
#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
/* When there are this many requests queue to be written by /* When there are this many requests queue to be written by
* the raid1 thread, we become 'congested' to provide back-pressure * the raid1 thread, we become 'congested' to provide back-pressure
* for writeback. * for writeback.
...@@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect ...@@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
const sector_t this_sector = r1_bio->sector; const sector_t this_sector = r1_bio->sector;
int sectors; int sectors;
int best_good_sectors; int best_good_sectors;
int start_disk; int best_disk, best_dist_disk, best_pending_disk;
int best_disk; int has_nonrot_disk;
int i; int disk;
sector_t best_dist; sector_t best_dist;
unsigned int min_pending;
struct md_rdev *rdev; struct md_rdev *rdev;
int choose_first; int choose_first;
int choose_next_idle;
rcu_read_lock(); rcu_read_lock();
/* /*
...@@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect ...@@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
retry: retry:
sectors = r1_bio->sectors; sectors = r1_bio->sectors;
best_disk = -1; best_disk = -1;
best_dist_disk = -1;
best_dist = MaxSector; best_dist = MaxSector;
best_pending_disk = -1;
min_pending = UINT_MAX;
best_good_sectors = 0; best_good_sectors = 0;
has_nonrot_disk = 0;
choose_next_idle = 0;
if (conf->mddev->recovery_cp < MaxSector && if (conf->mddev->recovery_cp < MaxSector &&
(this_sector + sectors >= conf->next_resync)) { (this_sector + sectors >= conf->next_resync))
choose_first = 1; choose_first = 1;
start_disk = 0; else
} else {
choose_first = 0; choose_first = 0;
start_disk = conf->last_used;
}
for (i = 0 ; i < conf->raid_disks * 2 ; i++) { for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
sector_t dist; sector_t dist;
sector_t first_bad; sector_t first_bad;
int bad_sectors; int bad_sectors;
unsigned int pending;
int disk = start_disk + i; bool nonrot;
if (disk >= conf->raid_disks * 2)
disk -= conf->raid_disks * 2;
rdev = rcu_dereference(conf->mirrors[disk].rdev); rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (r1_bio->bios[disk] == IO_BLOCKED if (r1_bio->bios[disk] == IO_BLOCKED
...@@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect ...@@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
} else } else
best_good_sectors = sectors; best_good_sectors = sectors;
nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
has_nonrot_disk |= nonrot;
pending = atomic_read(&rdev->nr_pending);
dist = abs(this_sector - conf->mirrors[disk].head_position); dist = abs(this_sector - conf->mirrors[disk].head_position);
if (choose_first if (choose_first) {
/* Don't change to another disk for sequential reads */ best_disk = disk;
|| conf->next_seq_sect == this_sector break;
|| dist == 0 }
/* If device is idle, use it */ /* Don't change to another disk for sequential reads */
|| atomic_read(&rdev->nr_pending) == 0) { if (conf->mirrors[disk].next_seq_sect == this_sector
|| dist == 0) {
int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
struct raid1_info *mirror = &conf->mirrors[disk];
best_disk = disk;
/*
* If buffered sequential IO size exceeds optimal
* iosize, check if there is idle disk. If yes, choose
* the idle disk. read_balance could already choose an
* idle disk before noticing it's a sequential IO in
* this disk. This doesn't matter because this disk
* will idle, next time it will be utilized after the
* first disk has IO size exceeds optimal iosize. In
* this way, iosize of the first disk will be optimal
* iosize at least. iosize of the second disk might be
* small, but not a big deal since when the second disk
* starts IO, the first disk is likely still busy.
*/
if (nonrot && opt_iosize > 0 &&
mirror->seq_start != MaxSector &&
mirror->next_seq_sect > opt_iosize &&
mirror->next_seq_sect - opt_iosize >=
mirror->seq_start) {
choose_next_idle = 1;
continue;
}
break;
}
/* If device is idle, use it */
if (pending == 0) {
best_disk = disk; best_disk = disk;
break; break;
} }
if (choose_next_idle)
continue;
if (min_pending > pending) {
min_pending = pending;
best_pending_disk = disk;
}
if (dist < best_dist) { if (dist < best_dist) {
best_dist = dist; best_dist = dist;
best_disk = disk; best_dist_disk = disk;
} }
} }
/*
* If all disks are rotational, choose the closest disk. If any disk is
* non-rotational, choose the disk with less pending request even the
* disk is rotational, which might/might not be optimal for raids with
* mixed ratation/non-rotational disks depending on workload.
*/
if (best_disk == -1) {
if (has_nonrot_disk)
best_disk = best_pending_disk;
else
best_disk = best_dist_disk;
}
if (best_disk >= 0) { if (best_disk >= 0) {
rdev = rcu_dereference(conf->mirrors[best_disk].rdev); rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
if (!rdev) if (!rdev)
...@@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect ...@@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
goto retry; goto retry;
} }
sectors = best_good_sectors; sectors = best_good_sectors;
conf->next_seq_sect = this_sector + sectors;
conf->last_used = best_disk; if (conf->mirrors[best_disk].next_seq_sect != this_sector)
conf->mirrors[best_disk].seq_start = this_sector;
conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
} }
rcu_read_unlock(); rcu_read_unlock();
*max_sectors = sectors; *max_sectors = sectors;
...@@ -873,7 +947,7 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio) ...@@ -873,7 +947,7 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
static void make_request(struct mddev *mddev, struct bio * bio) static void make_request(struct mddev *mddev, struct bio * bio)
{ {
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
struct mirror_info *mirror; struct raid1_info *mirror;
struct r1bio *r1_bio; struct r1bio *r1_bio;
struct bio *read_bio; struct bio *read_bio;
int i, disks; int i, disks;
...@@ -1364,7 +1438,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1364,7 +1438,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
int err = -EEXIST; int err = -EEXIST;
int mirror = 0; int mirror = 0;
struct mirror_info *p; struct raid1_info *p;
int first = 0; int first = 0;
int last = conf->raid_disks - 1; int last = conf->raid_disks - 1;
struct request_queue *q = bdev_get_queue(rdev->bdev); struct request_queue *q = bdev_get_queue(rdev->bdev);
...@@ -1433,7 +1507,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1433,7 +1507,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
int err = 0; int err = 0;
int number = rdev->raid_disk; int number = rdev->raid_disk;
struct mirror_info *p = conf->mirrors+ number; struct raid1_info *p = conf->mirrors + number;
if (rdev != p->rdev) if (rdev != p->rdev)
p = conf->mirrors + conf->raid_disks + number; p = conf->mirrors + conf->raid_disks + number;
...@@ -2371,6 +2445,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp ...@@ -2371,6 +2445,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
bio->bi_rw = READ; bio->bi_rw = READ;
bio->bi_end_io = end_sync_read; bio->bi_end_io = end_sync_read;
read_targets++; read_targets++;
} else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
!test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
/*
* The device is suitable for reading (InSync),
* but has bad block(s) here. Let's try to correct them,
* if we are doing resync or repair. Otherwise, leave
* this device alone for this sync request.
*/
bio->bi_rw = WRITE;
bio->bi_end_io = end_sync_write;
write_targets++;
} }
} }
if (bio->bi_end_io) { if (bio->bi_end_io) {
...@@ -2428,7 +2514,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp ...@@ -2428,7 +2514,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
/* There is nowhere to write, so all non-sync /* There is nowhere to write, so all non-sync
* drives must be failed - so we are finished * drives must be failed - so we are finished
*/ */
sector_t rv = max_sector - sector_nr; sector_t rv;
if (min_bad > 0)
max_sector = sector_nr + min_bad;
rv = max_sector - sector_nr;
*skipped = 1; *skipped = 1;
put_buf(r1_bio); put_buf(r1_bio);
return rv; return rv;
...@@ -2521,7 +2610,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) ...@@ -2521,7 +2610,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
{ {
struct r1conf *conf; struct r1conf *conf;
int i; int i;
struct mirror_info *disk; struct raid1_info *disk;
struct md_rdev *rdev; struct md_rdev *rdev;
int err = -ENOMEM; int err = -ENOMEM;
...@@ -2529,7 +2618,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) ...@@ -2529,7 +2618,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf) if (!conf)
goto abort; goto abort;
conf->mirrors = kzalloc(sizeof(struct mirror_info) conf->mirrors = kzalloc(sizeof(struct raid1_info)
* mddev->raid_disks * 2, * mddev->raid_disks * 2,
GFP_KERNEL); GFP_KERNEL);
if (!conf->mirrors) if (!conf->mirrors)
...@@ -2572,6 +2661,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) ...@@ -2572,6 +2661,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
mddev->merge_check_needed = 1; mddev->merge_check_needed = 1;
disk->head_position = 0; disk->head_position = 0;
disk->seq_start = MaxSector;
} }
conf->raid_disks = mddev->raid_disks; conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev; conf->mddev = mddev;
...@@ -2585,7 +2675,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) ...@@ -2585,7 +2675,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->recovery_disabled = mddev->recovery_disabled - 1; conf->recovery_disabled = mddev->recovery_disabled - 1;
err = -EIO; err = -EIO;
conf->last_used = -1;
for (i = 0; i < conf->raid_disks * 2; i++) { for (i = 0; i < conf->raid_disks * 2; i++) {
disk = conf->mirrors + i; disk = conf->mirrors + i;
...@@ -2611,19 +2700,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) ...@@ -2611,19 +2700,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (disk->rdev && if (disk->rdev &&
(disk->rdev->saved_raid_disk < 0)) (disk->rdev->saved_raid_disk < 0))
conf->fullsync = 1; conf->fullsync = 1;
} else if (conf->last_used < 0) }
/*
* The first working device is used as a
* starting point to read balancing.
*/
conf->last_used = i;
} }
if (conf->last_used < 0) {
printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
mdname(mddev));
goto abort;
}
err = -ENOMEM; err = -ENOMEM;
conf->thread = md_register_thread(raid1d, mddev, "raid1"); conf->thread = md_register_thread(raid1d, mddev, "raid1");
if (!conf->thread) { if (!conf->thread) {
...@@ -2798,7 +2877,7 @@ static int raid1_reshape(struct mddev *mddev) ...@@ -2798,7 +2877,7 @@ static int raid1_reshape(struct mddev *mddev)
*/ */
mempool_t *newpool, *oldpool; mempool_t *newpool, *oldpool;
struct pool_info *newpoolinfo; struct pool_info *newpoolinfo;
struct mirror_info *newmirrors; struct raid1_info *newmirrors;
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
int cnt, raid_disks; int cnt, raid_disks;
unsigned long flags; unsigned long flags;
...@@ -2841,7 +2920,7 @@ static int raid1_reshape(struct mddev *mddev) ...@@ -2841,7 +2920,7 @@ static int raid1_reshape(struct mddev *mddev)
kfree(newpoolinfo); kfree(newpoolinfo);
return -ENOMEM; return -ENOMEM;
} }
newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
GFP_KERNEL); GFP_KERNEL);
if (!newmirrors) { if (!newmirrors) {
kfree(newpoolinfo); kfree(newpoolinfo);
...@@ -2880,7 +2959,6 @@ static int raid1_reshape(struct mddev *mddev) ...@@ -2880,7 +2959,6 @@ static int raid1_reshape(struct mddev *mddev)
conf->raid_disks = mddev->raid_disks = raid_disks; conf->raid_disks = mddev->raid_disks = raid_disks;
mddev->delta_disks = 0; mddev->delta_disks = 0;
conf->last_used = 0; /* just make sure it is in-range */
lower_barrier(conf); lower_barrier(conf);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
......
#ifndef _RAID1_H #ifndef _RAID1_H
#define _RAID1_H #define _RAID1_H
struct mirror_info { struct raid1_info {
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t head_position; sector_t head_position;
/* When choose the best device for a read (read_balance())
* we try to keep sequential reads one the same device
*/
sector_t next_seq_sect;
sector_t seq_start;
}; };
/* /*
...@@ -24,17 +30,11 @@ struct pool_info { ...@@ -24,17 +30,11 @@ struct pool_info {
struct r1conf { struct r1conf {
struct mddev *mddev; struct mddev *mddev;
struct mirror_info *mirrors; /* twice 'raid_disks' to struct raid1_info *mirrors; /* twice 'raid_disks' to
* allow for replacements. * allow for replacements.
*/ */
int raid_disks; int raid_disks;
/* When choose the best device for a read (read_balance())
* we try to keep sequential reads one the same device
* using 'last_used' and 'next_seq_sect'
*/
int last_used;
sector_t next_seq_sect;
/* During resync, read_balancing is only allowed on the part /* During resync, read_balancing is only allowed on the part
* of the array that has been resynced. 'next_resync' tells us * of the array that has been resynced. 'next_resync' tells us
* where that is. * where that is.
...@@ -135,20 +135,6 @@ struct r1bio { ...@@ -135,20 +135,6 @@ struct r1bio {
/* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
}; };
/* when we get a read error on a read-only array, we redirect to another
* device without failing the first device, or trying to over-write to
* correct the read error. To keep track of bad blocks on a per-bio
* level, we store IO_BLOCKED in the appropriate 'bios' pointer
*/
#define IO_BLOCKED ((struct bio *)1)
/* When we successfully write to a known bad-block, we need to remove the
* bad-block marking which must be done from process context. So we record
* the success by setting bios[n] to IO_MADE_GOOD
*/
#define IO_MADE_GOOD ((struct bio *)2)
#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
/* bits for r1bio.state */ /* bits for r1bio.state */
#define R1BIO_Uptodate 0 #define R1BIO_Uptodate 0
#define R1BIO_IsSync 1 #define R1BIO_IsSync 1
......
...@@ -60,7 +60,21 @@ ...@@ -60,7 +60,21 @@
*/ */
#define NR_RAID10_BIOS 256 #define NR_RAID10_BIOS 256
/* When there are this many requests queue to be written by /* when we get a read error on a read-only array, we redirect to another
* device without failing the first device, or trying to over-write to
* correct the read error. To keep track of bad blocks on a per-bio
* level, we store IO_BLOCKED in the appropriate 'bios' pointer
*/
#define IO_BLOCKED ((struct bio *)1)
/* When we successfully write to a known bad-block, we need to remove the
* bad-block marking which must be done from process context. So we record
* the success by setting devs[n].bio to IO_MADE_GOOD
*/
#define IO_MADE_GOOD ((struct bio *)2)
#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
/* When there are this many requests queued to be written by
* the raid10 thread, we become 'congested' to provide back-pressure * the raid10 thread, we become 'congested' to provide back-pressure
* for writeback. * for writeback.
*/ */
...@@ -717,7 +731,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, ...@@ -717,7 +731,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
int sectors = r10_bio->sectors; int sectors = r10_bio->sectors;
int best_good_sectors; int best_good_sectors;
sector_t new_distance, best_dist; sector_t new_distance, best_dist;
struct md_rdev *rdev, *best_rdev; struct md_rdev *best_rdev, *rdev = NULL;
int do_balance; int do_balance;
int best_slot; int best_slot;
struct geom *geo = &conf->geo; struct geom *geo = &conf->geo;
...@@ -839,9 +853,8 @@ static struct md_rdev *read_balance(struct r10conf *conf, ...@@ -839,9 +853,8 @@ static struct md_rdev *read_balance(struct r10conf *conf,
return rdev; return rdev;
} }
static int raid10_congested(void *data, int bits) int md_raid10_congested(struct mddev *mddev, int bits)
{ {
struct mddev *mddev = data;
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
int i, ret = 0; int i, ret = 0;
...@@ -849,8 +862,6 @@ static int raid10_congested(void *data, int bits) ...@@ -849,8 +862,6 @@ static int raid10_congested(void *data, int bits)
conf->pending_count >= max_queued_requests) conf->pending_count >= max_queued_requests)
return 1; return 1;
if (mddev_congested(mddev, bits))
return 1;
rcu_read_lock(); rcu_read_lock();
for (i = 0; for (i = 0;
(i < conf->geo.raid_disks || i < conf->prev.raid_disks) (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
...@@ -866,6 +877,15 @@ static int raid10_congested(void *data, int bits) ...@@ -866,6 +877,15 @@ static int raid10_congested(void *data, int bits)
rcu_read_unlock(); rcu_read_unlock();
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(md_raid10_congested);
static int raid10_congested(void *data, int bits)
{
struct mddev *mddev = data;
return mddev_congested(mddev, bits) ||
md_raid10_congested(mddev, bits);
}
static void flush_pending_writes(struct r10conf *conf) static void flush_pending_writes(struct r10conf *conf)
{ {
...@@ -1546,7 +1566,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1546,7 +1566,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
static void print_conf(struct r10conf *conf) static void print_conf(struct r10conf *conf)
{ {
int i; int i;
struct mirror_info *tmp; struct raid10_info *tmp;
printk(KERN_DEBUG "RAID10 conf printout:\n"); printk(KERN_DEBUG "RAID10 conf printout:\n");
if (!conf) { if (!conf) {
...@@ -1580,7 +1600,7 @@ static int raid10_spare_active(struct mddev *mddev) ...@@ -1580,7 +1600,7 @@ static int raid10_spare_active(struct mddev *mddev)
{ {
int i; int i;
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
struct mirror_info *tmp; struct raid10_info *tmp;
int count = 0; int count = 0;
unsigned long flags; unsigned long flags;
...@@ -1655,7 +1675,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1655,7 +1675,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
else else
mirror = first; mirror = first;
for ( ; mirror <= last ; mirror++) { for ( ; mirror <= last ; mirror++) {
struct mirror_info *p = &conf->mirrors[mirror]; struct raid10_info *p = &conf->mirrors[mirror];
if (p->recovery_disabled == mddev->recovery_disabled) if (p->recovery_disabled == mddev->recovery_disabled)
continue; continue;
if (p->rdev) { if (p->rdev) {
...@@ -1709,7 +1729,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1709,7 +1729,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
int err = 0; int err = 0;
int number = rdev->raid_disk; int number = rdev->raid_disk;
struct md_rdev **rdevp; struct md_rdev **rdevp;
struct mirror_info *p = conf->mirrors + number; struct raid10_info *p = conf->mirrors + number;
print_conf(conf); print_conf(conf);
if (rdev == p->rdev) if (rdev == p->rdev)
...@@ -2876,7 +2896,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2876,7 +2896,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t sect; sector_t sect;
int must_sync; int must_sync;
int any_working; int any_working;
struct mirror_info *mirror = &conf->mirrors[i]; struct raid10_info *mirror = &conf->mirrors[i];
if ((mirror->rdev == NULL || if ((mirror->rdev == NULL ||
test_bit(In_sync, &mirror->rdev->flags)) test_bit(In_sync, &mirror->rdev->flags))
...@@ -3388,7 +3408,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) ...@@ -3388,7 +3408,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
goto out; goto out;
/* FIXME calc properly */ /* FIXME calc properly */
conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks + conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
max(0,mddev->delta_disks)), max(0,mddev->delta_disks)),
GFP_KERNEL); GFP_KERNEL);
if (!conf->mirrors) if (!conf->mirrors)
...@@ -3452,7 +3472,7 @@ static int run(struct mddev *mddev) ...@@ -3452,7 +3472,7 @@ static int run(struct mddev *mddev)
{ {
struct r10conf *conf; struct r10conf *conf;
int i, disk_idx, chunk_size; int i, disk_idx, chunk_size;
struct mirror_info *disk; struct raid10_info *disk;
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t size; sector_t size;
sector_t min_offset_diff = 0; sector_t min_offset_diff = 0;
...@@ -3472,12 +3492,14 @@ static int run(struct mddev *mddev) ...@@ -3472,12 +3492,14 @@ static int run(struct mddev *mddev)
conf->thread = NULL; conf->thread = NULL;
chunk_size = mddev->chunk_sectors << 9; chunk_size = mddev->chunk_sectors << 9;
blk_queue_io_min(mddev->queue, chunk_size); if (mddev->queue) {
if (conf->geo.raid_disks % conf->geo.near_copies) blk_queue_io_min(mddev->queue, chunk_size);
blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); if (conf->geo.raid_disks % conf->geo.near_copies)
else blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
blk_queue_io_opt(mddev->queue, chunk_size * else
(conf->geo.raid_disks / conf->geo.near_copies)); blk_queue_io_opt(mddev->queue, chunk_size *
(conf->geo.raid_disks / conf->geo.near_copies));
}
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
long long diff; long long diff;
...@@ -3511,8 +3533,9 @@ static int run(struct mddev *mddev) ...@@ -3511,8 +3533,9 @@ static int run(struct mddev *mddev)
if (first || diff < min_offset_diff) if (first || diff < min_offset_diff)
min_offset_diff = diff; min_offset_diff = diff;
disk_stack_limits(mddev->gendisk, rdev->bdev, if (mddev->gendisk)
rdev->data_offset << 9); disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
disk->head_position = 0; disk->head_position = 0;
} }
...@@ -3575,22 +3598,22 @@ static int run(struct mddev *mddev) ...@@ -3575,22 +3598,22 @@ static int run(struct mddev *mddev)
md_set_array_sectors(mddev, size); md_set_array_sectors(mddev, size);
mddev->resync_max_sectors = size; mddev->resync_max_sectors = size;
mddev->queue->backing_dev_info.congested_fn = raid10_congested; if (mddev->queue) {
mddev->queue->backing_dev_info.congested_data = mddev;
/* Calculate max read-ahead size.
* We need to readahead at least twice a whole stripe....
* maybe...
*/
{
int stripe = conf->geo.raid_disks * int stripe = conf->geo.raid_disks *
((mddev->chunk_sectors << 9) / PAGE_SIZE); ((mddev->chunk_sectors << 9) / PAGE_SIZE);
mddev->queue->backing_dev_info.congested_fn = raid10_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
/* Calculate max read-ahead size.
* We need to readahead at least twice a whole stripe....
* maybe...
*/
stripe /= conf->geo.near_copies; stripe /= conf->geo.near_copies;
if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
mddev->queue->backing_dev_info.ra_pages = 2 * stripe; mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
} }
blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
if (md_integrity_register(mddev)) if (md_integrity_register(mddev))
goto out_free_conf; goto out_free_conf;
...@@ -3641,7 +3664,10 @@ static int stop(struct mddev *mddev) ...@@ -3641,7 +3664,10 @@ static int stop(struct mddev *mddev)
lower_barrier(conf); lower_barrier(conf);
md_unregister_thread(&mddev->thread); md_unregister_thread(&mddev->thread);
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ if (mddev->queue)
/* the unplug fn references 'conf'*/
blk_sync_queue(mddev->queue);
if (conf->r10bio_pool) if (conf->r10bio_pool)
mempool_destroy(conf->r10bio_pool); mempool_destroy(conf->r10bio_pool);
kfree(conf->mirrors); kfree(conf->mirrors);
...@@ -3805,7 +3831,7 @@ static int raid10_check_reshape(struct mddev *mddev) ...@@ -3805,7 +3831,7 @@ static int raid10_check_reshape(struct mddev *mddev)
if (mddev->delta_disks > 0) { if (mddev->delta_disks > 0) {
/* allocate new 'mirrors' list */ /* allocate new 'mirrors' list */
conf->mirrors_new = kzalloc( conf->mirrors_new = kzalloc(
sizeof(struct mirror_info) sizeof(struct raid10_info)
*(mddev->raid_disks + *(mddev->raid_disks +
mddev->delta_disks), mddev->delta_disks),
GFP_KERNEL); GFP_KERNEL);
...@@ -3930,7 +3956,7 @@ static int raid10_start_reshape(struct mddev *mddev) ...@@ -3930,7 +3956,7 @@ static int raid10_start_reshape(struct mddev *mddev)
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
if (conf->mirrors_new) { if (conf->mirrors_new) {
memcpy(conf->mirrors_new, conf->mirrors, memcpy(conf->mirrors_new, conf->mirrors,
sizeof(struct mirror_info)*conf->prev.raid_disks); sizeof(struct raid10_info)*conf->prev.raid_disks);
smp_mb(); smp_mb();
kfree(conf->mirrors_old); /* FIXME and elsewhere */ kfree(conf->mirrors_old); /* FIXME and elsewhere */
conf->mirrors_old = conf->mirrors; conf->mirrors_old = conf->mirrors;
......
#ifndef _RAID10_H #ifndef _RAID10_H
#define _RAID10_H #define _RAID10_H
struct mirror_info { struct raid10_info {
struct md_rdev *rdev, *replacement; struct md_rdev *rdev, *replacement;
sector_t head_position; sector_t head_position;
int recovery_disabled; /* matches int recovery_disabled; /* matches
...@@ -13,8 +13,8 @@ struct mirror_info { ...@@ -13,8 +13,8 @@ struct mirror_info {
struct r10conf { struct r10conf {
struct mddev *mddev; struct mddev *mddev;
struct mirror_info *mirrors; struct raid10_info *mirrors;
struct mirror_info *mirrors_new, *mirrors_old; struct raid10_info *mirrors_new, *mirrors_old;
spinlock_t device_lock; spinlock_t device_lock;
/* geometry */ /* geometry */
...@@ -123,20 +123,6 @@ struct r10bio { ...@@ -123,20 +123,6 @@ struct r10bio {
} devs[0]; } devs[0];
}; };
/* when we get a read error on a read-only array, we redirect to another
* device without failing the first device, or trying to over-write to
* correct the read error. To keep track of bad blocks on a per-bio
* level, we store IO_BLOCKED in the appropriate 'bios' pointer
*/
#define IO_BLOCKED ((struct bio*)1)
/* When we successfully write to a known bad-block, we need to remove the
* bad-block marking which must be done from process context. So we record
* the success by setting devs[n].bio to IO_MADE_GOOD
*/
#define IO_MADE_GOOD ((struct bio *)2)
#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
/* bits for r10bio.state */ /* bits for r10bio.state */
enum r10bio_state { enum r10bio_state {
R10BIO_Uptodate, R10BIO_Uptodate,
...@@ -159,4 +145,7 @@ enum r10bio_state { ...@@ -159,4 +145,7 @@ enum r10bio_state {
*/ */
R10BIO_Previous, R10BIO_Previous,
}; };
extern int md_raid10_congested(struct mddev *mddev, int bits);
#endif #endif
...@@ -99,34 +99,40 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) ...@@ -99,34 +99,40 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
* We maintain a biased count of active stripes in the bottom 16 bits of * We maintain a biased count of active stripes in the bottom 16 bits of
* bi_phys_segments, and a count of processed stripes in the upper 16 bits * bi_phys_segments, and a count of processed stripes in the upper 16 bits
*/ */
static inline int raid5_bi_phys_segments(struct bio *bio) static inline int raid5_bi_processed_stripes(struct bio *bio)
{ {
return bio->bi_phys_segments & 0xffff; atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
return (atomic_read(segments) >> 16) & 0xffff;
} }
static inline int raid5_bi_hw_segments(struct bio *bio) static inline int raid5_dec_bi_active_stripes(struct bio *bio)
{ {
return (bio->bi_phys_segments >> 16) & 0xffff; atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
return atomic_sub_return(1, segments) & 0xffff;
} }
static inline int raid5_dec_bi_phys_segments(struct bio *bio) static inline void raid5_inc_bi_active_stripes(struct bio *bio)
{ {
--bio->bi_phys_segments; atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
return raid5_bi_phys_segments(bio); atomic_inc(segments);
} }
static inline int raid5_dec_bi_hw_segments(struct bio *bio) static inline void raid5_set_bi_processed_stripes(struct bio *bio,
unsigned int cnt)
{ {
unsigned short val = raid5_bi_hw_segments(bio); atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
int old, new;
--val; do {
bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); old = atomic_read(segments);
return val; new = (old & 0xffff) | (cnt << 16);
} while (atomic_cmpxchg(segments, old, new) != old);
} }
static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
{ {
bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
atomic_set(segments, cnt);
} }
/* Find first data disk in a raid6 stripe */ /* Find first data disk in a raid6 stripe */
...@@ -190,49 +196,56 @@ static int stripe_operations_active(struct stripe_head *sh) ...@@ -190,49 +196,56 @@ static int stripe_operations_active(struct stripe_head *sh)
test_bit(STRIPE_COMPUTE_RUN, &sh->state); test_bit(STRIPE_COMPUTE_RUN, &sh->state);
} }
static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
{ {
if (atomic_dec_and_test(&sh->count)) { BUG_ON(!list_empty(&sh->lru));
BUG_ON(!list_empty(&sh->lru)); BUG_ON(atomic_read(&conf->active_stripes)==0);
BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) {
if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state) &&
if (test_bit(STRIPE_DELAYED, &sh->state) && !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) list_add_tail(&sh->lru, &conf->delayed_list);
list_add_tail(&sh->lru, &conf->delayed_list); else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && sh->bm_seq - conf->seq_write > 0)
sh->bm_seq - conf->seq_write > 0) list_add_tail(&sh->lru, &conf->bitmap_list);
list_add_tail(&sh->lru, &conf->bitmap_list); else {
else { clear_bit(STRIPE_DELAYED, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state); clear_bit(STRIPE_BIT_DELAY, &sh->state);
clear_bit(STRIPE_BIT_DELAY, &sh->state); list_add_tail(&sh->lru, &conf->handle_list);
list_add_tail(&sh->lru, &conf->handle_list); }
} md_wakeup_thread(conf->mddev->thread);
md_wakeup_thread(conf->mddev->thread); } else {
} else { BUG_ON(stripe_operations_active(sh));
BUG_ON(stripe_operations_active(sh)); if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) if (atomic_dec_return(&conf->preread_active_stripes)
if (atomic_dec_return(&conf->preread_active_stripes) < IO_THRESHOLD)
< IO_THRESHOLD) md_wakeup_thread(conf->mddev->thread);
md_wakeup_thread(conf->mddev->thread); atomic_dec(&conf->active_stripes);
atomic_dec(&conf->active_stripes); if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
if (!test_bit(STRIPE_EXPANDING, &sh->state)) { list_add_tail(&sh->lru, &conf->inactive_list);
list_add_tail(&sh->lru, &conf->inactive_list); wake_up(&conf->wait_for_stripe);
wake_up(&conf->wait_for_stripe); if (conf->retry_read_aligned)
if (conf->retry_read_aligned) md_wakeup_thread(conf->mddev->thread);
md_wakeup_thread(conf->mddev->thread);
}
} }
} }
} }
static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
{
if (atomic_dec_and_test(&sh->count))
do_release_stripe(conf, sh);
}
static void release_stripe(struct stripe_head *sh) static void release_stripe(struct stripe_head *sh)
{ {
struct r5conf *conf = sh->raid_conf; struct r5conf *conf = sh->raid_conf;
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&conf->device_lock, flags); local_irq_save(flags);
__release_stripe(conf, sh); if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
spin_unlock_irqrestore(&conf->device_lock, flags); do_release_stripe(conf, sh);
spin_unlock(&conf->device_lock);
}
local_irq_restore(flags);
} }
static inline void remove_hash(struct stripe_head *sh) static inline void remove_hash(struct stripe_head *sh)
...@@ -640,6 +653,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -640,6 +653,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
else else
bi->bi_sector = (sh->sector bi->bi_sector = (sh->sector
+ rdev->data_offset); + rdev->data_offset);
if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
bi->bi_rw |= REQ_FLUSH;
bi->bi_flags = 1 << BIO_UPTODATE; bi->bi_flags = 1 << BIO_UPTODATE;
bi->bi_idx = 0; bi->bi_idx = 0;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
...@@ -749,14 +765,12 @@ static void ops_complete_biofill(void *stripe_head_ref) ...@@ -749,14 +765,12 @@ static void ops_complete_biofill(void *stripe_head_ref)
{ {
struct stripe_head *sh = stripe_head_ref; struct stripe_head *sh = stripe_head_ref;
struct bio *return_bi = NULL; struct bio *return_bi = NULL;
struct r5conf *conf = sh->raid_conf;
int i; int i;
pr_debug("%s: stripe %llu\n", __func__, pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector); (unsigned long long)sh->sector);
/* clear completed biofills */ /* clear completed biofills */
spin_lock_irq(&conf->device_lock);
for (i = sh->disks; i--; ) { for (i = sh->disks; i--; ) {
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
...@@ -774,7 +788,7 @@ static void ops_complete_biofill(void *stripe_head_ref) ...@@ -774,7 +788,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
while (rbi && rbi->bi_sector < while (rbi && rbi->bi_sector <
dev->sector + STRIPE_SECTORS) { dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector); rbi2 = r5_next_bio(rbi, dev->sector);
if (!raid5_dec_bi_phys_segments(rbi)) { if (!raid5_dec_bi_active_stripes(rbi)) {
rbi->bi_next = return_bi; rbi->bi_next = return_bi;
return_bi = rbi; return_bi = rbi;
} }
...@@ -782,7 +796,6 @@ static void ops_complete_biofill(void *stripe_head_ref) ...@@ -782,7 +796,6 @@ static void ops_complete_biofill(void *stripe_head_ref)
} }
} }
} }
spin_unlock_irq(&conf->device_lock);
clear_bit(STRIPE_BIOFILL_RUN, &sh->state); clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
return_io(return_bi); return_io(return_bi);
...@@ -794,7 +807,6 @@ static void ops_complete_biofill(void *stripe_head_ref) ...@@ -794,7 +807,6 @@ static void ops_complete_biofill(void *stripe_head_ref)
static void ops_run_biofill(struct stripe_head *sh) static void ops_run_biofill(struct stripe_head *sh)
{ {
struct dma_async_tx_descriptor *tx = NULL; struct dma_async_tx_descriptor *tx = NULL;
struct r5conf *conf = sh->raid_conf;
struct async_submit_ctl submit; struct async_submit_ctl submit;
int i; int i;
...@@ -805,10 +817,10 @@ static void ops_run_biofill(struct stripe_head *sh) ...@@ -805,10 +817,10 @@ static void ops_run_biofill(struct stripe_head *sh)
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
if (test_bit(R5_Wantfill, &dev->flags)) { if (test_bit(R5_Wantfill, &dev->flags)) {
struct bio *rbi; struct bio *rbi;
spin_lock_irq(&conf->device_lock); spin_lock_irq(&sh->stripe_lock);
dev->read = rbi = dev->toread; dev->read = rbi = dev->toread;
dev->toread = NULL; dev->toread = NULL;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&sh->stripe_lock);
while (rbi && rbi->bi_sector < while (rbi && rbi->bi_sector <
dev->sector + STRIPE_SECTORS) { dev->sector + STRIPE_SECTORS) {
tx = async_copy_data(0, rbi, dev->page, tx = async_copy_data(0, rbi, dev->page,
...@@ -1144,12 +1156,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) ...@@ -1144,12 +1156,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
struct bio *wbi; struct bio *wbi;
spin_lock_irq(&sh->raid_conf->device_lock); spin_lock_irq(&sh->stripe_lock);
chosen = dev->towrite; chosen = dev->towrite;
dev->towrite = NULL; dev->towrite = NULL;
BUG_ON(dev->written); BUG_ON(dev->written);
wbi = dev->written = chosen; wbi = dev->written = chosen;
spin_unlock_irq(&sh->raid_conf->device_lock); spin_unlock_irq(&sh->stripe_lock);
while (wbi && wbi->bi_sector < while (wbi && wbi->bi_sector <
dev->sector + STRIPE_SECTORS) { dev->sector + STRIPE_SECTORS) {
...@@ -1454,6 +1466,8 @@ static int grow_one_stripe(struct r5conf *conf) ...@@ -1454,6 +1466,8 @@ static int grow_one_stripe(struct r5conf *conf)
init_waitqueue_head(&sh->ops.wait_for_ops); init_waitqueue_head(&sh->ops.wait_for_ops);
#endif #endif
spin_lock_init(&sh->stripe_lock);
if (grow_buffers(sh)) { if (grow_buffers(sh)) {
shrink_buffers(sh); shrink_buffers(sh);
kmem_cache_free(conf->slab_cache, sh); kmem_cache_free(conf->slab_cache, sh);
...@@ -1739,7 +1753,9 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1739,7 +1753,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags);
} } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
if (atomic_read(&rdev->read_errors)) if (atomic_read(&rdev->read_errors))
atomic_set(&rdev->read_errors, 0); atomic_set(&rdev->read_errors, 0);
} else { } else {
...@@ -1784,7 +1800,11 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1784,7 +1800,11 @@ static void raid5_end_read_request(struct bio * bi, int error)
else else
retry = 1; retry = 1;
if (retry) if (retry)
set_bit(R5_ReadError, &sh->dev[i].flags); if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
set_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
} else
set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
else { else {
clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags);
...@@ -2340,11 +2360,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in ...@@ -2340,11 +2360,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
(unsigned long long)bi->bi_sector, (unsigned long long)bi->bi_sector,
(unsigned long long)sh->sector); (unsigned long long)sh->sector);
/*
spin_lock_irq(&conf->device_lock); * If several bio share a stripe. The bio bi_phys_segments acts as a
* reference count to avoid race. The reference count should already be
* increased before this function is called (for example, in
* make_request()), so other bio sharing this stripe will not free the
* stripe. If a stripe is owned by one stripe, the stripe lock will
* protect it.
*/
spin_lock_irq(&sh->stripe_lock);
if (forwrite) { if (forwrite) {
bip = &sh->dev[dd_idx].towrite; bip = &sh->dev[dd_idx].towrite;
if (*bip == NULL && sh->dev[dd_idx].written == NULL) if (*bip == NULL)
firstwrite = 1; firstwrite = 1;
} else } else
bip = &sh->dev[dd_idx].toread; bip = &sh->dev[dd_idx].toread;
...@@ -2360,7 +2387,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in ...@@ -2360,7 +2387,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
if (*bip) if (*bip)
bi->bi_next = *bip; bi->bi_next = *bip;
*bip = bi; *bip = bi;
bi->bi_phys_segments++; raid5_inc_bi_active_stripes(bi);
if (forwrite) { if (forwrite) {
/* check if page is covered */ /* check if page is covered */
...@@ -2375,7 +2402,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in ...@@ -2375,7 +2402,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
} }
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&sh->stripe_lock);
pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
(unsigned long long)(*bip)->bi_sector, (unsigned long long)(*bip)->bi_sector,
...@@ -2391,7 +2418,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in ...@@ -2391,7 +2418,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
overlap: overlap:
set_bit(R5_Overlap, &sh->dev[dd_idx].flags); set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&sh->stripe_lock);
return 0; return 0;
} }
...@@ -2441,10 +2468,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, ...@@ -2441,10 +2468,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
rdev_dec_pending(rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
} }
} }
spin_lock_irq(&conf->device_lock); spin_lock_irq(&sh->stripe_lock);
/* fail all writes first */ /* fail all writes first */
bi = sh->dev[i].towrite; bi = sh->dev[i].towrite;
sh->dev[i].towrite = NULL; sh->dev[i].towrite = NULL;
spin_unlock_irq(&sh->stripe_lock);
if (bi) { if (bi) {
s->to_write--; s->to_write--;
bitmap_end = 1; bitmap_end = 1;
...@@ -2457,13 +2485,17 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, ...@@ -2457,13 +2485,17 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].sector + STRIPE_SECTORS) { sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags); clear_bit(BIO_UPTODATE, &bi->bi_flags);
if (!raid5_dec_bi_phys_segments(bi)) { if (!raid5_dec_bi_active_stripes(bi)) {
md_write_end(conf->mddev); md_write_end(conf->mddev);
bi->bi_next = *return_bi; bi->bi_next = *return_bi;
*return_bi = bi; *return_bi = bi;
} }
bi = nextbi; bi = nextbi;
} }
if (bitmap_end)
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS, 0, 0);
bitmap_end = 0;
/* and fail all 'written' */ /* and fail all 'written' */
bi = sh->dev[i].written; bi = sh->dev[i].written;
sh->dev[i].written = NULL; sh->dev[i].written = NULL;
...@@ -2472,7 +2504,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, ...@@ -2472,7 +2504,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].sector + STRIPE_SECTORS) { sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags); clear_bit(BIO_UPTODATE, &bi->bi_flags);
if (!raid5_dec_bi_phys_segments(bi)) { if (!raid5_dec_bi_active_stripes(bi)) {
md_write_end(conf->mddev); md_write_end(conf->mddev);
bi->bi_next = *return_bi; bi->bi_next = *return_bi;
*return_bi = bi; *return_bi = bi;
...@@ -2496,14 +2528,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, ...@@ -2496,14 +2528,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
struct bio *nextbi = struct bio *nextbi =
r5_next_bio(bi, sh->dev[i].sector); r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags); clear_bit(BIO_UPTODATE, &bi->bi_flags);
if (!raid5_dec_bi_phys_segments(bi)) { if (!raid5_dec_bi_active_stripes(bi)) {
bi->bi_next = *return_bi; bi->bi_next = *return_bi;
*return_bi = bi; *return_bi = bi;
} }
bi = nextbi; bi = nextbi;
} }
} }
spin_unlock_irq(&conf->device_lock);
if (bitmap_end) if (bitmap_end)
bitmap_endwrite(conf->mddev->bitmap, sh->sector, bitmap_endwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS, 0, 0); STRIPE_SECTORS, 0, 0);
...@@ -2707,30 +2738,23 @@ static void handle_stripe_clean_event(struct r5conf *conf, ...@@ -2707,30 +2738,23 @@ static void handle_stripe_clean_event(struct r5conf *conf,
test_bit(R5_UPTODATE, &dev->flags)) { test_bit(R5_UPTODATE, &dev->flags)) {
/* We can return any write requests */ /* We can return any write requests */
struct bio *wbi, *wbi2; struct bio *wbi, *wbi2;
int bitmap_end = 0;
pr_debug("Return write for disc %d\n", i); pr_debug("Return write for disc %d\n", i);
spin_lock_irq(&conf->device_lock);
wbi = dev->written; wbi = dev->written;
dev->written = NULL; dev->written = NULL;
while (wbi && wbi->bi_sector < while (wbi && wbi->bi_sector <
dev->sector + STRIPE_SECTORS) { dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector); wbi2 = r5_next_bio(wbi, dev->sector);
if (!raid5_dec_bi_phys_segments(wbi)) { if (!raid5_dec_bi_active_stripes(wbi)) {
md_write_end(conf->mddev); md_write_end(conf->mddev);
wbi->bi_next = *return_bi; wbi->bi_next = *return_bi;
*return_bi = wbi; *return_bi = wbi;
} }
wbi = wbi2; wbi = wbi2;
} }
if (dev->towrite == NULL) bitmap_endwrite(conf->mddev->bitmap, sh->sector,
bitmap_end = 1; STRIPE_SECTORS,
spin_unlock_irq(&conf->device_lock);
if (bitmap_end)
bitmap_endwrite(conf->mddev->bitmap,
sh->sector,
STRIPE_SECTORS,
!test_bit(STRIPE_DEGRADED, &sh->state), !test_bit(STRIPE_DEGRADED, &sh->state),
0); 0);
} }
} }
...@@ -3182,7 +3206,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -3182,7 +3206,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
/* Now to look around and see what can be done */ /* Now to look around and see what can be done */
rcu_read_lock(); rcu_read_lock();
spin_lock_irq(&conf->device_lock);
for (i=disks; i--; ) { for (i=disks; i--; ) {
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t first_bad; sector_t first_bad;
...@@ -3328,7 +3351,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -3328,7 +3351,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
do_recovery = 1; do_recovery = 1;
} }
} }
spin_unlock_irq(&conf->device_lock);
if (test_bit(STRIPE_SYNCING, &sh->state)) { if (test_bit(STRIPE_SYNCING, &sh->state)) {
/* If there is a failed device being replaced, /* If there is a failed device being replaced,
* we must be recovering. * we must be recovering.
...@@ -3791,7 +3813,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf) ...@@ -3791,7 +3813,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
* this sets the active strip count to 1 and the processed * this sets the active strip count to 1 and the processed
* strip count to zero (upper 8 bits) * strip count to zero (upper 8 bits)
*/ */
bi->bi_phys_segments = 1; /* biased count of active stripes */ raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
} }
return bi; return bi;
...@@ -4113,7 +4135,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) ...@@ -4113,7 +4135,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
finish_wait(&conf->wait_for_overlap, &w); finish_wait(&conf->wait_for_overlap, &w);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state);
if ((bi->bi_rw & REQ_SYNC) && if ((bi->bi_rw & REQ_NOIDLE) &&
!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
atomic_inc(&conf->preread_active_stripes); atomic_inc(&conf->preread_active_stripes);
mddev_check_plugged(mddev); mddev_check_plugged(mddev);
...@@ -4126,9 +4148,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) ...@@ -4126,9 +4148,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
} }
} }
spin_lock_irq(&conf->device_lock); remaining = raid5_dec_bi_active_stripes(bi);
remaining = raid5_dec_bi_phys_segments(bi);
spin_unlock_irq(&conf->device_lock);
if (remaining == 0) { if (remaining == 0) {
if ( rw == WRITE ) if ( rw == WRITE )
...@@ -4484,7 +4504,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) ...@@ -4484,7 +4504,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
sector += STRIPE_SECTORS, sector += STRIPE_SECTORS,
scnt++) { scnt++) {
if (scnt < raid5_bi_hw_segments(raid_bio)) if (scnt < raid5_bi_processed_stripes(raid_bio))
/* already done this stripe */ /* already done this stripe */
continue; continue;
...@@ -4492,25 +4512,24 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) ...@@ -4492,25 +4512,24 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
if (!sh) { if (!sh) {
/* failed to get a stripe - must wait */ /* failed to get a stripe - must wait */
raid5_set_bi_hw_segments(raid_bio, scnt); raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio; conf->retry_read_aligned = raid_bio;
return handled; return handled;
} }
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
release_stripe(sh); release_stripe(sh);
raid5_set_bi_hw_segments(raid_bio, scnt); raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio; conf->retry_read_aligned = raid_bio;
return handled; return handled;
} }
set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
handle_stripe(sh); handle_stripe(sh);
release_stripe(sh); release_stripe(sh);
handled++; handled++;
} }
spin_lock_irq(&conf->device_lock); remaining = raid5_dec_bi_active_stripes(raid_bio);
remaining = raid5_dec_bi_phys_segments(raid_bio);
spin_unlock_irq(&conf->device_lock);
if (remaining == 0) if (remaining == 0)
bio_endio(raid_bio, 0); bio_endio(raid_bio, 0);
if (atomic_dec_and_test(&conf->active_aligned_reads)) if (atomic_dec_and_test(&conf->active_aligned_reads))
......
...@@ -210,6 +210,7 @@ struct stripe_head { ...@@ -210,6 +210,7 @@ struct stripe_head {
int disks; /* disks in stripe */ int disks; /* disks in stripe */
enum check_states check_state; enum check_states check_state;
enum reconstruct_states reconstruct_state; enum reconstruct_states reconstruct_state;
spinlock_t stripe_lock;
/** /**
* struct stripe_operations * struct stripe_operations
* @target - STRIPE_OP_COMPUTE_BLK target * @target - STRIPE_OP_COMPUTE_BLK target
...@@ -273,6 +274,7 @@ enum r5dev_flags { ...@@ -273,6 +274,7 @@ enum r5dev_flags {
R5_Wantwrite, R5_Wantwrite,
R5_Overlap, /* There is a pending overlapping request R5_Overlap, /* There is a pending overlapping request
* on this block */ * on this block */
R5_ReadNoMerge, /* prevent bio from merging in block-layer */
R5_ReadError, /* seen a read error here recently */ R5_ReadError, /* seen a read error here recently */
R5_ReWrite, /* have tried to over-write the readerror */ R5_ReWrite, /* have tried to over-write the readerror */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册