提交 6d6e352c 编写于 作者: L Linus Torvalds

Merge tag 'md/3.13' of git://neil.brown.name/md

Pull md update from Neil Brown:
 "Mostly optimisations and obscure bug fixes.
   - raid5 gets less lock contention
   - raid1 gets less contention between normal-io and resync-io during
     resync"

* tag 'md/3.13' of git://neil.brown.name/md:
  md/raid5: Use conf->device_lock protect changing of multi-thread resources.
  md/raid5: Before freeing old multi-thread worker, it should flush them.
  md/raid5: For stripe with R5_ReadNoMerge, we replace REQ_FLUSH with REQ_NOMERGE.
  UAPI: include <asm/byteorder.h> in linux/raid/md_p.h
  raid1: Rewrite the implementation of iobarrier.
  raid1: Add some macros to make code clearly.
  raid1: Replace raise_barrier/lower_barrier with freeze_array/unfreeze_array when reconfiguring the array.
  raid1: Add a field array_frozen to indicate whether raid in freeze state.
  md: Convert use of typedef ctl_table to struct ctl_table
  md/raid5: avoid deadlock when raid5 array has unack badblocks during md_stop_writes.
  md: use MD_RECOVERY_INTR instead of kthread_should_stop in resync thread.
  md: fix some places where mddev_lock return value is not checked.
  raid5: Retry R5_ReadNoMerge flag when hit a read error.
  raid5: relieve lock contention in get_active_stripe()
  raid5: relieve lock contention in get_active_stripe()
  wait: add wait_event_cmd()
  md/raid5.c: add proper locking to error path of raid5_start_reshape.
  md: fix calculation of stacking limits on level change.
  raid5: Use slow_path to release stripe when mddev->thread is null
...@@ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev) ...@@ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev)
static struct ctl_table_header *raid_table_header; static struct ctl_table_header *raid_table_header;
static ctl_table raid_table[] = { static struct ctl_table raid_table[] = {
{ {
.procname = "speed_limit_min", .procname = "speed_limit_min",
.data = &sysctl_speed_limit_min, .data = &sysctl_speed_limit_min,
...@@ -130,7 +130,7 @@ static ctl_table raid_table[] = { ...@@ -130,7 +130,7 @@ static ctl_table raid_table[] = {
{ } { }
}; };
static ctl_table raid_dir_table[] = { static struct ctl_table raid_dir_table[] = {
{ {
.procname = "raid", .procname = "raid",
.maxlen = 0, .maxlen = 0,
...@@ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = { ...@@ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = {
{ } { }
}; };
static ctl_table raid_root_table[] = { static struct ctl_table raid_root_table[] = {
{ {
.procname = "dev", .procname = "dev",
.maxlen = 0, .maxlen = 0,
...@@ -562,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit) ...@@ -562,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit)
goto retry; goto retry;
} }
static inline int mddev_lock(struct mddev * mddev) static inline int __must_check mddev_lock(struct mddev * mddev)
{ {
return mutex_lock_interruptible(&mddev->reconfig_mutex); return mutex_lock_interruptible(&mddev->reconfig_mutex);
} }
/* Sometimes we need to take the lock in a situation where
* failure due to interrupts is not acceptable.
*/
static inline void mddev_lock_nointr(struct mddev * mddev)
{
mutex_lock(&mddev->reconfig_mutex);
}
static inline int mddev_is_locked(struct mddev *mddev) static inline int mddev_is_locked(struct mddev *mddev)
{ {
return mutex_is_locked(&mddev->reconfig_mutex); return mutex_is_locked(&mddev->reconfig_mutex);
...@@ -2978,7 +2986,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2978,7 +2986,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
for_each_mddev(mddev, tmp) { for_each_mddev(mddev, tmp) {
struct md_rdev *rdev2; struct md_rdev *rdev2;
mddev_lock(mddev); mddev_lock_nointr(mddev);
rdev_for_each(rdev2, mddev) rdev_for_each(rdev2, mddev)
if (rdev->bdev == rdev2->bdev && if (rdev->bdev == rdev2->bdev &&
rdev != rdev2 && rdev != rdev2 &&
...@@ -2994,7 +3002,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2994,7 +3002,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
break; break;
} }
} }
mddev_lock(my_mddev); mddev_lock_nointr(my_mddev);
if (overlap) { if (overlap) {
/* Someone else could have slipped in a size /* Someone else could have slipped in a size
* change here, but doing so is just silly. * change here, but doing so is just silly.
...@@ -3580,6 +3588,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -3580,6 +3588,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->in_sync = 1; mddev->in_sync = 1;
del_timer_sync(&mddev->safemode_timer); del_timer_sync(&mddev->safemode_timer);
} }
blk_set_stacking_limits(&mddev->queue->limits);
pers->run(mddev); pers->run(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags);
mddev_resume(mddev); mddev_resume(mddev);
...@@ -5258,7 +5267,7 @@ static void __md_stop_writes(struct mddev *mddev) ...@@ -5258,7 +5267,7 @@ static void __md_stop_writes(struct mddev *mddev)
void md_stop_writes(struct mddev *mddev) void md_stop_writes(struct mddev *mddev)
{ {
mddev_lock(mddev); mddev_lock_nointr(mddev);
__md_stop_writes(mddev); __md_stop_writes(mddev);
mddev_unlock(mddev); mddev_unlock(mddev);
} }
...@@ -5291,20 +5300,35 @@ EXPORT_SYMBOL_GPL(md_stop); ...@@ -5291,20 +5300,35 @@ EXPORT_SYMBOL_GPL(md_stop);
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
{ {
int err = 0; int err = 0;
int did_freeze = 0;
if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
did_freeze = 1;
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
md_wakeup_thread(mddev->thread);
}
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
/* Thread might be blocked waiting for metadata update
* which will now never happen */
wake_up_process(mddev->sync_thread->tsk);
}
mddev_unlock(mddev);
wait_event(resync_wait, mddev->sync_thread == NULL);
mddev_lock_nointr(mddev);
mutex_lock(&mddev->open_mutex); mutex_lock(&mddev->open_mutex);
if (atomic_read(&mddev->openers) > !!bdev) { if (atomic_read(&mddev->openers) > !!bdev ||
mddev->sync_thread ||
(bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
printk("md: %s still in use.\n",mdname(mddev)); printk("md: %s still in use.\n",mdname(mddev));
if (did_freeze) {
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
md_wakeup_thread(mddev->thread);
}
err = -EBUSY; err = -EBUSY;
goto out; goto out;
} }
if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
/* Someone opened the device since we flushed it
* so page cache could be dirty and it is too late
* to flush. So abort
*/
mutex_unlock(&mddev->open_mutex);
return -EBUSY;
}
if (mddev->pers) { if (mddev->pers) {
__md_stop_writes(mddev); __md_stop_writes(mddev);
...@@ -5315,7 +5339,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) ...@@ -5315,7 +5339,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
set_disk_ro(mddev->gendisk, 1); set_disk_ro(mddev->gendisk, 1);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
sysfs_notify_dirent_safe(mddev->sysfs_state); sysfs_notify_dirent_safe(mddev->sysfs_state);
err = 0; err = 0;
} }
out: out:
mutex_unlock(&mddev->open_mutex); mutex_unlock(&mddev->open_mutex);
...@@ -5331,20 +5355,34 @@ static int do_md_stop(struct mddev * mddev, int mode, ...@@ -5331,20 +5355,34 @@ static int do_md_stop(struct mddev * mddev, int mode,
{ {
struct gendisk *disk = mddev->gendisk; struct gendisk *disk = mddev->gendisk;
struct md_rdev *rdev; struct md_rdev *rdev;
int did_freeze = 0;
if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
did_freeze = 1;
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
md_wakeup_thread(mddev->thread);
}
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
/* Thread might be blocked waiting for metadata update
* which will now never happen */
wake_up_process(mddev->sync_thread->tsk);
}
mddev_unlock(mddev);
wait_event(resync_wait, mddev->sync_thread == NULL);
mddev_lock_nointr(mddev);
mutex_lock(&mddev->open_mutex); mutex_lock(&mddev->open_mutex);
if (atomic_read(&mddev->openers) > !!bdev || if (atomic_read(&mddev->openers) > !!bdev ||
mddev->sysfs_active) { mddev->sysfs_active ||
mddev->sync_thread ||
(bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
printk("md: %s still in use.\n",mdname(mddev)); printk("md: %s still in use.\n",mdname(mddev));
mutex_unlock(&mddev->open_mutex); mutex_unlock(&mddev->open_mutex);
return -EBUSY; if (did_freeze) {
} clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { md_wakeup_thread(mddev->thread);
/* Someone opened the device since we flushed it }
* so page cache could be dirty and it is too late
* to flush. So abort
*/
mutex_unlock(&mddev->open_mutex);
return -EBUSY; return -EBUSY;
} }
if (mddev->pers) { if (mddev->pers) {
...@@ -6551,7 +6589,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, ...@@ -6551,7 +6589,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
wait_event(mddev->sb_wait, wait_event(mddev->sb_wait,
!test_bit(MD_CHANGE_DEVS, &mddev->flags) && !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
!test_bit(MD_CHANGE_PENDING, &mddev->flags)); !test_bit(MD_CHANGE_PENDING, &mddev->flags));
mddev_lock(mddev); mddev_lock_nointr(mddev);
} }
} else { } else {
err = -EROFS; err = -EROFS;
...@@ -7361,9 +7399,6 @@ void md_do_sync(struct md_thread *thread) ...@@ -7361,9 +7399,6 @@ void md_do_sync(struct md_thread *thread)
mddev->curr_resync = 2; mddev->curr_resync = 2;
try_again: try_again:
if (kthread_should_stop())
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
goto skip; goto skip;
for_each_mddev(mddev2, tmp) { for_each_mddev(mddev2, tmp) {
...@@ -7388,7 +7423,7 @@ void md_do_sync(struct md_thread *thread) ...@@ -7388,7 +7423,7 @@ void md_do_sync(struct md_thread *thread)
* be caught by 'softlockup' * be caught by 'softlockup'
*/ */
prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
if (!kthread_should_stop() && if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
mddev2->curr_resync >= mddev->curr_resync) { mddev2->curr_resync >= mddev->curr_resync) {
printk(KERN_INFO "md: delaying %s of %s" printk(KERN_INFO "md: delaying %s of %s"
" until %s has finished (they" " until %s has finished (they"
...@@ -7464,7 +7499,7 @@ void md_do_sync(struct md_thread *thread) ...@@ -7464,7 +7499,7 @@ void md_do_sync(struct md_thread *thread)
last_check = 0; last_check = 0;
if (j>2) { if (j>2) {
printk(KERN_INFO printk(KERN_INFO
"md: resuming %s of %s from checkpoint.\n", "md: resuming %s of %s from checkpoint.\n",
desc, mdname(mddev)); desc, mdname(mddev));
mddev->curr_resync = j; mddev->curr_resync = j;
...@@ -7501,7 +7536,8 @@ void md_do_sync(struct md_thread *thread) ...@@ -7501,7 +7536,8 @@ void md_do_sync(struct md_thread *thread)
sysfs_notify(&mddev->kobj, NULL, "sync_completed"); sysfs_notify(&mddev->kobj, NULL, "sync_completed");
} }
while (j >= mddev->resync_max && !kthread_should_stop()) { while (j >= mddev->resync_max &&
!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* As this condition is controlled by user-space, /* As this condition is controlled by user-space,
* we can block indefinitely, so use '_interruptible' * we can block indefinitely, so use '_interruptible'
* to avoid triggering warnings. * to avoid triggering warnings.
...@@ -7509,17 +7545,18 @@ void md_do_sync(struct md_thread *thread) ...@@ -7509,17 +7545,18 @@ void md_do_sync(struct md_thread *thread)
flush_signals(current); /* just in case */ flush_signals(current); /* just in case */
wait_event_interruptible(mddev->recovery_wait, wait_event_interruptible(mddev->recovery_wait,
mddev->resync_max > j mddev->resync_max > j
|| kthread_should_stop()); || test_bit(MD_RECOVERY_INTR,
&mddev->recovery));
} }
if (kthread_should_stop()) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
goto interrupted; break;
sectors = mddev->pers->sync_request(mddev, j, &skipped, sectors = mddev->pers->sync_request(mddev, j, &skipped,
currspeed < speed_min(mddev)); currspeed < speed_min(mddev));
if (sectors == 0) { if (sectors == 0) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery);
goto out; break;
} }
if (!skipped) { /* actual IO requested */ if (!skipped) { /* actual IO requested */
...@@ -7556,10 +7593,8 @@ void md_do_sync(struct md_thread *thread) ...@@ -7556,10 +7593,8 @@ void md_do_sync(struct md_thread *thread)
last_mark = next; last_mark = next;
} }
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
if (kthread_should_stop()) break;
goto interrupted;
/* /*
* this loop exits only if either when we are slower than * this loop exits only if either when we are slower than
...@@ -7582,11 +7617,12 @@ void md_do_sync(struct md_thread *thread) ...@@ -7582,11 +7617,12 @@ void md_do_sync(struct md_thread *thread)
} }
} }
} }
printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
test_bit(MD_RECOVERY_INTR, &mddev->recovery)
? "interrupted" : "done");
/* /*
* this also signals 'finished resyncing' to md_stop * this also signals 'finished resyncing' to md_stop
*/ */
out:
blk_finish_plug(&plug); blk_finish_plug(&plug);
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
...@@ -7640,16 +7676,6 @@ void md_do_sync(struct md_thread *thread) ...@@ -7640,16 +7676,6 @@ void md_do_sync(struct md_thread *thread)
set_bit(MD_RECOVERY_DONE, &mddev->recovery); set_bit(MD_RECOVERY_DONE, &mddev->recovery);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
return; return;
interrupted:
/*
* got a signal, exit.
*/
printk(KERN_INFO
"md: md_do_sync() got signal ... exiting\n");
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
goto out;
} }
EXPORT_SYMBOL_GPL(md_do_sync); EXPORT_SYMBOL_GPL(md_do_sync);
...@@ -7894,6 +7920,7 @@ void md_reap_sync_thread(struct mddev *mddev) ...@@ -7894,6 +7920,7 @@ void md_reap_sync_thread(struct mddev *mddev)
/* resync has finished, collect result */ /* resync has finished, collect result */
md_unregister_thread(&mddev->sync_thread); md_unregister_thread(&mddev->sync_thread);
wake_up(&resync_wait);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
/* success...*/ /* success...*/
......
...@@ -66,7 +66,8 @@ ...@@ -66,7 +66,8 @@
*/ */
static int max_queued_requests = 1024; static int max_queued_requests = 1024;
static void allow_barrier(struct r1conf *conf); static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
sector_t bi_sector);
static void lower_barrier(struct r1conf *conf); static void lower_barrier(struct r1conf *conf);
static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
...@@ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data) ...@@ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data)
} }
#define RESYNC_BLOCK_SIZE (64*1024) #define RESYNC_BLOCK_SIZE (64*1024)
//#define RESYNC_BLOCK_SIZE PAGE_SIZE #define RESYNC_DEPTH 32
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
#define RESYNC_WINDOW (2048*1024) #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
{ {
...@@ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio) ...@@ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio)
struct bio *bio = r1_bio->master_bio; struct bio *bio = r1_bio->master_bio;
int done; int done;
struct r1conf *conf = r1_bio->mddev->private; struct r1conf *conf = r1_bio->mddev->private;
sector_t start_next_window = r1_bio->start_next_window;
sector_t bi_sector = bio->bi_sector;
if (bio->bi_phys_segments) { if (bio->bi_phys_segments) {
unsigned long flags; unsigned long flags;
...@@ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio) ...@@ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio)
bio->bi_phys_segments--; bio->bi_phys_segments--;
done = (bio->bi_phys_segments == 0); done = (bio->bi_phys_segments == 0);
spin_unlock_irqrestore(&conf->device_lock, flags); spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* make_request() might be waiting for
* bi_phys_segments to decrease
*/
wake_up(&conf->wait_barrier);
} else } else
done = 1; done = 1;
...@@ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio) ...@@ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
* Wake up any possible resync thread that waits for the device * Wake up any possible resync thread that waits for the device
* to go idle. * to go idle.
*/ */
allow_barrier(conf); allow_barrier(conf, start_next_window, bi_sector);
} }
} }
...@@ -814,8 +824,6 @@ static void flush_pending_writes(struct r1conf *conf) ...@@ -814,8 +824,6 @@ static void flush_pending_writes(struct r1conf *conf)
* there is no normal IO happeing. It must arrange to call * there is no normal IO happeing. It must arrange to call
* lower_barrier when the particular background IO completes. * lower_barrier when the particular background IO completes.
*/ */
#define RESYNC_DEPTH 32
static void raise_barrier(struct r1conf *conf) static void raise_barrier(struct r1conf *conf)
{ {
spin_lock_irq(&conf->resync_lock); spin_lock_irq(&conf->resync_lock);
...@@ -827,9 +835,19 @@ static void raise_barrier(struct r1conf *conf) ...@@ -827,9 +835,19 @@ static void raise_barrier(struct r1conf *conf)
/* block any new IO from starting */ /* block any new IO from starting */
conf->barrier++; conf->barrier++;
/* Now wait for all pending IO to complete */ /* For these conditions we must wait:
* A: while the array is in frozen state
* B: while barrier >= RESYNC_DEPTH, meaning resync reach
* the max count which allowed.
* C: next_resync + RESYNC_SECTORS > start_next_window, meaning
* next resync will reach to the window which normal bios are
* handling.
*/
wait_event_lock_irq(conf->wait_barrier, wait_event_lock_irq(conf->wait_barrier,
!conf->nr_pending && conf->barrier < RESYNC_DEPTH, !conf->array_frozen &&
conf->barrier < RESYNC_DEPTH &&
(conf->start_next_window >=
conf->next_resync + RESYNC_SECTORS),
conf->resync_lock); conf->resync_lock);
spin_unlock_irq(&conf->resync_lock); spin_unlock_irq(&conf->resync_lock);
...@@ -845,10 +863,33 @@ static void lower_barrier(struct r1conf *conf) ...@@ -845,10 +863,33 @@ static void lower_barrier(struct r1conf *conf)
wake_up(&conf->wait_barrier); wake_up(&conf->wait_barrier);
} }
static void wait_barrier(struct r1conf *conf) static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
{ {
bool wait = false;
if (conf->array_frozen || !bio)
wait = true;
else if (conf->barrier && bio_data_dir(bio) == WRITE) {
if (conf->next_resync < RESYNC_WINDOW_SECTORS)
wait = true;
else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
>= bio_end_sector(bio)) ||
(conf->next_resync + NEXT_NORMALIO_DISTANCE
<= bio->bi_sector))
wait = false;
else
wait = true;
}
return wait;
}
static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
{
sector_t sector = 0;
spin_lock_irq(&conf->resync_lock); spin_lock_irq(&conf->resync_lock);
if (conf->barrier) { if (need_to_wait_for_sync(conf, bio)) {
conf->nr_waiting++; conf->nr_waiting++;
/* Wait for the barrier to drop. /* Wait for the barrier to drop.
* However if there are already pending * However if there are already pending
...@@ -860,22 +901,67 @@ static void wait_barrier(struct r1conf *conf) ...@@ -860,22 +901,67 @@ static void wait_barrier(struct r1conf *conf)
* count down. * count down.
*/ */
wait_event_lock_irq(conf->wait_barrier, wait_event_lock_irq(conf->wait_barrier,
!conf->barrier || !conf->array_frozen &&
(conf->nr_pending && (!conf->barrier ||
((conf->start_next_window <
conf->next_resync + RESYNC_SECTORS) &&
current->bio_list && current->bio_list &&
!bio_list_empty(current->bio_list)), !bio_list_empty(current->bio_list))),
conf->resync_lock); conf->resync_lock);
conf->nr_waiting--; conf->nr_waiting--;
} }
if (bio && bio_data_dir(bio) == WRITE) {
if (conf->next_resync + NEXT_NORMALIO_DISTANCE
<= bio->bi_sector) {
if (conf->start_next_window == MaxSector)
conf->start_next_window =
conf->next_resync +
NEXT_NORMALIO_DISTANCE;
if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
<= bio->bi_sector)
conf->next_window_requests++;
else
conf->current_window_requests++;
}
if (bio->bi_sector >= conf->start_next_window)
sector = conf->start_next_window;
}
conf->nr_pending++; conf->nr_pending++;
spin_unlock_irq(&conf->resync_lock); spin_unlock_irq(&conf->resync_lock);
return sector;
} }
static void allow_barrier(struct r1conf *conf) static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
sector_t bi_sector)
{ {
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&conf->resync_lock, flags); spin_lock_irqsave(&conf->resync_lock, flags);
conf->nr_pending--; conf->nr_pending--;
if (start_next_window) {
if (start_next_window == conf->start_next_window) {
if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
<= bi_sector)
conf->next_window_requests--;
else
conf->current_window_requests--;
} else
conf->current_window_requests--;
if (!conf->current_window_requests) {
if (conf->next_window_requests) {
conf->current_window_requests =
conf->next_window_requests;
conf->next_window_requests = 0;
conf->start_next_window +=
NEXT_NORMALIO_DISTANCE;
} else
conf->start_next_window = MaxSector;
}
}
spin_unlock_irqrestore(&conf->resync_lock, flags); spin_unlock_irqrestore(&conf->resync_lock, flags);
wake_up(&conf->wait_barrier); wake_up(&conf->wait_barrier);
} }
...@@ -884,8 +970,7 @@ static void freeze_array(struct r1conf *conf, int extra) ...@@ -884,8 +970,7 @@ static void freeze_array(struct r1conf *conf, int extra)
{ {
/* stop syncio and normal IO and wait for everything to /* stop syncio and normal IO and wait for everything to
* go quite. * go quite.
* We increment barrier and nr_waiting, and then * We wait until nr_pending match nr_queued+extra
* wait until nr_pending match nr_queued+extra
* This is called in the context of one normal IO request * This is called in the context of one normal IO request
* that has failed. Thus any sync request that might be pending * that has failed. Thus any sync request that might be pending
* will be blocked by nr_pending, and we need to wait for * will be blocked by nr_pending, and we need to wait for
...@@ -895,8 +980,7 @@ static void freeze_array(struct r1conf *conf, int extra) ...@@ -895,8 +980,7 @@ static void freeze_array(struct r1conf *conf, int extra)
* we continue. * we continue.
*/ */
spin_lock_irq(&conf->resync_lock); spin_lock_irq(&conf->resync_lock);
conf->barrier++; conf->array_frozen = 1;
conf->nr_waiting++;
wait_event_lock_irq_cmd(conf->wait_barrier, wait_event_lock_irq_cmd(conf->wait_barrier,
conf->nr_pending == conf->nr_queued+extra, conf->nr_pending == conf->nr_queued+extra,
conf->resync_lock, conf->resync_lock,
...@@ -907,8 +991,7 @@ static void unfreeze_array(struct r1conf *conf) ...@@ -907,8 +991,7 @@ static void unfreeze_array(struct r1conf *conf)
{ {
/* reverse the effect of the freeze */ /* reverse the effect of the freeze */
spin_lock_irq(&conf->resync_lock); spin_lock_irq(&conf->resync_lock);
conf->barrier--; conf->array_frozen = 0;
conf->nr_waiting--;
wake_up(&conf->wait_barrier); wake_up(&conf->wait_barrier);
spin_unlock_irq(&conf->resync_lock); spin_unlock_irq(&conf->resync_lock);
} }
...@@ -1013,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1013,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
int first_clone; int first_clone;
int sectors_handled; int sectors_handled;
int max_sectors; int max_sectors;
sector_t start_next_window;
/* /*
* Register the new request and wait if the reconstruction * Register the new request and wait if the reconstruction
...@@ -1042,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1042,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
finish_wait(&conf->wait_barrier, &w); finish_wait(&conf->wait_barrier, &w);
} }
wait_barrier(conf); start_next_window = wait_barrier(conf, bio);
bitmap = mddev->bitmap; bitmap = mddev->bitmap;
...@@ -1163,6 +1247,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1163,6 +1247,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
disks = conf->raid_disks * 2; disks = conf->raid_disks * 2;
retry_write: retry_write:
r1_bio->start_next_window = start_next_window;
blocked_rdev = NULL; blocked_rdev = NULL;
rcu_read_lock(); rcu_read_lock();
max_sectors = r1_bio->sectors; max_sectors = r1_bio->sectors;
...@@ -1231,14 +1316,24 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1231,14 +1316,24 @@ static void make_request(struct mddev *mddev, struct bio * bio)
if (unlikely(blocked_rdev)) { if (unlikely(blocked_rdev)) {
/* Wait for this device to become unblocked */ /* Wait for this device to become unblocked */
int j; int j;
sector_t old = start_next_window;
for (j = 0; j < i; j++) for (j = 0; j < i; j++)
if (r1_bio->bios[j]) if (r1_bio->bios[j])
rdev_dec_pending(conf->mirrors[j].rdev, mddev); rdev_dec_pending(conf->mirrors[j].rdev, mddev);
r1_bio->state = 0; r1_bio->state = 0;
allow_barrier(conf); allow_barrier(conf, start_next_window, bio->bi_sector);
md_wait_for_blocked_rdev(blocked_rdev, mddev); md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf); start_next_window = wait_barrier(conf, bio);
/*
* We must make sure the multi r1bios of bio have
* the same value of bi_phys_segments
*/
if (bio->bi_phys_segments && old &&
old != start_next_window)
/* Wait for the former r1bio(s) to complete */
wait_event(conf->wait_barrier,
bio->bi_phys_segments == 1);
goto retry_write; goto retry_write;
} }
...@@ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf) ...@@ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf)
static void close_sync(struct r1conf *conf) static void close_sync(struct r1conf *conf)
{ {
wait_barrier(conf); wait_barrier(conf, NULL);
allow_barrier(conf); allow_barrier(conf, 0, 0);
mempool_destroy(conf->r1buf_pool); mempool_destroy(conf->r1buf_pool);
conf->r1buf_pool = NULL; conf->r1buf_pool = NULL;
conf->next_resync = 0;
conf->start_next_window = MaxSector;
} }
static int raid1_spare_active(struct mddev *mddev) static int raid1_spare_active(struct mddev *mddev)
...@@ -2714,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) ...@@ -2714,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->pending_count = 0; conf->pending_count = 0;
conf->recovery_disabled = mddev->recovery_disabled - 1; conf->recovery_disabled = mddev->recovery_disabled - 1;
conf->start_next_window = MaxSector;
conf->current_window_requests = conf->next_window_requests = 0;
err = -EIO; err = -EIO;
for (i = 0; i < conf->raid_disks * 2; i++) { for (i = 0; i < conf->raid_disks * 2; i++) {
...@@ -2871,8 +2972,8 @@ static int stop(struct mddev *mddev) ...@@ -2871,8 +2972,8 @@ static int stop(struct mddev *mddev)
atomic_read(&bitmap->behind_writes) == 0); atomic_read(&bitmap->behind_writes) == 0);
} }
raise_barrier(conf); freeze_array(conf, 0);
lower_barrier(conf); unfreeze_array(conf);
md_unregister_thread(&mddev->thread); md_unregister_thread(&mddev->thread);
if (conf->r1bio_pool) if (conf->r1bio_pool)
...@@ -3031,10 +3132,10 @@ static void raid1_quiesce(struct mddev *mddev, int state) ...@@ -3031,10 +3132,10 @@ static void raid1_quiesce(struct mddev *mddev, int state)
wake_up(&conf->wait_barrier); wake_up(&conf->wait_barrier);
break; break;
case 1: case 1:
raise_barrier(conf); freeze_array(conf, 0);
break; break;
case 0: case 0:
lower_barrier(conf); unfreeze_array(conf);
break; break;
} }
} }
...@@ -3051,7 +3152,8 @@ static void *raid1_takeover(struct mddev *mddev) ...@@ -3051,7 +3152,8 @@ static void *raid1_takeover(struct mddev *mddev)
mddev->new_chunk_sectors = 0; mddev->new_chunk_sectors = 0;
conf = setup_conf(mddev); conf = setup_conf(mddev);
if (!IS_ERR(conf)) if (!IS_ERR(conf))
conf->barrier = 1; /* Array must appear to be quiesced */
conf->array_frozen = 1;
return conf; return conf;
} }
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
......
...@@ -41,6 +41,19 @@ struct r1conf { ...@@ -41,6 +41,19 @@ struct r1conf {
*/ */
sector_t next_resync; sector_t next_resync;
/* When raid1 starts resync, we divide array into four partitions
* |---------|--------------|---------------------|-------------|
* next_resync start_next_window end_window
* start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
* end_window = start_next_window + NEXT_NORMALIO_DISTANCE
* current_window_requests means the count of normalIO between
* start_next_window and end_window.
* next_window_requests means the count of normalIO after end_window.
* */
sector_t start_next_window;
int current_window_requests;
int next_window_requests;
spinlock_t device_lock; spinlock_t device_lock;
/* list of 'struct r1bio' that need to be processed by raid1d, /* list of 'struct r1bio' that need to be processed by raid1d,
...@@ -65,6 +78,7 @@ struct r1conf { ...@@ -65,6 +78,7 @@ struct r1conf {
int nr_waiting; int nr_waiting;
int nr_queued; int nr_queued;
int barrier; int barrier;
int array_frozen;
/* Set to 1 if a full sync is needed, (fresh device added). /* Set to 1 if a full sync is needed, (fresh device added).
* Cleared when a sync completes. * Cleared when a sync completes.
...@@ -111,6 +125,7 @@ struct r1bio { ...@@ -111,6 +125,7 @@ struct r1bio {
* in this BehindIO request * in this BehindIO request
*/ */
sector_t sector; sector_t sector;
sector_t start_next_window;
int sectors; int sectors;
unsigned long state; unsigned long state;
struct mddev *mddev; struct mddev *mddev;
......
...@@ -4384,7 +4384,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, ...@@ -4384,7 +4384,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait, mddev->flags == 0 || wait_event(mddev->sb_wait, mddev->flags == 0 ||
kthread_should_stop()); test_bit(MD_RECOVERY_INTR, &mddev->recovery));
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
allow_barrier(conf);
return sectors_done;
}
conf->reshape_safe = mddev->reshape_position; conf->reshape_safe = mddev->reshape_position;
allow_barrier(conf); allow_barrier(conf);
} }
......
此差异已折叠。
...@@ -205,6 +205,7 @@ struct stripe_head { ...@@ -205,6 +205,7 @@ struct stripe_head {
short pd_idx; /* parity disk index */ short pd_idx; /* parity disk index */
short qd_idx; /* 'Q' disk index for raid6 */ short qd_idx; /* 'Q' disk index for raid6 */
short ddf_layout;/* use DDF ordering to calculate Q */ short ddf_layout;/* use DDF ordering to calculate Q */
short hash_lock_index;
unsigned long state; /* state flags */ unsigned long state; /* state flags */
atomic_t count; /* nr of active thread/requests */ atomic_t count; /* nr of active thread/requests */
int bm_seq; /* sequence number for bitmap flushes */ int bm_seq; /* sequence number for bitmap flushes */
...@@ -367,9 +368,18 @@ struct disk_info { ...@@ -367,9 +368,18 @@ struct disk_info {
struct md_rdev *rdev, *replacement; struct md_rdev *rdev, *replacement;
}; };
/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
* This is because we sometimes take all the spinlocks
* and creating that much locking depth can cause
* problems.
*/
#define NR_STRIPE_HASH_LOCKS 8
#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1)
struct r5worker { struct r5worker {
struct work_struct work; struct work_struct work;
struct r5worker_group *group; struct r5worker_group *group;
struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
bool working; bool working;
}; };
...@@ -382,6 +392,8 @@ struct r5worker_group { ...@@ -382,6 +392,8 @@ struct r5worker_group {
struct r5conf { struct r5conf {
struct hlist_head *stripe_hashtbl; struct hlist_head *stripe_hashtbl;
/* only protect corresponding hash list and inactive_list */
spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS];
struct mddev *mddev; struct mddev *mddev;
int chunk_sectors; int chunk_sectors;
int level, algorithm; int level, algorithm;
...@@ -462,7 +474,8 @@ struct r5conf { ...@@ -462,7 +474,8 @@ struct r5conf {
* Free stripes pool * Free stripes pool
*/ */
atomic_t active_stripes; atomic_t active_stripes;
struct list_head inactive_list; struct list_head inactive_list[NR_STRIPE_HASH_LOCKS];
atomic_t empty_inactive_list_nr;
struct llist_head released_stripes; struct llist_head released_stripes;
wait_queue_head_t wait_for_stripe; wait_queue_head_t wait_for_stripe;
wait_queue_head_t wait_for_overlap; wait_queue_head_t wait_for_overlap;
...@@ -477,6 +490,7 @@ struct r5conf { ...@@ -477,6 +490,7 @@ struct r5conf {
* the new thread here until we fully activate the array. * the new thread here until we fully activate the array.
*/ */
struct md_thread *thread; struct md_thread *thread;
struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
struct r5worker_group *worker_groups; struct r5worker_group *worker_groups;
int group_cnt; int group_cnt;
int worker_cnt_per_group; int worker_cnt_per_group;
......
...@@ -278,6 +278,31 @@ do { \ ...@@ -278,6 +278,31 @@ do { \
__ret; \ __ret; \
}) })
#define __wait_event_cmd(wq, condition, cmd1, cmd2) \
(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
cmd1; schedule(); cmd2)
/**
* wait_event_cmd - sleep until a condition gets true
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* cmd1: the command will be executed before sleep
* cmd2: the command will be executed after sleep
*
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
* @condition evaluates to true. The @condition is checked each time
* the waitqueue @wq is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*/
#define wait_event_cmd(wq, condition, cmd1, cmd2) \
do { \
if (condition) \
break; \
__wait_event_cmd(wq, condition, cmd1, cmd2); \
} while (0)
#define __wait_event_interruptible(wq, condition) \ #define __wait_event_interruptible(wq, condition) \
___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \
schedule()) schedule())
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#define _MD_P_H #define _MD_P_H
#include <linux/types.h> #include <linux/types.h>
#include <asm/byteorder.h>
/* /*
* RAID superblock. * RAID superblock.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册