diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 29fc06b47d4e07b4bdd64c025eae6180f9a1f5ca..d247429ee5efe44b449b984f435a2ba3eafab927 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1050,7 +1050,7 @@ static void compute_parity5(struct stripe_head *sh, int method) static void compute_parity6(struct stripe_head *sh, int method) { raid6_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count; + int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count; struct bio *chosen; /**** FIX THIS: This could be very bad if disks is close to 256 ****/ void *ptrs[disks]; @@ -1131,8 +1131,7 @@ static void compute_parity6(struct stripe_head *sh, int method) /* Compute one missing block */ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) { - raid6_conf_t *conf = sh->raid_conf; - int i, count, disks = conf->raid_disks; + int i, count, disks = sh->disks; void *ptr[MAX_XOR_BLOCKS], *p; int pd_idx = sh->pd_idx; int qd_idx = raid6_next_disk(pd_idx, disks); @@ -1170,8 +1169,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) /* Compute two missing blocks */ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) { - raid6_conf_t *conf = sh->raid_conf; - int i, count, disks = conf->raid_disks; + int i, count, disks = sh->disks; int pd_idx = sh->pd_idx; int qd_idx = raid6_next_disk(pd_idx, disks); int d0_idx = raid6_next_disk(qd_idx, disks); @@ -1887,11 +1885,11 @@ static void handle_stripe5(struct stripe_head *sh) static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) { raid6_conf_t *conf = sh->raid_conf; - int disks = conf->raid_disks; + int disks = sh->disks; struct bio *return_bi= NULL; struct bio *bi; int i; - int syncing; + int syncing, expanding, expanded; int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; int non_overwrite = 0; int failed_num[2] = {0, 0}; @@ -1909,6 +1907,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) clear_bit(STRIPE_DELAYED, &sh->state); syncing = test_bit(STRIPE_SYNCING, &sh->state); + expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); + expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); /* Now to look around and see what can be done */ rcu_read_lock(); @@ -2114,13 +2114,15 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) * parity, or to satisfy requests * or to load a block that is being partially written. */ - if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) { + if (to_read || non_overwrite || (to_write && failed) || + (syncing && (uptodate < disks)) || expanding) { for (i=disks; i--;) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || syncing || + expanding || (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) || (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write)) ) @@ -2355,6 +2357,79 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } } } + + if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { + /* Need to write out all blocks after computing P&Q */ + sh->disks = conf->raid_disks; + sh->pd_idx = stripe_to_pdidx(sh->sector, conf, + conf->raid_disks); + compute_parity6(sh, RECONSTRUCT_WRITE); + for (i = conf->raid_disks ; i-- ; ) { + set_bit(R5_LOCKED, &sh->dev[i].flags); + locked++; + set_bit(R5_Wantwrite, &sh->dev[i].flags); + } + clear_bit(STRIPE_EXPANDING, &sh->state); + } else if (expanded) { + clear_bit(STRIPE_EXPAND_READY, &sh->state); + atomic_dec(&conf->reshape_stripes); + wake_up(&conf->wait_for_overlap); + md_done_sync(conf->mddev, STRIPE_SECTORS, 1); + } + + if (expanding && locked == 0) { + /* We have read all the blocks in this stripe and now we need to + * copy some of them into a target stripe for expand. + */ + clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); + for (i = 0; i < sh->disks ; i++) + if (i != pd_idx && i != qd_idx) { + int dd_idx2, pd_idx2, j; + struct stripe_head *sh2; + + sector_t bn = compute_blocknr(sh, i); + sector_t s = raid5_compute_sector( + bn, conf->raid_disks, + conf->raid_disks - conf->max_degraded, + &dd_idx2, &pd_idx2, conf); + sh2 = get_active_stripe(conf, s, + conf->raid_disks, + pd_idx2, 1); + if (sh2 == NULL) + /* so for only the early blocks of + * this stripe have been requests. + * When later blocks get requests, we + * will try again + */ + continue; + if (!test_bit(STRIPE_EXPANDING, &sh2->state) || + test_bit(R5_Expanded, + &sh2->dev[dd_idx2].flags)) { + /* must have already done this block */ + release_stripe(sh2); + continue; + } + memcpy(page_address(sh2->dev[dd_idx2].page), + page_address(sh->dev[i].page), + STRIPE_SIZE); + set_bit(R5_Expanded, &sh2->dev[dd_idx2].flags); + set_bit(R5_UPTODATE, &sh2->dev[dd_idx2].flags); + for (j = 0 ; j < conf->raid_disks ; j++) + if (j != sh2->pd_idx && + j != raid6_next_disk(sh2->pd_idx, + sh2->disks) && + !test_bit(R5_Expanded, + &sh2->dev[j].flags)) + break; + if (j == conf->raid_disks) { + set_bit(STRIPE_EXPAND_READY, + &sh2->state); + set_bit(STRIPE_HANDLE, &sh2->state); + } + release_stripe(sh2); + } + } + spin_unlock(&sh->lock); while ((bi=return_bi)) { @@ -2395,7 +2470,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) rcu_read_unlock(); if (rdev) { - if (syncing) + if (syncing || expanding || expanded) md_sync_acct(rdev->bdev, STRIPE_SECTORS); bi->bi_bdev = rdev->bdev; @@ -2915,8 +2990,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped struct stripe_head *sh; int pd_idx; sector_t first_sector, last_sector; - int raid_disks; - int data_disks; + int raid_disks = conf->previous_raid_disks; + int data_disks = raid_disks - conf->max_degraded; + int new_data_disks = conf->raid_disks - conf->max_degraded; int i; int dd_idx; sector_t writepos, safepos, gap; @@ -2925,7 +3001,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped conf->expand_progress != 0) { /* restarting in the middle, skip the initial sectors */ sector_nr = conf->expand_progress; - sector_div(sector_nr, conf->raid_disks-1); + sector_div(sector_nr, new_data_disks); *skipped = 1; return sector_nr; } @@ -2939,14 +3015,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * to after where expand_lo old_maps to */ writepos = conf->expand_progress + - conf->chunk_size/512*(conf->raid_disks-1); - sector_div(writepos, conf->raid_disks-1); + conf->chunk_size/512*(new_data_disks); + sector_div(writepos, new_data_disks); safepos = conf->expand_lo; - sector_div(safepos, conf->previous_raid_disks-1); + sector_div(safepos, data_disks); gap = conf->expand_progress - conf->expand_lo; if (writepos >= safepos || - gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) { + gap > (new_data_disks)*3000*2 /*3Meg*/) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes)==0); @@ -2976,6 +3052,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped sector_t s; if (j == sh->pd_idx) continue; + if (conf->level == 6 && + j == raid6_next_disk(sh->pd_idx, sh->disks)) + continue; s = compute_blocknr(sh, j); if (s < (mddev->array_size<<1)) { skipped = 1; @@ -2999,21 +3078,20 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * The source stripes are determined by mapping the first and last * block on the destination stripes. */ - raid_disks = conf->previous_raid_disks; - data_disks = raid_disks - 1; first_sector = - raid5_compute_sector(sector_nr*(conf->raid_disks-1), + raid5_compute_sector(sector_nr*(new_data_disks), raid_disks, data_disks, &dd_idx, &pd_idx, conf); last_sector = raid5_compute_sector((sector_nr+conf->chunk_size/512) - *(conf->raid_disks-1) -1, + *(new_data_disks) -1, raid_disks, data_disks, &dd_idx, &pd_idx, conf); if (last_sector >= (mddev->size<<1)) last_sector = (mddev->size<<1)-1; while (first_sector <= last_sector) { - pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks); + pd_idx = stripe_to_pdidx(first_sector, conf, + conf->previous_raid_disks); sh = get_active_stripe(conf, first_sector, conf->previous_raid_disks, pd_idx, 0); set_bit(STRIPE_EXPAND_SOURCE, &sh->state); @@ -3348,35 +3426,44 @@ static int run(mddev_t *mddev) */ sector_t here_new, here_old; int old_disks; + int max_degraded = (mddev->level == 5 ? 1 : 2); if (mddev->new_level != mddev->level || mddev->new_layout != mddev->layout || mddev->new_chunk != mddev->chunk_size) { - printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n", + printk(KERN_ERR "raid5: %s: unsupported reshape " + "required - aborting.\n", mdname(mddev)); return -EINVAL; } if (mddev->delta_disks <= 0) { - printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n", + printk(KERN_ERR "raid5: %s: unsupported reshape " + "(reduce disks) required - aborting.\n", mdname(mddev)); return -EINVAL; } old_disks = mddev->raid_disks - mddev->delta_disks; /* reshape_position must be on a new-stripe boundary, and one - * further up in new geometry must map after here in old geometry. + * further up in new geometry must map after here in old + * geometry. */ here_new = mddev->reshape_position; - if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) { - printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n"); + if (sector_div(here_new, (mddev->chunk_size>>9)* + (mddev->raid_disks - max_degraded))) { + printk(KERN_ERR "raid5: reshape_position not " + "on a stripe boundary\n"); return -EINVAL; } /* here_new is the stripe we will write to */ here_old = mddev->reshape_position; - sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1)); - /* here_old is the first stripe that we might need to read from */ + sector_div(here_old, (mddev->chunk_size>>9)* + (old_disks-max_degraded)); + /* here_old is the first stripe that we might need to read + * from */ if (here_new >= here_old) { /* Reading from the same stripe as writing to - bad */ - printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n"); + printk(KERN_ERR "raid5: reshape_position too early for " + "auto-recovery - aborting.\n"); return -EINVAL; } printk(KERN_INFO "raid5: reshape will continue\n"); @@ -3829,8 +3916,7 @@ static int raid5_start_reshape(mddev_t *mddev) int added_devices = 0; unsigned long flags; - if (mddev->degraded || - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; ITERATE_RDEV(mddev, rdev, rtmp) @@ -3838,7 +3924,7 @@ static int raid5_start_reshape(mddev_t *mddev) !test_bit(Faulty, &rdev->flags)) spares++; - if (spares < mddev->delta_disks-1) + if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) /* Not enough devices even to make a degraded array * of that size */ @@ -3901,7 +3987,8 @@ static void end_reshape(raid5_conf_t *conf) struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1); + conf->mddev->array_size = conf->mddev->size * + (conf->raid_disks - conf->max_degraded); set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); conf->mddev->changed = 1; @@ -3974,6 +4061,10 @@ static struct mdk_personality raid6_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, +#ifdef CONFIG_MD_RAID5_RESHAPE + .check_reshape = raid5_check_reshape, + .start_reshape = raid5_start_reshape, +#endif .quiesce = raid5_quiesce, }; static struct mdk_personality raid5_personality =