diff --git a/Documentation/md.txt b/Documentation/md.txt index 1da9d1b1793f3436b3561a48de7fbcd8c893e74e..4edd39ec7db91abcbcbac352fc3c5e6d3e3c3eca 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt @@ -164,15 +164,19 @@ All md devices contain: raid_disks a text file with a simple number indicating the number of devices in a fully functional array. If this is not yet known, the file - will be empty. If an array is being resized (not currently - possible) this will contain the larger of the old and new sizes. - Some raid level (RAID1) allow this value to be set while the - array is active. This will reconfigure the array. Otherwise - it can only be set while assembling an array. + will be empty. If an array is being resized this will contain + the new number of devices. + Some raid levels allow this value to be set while the array is + active. This will reconfigure the array. Otherwise it can only + be set while assembling an array. + A change to this attribute will not be permitted if it would + reduce the size of the array. To reduce the number of drives + in an e.g. raid5, the array size must first be reduced by + setting the 'array_size' attribute. chunk_size - This is the size if bytes for 'chunks' and is only relevant to - raid levels that involve striping (1,4,5,6,10). The address space + This is the size in bytes for 'chunks' and is only relevant to + raid levels that involve striping (0,4,5,6,10). The address space of the array is conceptually divided into chunks and consecutive chunks are striped onto neighbouring devices. The size should be at least PAGE_SIZE (4k) and should be a power @@ -183,6 +187,20 @@ All md devices contain: simply a number that is interpretted differently by different levels. It can be written while assembling an array. + array_size + This can be used to artificially constrain the available space in + the array to be less than is actually available on the combined + devices. Writing a number (in Kilobytes) which is less than + the available size will set the size. Any reconfiguration of the + array (e.g. adding devices) will not cause the size to change. + Writing the word 'default' will cause the effective size of the + array to be whatever size is actually available based on + 'level', 'chunk_size' and 'component_size'. + + This can be used to reduce the size of the array before reducing + the number of devices in a raid4/5/6, or to support external + metadata formats which mandate such clipping. + reshape_position This is either "none" or a sector number within the devices of the array where "reshape" is up to. If this is set, the three @@ -207,6 +225,11 @@ All md devices contain: about the array. It can be 0.90 (traditional format), 1.0, 1.1, 1.2 (newer format in varying locations) or "none" indicating that the kernel isn't managing metadata at all. + Alternately it can be "external:" followed by a string which + is set by user-space. This indicates that metadata is managed + by a user-space program. Any device failure or other event that + requires a metadata update will cause array activity to be + suspended until the event is acknowledged. resync_start The point at which resync should start. If no resync is needed, diff --git a/crypto/xor.c b/crypto/xor.c index b2e6db075e4993eac4fcffb9004b82ee4a0bca71..996b6ee57d9e3fb3ff48251ea9ee01d405b648ad 100644 --- a/crypto/xor.c +++ b/crypto/xor.c @@ -18,8 +18,8 @@ #define BH_TRACE 0 #include -#include #include +#include #include /* The xor routines to use. */ diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2281b5098e95455cd1a6d60af7354f7728888185..36e0675be9f72fe5793687c85c3ed0d7db67efae 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -121,6 +121,7 @@ config MD_RAID10 config MD_RAID456 tristate "RAID-4/RAID-5/RAID-6 mode" depends on BLK_DEV_MD + select MD_RAID6_PQ select ASYNC_MEMCPY select ASYNC_XOR ---help--- @@ -151,34 +152,8 @@ config MD_RAID456 If unsure, say Y. -config MD_RAID5_RESHAPE - bool "Support adding drives to a raid-5 array" - depends on MD_RAID456 - default y - ---help--- - A RAID-5 set can be expanded by adding extra drives. This - requires "restriping" the array which means (almost) every - block must be written to a different place. - - This option allows such restriping to be done while the array - is online. - - You will need mdadm version 2.4.1 or later to use this - feature safely. During the early stage of reshape there is - a critical section where live data is being over-written. A - crash during this time needs extra care for recovery. The - newer mdadm takes a copy of the data in the critical section - and will restore it, if necessary, after a crash. - - The mdadm usage is e.g. - mdadm --grow /dev/md1 --raid-disks=6 - to grow '/dev/md1' to having 6 disks. - - Note: The array can only be expanded, not contracted. - There should be enough spares already present to make the new - array workable. - - If unsure, say Y. +config MD_RAID6_PQ + tristate config MD_MULTIPATH tristate "Multipath I/O support" diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 72880b7e28d9c464655c471a1d4bb85d1df0fab5..45cc5951d9287030df73363a20659482aa55414d 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -2,20 +2,21 @@ # Makefile for the kernel software RAID and LVM drivers. # -dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ +dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o -dm-multipath-objs := dm-path-selector.o dm-mpath.o -dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \ +dm-multipath-y += dm-path-selector.o dm-mpath.o +dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ dm-snap-persistent.o -dm-mirror-objs := dm-raid1.o -md-mod-objs := md.o bitmap.o -raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \ +dm-mirror-y += dm-raid1.o +md-mod-y += md.o bitmap.o +raid456-y += raid5.o +raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ raid6int1.o raid6int2.o raid6int4.o \ raid6int8.o raid6int16.o raid6int32.o \ raid6altivec1.o raid6altivec2.o raid6altivec4.o \ raid6altivec8.o \ raid6mmx.o raid6sse1.o raid6sse2.o -hostprogs-y := mktables +hostprogs-y += mktables # Note: link order is important. All raid personalities # and must come before md.o, as they each initialise @@ -26,6 +27,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o obj-$(CONFIG_MD_RAID0) += raid0.o obj-$(CONFIG_MD_RAID1) += raid1.o obj-$(CONFIG_MD_RAID10) += raid10.o +obj-$(CONFIG_MD_RAID6_PQ) += raid6_pq.o obj-$(CONFIG_MD_RAID456) += raid456.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_FAULTY) += faulty.o diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 719943763391263c7a4ab98ad0201244ce1d3d62..f8a9f7ab2cb8ac71884b87fb512a281db3c86789 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -16,6 +16,7 @@ * wait if count gets too high, wake when it drops to half. */ +#include #include #include #include @@ -26,8 +27,8 @@ #include #include #include -#include -#include +#include "md.h" +#include "bitmap.h" /* debug macros */ @@ -111,9 +112,10 @@ static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int creat unsigned char *mappage; if (page >= bitmap->pages) { - printk(KERN_ALERT - "%s: invalid bitmap page request: %lu (> %lu)\n", - bmname(bitmap), page, bitmap->pages-1); + /* This can happen if bitmap_start_sync goes beyond + * End-of-device while looking for a whole page. + * It is harmless. + */ return -EINVAL; } @@ -265,7 +267,6 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev) list_for_each_continue_rcu(pos, &mddev->disks) { rdev = list_entry(pos, mdk_rdev_t, same_set); if (rdev->raid_disk >= 0 && - test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) { /* this is a usable devices */ atomic_inc(&rdev->nr_pending); @@ -297,7 +298,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) + size/512 > 0) /* bitmap runs in to metadata */ goto bad_alignment; - if (rdev->data_offset + mddev->size*2 + if (rdev->data_offset + mddev->dev_sectors > rdev->sb_start + bitmap->offset) /* data runs in to bitmap */ goto bad_alignment; @@ -570,7 +571,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) reason = "unrecognized superblock version"; - else if (chunksize < PAGE_SIZE) + else if (chunksize < 512) reason = "bitmap chunksize too small"; else if ((1 << ffz(~chunksize)) != chunksize) reason = "bitmap chunksize not a power of 2"; @@ -1306,6 +1307,9 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); } + if (bitmap->mddev->degraded) + /* Never clear bits or update events_cleared when degraded */ + success = 0; while (sectors) { int blocks; @@ -1345,8 +1349,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto } } -int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, - int degraded) +static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, + int degraded) { bitmap_counter_t *bmc; int rv; @@ -1374,6 +1378,29 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, return rv; } +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, + int degraded) +{ + /* bitmap_start_sync must always report on multiples of whole + * pages, otherwise resync (which is very PAGE_SIZE based) will + * get confused. + * So call __bitmap_start_sync repeatedly (if needed) until + * At least PAGE_SIZE>>9 blocks are covered. + * Return the 'or' of the result. + */ + int rv = 0; + int blocks1; + + *blocks = 0; + while (*blocks < (PAGE_SIZE>>9)) { + rv |= __bitmap_start_sync(bitmap, offset, + &blocks1, degraded); + offset += blocks1; + *blocks += blocks1; + } + return rv; +} + void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) { bitmap_counter_t *bmc; @@ -1443,6 +1470,8 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) wait_event(bitmap->mddev->recovery_wait, atomic_read(&bitmap->mddev->recovery_active) == 0); + bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; + set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); s = 0; while (s < sector && s < bitmap->mddev->resync_max_sectors) { diff --git a/include/linux/raid/bitmap.h b/drivers/md/bitmap.h similarity index 100% rename from include/linux/raid/bitmap.h rename to drivers/md/bitmap.h diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 86d9adf90e791857efdf674117168398b40735a1..8695809b24b05f049c13a3dc667bf2a02d9b4ba6 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -62,7 +62,10 @@ #define ModeShift 5 #define MaxFault 50 -#include +#include +#include +#include "md.h" +#include static void faulty_fail(struct bio *bio, int error) @@ -280,6 +283,17 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size) return 0; } +static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + WARN_ONCE(raid_disks, + "%s does not support generic reshape\n", __func__); + + if (sectors == 0) + return mddev->dev_sectors; + + return sectors; +} + static int run(mddev_t *mddev) { mdk_rdev_t *rdev; @@ -298,7 +312,7 @@ static int run(mddev_t *mddev) list_for_each_entry(rdev, &mddev->disks, same_set) conf->rdev = rdev; - mddev->array_sectors = mddev->size * 2; + md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); mddev->private = conf; reconfig(mddev, mddev->layout, -1); @@ -325,6 +339,7 @@ static struct mdk_personality faulty_personality = .stop = stop, .status = status, .reconfig = reconfig, + .size = faulty_size, }; static int __init raid_init(void) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 09658b218474a3a8f676995f02e59fdfa10693b0..7a36e38393a1e9ff24defd03c756138a9f5da903 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -16,7 +16,11 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include +#include +#include +#include +#include "md.h" +#include "linear.h" /* * find which device holds a particular offset @@ -97,6 +101,16 @@ static int linear_congested(void *data, int bits) return ret; } +static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + linear_conf_t *conf = mddev_to_conf(mddev); + + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + + return conf->array_sectors; +} + static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) { linear_conf_t *conf; @@ -135,8 +149,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) mddev->queue->max_sectors > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - disk->num_sectors = rdev->size * 2; - conf->array_sectors += rdev->size * 2; + disk->num_sectors = rdev->sectors; + conf->array_sectors += rdev->sectors; cnt++; } @@ -249,7 +263,7 @@ static int linear_run (mddev_t *mddev) if (!conf) return 1; mddev->private = conf; - mddev->array_sectors = conf->array_sectors; + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); mddev->queue->unplug_fn = linear_unplug; @@ -283,7 +297,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) newconf->prev = mddev_to_conf(mddev); mddev->private = newconf; mddev->raid_disks++; - mddev->array_sectors = newconf->array_sectors; + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); return 0; } @@ -381,6 +395,7 @@ static struct mdk_personality linear_personality = .stop = linear_stop, .status = linear_status, .hot_add_disk = linear_add, + .size = linear_size, }; static int __init linear_init (void) diff --git a/include/linux/raid/linear.h b/drivers/md/linear.h similarity index 95% rename from include/linux/raid/linear.h rename to drivers/md/linear.h index f38b9c586afbc38c2a3187ac479735b14ad5fffb..bf8179587f950b2ace4eba54a73abaf4960463ab 100644 --- a/include/linux/raid/linear.h +++ b/drivers/md/linear.h @@ -1,8 +1,6 @@ #ifndef _LINEAR_H #define _LINEAR_H -#include - struct dev_info { mdk_rdev_t *rdev; sector_t num_sectors; diff --git a/drivers/md/md.c b/drivers/md/md.c index a307f87eb90ee361ea6c36b2cbdd94c8886dfdec..ed5727c089a9403b2ad10bb7e581beae5b0d36ac 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -33,9 +33,9 @@ */ #include -#include -#include +#include #include +#include #include /* for invalidate_bdev */ #include #include @@ -45,11 +45,10 @@ #include #include #include - -#define MAJOR_NR MD_MAJOR - -/* 63 partitions with the alternate major number (mdp) */ -#define MdpMinorShift 6 +#include +#include +#include "md.h" +#include "bitmap.h" #define DEBUG 0 #define dprintk(x...) ((void)(DEBUG && printk(x))) @@ -202,12 +201,68 @@ static DEFINE_SPINLOCK(all_mddevs_lock); ) -static int md_fail_request(struct request_queue *q, struct bio *bio) +/* Rather than calling directly into the personality make_request function, + * IO requests come here first so that we can check if the device is + * being suspended pending a reconfiguration. + * We hold a refcount over the call to ->make_request. By the time that + * call has finished, the bio has been linked into some internal structure + * and so is visible to ->quiesce(), so we don't need the refcount any more. + */ +static int md_make_request(struct request_queue *q, struct bio *bio) { - bio_io_error(bio); - return 0; + mddev_t *mddev = q->queuedata; + int rv; + if (mddev == NULL || mddev->pers == NULL) { + bio_io_error(bio); + return 0; + } + rcu_read_lock(); + if (mddev->suspended) { + DEFINE_WAIT(__wait); + for (;;) { + prepare_to_wait(&mddev->sb_wait, &__wait, + TASK_UNINTERRUPTIBLE); + if (!mddev->suspended) + break; + rcu_read_unlock(); + schedule(); + rcu_read_lock(); + } + finish_wait(&mddev->sb_wait, &__wait); + } + atomic_inc(&mddev->active_io); + rcu_read_unlock(); + rv = mddev->pers->make_request(q, bio); + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) + wake_up(&mddev->sb_wait); + + return rv; +} + +static void mddev_suspend(mddev_t *mddev) +{ + BUG_ON(mddev->suspended); + mddev->suspended = 1; + synchronize_rcu(); + wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); + mddev->pers->quiesce(mddev, 1); + md_unregister_thread(mddev->thread); + mddev->thread = NULL; + /* we now know that no code is executing in the personality module, + * except possibly the tail end of a ->bi_end_io function, but that + * is certain to complete before the module has a chance to get + * unloaded + */ +} + +static void mddev_resume(mddev_t *mddev) +{ + mddev->suspended = 0; + wake_up(&mddev->sb_wait); + mddev->pers->quiesce(mddev, 0); } + static inline mddev_t *mddev_get(mddev_t *mddev) { atomic_inc(&mddev->active); @@ -310,6 +365,7 @@ static mddev_t * mddev_find(dev_t unit) init_timer(&new->safemode_timer); atomic_set(&new->active, 1); atomic_set(&new->openers, 0); + atomic_set(&new->active_io, 0); spin_lock_init(&new->write_lock); init_waitqueue_head(&new->sb_wait); init_waitqueue_head(&new->recovery_wait); @@ -326,6 +382,11 @@ static inline int mddev_lock(mddev_t * mddev) return mutex_lock_interruptible(&mddev->reconfig_mutex); } +static inline int mddev_is_locked(mddev_t *mddev) +{ + return mutex_is_locked(&mddev->reconfig_mutex); +} + static inline int mddev_trylock(mddev_t * mddev) { return mutex_trylock(&mddev->reconfig_mutex); @@ -409,7 +470,7 @@ static void free_disk_sb(mdk_rdev_t * rdev) rdev->sb_loaded = 0; rdev->sb_page = NULL; rdev->sb_start = 0; - rdev->size = 0; + rdev->sectors = 0; } } @@ -775,9 +836,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version else ret = 0; } - rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2; + rdev->sectors = calc_num_sectors(rdev, sb->chunk_size); - if (rdev->size < sb->size && sb->level > 1) + if (rdev->sectors < sb->size * 2 && sb->level > 1) /* "this cannot possibly happen" ... */ ret = -EINVAL; @@ -812,7 +873,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->clevel[0] = 0; mddev->layout = sb->layout; mddev->raid_disks = sb->raid_disks; - mddev->size = sb->size; + mddev->dev_sectors = sb->size * 2; mddev->events = ev1; mddev->bitmap_offset = 0; mddev->default_bitmap_offset = MD_SB_BYTES >> 9; @@ -926,7 +987,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->ctime = mddev->ctime; sb->level = mddev->level; - sb->size = mddev->size; + sb->size = mddev->dev_sectors / 2; sb->raid_disks = mddev->raid_disks; sb->md_minor = mddev->md_minor; sb->not_persistent = 0; @@ -1024,7 +1085,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) static unsigned long long super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) { - if (num_sectors && num_sectors < rdev->mddev->size * 2) + if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ if (rdev->mddev->bitmap_offset) return 0; /* can't move bitmap */ @@ -1180,16 +1241,17 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) ret = 0; } if (minor_version) - rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; + rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) - + le64_to_cpu(sb->data_offset); else - rdev->size = rdev->sb_start / 2; - if (rdev->size < le64_to_cpu(sb->data_size)/2) + rdev->sectors = rdev->sb_start; + if (rdev->sectors < le64_to_cpu(sb->data_size)) return -EINVAL; - rdev->size = le64_to_cpu(sb->data_size)/2; + rdev->sectors = le64_to_cpu(sb->data_size); if (le32_to_cpu(sb->chunksize)) - rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); + rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1); - if (le64_to_cpu(sb->size) > rdev->size*2) + if (le64_to_cpu(sb->size) > rdev->sectors) return -EINVAL; return ret; } @@ -1216,7 +1278,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->clevel[0] = 0; mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); - mddev->size = le64_to_cpu(sb->size)/2; + mddev->dev_sectors = le64_to_cpu(sb->size); mddev->events = ev1; mddev->bitmap_offset = 0; mddev->default_bitmap_offset = 1024 >> 9; @@ -1312,7 +1374,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); sb->raid_disks = cpu_to_le32(mddev->raid_disks); - sb->size = cpu_to_le64(mddev->size<<1); + sb->size = cpu_to_le64(mddev->dev_sectors); if (mddev->bitmap && mddev->bitmap_file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); @@ -1320,10 +1382,15 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) } if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset > 0) { - sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); - sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); + !test_bit(In_sync, &rdev->flags)) { + if (mddev->curr_resync_completed > rdev->recovery_offset) + rdev->recovery_offset = mddev->curr_resync_completed; + if (rdev->recovery_offset > 0) { + sb->feature_map |= + cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); + sb->recovery_offset = + cpu_to_le64(rdev->recovery_offset); + } } if (mddev->reshape_position != MaxSector) { @@ -1365,7 +1432,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) { struct mdp_superblock_1 *sb; sector_t max_sectors; - if (num_sectors && num_sectors < rdev->mddev->size * 2) + if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */ @@ -1381,7 +1448,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) sector_t sb_start; sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; sb_start &= ~(sector_t)(4*2 - 1); - max_sectors = rdev->size * 2 + sb_start - rdev->sb_start; + max_sectors = rdev->sectors + sb_start - rdev->sb_start; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; rdev->sb_start = sb_start; @@ -1433,6 +1500,38 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) static LIST_HEAD(pending_raid_disks); +static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev) +{ + struct mdk_personality *pers = mddev->pers; + struct gendisk *disk = mddev->gendisk; + struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); + struct blk_integrity *bi_mddev = blk_get_integrity(disk); + + /* Data integrity passthrough not supported on RAID 4, 5 and 6 */ + if (pers && pers->level >= 4 && pers->level <= 6) + return; + + /* If rdev is integrity capable, register profile for mddev */ + if (!bi_mddev && bi_rdev) { + if (blk_integrity_register(disk, bi_rdev)) + printk(KERN_ERR "%s: %s Could not register integrity!\n", + __func__, disk->disk_name); + else + printk(KERN_NOTICE "Enabling data integrity on %s\n", + disk->disk_name); + return; + } + + /* Check that mddev and rdev have matching profiles */ + if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) { + printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__, + disk->disk_name, rdev->bdev->bd_disk->disk_name); + printk(KERN_NOTICE "Disabling data integrity on %s\n", + disk->disk_name); + blk_integrity_unregister(disk); + } +} + static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) { char b[BDEVNAME_SIZE]; @@ -1449,8 +1548,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST; - /* make sure rdev->size exceeds mddev->size */ - if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { + /* make sure rdev->sectors exceeds mddev->dev_sectors */ + if (rdev->sectors && (mddev->dev_sectors == 0 || + rdev->sectors < mddev->dev_sectors)) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care @@ -1459,7 +1559,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (mddev->level > 0) return -ENOSPC; } else - mddev->size = rdev->size; + mddev->dev_sectors = rdev->sectors; } /* Verify rdev->desc_nr is unique. @@ -1503,6 +1603,8 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) /* May as well allow recovery to be retried once */ mddev->recovery_disabled = 0; + + md_integrity_check(rdev, mddev); return 0; fail: @@ -1713,8 +1815,8 @@ static void print_sb_1(struct mdp_superblock_1 *sb) static void print_rdev(mdk_rdev_t *rdev, int major_version) { char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", - bdevname(rdev->bdev,b), (unsigned long long)rdev->size, + printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", + bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), rdev->desc_nr); if (rdev->sb_loaded) { @@ -2153,7 +2255,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) return -EINVAL; if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; - if (rdev->size && rdev->mddev->external) + if (rdev->sectors && rdev->mddev->external) /* Must set offset before size, so overlap checks * can be sane */ return -EBUSY; @@ -2167,7 +2269,7 @@ __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); static ssize_t rdev_size_show(mdk_rdev_t *rdev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)rdev->size); + return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); } static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) @@ -2180,34 +2282,52 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) return 1; } +static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) +{ + unsigned long long blocks; + sector_t new; + + if (strict_strtoull(buf, 10, &blocks) < 0) + return -EINVAL; + + if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) + return -EINVAL; /* sector conversion overflow */ + + new = blocks * 2; + if (new != blocks * 2) + return -EINVAL; /* unsigned long long to sector_t overflow */ + + *sectors = new; + return 0; +} + static ssize_t rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) { - unsigned long long size; - unsigned long long oldsize = rdev->size; mddev_t *my_mddev = rdev->mddev; + sector_t oldsectors = rdev->sectors; + sector_t sectors; - if (strict_strtoull(buf, 10, &size) < 0) + if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; if (my_mddev->pers && rdev->raid_disk >= 0) { if (my_mddev->persistent) { - size = super_types[my_mddev->major_version]. - rdev_size_change(rdev, size * 2); - if (!size) + sectors = super_types[my_mddev->major_version]. + rdev_size_change(rdev, sectors); + if (!sectors) return -EBUSY; - } else if (!size) { - size = (rdev->bdev->bd_inode->i_size >> 10); - size -= rdev->data_offset/2; - } + } else if (!sectors) + sectors = (rdev->bdev->bd_inode->i_size >> 9) - + rdev->data_offset; } - if (size < my_mddev->size) + if (sectors < my_mddev->dev_sectors) return -EINVAL; /* component must fit device */ - rdev->size = size; - if (size > oldsize && my_mddev->external) { + rdev->sectors = sectors; + if (sectors > oldsectors && my_mddev->external) { /* need to check that all other rdevs with the same ->bdev * do not overlap. We need to unlock the mddev to avoid - * a deadlock. We have already changed rdev->size, and if + * a deadlock. We have already changed rdev->sectors, and if * we have to change it back, we will have the lock again. */ mddev_t *mddev; @@ -2223,9 +2343,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (test_bit(AllReserved, &rdev2->flags) || (rdev->bdev == rdev2->bdev && rdev != rdev2 && - overlaps(rdev->data_offset, rdev->size * 2, + overlaps(rdev->data_offset, rdev->sectors, rdev2->data_offset, - rdev2->size * 2))) { + rdev2->sectors))) { overlap = 1; break; } @@ -2239,11 +2359,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (overlap) { /* Someone else could have slipped in a size * change here, but doing so is just silly. - * We put oldsize back because we *know* it is + * We put oldsectors back because we *know* it is * safe, and trust userspace not to race with * itself */ - rdev->size = oldsize; + rdev->sectors = oldsectors; return -EBUSY; } } @@ -2547,18 +2667,101 @@ level_show(mddev_t *mddev, char *page) static ssize_t level_store(mddev_t *mddev, const char *buf, size_t len) { + char level[16]; ssize_t rv = len; - if (mddev->pers) + struct mdk_personality *pers; + void *priv; + + if (mddev->pers == NULL) { + if (len == 0) + return 0; + if (len >= sizeof(mddev->clevel)) + return -ENOSPC; + strncpy(mddev->clevel, buf, len); + if (mddev->clevel[len-1] == '\n') + len--; + mddev->clevel[len] = 0; + mddev->level = LEVEL_NONE; + return rv; + } + + /* request to change the personality. Need to ensure: + * - array is not engaged in resync/recovery/reshape + * - old personality can be suspended + * - new personality will access other array. + */ + + if (mddev->sync_thread || mddev->reshape_position != MaxSector) return -EBUSY; - if (len == 0) - return 0; - if (len >= sizeof(mddev->clevel)) - return -ENOSPC; - strncpy(mddev->clevel, buf, len); - if (mddev->clevel[len-1] == '\n') + + if (!mddev->pers->quiesce) { + printk(KERN_WARNING "md: %s: %s does not support online personality change\n", + mdname(mddev), mddev->pers->name); + return -EINVAL; + } + + /* Now find the new personality */ + if (len == 0 || len >= sizeof(level)) + return -EINVAL; + strncpy(level, buf, len); + if (level[len-1] == '\n') len--; - mddev->clevel[len] = 0; - mddev->level = LEVEL_NONE; + level[len] = 0; + + request_module("md-%s", level); + spin_lock(&pers_lock); + pers = find_pers(LEVEL_NONE, level); + if (!pers || !try_module_get(pers->owner)) { + spin_unlock(&pers_lock); + printk(KERN_WARNING "md: personality %s not loaded\n", level); + return -EINVAL; + } + spin_unlock(&pers_lock); + + if (pers == mddev->pers) { + /* Nothing to do! */ + module_put(pers->owner); + return rv; + } + if (!pers->takeover) { + module_put(pers->owner); + printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", + mdname(mddev), level); + return -EINVAL; + } + + /* ->takeover must set new_* and/or delta_disks + * if it succeeds, and may set them when it fails. + */ + priv = pers->takeover(mddev); + if (IS_ERR(priv)) { + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk = mddev->chunk_size; + mddev->raid_disks -= mddev->delta_disks; + mddev->delta_disks = 0; + module_put(pers->owner); + printk(KERN_WARNING "md: %s: %s would not accept array\n", + mdname(mddev), level); + return PTR_ERR(priv); + } + + /* Looks like we have a winner */ + mddev_suspend(mddev); + mddev->pers->stop(mddev); + module_put(mddev->pers->owner); + mddev->pers = pers; + mddev->private = priv; + strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + mddev->level = mddev->new_level; + mddev->layout = mddev->new_layout; + mddev->chunk_size = mddev->new_chunk; + mddev->delta_disks = 0; + pers->run(mddev); + mddev_resume(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); return rv; } @@ -2586,12 +2789,18 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) if (!*buf || (*e && *e != '\n')) return -EINVAL; - if (mddev->pers) - return -EBUSY; - if (mddev->reshape_position != MaxSector) + if (mddev->pers) { + int err; + if (mddev->pers->reconfig == NULL) + return -EBUSY; + err = mddev->pers->reconfig(mddev, n, -1); + if (err) + return err; + } else { mddev->new_layout = n; - else - mddev->layout = n; + if (mddev->reshape_position == MaxSector) + mddev->layout = n; + } return len; } static struct md_sysfs_entry md_layout = @@ -2648,19 +2857,24 @@ chunk_size_show(mddev_t *mddev, char *page) static ssize_t chunk_size_store(mddev_t *mddev, const char *buf, size_t len) { - /* can only set chunk_size if array is not yet active */ char *e; unsigned long n = simple_strtoul(buf, &e, 10); if (!*buf || (*e && *e != '\n')) return -EINVAL; - if (mddev->pers) - return -EBUSY; - else if (mddev->reshape_position != MaxSector) + if (mddev->pers) { + int err; + if (mddev->pers->reconfig == NULL) + return -EBUSY; + err = mddev->pers->reconfig(mddev, -1, n); + if (err) + return err; + } else { mddev->new_chunk = n; - else - mddev->chunk_size = n; + if (mddev->reshape_position == MaxSector) + mddev->chunk_size = n; + } return len; } static struct md_sysfs_entry md_chunk_size = @@ -2669,6 +2883,8 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); static ssize_t resync_start_show(mddev_t *mddev, char *page) { + if (mddev->recovery_cp == MaxSector) + return sprintf(page, "none\n"); return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); } @@ -2766,7 +2982,7 @@ array_state_show(mddev_t *mddev, char *page) else { if (list_empty(&mddev->disks) && mddev->raid_disks == 0 && - mddev->size == 0) + mddev->dev_sectors == 0) st = clear; else st = inactive; @@ -2973,7 +3189,8 @@ __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); static ssize_t size_show(mddev_t *mddev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)mddev->size); + return sprintf(page, "%llu\n", + (unsigned long long)mddev->dev_sectors / 2); } static int update_size(mddev_t *mddev, sector_t num_sectors); @@ -2985,20 +3202,18 @@ size_store(mddev_t *mddev, const char *buf, size_t len) * not increase it (except from 0). * If array is active, we can try an on-line resize */ - char *e; - int err = 0; - unsigned long long size = simple_strtoull(buf, &e, 10); - if (!*buf || *buf == '\n' || - (*e && *e != '\n')) - return -EINVAL; + sector_t sectors; + int err = strict_blocks_to_sectors(buf, §ors); + if (err < 0) + return err; if (mddev->pers) { - err = update_size(mddev, size * 2); + err = update_size(mddev, sectors); md_update_sb(mddev, 1); } else { - if (mddev->size == 0 || - mddev->size > size) - mddev->size = size; + if (mddev->dev_sectors == 0 || + mddev->dev_sectors > sectors) + mddev->dev_sectors = sectors; else err = -ENOSPC; } @@ -3251,6 +3466,8 @@ static ssize_t sync_speed_show(mddev_t *mddev, char *page) { unsigned long resync, dt, db; + if (mddev->curr_resync == 0) + return sprintf(page, "none\n"); resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); dt = (jiffies - mddev->resync_mark) / HZ; if (!dt) dt++; @@ -3263,15 +3480,15 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static ssize_t sync_completed_show(mddev_t *mddev, char *page) { - unsigned long max_blocks, resync; + unsigned long max_sectors, resync; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - max_blocks = mddev->resync_max_sectors; + max_sectors = mddev->resync_max_sectors; else - max_blocks = mddev->size << 1; + max_sectors = mddev->dev_sectors; resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); - return sprintf(page, "%lu / %lu\n", resync, max_blocks); + return sprintf(page, "%lu / %lu\n", resync, max_sectors); } static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); @@ -3431,6 +3648,57 @@ static struct md_sysfs_entry md_reshape_position = __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, reshape_position_store); +static ssize_t +array_size_show(mddev_t *mddev, char *page) +{ + if (mddev->external_size) + return sprintf(page, "%llu\n", + (unsigned long long)mddev->array_sectors/2); + else + return sprintf(page, "default\n"); +} + +static ssize_t +array_size_store(mddev_t *mddev, const char *buf, size_t len) +{ + sector_t sectors; + + if (strncmp(buf, "default", 7) == 0) { + if (mddev->pers) + sectors = mddev->pers->size(mddev, 0, 0); + else + sectors = mddev->array_sectors; + + mddev->external_size = 0; + } else { + if (strict_blocks_to_sectors(buf, §ors) < 0) + return -EINVAL; + if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) + return -EINVAL; + + mddev->external_size = 1; + } + + mddev->array_sectors = sectors; + set_capacity(mddev->gendisk, mddev->array_sectors); + if (mddev->pers) { + struct block_device *bdev = bdget_disk(mddev->gendisk, 0); + + if (bdev) { + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, + (loff_t)mddev->array_sectors << 9); + mutex_unlock(&bdev->bd_inode->i_mutex); + bdput(bdev); + } + } + + return len; +} + +static struct md_sysfs_entry md_array_size = +__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, + array_size_store); static struct attribute *md_default_attrs[] = { &md_level.attr, @@ -3444,6 +3712,7 @@ static struct attribute *md_default_attrs[] = { &md_safe_delay.attr, &md_array_state.attr, &md_reshape_position.attr, + &md_array_size.attr, NULL, }; @@ -3602,10 +3871,12 @@ static int md_alloc(dev_t dev, char *name) mddev_put(mddev); return -ENOMEM; } + mddev->queue->queuedata = mddev; + /* Can be unlocked because the queue is new: no concurrency */ queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); - blk_queue_make_request(mddev->queue, md_fail_request); + blk_queue_make_request(mddev->queue, md_make_request); disk = alloc_disk(1 << shift); if (!disk) { @@ -3731,13 +4002,13 @@ static int do_md_run(mddev_t * mddev) list_for_each_entry(rdev, &mddev->disks, same_set) { if (test_bit(Faulty, &rdev->flags)) continue; - if (rdev->size < chunk_size / 1024) { + if (rdev->sectors < chunk_size / 512) { printk(KERN_WARNING "md: Dev %s smaller than chunk_size:" - " %lluk < %dk\n", + " %llu < %d\n", bdevname(rdev->bdev,b), - (unsigned long long)rdev->size, - chunk_size / 1024); + (unsigned long long)rdev->sectors, + chunk_size / 512); return -EINVAL; } } @@ -3761,11 +4032,11 @@ static int do_md_run(mddev_t * mddev) /* perform some consistency tests on the device. * We don't want the data to overlap the metadata, - * Internal Bitmap issues has handled elsewhere. + * Internal Bitmap issues have been handled elsewhere. */ if (rdev->data_offset < rdev->sb_start) { - if (mddev->size && - rdev->data_offset + mddev->size*2 + if (mddev->dev_sectors && + rdev->data_offset + mddev->dev_sectors > rdev->sb_start) { printk("md: %s: data overlaps metadata\n", mdname(mddev)); @@ -3801,9 +4072,16 @@ static int do_md_run(mddev_t * mddev) } mddev->pers = pers; spin_unlock(&pers_lock); - mddev->level = pers->level; + if (mddev->level != pers->level) { + mddev->level = pers->level; + mddev->new_level = pers->level; + } strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + if (pers->level >= 4 && pers->level <= 6) + /* Cannot support integrity (yet) */ + blk_integrity_unregister(mddev->gendisk); + if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ @@ -3843,7 +4121,9 @@ static int do_md_run(mddev_t * mddev) } mddev->recovery = 0; - mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ + /* may be over-ridden by personality */ + mddev->resync_max_sectors = mddev->dev_sectors; + mddev->barriers_work = 1; mddev->ok_start_degraded = start_dirty_degraded; @@ -3853,7 +4133,17 @@ static int do_md_run(mddev_t * mddev) err = mddev->pers->run(mddev); if (err) printk(KERN_ERR "md: pers->run() failed ...\n"); - else if (mddev->pers->sync_request) { + else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { + WARN_ONCE(!mddev->external_size, "%s: default size too small," + " but 'external_size' not in effect?\n", __func__); + printk(KERN_ERR + "md: invalid array_size %llu > default size %llu\n", + (unsigned long long)mddev->array_sectors / 2, + (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); + err = -EINVAL; + mddev->pers->stop(mddev); + } + if (err == 0 && mddev->pers->sync_request) { err = bitmap_create(mddev); if (err) { printk(KERN_ERR "%s: failed to create bitmap (%d)\n", @@ -3899,16 +4189,6 @@ static int do_md_run(mddev_t * mddev) set_capacity(disk, mddev->array_sectors); - /* If we call blk_queue_make_request here, it will - * re-initialise max_sectors etc which may have been - * refined inside -> run. So just set the bits we need to set. - * Most initialisation happended when we called - * blk_queue_make_request(..., md_fail_request) - * earlier. - */ - mddev->queue->queuedata = mddev; - mddev->queue->make_request_fn = mddev->pers->make_request; - /* If there is a partially-recovered drive we need to * start recovery here. If we leave it to md_check_recovery, * it will remove the drives and not do the right thing @@ -4038,7 +4318,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) md_super_wait(mddev); if (mddev->ro) set_disk_ro(disk, 0); - blk_queue_make_request(mddev->queue, md_fail_request); + mddev->pers->stop(mddev); mddev->queue->merge_bvec_fn = NULL; mddev->queue->unplug_fn = NULL; @@ -4095,7 +4375,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) export_array(mddev); mddev->array_sectors = 0; - mddev->size = 0; + mddev->external_size = 0; + mddev->dev_sectors = 0; mddev->raid_disks = 0; mddev->recovery_cp = 0; mddev->resync_min = 0; @@ -4135,6 +4416,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) printk(KERN_INFO "md: %s switched to read-only mode.\n", mdname(mddev)); err = 0; + blk_integrity_unregister(disk); md_new_event(mddev); sysfs_notify_dirent(mddev->sysfs_state); out: @@ -4300,8 +4582,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg) info.patch_version = MD_PATCHLEVEL_VERSION; info.ctime = mddev->ctime; info.level = mddev->level; - info.size = mddev->size; - if (info.size != mddev->size) /* overflow */ + info.size = mddev->dev_sectors / 2; + if (info.size != mddev->dev_sectors / 2) /* overflow */ info.size = -1; info.nr_disks = nr; info.raid_disks = mddev->raid_disks; @@ -4480,6 +4762,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) clear_bit(In_sync, &rdev->flags); /* just to be sure */ if (info->state & (1<flags); + else + clear_bit(WriteMostly, &rdev->flags); rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); @@ -4543,7 +4827,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; } else rdev->sb_start = calc_dev_sboffset(rdev->bdev); - rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; + rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); err = bind_rdev_to_array(rdev, mddev); if (err) { @@ -4613,7 +4897,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) else rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; - rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; + rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); if (test_bit(Faulty, &rdev->flags)) { printk(KERN_WARNING @@ -4749,7 +5033,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->level = info->level; mddev->clevel[0] = 0; - mddev->size = info->size; + mddev->dev_sectors = 2 * (sector_t)info->size; mddev->raid_disks = info->raid_disks; /* don't set md_minor, it is determined by which /dev/md* was * openned @@ -4788,6 +5072,17 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) return 0; } +void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors) +{ + WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); + + if (mddev->external_size) + return; + + mddev->array_sectors = array_sectors; +} +EXPORT_SYMBOL(md_set_array_sectors); + static int update_size(mddev_t *mddev, sector_t num_sectors) { mdk_rdev_t *rdev; @@ -4814,8 +5109,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) */ return -EBUSY; list_for_each_entry(rdev, &mddev->disks, same_set) { - sector_t avail; - avail = rdev->size * 2; + sector_t avail = rdev->sectors; if (fit && (num_sectors == 0 || num_sectors > avail)) num_sectors = avail; @@ -4887,12 +5181,18 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) ) return -EINVAL; /* Check there is only one change */ - if (info->size >= 0 && mddev->size != info->size) cnt++; - if (mddev->raid_disks != info->raid_disks) cnt++; - if (mddev->layout != info->layout) cnt++; - if ((state ^ info->state) & (1< 1) return -EINVAL; + if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) + cnt++; + if (mddev->raid_disks != info->raid_disks) + cnt++; + if (mddev->layout != info->layout) + cnt++; + if ((state ^ info->state) & (1< 1) + return -EINVAL; if (mddev->layout != info->layout) { /* Change layout @@ -4904,7 +5204,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) else return mddev->pers->reconfig(mddev, info->layout, -1); } - if (info->size >= 0 && mddev->size != info->size) + if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); if (mddev->raid_disks != info->raid_disks) @@ -5331,6 +5631,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, void md_unregister_thread(mdk_thread_t *thread) { + if (!thread) + return; dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); kthread_stop(thread->tsk); @@ -5404,7 +5706,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) max_blocks = mddev->resync_max_sectors >> 1; else - max_blocks = mddev->size; + max_blocks = mddev->dev_sectors / 2; /* * Should not happen. @@ -5537,7 +5839,7 @@ struct mdstat_info { static int md_seq_show(struct seq_file *seq, void *v) { mddev_t *mddev = v; - sector_t size; + sector_t sectors; mdk_rdev_t *rdev; struct mdstat_info *mi = seq->private; struct bitmap *bitmap; @@ -5573,7 +5875,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " %s", mddev->pers->name); } - size = 0; + sectors = 0; list_for_each_entry(rdev, &mddev->disks, same_set) { char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", @@ -5585,7 +5887,7 @@ static int md_seq_show(struct seq_file *seq, void *v) continue; } else if (rdev->raid_disk < 0) seq_printf(seq, "(S)"); /* spare */ - size += rdev->size; + sectors += rdev->sectors; } if (!list_empty(&mddev->disks)) { @@ -5595,7 +5897,7 @@ static int md_seq_show(struct seq_file *seq, void *v) mddev->array_sectors / 2); else seq_printf(seq, "\n %llu blocks", - (unsigned long long)size); + (unsigned long long)sectors / 2); } if (mddev->persistent) { if (mddev->major_version != 0 || @@ -5722,19 +6024,19 @@ int unregister_md_personality(struct mdk_personality *p) return 0; } -static int is_mddev_idle(mddev_t *mddev) +static int is_mddev_idle(mddev_t *mddev, int init) { mdk_rdev_t * rdev; int idle; - long curr_events; + int curr_events; idle = 1; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; - curr_events = part_stat_read(&disk->part0, sectors[0]) + - part_stat_read(&disk->part0, sectors[1]) - - atomic_read(&disk->sync_io); + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]) - + atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and * disk_stats is counted when it completes. @@ -5757,7 +6059,7 @@ static int is_mddev_idle(mddev_t *mddev) * always make curr_events less than last_events. * */ - if (curr_events - rdev->last_events > 4096) { + if (init || curr_events - rdev->last_events > 64) { rdev->last_events = curr_events; idle = 0; } @@ -5980,10 +6282,10 @@ void md_do_sync(mddev_t *mddev) j = mddev->recovery_cp; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - max_sectors = mddev->size << 1; + max_sectors = mddev->dev_sectors; else { /* recovery follows the physical size of devices */ - max_sectors = mddev->size << 1; + max_sectors = mddev->dev_sectors; j = MaxSector; list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && @@ -6000,7 +6302,7 @@ void md_do_sync(mddev_t *mddev) "(but not more than %d KB/sec) for %s.\n", speed_max(mddev), desc); - is_mddev_idle(mddev); /* this also initializes IO event counters */ + is_mddev_idle(mddev, 1); /* this initializes IO event counters */ io_sectors = 0; for (m = 0; m < SYNC_MARKS; m++) { @@ -6040,6 +6342,18 @@ void md_do_sync(mddev_t *mddev) } if (kthread_should_stop()) goto interrupted; + + if (mddev->curr_resync > mddev->curr_resync_completed && + (mddev->curr_resync - mddev->curr_resync_completed) + > (max_sectors >> 4)) { + /* time to update curr_resync_completed */ + blk_unplug(mddev->queue); + wait_event(mddev->recovery_wait, + atomic_read(&mddev->recovery_active) == 0); + mddev->curr_resync_completed = + mddev->curr_resync; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + } sectors = mddev->pers->sync_request(mddev, j, &skipped, currspeed < speed_min(mddev)); if (sectors == 0) { @@ -6102,7 +6416,7 @@ void md_do_sync(mddev_t *mddev) if (currspeed > speed_min(mddev)) { if ((currspeed > speed_max(mddev)) || - !is_mddev_idle(mddev)) { + !is_mddev_idle(mddev, 0)) { msleep(500); goto repeat; } @@ -6173,6 +6487,8 @@ static int remove_and_add_spares(mddev_t *mddev) mdk_rdev_t *rdev; int spares = 0; + mddev->curr_resync_completed = 0; + list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && @@ -6327,6 +6643,9 @@ void md_check_recovery(mddev_t *mddev) sysfs_notify(&mddev->kobj, NULL, "degraded"); } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); md_update_sb(mddev, 1); /* if array is no-longer degraded, then any saved_raid_disk @@ -6470,13 +6789,13 @@ static void md_geninit(void) static int __init md_init(void) { - if (register_blkdev(MAJOR_NR, "md")) + if (register_blkdev(MD_MAJOR, "md")) return -1; if ((mdp_major=register_blkdev(0, "mdp"))<=0) { - unregister_blkdev(MAJOR_NR, "md"); + unregister_blkdev(MD_MAJOR, "md"); return -1; } - blk_register_region(MKDEV(MAJOR_NR, 0), 1UL< 2drive raid5 + * ndrive raid5 -> degraded n+1drive raid6 with special layout + * If the takeover succeeds, a new 'private' structure is returned. + * This needs to be installed and then ->run used to activate the + * array. + */ + void *(*takeover) (mddev_t *mddev); }; @@ -400,3 +411,26 @@ static inline void safe_put_page(struct page *p) #endif /* CONFIG_BLOCK */ #endif + +extern int register_md_personality(struct mdk_personality *p); +extern int unregister_md_personality(struct mdk_personality *p); +extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), + mddev_t *mddev, const char *name); +extern void md_unregister_thread(mdk_thread_t *thread); +extern void md_wakeup_thread(mdk_thread_t *thread); +extern void md_check_recovery(mddev_t *mddev); +extern void md_write_start(mddev_t *mddev, struct bio *bi); +extern void md_write_end(mddev_t *mddev); +extern void md_done_sync(mddev_t *mddev, int blocks, int ok); +extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); + +extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, + sector_t sector, int size, struct page *page); +extern void md_super_wait(mddev_t *mddev); +extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, + struct page *page, int rw); +extern void md_do_sync(mddev_t *mddev); +extern void md_new_event(mddev_t *mddev); +extern int md_allow_write(mddev_t *mddev); +extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); +extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c index b61d5767aae7c61c3960a09e16c26becb3a1712f..3b1500843bbac2dfa85daaeb85058480294080d4 100644 --- a/drivers/md/mktables.c +++ b/drivers/md/mktables.c @@ -59,7 +59,7 @@ int main(int argc, char *argv[]) uint8_t v; uint8_t exptbl[256], invtbl[256]; - printf("#include \"raid6.h\"\n"); + printf("#include \n"); /* Compute multiplication table */ printf("\nconst u8 __attribute__((aligned(256)))\n" @@ -76,6 +76,9 @@ int main(int argc, char *argv[]) printf("\t},\n"); } printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfmul);\n"); + printf("#endif\n"); /* Compute power-of-2 table (exponent) */ v = 1; @@ -92,6 +95,9 @@ int main(int argc, char *argv[]) } } printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfexp);\n"); + printf("#endif\n"); /* Compute inverse table x^-1 == x^254 */ printf("\nconst u8 __attribute__((aligned(256)))\n" @@ -104,6 +110,9 @@ int main(int argc, char *argv[]) } } printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfinv);\n"); + printf("#endif\n"); /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ printf("\nconst u8 __attribute__((aligned(256)))\n" @@ -115,6 +124,9 @@ int main(int argc, char *argv[]) (j == 7) ? '\n' : ' '); } printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfexi);\n"); + printf("#endif\n"); return 0; } diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index f6d08f2416716f7207fe49aba23d154a5eb41fae..41ced0cbe823c7275cc2f79172cc9913385fcd2f 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -19,7 +19,11 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include +#include +#include +#include +#include "md.h" +#include "multipath.h" #define MAX_WORK_PER_DISK 128 @@ -402,6 +406,14 @@ static void multipathd (mddev_t *mddev) spin_unlock_irqrestore(&conf->device_lock, flags); } +static sector_t multipath_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + + return mddev->dev_sectors; +} + static int multipath_run (mddev_t *mddev) { multipath_conf_t *conf; @@ -498,7 +510,7 @@ static int multipath_run (mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_sectors = mddev->size * 2; + md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); mddev->queue->unplug_fn = multipath_unplug; mddev->queue->backing_dev_info.congested_fn = multipath_congested; @@ -543,6 +555,7 @@ static struct mdk_personality multipath_personality = .error_handler = multipath_error, .hot_add_disk = multipath_add_disk, .hot_remove_disk= multipath_remove_disk, + .size = multipath_size, }; static int __init multipath_init (void) diff --git a/include/linux/raid/multipath.h b/drivers/md/multipath.h similarity index 96% rename from include/linux/raid/multipath.h rename to drivers/md/multipath.h index 6f53fc177a47359b1490ba5b999e9d2b34e7bb2e..6fa70b400cdae6347fa07dedc2c80a5c16cabc90 100644 --- a/include/linux/raid/multipath.h +++ b/drivers/md/multipath.h @@ -1,8 +1,6 @@ #ifndef _MULTIPATH_H #define _MULTIPATH_H -#include - struct multipath_info { mdk_rdev_t *rdev; }; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c605ba8055863d2d0ede52e9fe7e9c4e374bd9d3..c08d7559be5531fb01bedb55e4663a0e40092607 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -18,7 +18,10 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include +#include +#include +#include "md.h" +#include "raid0.h" static void raid0_unplug(struct request_queue *q) { @@ -73,16 +76,15 @@ static int create_strip_zones (mddev_t *mddev) list_for_each_entry(rdev2, &mddev->disks, same_set) { printk(KERN_INFO "raid0: comparing %s(%llu)", bdevname(rdev1->bdev,b), - (unsigned long long)rdev1->size); + (unsigned long long)rdev1->sectors); printk(KERN_INFO " with %s(%llu)\n", bdevname(rdev2->bdev,b), - (unsigned long long)rdev2->size); + (unsigned long long)rdev2->sectors); if (rdev2 == rdev1) { printk(KERN_INFO "raid0: END\n"); break; } - if (rdev2->size == rdev1->size) - { + if (rdev2->sectors == rdev1->sectors) { /* * Not unique, don't count it as a new * group @@ -145,7 +147,7 @@ static int create_strip_zones (mddev_t *mddev) mddev->queue->max_sectors > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - if (!smallest || (rdev1->size size)) + if (!smallest || (rdev1->sectors < smallest->sectors)) smallest = rdev1; cnt++; } @@ -155,10 +157,10 @@ static int create_strip_zones (mddev_t *mddev) goto abort; } zone->nb_dev = cnt; - zone->sectors = smallest->size * cnt * 2; + zone->sectors = smallest->sectors * cnt; zone->zone_start = 0; - current_start = smallest->size * 2; + current_start = smallest->sectors; curr_zone_start = zone->sectors; /* now do the other zones */ @@ -177,29 +179,29 @@ static int create_strip_zones (mddev_t *mddev) rdev = conf->strip_zone[0].dev[j]; printk(KERN_INFO "raid0: checking %s ...", bdevname(rdev->bdev, b)); - if (rdev->size > current_start / 2) { - printk(KERN_INFO " contained as device %d\n", - c); - zone->dev[c] = rdev; - c++; - if (!smallest || (rdev->size size)) { - smallest = rdev; - printk(KERN_INFO " (%llu) is smallest!.\n", - (unsigned long long)rdev->size); - } - } else + if (rdev->sectors <= current_start) { printk(KERN_INFO " nope.\n"); + continue; + } + printk(KERN_INFO " contained as device %d\n", c); + zone->dev[c] = rdev; + c++; + if (!smallest || rdev->sectors < smallest->sectors) { + smallest = rdev; + printk(KERN_INFO " (%llu) is smallest!.\n", + (unsigned long long)rdev->sectors); + } } zone->nb_dev = c; - zone->sectors = (smallest->size * 2 - current_start) * c; + zone->sectors = (smallest->sectors - current_start) * c; printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", zone->nb_dev, (unsigned long long)zone->sectors); zone->zone_start = curr_zone_start; curr_zone_start += zone->sectors; - current_start = smallest->size * 2; + current_start = smallest->sectors; printk(KERN_INFO "raid0: current zone start: %llu\n", (unsigned long long)current_start); } @@ -261,12 +263,25 @@ static int raid0_mergeable_bvec(struct request_queue *q, return max; } +static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + sector_t array_sectors = 0; + mdk_rdev_t *rdev; + + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + + list_for_each_entry(rdev, &mddev->disks, same_set) + array_sectors += rdev->sectors; + + return array_sectors; +} + static int raid0_run (mddev_t *mddev) { unsigned cur=0, i=0, nb_zone; s64 sectors; raid0_conf_t *conf; - mdk_rdev_t *rdev; if (mddev->chunk_size == 0) { printk(KERN_ERR "md/raid0: non-zero chunk size required.\n"); @@ -291,16 +306,14 @@ static int raid0_run (mddev_t *mddev) goto out_free_conf; /* calculate array device size */ - mddev->array_sectors = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) - mddev->array_sectors += rdev->size * 2; + md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", (unsigned long long)mddev->array_sectors); printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n", (unsigned long long)conf->spacing); { - sector_t s = mddev->array_sectors; + sector_t s = raid0_size(mddev, 0, 0); sector_t space = conf->spacing; int round; conf->sector_shift = 0; @@ -509,6 +522,7 @@ static struct mdk_personality raid0_personality= .run = raid0_run, .stop = raid0_stop, .status = raid0_status, + .size = raid0_size, }; static int __init raid0_init (void) diff --git a/include/linux/raid/raid0.h b/drivers/md/raid0.h similarity index 96% rename from include/linux/raid/raid0.h rename to drivers/md/raid0.h index fd42aa87c39186791d1a06481fecb6ddd7c03326..824b12eb1d4f50086e8a4dba48d711563362640c 100644 --- a/include/linux/raid/raid0.h +++ b/drivers/md/raid0.h @@ -1,8 +1,6 @@ #ifndef _RAID0_H #define _RAID0_H -#include - struct strip_zone { sector_t zone_start; /* Zone offset in md_dev (in sectors) */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e2466425d9cad798edf40858183404408b319e3d..b4f4badc0068991290515b1281cdaddd898893ce 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -31,10 +31,13 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "dm-bio-list.h" #include -#include -#include +#include +#include +#include "md.h" +#include "dm-bio-list.h" +#include "raid1.h" +#include "bitmap.h" #define DEBUG 0 #if DEBUG @@ -1723,7 +1726,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return 0; } - max_sector = mddev->size << 1; + max_sector = mddev->dev_sectors; if (sector_nr >= max_sector) { /* If we aborted, we need to abort the * sync on the 'current' bitmap chunk (there will @@ -1919,6 +1922,14 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return nr_sectors; } +static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + if (sectors) + return sectors; + + return mddev->dev_sectors; +} + static int run(mddev_t *mddev) { conf_t *conf; @@ -2048,7 +2059,7 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_sectors = mddev->size * 2; + md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); mddev->queue->unplug_fn = raid1_unplug; mddev->queue->backing_dev_info.congested_fn = raid1_congested; @@ -2089,6 +2100,9 @@ static int stop(mddev_t *mddev) /* need to kick something here to make sure I/O goes? */ } + raise_barrier(conf); + lower_barrier(conf); + md_unregister_thread(mddev->thread); mddev->thread = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ @@ -2110,15 +2124,17 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - mddev->array_sectors = sectors; + md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0)); + if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) + return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; - if (mddev->array_sectors / 2 > mddev->size && + if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { - mddev->recovery_cp = mddev->size << 1; + mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } - mddev->size = mddev->array_sectors / 2; + mddev->dev_sectors = sectors; mddev->resync_max_sectors = sectors; return 0; } @@ -2264,6 +2280,7 @@ static struct mdk_personality raid1_personality = .spare_active = raid1_spare_active, .sync_request = sync_request, .resize = raid1_resize, + .size = raid1_size, .check_reshape = raid1_reshape, .quiesce = raid1_quiesce, }; diff --git a/include/linux/raid/raid1.h b/drivers/md/raid1.h similarity index 99% rename from include/linux/raid/raid1.h rename to drivers/md/raid1.h index 0a9ba7c3302e2393e881244983a3397807e29dfb..1620eea3d57c50231729c04c98e6996c086b00c5 100644 --- a/include/linux/raid/raid1.h +++ b/drivers/md/raid1.h @@ -1,8 +1,6 @@ #ifndef _RAID1_H #define _RAID1_H -#include - typedef struct mirror_info mirror_info_t; struct mirror_info { diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7301631abe0453a4791dec55ba0eff84912876c3..e293d92641acc2b61c5767d3b343702fd65a509b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -18,10 +18,13 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "dm-bio-list.h" #include -#include -#include +#include +#include +#include "md.h" +#include "dm-bio-list.h" +#include "raid10.h" +#include "bitmap.h" /* * RAID10 provides a combination of RAID0 and RAID1 functionality. @@ -1695,7 +1698,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return 0; skipped: - max_sector = mddev->size << 1; + max_sector = mddev->dev_sectors; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { @@ -2020,6 +2023,25 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i goto skipped; } +static sector_t +raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + sector_t size; + conf_t *conf = mddev_to_conf(mddev); + + if (!raid_disks) + raid_disks = mddev->raid_disks; + if (!sectors) + sectors = mddev->dev_sectors; + + size = sectors >> conf->chunk_shift; + sector_div(size, conf->far_copies); + size = size * raid_disks; + sector_div(size, conf->near_copies); + + return size << conf->chunk_shift; +} + static int run(mddev_t *mddev) { conf_t *conf; @@ -2076,7 +2098,7 @@ static int run(mddev_t *mddev) conf->far_offset = fo; conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; conf->chunk_shift = ffz(~mddev->chunk_size) - 9; - size = mddev->size >> (conf->chunk_shift-1); + size = mddev->dev_sectors >> conf->chunk_shift; sector_div(size, fc); size = size * conf->raid_disks; sector_div(size, nc); @@ -2089,7 +2111,7 @@ static int run(mddev_t *mddev) */ stride += conf->raid_disks - 1; sector_div(stride, conf->raid_disks); - mddev->size = stride << (conf->chunk_shift-1); + mddev->dev_sectors = stride << conf->chunk_shift; if (fo) stride = 1; @@ -2171,8 +2193,8 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_sectors = size << conf->chunk_shift; - mddev->resync_max_sectors = size << conf->chunk_shift; + md_set_array_sectors(mddev, raid10_size(mddev, 0, 0)); + mddev->resync_max_sectors = raid10_size(mddev, 0, 0); mddev->queue->unplug_fn = raid10_unplug; mddev->queue->backing_dev_info.congested_fn = raid10_congested; @@ -2208,6 +2230,9 @@ static int stop(mddev_t *mddev) { conf_t *conf = mddev_to_conf(mddev); + raise_barrier(conf, 0); + lower_barrier(conf); + md_unregister_thread(mddev->thread); mddev->thread = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ @@ -2255,6 +2280,7 @@ static struct mdk_personality raid10_personality = .spare_active = raid10_spare_active, .sync_request = sync_request, .quiesce = raid10_quiesce, + .size = raid10_size, }; static int __init raid_init(void) diff --git a/include/linux/raid/raid10.h b/drivers/md/raid10.h similarity index 99% rename from include/linux/raid/raid10.h rename to drivers/md/raid10.h index e9091cfeb286c2a9b0d30dc57e6748933bb3a9ee..244dbe507a54fa893a6731f57c56de1dca2a3b70 100644 --- a/include/linux/raid/raid10.h +++ b/drivers/md/raid10.h @@ -1,8 +1,6 @@ #ifndef _RAID10_H #define _RAID10_H -#include - typedef struct mirror_info mirror_info_t; struct mirror_info { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a5ba080d303b93bb3a4d764ea621f4bf9bc191d0..3bbc6d647044c6b6d782427a1e3e35a146bc0751 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -43,11 +43,14 @@ * miss any bits. */ +#include #include -#include "raid6.h" - -#include +#include #include +#include +#include "md.h" +#include "raid5.h" +#include "bitmap.h" /* * Stripe cache @@ -91,11 +94,6 @@ #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) -#if !RAID6_USE_EMPTY_ZERO_PAGE -/* In .bss so it's zeroed */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); -#endif - /* * We maintain a biased count of active stripes in the bottom 16 bits of * bi_phys_segments, and a count of processed stripes in the upper 16 bits @@ -130,12 +128,42 @@ static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); } +/* Find first data disk in a raid6 stripe */ +static inline int raid6_d0(struct stripe_head *sh) +{ + if (sh->ddf_layout) + /* ddf always start from first device */ + return 0; + /* md starts just after Q block */ + if (sh->qd_idx == sh->disks - 1) + return 0; + else + return sh->qd_idx + 1; +} static inline int raid6_next_disk(int disk, int raid_disks) { disk++; return (disk < raid_disks) ? disk : 0; } +/* When walking through the disks in a raid5, starting at raid6_d0, + * We need to map each disk to a 'slot', where the data disks are slot + * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk + * is raid_disks-1. This help does that mapping. + */ +static int raid6_idx_to_slot(int idx, struct stripe_head *sh, + int *count, int syndrome_disks) +{ + int slot; + + if (idx == sh->pd_idx) + return syndrome_disks; + if (idx == sh->qd_idx) + return syndrome_disks + 1; + slot = (*count)++; + return slot; +} + static void return_io(struct bio *return_bi) { struct bio *bi = return_bi; @@ -193,6 +221,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) } } } + static void release_stripe(struct stripe_head *sh) { raid5_conf_t *conf = sh->raid_conf; @@ -270,9 +299,11 @@ static int grow_buffers(struct stripe_head *sh, int num) return 0; } -static void raid5_build_block(struct stripe_head *sh, int i); +static void raid5_build_block(struct stripe_head *sh, int i, int previous); +static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, + struct stripe_head *sh); -static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks) +static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) { raid5_conf_t *conf = sh->raid_conf; int i; @@ -287,11 +318,12 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int remove_hash(sh); + sh->generation = conf->generation - previous; + sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; - sh->pd_idx = pd_idx; + stripe_set_idx(sector, conf, previous, sh); sh->state = 0; - sh->disks = disks; for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -305,12 +337,13 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int BUG(); } dev->flags = 0; - raid5_build_block(sh, i); + raid5_build_block(sh, i, previous); } insert_hash(conf, sh); } -static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks) +static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, + short generation) { struct stripe_head *sh; struct hlist_node *hn; @@ -318,7 +351,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in CHECK_DEVLOCK(); pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) - if (sh->sector == sector && sh->disks == disks) + if (sh->sector == sector && sh->generation == generation) return sh; pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); return NULL; @@ -327,8 +360,9 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in static void unplug_slaves(mddev_t *mddev); static void raid5_unplug_device(struct request_queue *q); -static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks, - int pd_idx, int noblock) +static struct stripe_head * +get_active_stripe(raid5_conf_t *conf, sector_t sector, + int previous, int noblock) { struct stripe_head *sh; @@ -340,7 +374,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0, conf->device_lock, /* nothing */); - sh = __find_stripe(conf, sector, disks); + sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { if (!conf->inactive_blocked) sh = get_free_stripe(conf); @@ -358,10 +392,11 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector ); conf->inactive_blocked = 0; } else - init_stripe(sh, sector, pd_idx, disks); + init_stripe(sh, sector, previous); } else { if (atomic_read(&sh->count)) { - BUG_ON(!list_empty(&sh->lru)); + BUG_ON(!list_empty(&sh->lru) + && !test_bit(STRIPE_EXPANDING, &sh->state)); } else { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); @@ -895,8 +930,10 @@ static int grow_stripes(raid5_conf_t *conf, int num) struct kmem_cache *sc; int devs = conf->raid_disks; - sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev)); - sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev)); + sprintf(conf->cache_name[0], + "raid%d-%s", conf->level, mdname(conf->mddev)); + sprintf(conf->cache_name[1], + "raid%d-%s-alt", conf->level, mdname(conf->mddev)); conf->active_name = 0; sc = kmem_cache_create(conf->cache_name[conf->active_name], sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), @@ -911,7 +948,6 @@ static int grow_stripes(raid5_conf_t *conf, int num) return 0; } -#ifdef CONFIG_MD_RAID5_RESHAPE static int resize_stripes(raid5_conf_t *conf, int newsize) { /* Make all the stripes able to hold 'newsize' devices. @@ -1036,7 +1072,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) conf->pool_size = newsize; return err; } -#endif static int drop_one_stripe(raid5_conf_t *conf) { @@ -1066,7 +1101,7 @@ static void shrink_stripes(raid5_conf_t *conf) static void raid5_end_read_request(struct bio * bi, int error) { - struct stripe_head *sh = bi->bi_private; + struct stripe_head *sh = bi->bi_private; raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); @@ -1148,7 +1183,7 @@ static void raid5_end_read_request(struct bio * bi, int error) static void raid5_end_write_request(struct bio *bi, int error) { - struct stripe_head *sh = bi->bi_private; + struct stripe_head *sh = bi->bi_private; raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); @@ -1176,9 +1211,9 @@ static void raid5_end_write_request(struct bio *bi, int error) } -static sector_t compute_blocknr(struct stripe_head *sh, int i); +static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); -static void raid5_build_block(struct stripe_head *sh, int i) +static void raid5_build_block(struct stripe_head *sh, int i, int previous) { struct r5dev *dev = &sh->dev[i]; @@ -1194,7 +1229,7 @@ static void raid5_build_block(struct stripe_head *sh, int i) dev->req.bi_private = sh; dev->flags = 0; - dev->sector = compute_blocknr(sh, i); + dev->sector = compute_blocknr(sh, i, previous); } static void error(mddev_t *mddev, mdk_rdev_t *rdev) @@ -1227,15 +1262,23 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * Input: a 'big' sector number, * Output: index of the data and parity disk, and the sector # in them. */ -static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, - unsigned int data_disks, unsigned int * dd_idx, - unsigned int * pd_idx, raid5_conf_t *conf) +static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, + int previous, int *dd_idx, + struct stripe_head *sh) { long stripe; unsigned long chunk_number; unsigned int chunk_offset; + int pd_idx, qd_idx; + int ddf_layout = 0; sector_t new_sector; - int sectors_per_chunk = conf->chunk_size >> 9; + int algorithm = previous ? conf->prev_algo + : conf->algorithm; + int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) + : (conf->chunk_size >> 9); + int raid_disks = previous ? conf->previous_raid_disks + : conf->raid_disks; + int data_disks = raid_disks - conf->max_degraded; /* First compute the information on this sector */ @@ -1259,68 +1302,170 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, /* * Select the parity disk based on the user selected algorithm. */ + pd_idx = qd_idx = ~0; switch(conf->level) { case 4: - *pd_idx = data_disks; + pd_idx = data_disks; break; case 5: - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: - *pd_idx = data_disks - stripe % raid_disks; - if (*dd_idx >= *pd_idx) + pd_idx = data_disks - stripe % raid_disks; + if (*dd_idx >= pd_idx) (*dd_idx)++; break; case ALGORITHM_RIGHT_ASYMMETRIC: - *pd_idx = stripe % raid_disks; - if (*dd_idx >= *pd_idx) + pd_idx = stripe % raid_disks; + if (*dd_idx >= pd_idx) (*dd_idx)++; break; case ALGORITHM_LEFT_SYMMETRIC: - *pd_idx = data_disks - stripe % raid_disks; - *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + pd_idx = data_disks - stripe % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; break; case ALGORITHM_RIGHT_SYMMETRIC: - *pd_idx = stripe % raid_disks; - *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + pd_idx = stripe % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + break; + case ALGORITHM_PARITY_0: + pd_idx = 0; + (*dd_idx)++; + break; + case ALGORITHM_PARITY_N: + pd_idx = data_disks; break; default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", - conf->algorithm); + algorithm); + BUG(); } break; case 6: - /**** FIX THIS ****/ - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: - *pd_idx = raid_disks - 1 - (stripe % raid_disks); - if (*pd_idx == raid_disks-1) - (*dd_idx)++; /* Q D D D P */ - else if (*dd_idx >= *pd_idx) + pd_idx = raid_disks - 1 - (stripe % raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_RIGHT_ASYMMETRIC: - *pd_idx = stripe % raid_disks; - if (*pd_idx == raid_disks-1) - (*dd_idx)++; /* Q D D D P */ - else if (*dd_idx >= *pd_idx) + pd_idx = stripe % raid_disks; + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_LEFT_SYMMETRIC: - *pd_idx = raid_disks - 1 - (stripe % raid_disks); - *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; + pd_idx = raid_disks - 1 - (stripe % raid_disks); + qd_idx = (pd_idx + 1) % raid_disks; + *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; break; case ALGORITHM_RIGHT_SYMMETRIC: - *pd_idx = stripe % raid_disks; - *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; + pd_idx = stripe % raid_disks; + qd_idx = (pd_idx + 1) % raid_disks; + *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; + break; + + case ALGORITHM_PARITY_0: + pd_idx = 0; + qd_idx = 1; + (*dd_idx) += 2; + break; + case ALGORITHM_PARITY_N: + pd_idx = data_disks; + qd_idx = data_disks + 1; break; + + case ALGORITHM_ROTATING_ZERO_RESTART: + /* Exactly the same as RIGHT_ASYMMETRIC, but or + * of blocks for computing Q is different. + */ + pd_idx = stripe % raid_disks; + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + ddf_layout = 1; + break; + + case ALGORITHM_ROTATING_N_RESTART: + /* Same a left_asymmetric, by first stripe is + * D D D P Q rather than + * Q D D D P + */ + pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + ddf_layout = 1; + break; + + case ALGORITHM_ROTATING_N_CONTINUE: + /* Same as left_symmetric but Q is before P */ + pd_idx = raid_disks - 1 - (stripe % raid_disks); + qd_idx = (pd_idx + raid_disks - 1) % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + ddf_layout = 1; + break; + + case ALGORITHM_LEFT_ASYMMETRIC_6: + /* RAID5 left_asymmetric, with Q on last device */ + pd_idx = data_disks - stripe % (raid_disks-1); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_RIGHT_ASYMMETRIC_6: + pd_idx = stripe % (raid_disks-1); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_LEFT_SYMMETRIC_6: + pd_idx = data_disks - stripe % (raid_disks-1); + *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_RIGHT_SYMMETRIC_6: + pd_idx = stripe % (raid_disks-1); + *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_PARITY_0_6: + pd_idx = 0; + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + default: printk(KERN_CRIT "raid6: unsupported algorithm %d\n", - conf->algorithm); + algorithm); + BUG(); } break; } + if (sh) { + sh->pd_idx = pd_idx; + sh->qd_idx = qd_idx; + sh->ddf_layout = ddf_layout; + } /* * Finally, compute the new sector number */ @@ -1329,17 +1474,21 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, } -static sector_t compute_blocknr(struct stripe_head *sh, int i) +static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) { raid5_conf_t *conf = sh->raid_conf; int raid_disks = sh->disks; int data_disks = raid_disks - conf->max_degraded; sector_t new_sector = sh->sector, check; - int sectors_per_chunk = conf->chunk_size >> 9; + int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) + : (conf->chunk_size >> 9); + int algorithm = previous ? conf->prev_algo + : conf->algorithm; sector_t stripe; int chunk_offset; - int chunk_number, dummy1, dummy2, dd_idx = i; + int chunk_number, dummy1, dd_idx = i; sector_t r_sector; + struct stripe_head sh2; chunk_offset = sector_div(new_sector, sectors_per_chunk); @@ -1351,7 +1500,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) switch(conf->level) { case 4: break; case 5: - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: if (i > sh->pd_idx) @@ -1363,19 +1512,27 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) i += raid_disks; i -= (sh->pd_idx + 1); break; + case ALGORITHM_PARITY_0: + i -= 1; + break; + case ALGORITHM_PARITY_N: + break; default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", - conf->algorithm); + algorithm); + BUG(); } break; case 6: - if (i == raid6_next_disk(sh->pd_idx, raid_disks)) + if (i == sh->qd_idx) return 0; /* It is the Q disk */ - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: - if (sh->pd_idx == raid_disks-1) - i--; /* Q D D D P */ + case ALGORITHM_ROTATING_ZERO_RESTART: + case ALGORITHM_ROTATING_N_RESTART: + if (sh->pd_idx == raid_disks-1) + i--; /* Q D D D P */ else if (i > sh->pd_idx) i -= 2; /* D D P Q D */ break; @@ -1390,9 +1547,35 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) i -= (sh->pd_idx + 2); } break; + case ALGORITHM_PARITY_0: + i -= 2; + break; + case ALGORITHM_PARITY_N: + break; + case ALGORITHM_ROTATING_N_CONTINUE: + if (sh->pd_idx == 0) + i--; /* P D D D Q */ + else if (i > sh->pd_idx) + i -= 2; /* D D Q P D */ + break; + case ALGORITHM_LEFT_ASYMMETRIC_6: + case ALGORITHM_RIGHT_ASYMMETRIC_6: + if (i > sh->pd_idx) + i--; + break; + case ALGORITHM_LEFT_SYMMETRIC_6: + case ALGORITHM_RIGHT_SYMMETRIC_6: + if (i < sh->pd_idx) + i += data_disks + 1; + i -= (sh->pd_idx + 1); + break; + case ALGORITHM_PARITY_0_6: + i -= 1; + break; default: printk(KERN_CRIT "raid6: unsupported algorithm %d\n", - conf->algorithm); + algorithm); + BUG(); } break; } @@ -1400,8 +1583,10 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) chunk_number = stripe * data_disks + i; r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; - check = raid5_compute_sector(r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); - if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { + check = raid5_compute_sector(conf, r_sector, + previous, &dummy1, &sh2); + if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx + || sh2.qd_idx != sh->qd_idx) { printk(KERN_ERR "compute_blocknr: map not correct\n"); return 0; } @@ -1468,14 +1653,16 @@ static void copy_data(int frombio, struct bio *bio, static void compute_parity6(struct stripe_head *sh, int method) { - raid6_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count; + raid5_conf_t *conf = sh->raid_conf; + int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count; + int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); struct bio *chosen; /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[disks]; + void *ptrs[syndrome_disks+2]; - qd_idx = raid6_next_disk(pd_idx, disks); - d0_idx = raid6_next_disk(qd_idx, disks); + pd_idx = sh->pd_idx; + qd_idx = sh->qd_idx; + d0_idx = raid6_d0(sh); pr_debug("compute_parity, stripe %llu, method %d\n", (unsigned long long)sh->sector, method); @@ -1513,24 +1700,29 @@ static void compute_parity6(struct stripe_head *sh, int method) set_bit(R5_UPTODATE, &sh->dev[i].flags); } -// switch(method) { -// case RECONSTRUCT_WRITE: -// case CHECK_PARITY: -// case UPDATE_PARITY: - /* Note that unlike RAID-5, the ordering of the disks matters greatly. */ - /* FIX: Is this ordering of drives even remotely optimal? */ - count = 0; - i = d0_idx; - do { - ptrs[count++] = page_address(sh->dev[i].page); - if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) - printk("block %d/%d not uptodate on parity calc\n", i,count); - i = raid6_next_disk(i, disks); - } while ( i != d0_idx ); -// break; -// } - - raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs); + /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/ + + for (i = 0; i < disks; i++) + ptrs[i] = (void *)raid6_empty_zero_page; + + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + + ptrs[slot] = page_address(sh->dev[i].page); + if (slot < syndrome_disks && + !test_bit(R5_UPTODATE, &sh->dev[i].flags)) { + printk(KERN_ERR "block %d/%d not uptodate " + "on parity calc\n", i, count); + BUG(); + } + + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + BUG_ON(count != syndrome_disks); + + raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs); switch(method) { case RECONSTRUCT_WRITE: @@ -1552,8 +1744,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) { int i, count, disks = sh->disks; void *ptr[MAX_XOR_BLOCKS], *dest, *p; - int pd_idx = sh->pd_idx; - int qd_idx = raid6_next_disk(pd_idx, disks); + int qd_idx = sh->qd_idx; pr_debug("compute_block_1, stripe %llu, idx %d\n", (unsigned long long)sh->sector, dd_idx); @@ -1589,63 +1780,65 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) { int i, count, disks = sh->disks; - int pd_idx = sh->pd_idx; - int qd_idx = raid6_next_disk(pd_idx, disks); - int d0_idx = raid6_next_disk(qd_idx, disks); - int faila, failb; + int syndrome_disks = sh->ddf_layout ? disks : disks-2; + int d0_idx = raid6_d0(sh); + int faila = -1, failb = -1; + /**** FIX THIS: This could be very bad if disks is close to 256 ****/ + void *ptrs[syndrome_disks+2]; - /* faila and failb are disk numbers relative to d0_idx */ - /* pd_idx become disks-2 and qd_idx become disks-1 */ - faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx; - failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx; + for (i = 0; i < disks ; i++) + ptrs[i] = (void *)raid6_empty_zero_page; + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + + ptrs[slot] = page_address(sh->dev[i].page); + + if (i == dd_idx1) + faila = slot; + if (i == dd_idx2) + failb = slot; + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + BUG_ON(count != syndrome_disks); BUG_ON(faila == failb); if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", - (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); + (unsigned long long)sh->sector, dd_idx1, dd_idx2, + faila, failb); - if ( failb == disks-1 ) { + if (failb == syndrome_disks+1) { /* Q disk is one of the missing disks */ - if ( faila == disks-2 ) { + if (faila == syndrome_disks) { /* Missing P+Q, just recompute */ compute_parity6(sh, UPDATE_PARITY); return; } else { /* We're missing D+Q; recompute D from P */ - compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0); + compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ? + dd_idx2 : dd_idx1), + 0); compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ return; } } - /* We're missing D+P or D+D; build pointer table */ - { - /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[disks]; - - count = 0; - i = d0_idx; - do { - ptrs[count++] = page_address(sh->dev[i].page); - i = raid6_next_disk(i, disks); - if (i != dd_idx1 && i != dd_idx2 && - !test_bit(R5_UPTODATE, &sh->dev[i].flags)) - printk("compute_2 with missing block %d/%d\n", count, i); - } while ( i != d0_idx ); - - if ( failb == disks-2 ) { - /* We're missing D+P. */ - raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs); - } else { - /* We're missing D+D. */ - raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs); - } - - /* Both the above update both missing blocks */ - set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); - set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); + /* We're missing D+P or D+D; */ + if (failb == syndrome_disks) { + /* We're missing D+P. */ + raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs); + } else { + /* We're missing D+D. */ + raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb, + ptrs); } + + /* Both the above update both missing blocks */ + set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); + set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); } static void @@ -1800,17 +1993,21 @@ static int page_is_zero(struct page *p) memcmp(a, a+4, STRIPE_SIZE-4)==0); } -static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) +static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, + struct stripe_head *sh) { - int sectors_per_chunk = conf->chunk_size >> 9; - int pd_idx, dd_idx; + int sectors_per_chunk = + previous ? (conf->prev_chunk >> 9) + : (conf->chunk_size >> 9); + int dd_idx; int chunk_offset = sector_div(stripe, sectors_per_chunk); + int disks = previous ? conf->previous_raid_disks : conf->raid_disks; - raid5_compute_sector(stripe * (disks - conf->max_degraded) + raid5_compute_sector(conf, + stripe * (disks - conf->max_degraded) *sectors_per_chunk + chunk_offset, - disks, disks - conf->max_degraded, - &dd_idx, &pd_idx, conf); - return pd_idx; + previous, + &dd_idx, sh); } static void @@ -2181,7 +2378,7 @@ static void handle_stripe_dirtying6(raid5_conf_t *conf, struct r6_state *r6s, int disks) { int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; - int qd_idx = r6s->qd_idx; + int qd_idx = sh->qd_idx; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Would I have to read this buffer for reconstruct_write */ @@ -2371,7 +2568,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, int update_p = 0, update_q = 0; struct r5dev *dev; int pd_idx = sh->pd_idx; - int qd_idx = r6s->qd_idx; + int qd_idx = sh->qd_idx; set_bit(STRIPE_HANDLE, &sh->state); @@ -2467,17 +2664,14 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, struct dma_async_tx_descriptor *tx = NULL; clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); for (i = 0; i < sh->disks; i++) - if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) { - int dd_idx, pd_idx, j; + if (i != sh->pd_idx && i != sh->qd_idx) { + int dd_idx, j; struct stripe_head *sh2; - sector_t bn = compute_blocknr(sh, i); - sector_t s = raid5_compute_sector(bn, conf->raid_disks, - conf->raid_disks - - conf->max_degraded, &dd_idx, - &pd_idx, conf); - sh2 = get_active_stripe(conf, s, conf->raid_disks, - pd_idx, 1); + sector_t bn = compute_blocknr(sh, i, 1); + sector_t s = raid5_compute_sector(conf, bn, 0, + &dd_idx, NULL); + sh2 = get_active_stripe(conf, s, 0, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe * have been requested. When later blocks @@ -2500,8 +2694,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); for (j = 0; j < conf->raid_disks; j++) if (j != sh2->pd_idx && - (!r6s || j != raid6_next_disk(sh2->pd_idx, - sh2->disks)) && + (!r6s || j != sh2->qd_idx) && !test_bit(R5_Expanded, &sh2->dev[j].flags)) break; if (j == conf->raid_disks) { @@ -2750,6 +2943,23 @@ static bool handle_stripe5(struct stripe_head *sh) /* Finish reconstruct operations initiated by the expansion process */ if (sh->reconstruct_state == reconstruct_state_result) { + struct stripe_head *sh2 + = get_active_stripe(conf, sh->sector, 1, 1); + if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { + /* sh cannot be written until sh2 has been read. + * so arrange for sh to be delayed a little + */ + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, + &sh2->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe(sh2); + goto unlock; + } + if (sh2) + release_stripe(sh2); + sh->reconstruct_state = reconstruct_state_idle; clear_bit(STRIPE_EXPANDING, &sh->state); for (i = conf->raid_disks; i--; ) { @@ -2763,8 +2973,7 @@ static bool handle_stripe5(struct stripe_head *sh) !sh->reconstruct_state) { /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, - conf->raid_disks); + stripe_set_idx(sh->sector, conf, 0, sh); schedule_reconstruction5(sh, &s, 1, 1); } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); @@ -2796,20 +3005,19 @@ static bool handle_stripe5(struct stripe_head *sh) static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) { - raid6_conf_t *conf = sh->raid_conf; + raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks; struct bio *return_bi = NULL; - int i, pd_idx = sh->pd_idx; + int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; struct stripe_head_state s; struct r6_state r6s; struct r5dev *dev, *pdev, *qdev; mdk_rdev_t *blocked_rdev = NULL; - r6s.qd_idx = raid6_next_disk(pd_idx, disks); pr_debug("handling stripe %llu, state=%#lx cnt=%d, " "pd_idx=%d, qd_idx=%d\n", (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), pd_idx, r6s.qd_idx); + atomic_read(&sh->count), pd_idx, qd_idx); memset(&s, 0, sizeof(s)); spin_lock(&sh->lock); @@ -2920,9 +3128,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) pdev = &sh->dev[pd_idx]; r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); - qdev = &sh->dev[r6s.qd_idx]; - r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx) - || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx); + qdev = &sh->dev[qd_idx]; + r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) + || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); if ( s.written && ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) @@ -2980,10 +3188,26 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { + struct stripe_head *sh2 + = get_active_stripe(conf, sh->sector, 1, 1); + if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { + /* sh cannot be written until sh2 has been read. + * so arrange for sh to be delayed a little + */ + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, + &sh2->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe(sh2); + goto unlock; + } + if (sh2) + release_stripe(sh2); + /* Need to write out all blocks after computing P&Q */ sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, - conf->raid_disks); + stripe_set_idx(sh->sector, conf, 0, sh); compute_parity6(sh, RECONSTRUCT_WRITE); for (i = conf->raid_disks ; i-- ; ) { set_bit(R5_LOCKED, &sh->dev[i].flags); @@ -3134,6 +3358,8 @@ static int raid5_mergeable_bvec(struct request_queue *q, if ((bvm->bi_rw & 1) == WRITE) return biovec->bv_len; /* always allow writes to be mergeable */ + if (mddev->new_chunk < mddev->chunk_size) + chunk_sectors = mddev->new_chunk >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; if (max <= biovec->bv_len && bio_sectors == 0) @@ -3149,6 +3375,8 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) unsigned int chunk_sectors = mddev->chunk_size >> 9; unsigned int bio_sectors = bio->bi_size >> 9; + if (mddev->new_chunk < mddev->chunk_size) + chunk_sectors = mddev->new_chunk >> 9; return chunk_sectors >= ((sector & (chunk_sectors - 1)) + bio_sectors); } @@ -3255,9 +3483,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); - const unsigned int raid_disks = conf->raid_disks; - const unsigned int data_disks = raid_disks - conf->max_degraded; - unsigned int dd_idx, pd_idx; + unsigned int dd_idx; struct bio* align_bi; mdk_rdev_t *rdev; @@ -3266,7 +3492,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) return 0; } /* - * use bio_clone to make a copy of the bio + * use bio_clone to make a copy of the bio */ align_bi = bio_clone(raid_bio, GFP_NOIO); if (!align_bi) @@ -3280,12 +3506,9 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) /* * compute position */ - align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector, - raid_disks, - data_disks, - &dd_idx, - &pd_idx, - conf); + align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, + 0, + &dd_idx, NULL); rcu_read_lock(); rdev = rcu_dereference(conf->disks[dd_idx].rdev); @@ -3377,7 +3600,7 @@ static int make_request(struct request_queue *q, struct bio * bi) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); - unsigned int dd_idx, pd_idx; + int dd_idx; sector_t new_sector; sector_t logical_sector, last_sector; struct stripe_head *sh; @@ -3400,7 +3623,7 @@ static int make_request(struct request_queue *q, struct bio * bi) if (rw == READ && mddev->reshape_position == MaxSector && chunk_aligned_read(q,bi)) - return 0; + return 0; logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); @@ -3410,26 +3633,31 @@ static int make_request(struct request_queue *q, struct bio * bi) for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); int disks, data_disks; + int previous; retry: + previous = 0; + disks = conf->raid_disks; prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - if (likely(conf->expand_progress == MaxSector)) - disks = conf->raid_disks; - else { - /* spinlock is needed as expand_progress may be + if (unlikely(conf->reshape_progress != MaxSector)) { + /* spinlock is needed as reshape_progress may be * 64bit on a 32bit platform, and so it might be * possible to see a half-updated value - * Ofcourse expand_progress could change after + * Ofcourse reshape_progress could change after * the lock is dropped, so once we get a reference * to the stripe that we think it is, we will have * to check again. */ spin_lock_irq(&conf->device_lock); - disks = conf->raid_disks; - if (logical_sector >= conf->expand_progress) + if (mddev->delta_disks < 0 + ? logical_sector < conf->reshape_progress + : logical_sector >= conf->reshape_progress) { disks = conf->previous_raid_disks; - else { - if (logical_sector >= conf->expand_lo) { + previous = 1; + } else { + if (mddev->delta_disks < 0 + ? logical_sector < conf->reshape_safe + : logical_sector >= conf->reshape_safe) { spin_unlock_irq(&conf->device_lock); schedule(); goto retry; @@ -3439,15 +3667,17 @@ static int make_request(struct request_queue *q, struct bio * bi) } data_disks = disks - conf->max_degraded; - new_sector = raid5_compute_sector(logical_sector, disks, data_disks, - &dd_idx, &pd_idx, conf); + new_sector = raid5_compute_sector(conf, logical_sector, + previous, + &dd_idx, NULL); pr_debug("raid5: make_request, sector %llu logical %llu\n", (unsigned long long)new_sector, (unsigned long long)logical_sector); - sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK)); + sh = get_active_stripe(conf, new_sector, previous, + (bi->bi_rw&RWA_MASK)); if (sh) { - if (unlikely(conf->expand_progress != MaxSector)) { + if (unlikely(previous)) { /* expansion might have moved on while waiting for a * stripe, so we must do the range check again. * Expansion could still move past after this @@ -3458,8 +3688,9 @@ static int make_request(struct request_queue *q, struct bio * bi) */ int must_retry = 0; spin_lock_irq(&conf->device_lock); - if (logical_sector < conf->expand_progress && - disks == conf->previous_raid_disks) + if (mddev->delta_disks < 0 + ? logical_sector >= conf->reshape_progress + : logical_sector < conf->reshape_progress) /* mismatch, need to try again */ must_retry = 1; spin_unlock_irq(&conf->device_lock); @@ -3514,6 +3745,8 @@ static int make_request(struct request_queue *q, struct bio * bi) return 0; } +static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks); + static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped) { /* reshaping is quite different to recovery/resync so it is @@ -3527,61 +3760,118 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped */ raid5_conf_t *conf = (raid5_conf_t *) mddev->private; struct stripe_head *sh; - int pd_idx; sector_t first_sector, last_sector; int raid_disks = conf->previous_raid_disks; int data_disks = raid_disks - conf->max_degraded; int new_data_disks = conf->raid_disks - conf->max_degraded; int i; int dd_idx; - sector_t writepos, safepos, gap; - - if (sector_nr == 0 && - conf->expand_progress != 0) { - /* restarting in the middle, skip the initial sectors */ - sector_nr = conf->expand_progress; + sector_t writepos, readpos, safepos; + sector_t stripe_addr; + int reshape_sectors; + struct list_head stripes; + + if (sector_nr == 0) { + /* If restarting in the middle, skip the initial sectors */ + if (mddev->delta_disks < 0 && + conf->reshape_progress < raid5_size(mddev, 0, 0)) { + sector_nr = raid5_size(mddev, 0, 0) + - conf->reshape_progress; + } else if (mddev->delta_disks > 0 && + conf->reshape_progress > 0) + sector_nr = conf->reshape_progress; sector_div(sector_nr, new_data_disks); - *skipped = 1; - return sector_nr; + if (sector_nr) { + *skipped = 1; + return sector_nr; + } } + /* We need to process a full chunk at a time. + * If old and new chunk sizes differ, we need to process the + * largest of these + */ + if (mddev->new_chunk > mddev->chunk_size) + reshape_sectors = mddev->new_chunk / 512; + else + reshape_sectors = mddev->chunk_size / 512; + /* we update the metadata when there is more than 3Meg * in the block range (that is rather arbitrary, should * probably be time based) or when the data about to be * copied would over-write the source of the data at * the front of the range. - * i.e. one new_stripe forward from expand_progress new_maps - * to after where expand_lo old_maps to + * i.e. one new_stripe along from reshape_progress new_maps + * to after where reshape_safe old_maps to */ - writepos = conf->expand_progress + - conf->chunk_size/512*(new_data_disks); + writepos = conf->reshape_progress; sector_div(writepos, new_data_disks); - safepos = conf->expand_lo; + readpos = conf->reshape_progress; + sector_div(readpos, data_disks); + safepos = conf->reshape_safe; sector_div(safepos, data_disks); - gap = conf->expand_progress - conf->expand_lo; + if (mddev->delta_disks < 0) { + writepos -= reshape_sectors; + readpos += reshape_sectors; + safepos += reshape_sectors; + } else { + writepos += reshape_sectors; + readpos -= reshape_sectors; + safepos -= reshape_sectors; + } - if (writepos >= safepos || - gap > (new_data_disks)*3000*2 /*3Meg*/) { + /* 'writepos' is the most advanced device address we might write. + * 'readpos' is the least advanced device address we might read. + * 'safepos' is the least address recorded in the metadata as having + * been reshaped. + * If 'readpos' is behind 'writepos', then there is no way that we can + * ensure safety in the face of a crash - that must be done by userspace + * making a backup of the data. So in that case there is no particular + * rush to update metadata. + * Otherwise if 'safepos' is behind 'writepos', then we really need to + * update the metadata to advance 'safepos' to match 'readpos' so that + * we can be safe in the event of a crash. + * So we insist on updating metadata if safepos is behind writepos and + * readpos is beyond writepos. + * In any case, update the metadata every 10 seconds. + * Maybe that number should be configurable, but I'm not sure it is + * worth it.... maybe it could be a multiple of safemode_delay??? + */ + if ((mddev->delta_disks < 0 + ? (safepos > writepos && readpos < writepos) + : (safepos < writepos && readpos > writepos)) || + time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes)==0); - mddev->reshape_position = conf->expand_progress; + mddev->reshape_position = conf->reshape_progress; + conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, mddev->flags == 0 || kthread_should_stop()); spin_lock_irq(&conf->device_lock); - conf->expand_lo = mddev->reshape_position; + conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); } - for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { + if (mddev->delta_disks < 0) { + BUG_ON(conf->reshape_progress == 0); + stripe_addr = writepos; + BUG_ON((mddev->dev_sectors & + ~((sector_t)reshape_sectors - 1)) + - reshape_sectors - stripe_addr + != sector_nr); + } else { + BUG_ON(writepos != sector_nr + reshape_sectors); + stripe_addr = sector_nr; + } + INIT_LIST_HEAD(&stripes); + for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { int j; int skipped = 0; - pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks); - sh = get_active_stripe(conf, sector_nr+i, - conf->raid_disks, pd_idx, 0); + sh = get_active_stripe(conf, stripe_addr+i, 0, 0); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old @@ -3592,10 +3882,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped if (j == sh->pd_idx) continue; if (conf->level == 6 && - j == raid6_next_disk(sh->pd_idx, sh->disks)) + j == sh->qd_idx) continue; - s = compute_blocknr(sh, j); - if (s < mddev->array_sectors) { + s = compute_blocknr(sh, j, 0); + if (s < raid5_size(mddev, 0, 0)) { skipped = 1; continue; } @@ -3607,10 +3897,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped set_bit(STRIPE_EXPAND_READY, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); } - release_stripe(sh); + list_add(&sh->lru, &stripes); } spin_lock_irq(&conf->device_lock); - conf->expand_progress = (sector_nr + i) * new_data_disks; + if (mddev->delta_disks < 0) + conf->reshape_progress -= reshape_sectors * new_data_disks; + else + conf->reshape_progress += reshape_sectors * new_data_disks; spin_unlock_irq(&conf->device_lock); /* Ok, those stripe are ready. We can start scheduling * reads on the source stripes. @@ -3618,46 +3911,50 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * block on the destination stripes. */ first_sector = - raid5_compute_sector(sector_nr*(new_data_disks), - raid_disks, data_disks, - &dd_idx, &pd_idx, conf); + raid5_compute_sector(conf, stripe_addr*(new_data_disks), + 1, &dd_idx, NULL); last_sector = - raid5_compute_sector((sector_nr+conf->chunk_size/512) - *(new_data_disks) -1, - raid_disks, data_disks, - &dd_idx, &pd_idx, conf); - if (last_sector >= (mddev->size<<1)) - last_sector = (mddev->size<<1)-1; + raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512) + *(new_data_disks) - 1), + 1, &dd_idx, NULL); + if (last_sector >= mddev->dev_sectors) + last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { - pd_idx = stripe_to_pdidx(first_sector, conf, - conf->previous_raid_disks); - sh = get_active_stripe(conf, first_sector, - conf->previous_raid_disks, pd_idx, 0); + sh = get_active_stripe(conf, first_sector, 1, 0); set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); first_sector += STRIPE_SECTORS; } + /* Now that the sources are clearly marked, we can release + * the destination stripes + */ + while (!list_empty(&stripes)) { + sh = list_entry(stripes.next, struct stripe_head, lru); + list_del_init(&sh->lru); + release_stripe(sh); + } /* If this takes us to the resync_max point where we have to pause, * then we need to write out the superblock. */ - sector_nr += conf->chunk_size>>9; + sector_nr += reshape_sectors; if (sector_nr >= mddev->resync_max) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes) == 0); - mddev->reshape_position = conf->expand_progress; + mddev->reshape_position = conf->reshape_progress; + conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_DEVS, &mddev->flags) || kthread_should_stop()); spin_lock_irq(&conf->device_lock); - conf->expand_lo = mddev->reshape_position; + conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); } - return conf->chunk_size>>9; + return reshape_sectors; } /* FIXME go_faster isn't used */ @@ -3665,9 +3962,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski { raid5_conf_t *conf = (raid5_conf_t *) mddev->private; struct stripe_head *sh; - int pd_idx; - int raid_disks = conf->raid_disks; - sector_t max_sector = mddev->size << 1; + sector_t max_sector = mddev->dev_sectors; int sync_blocks; int still_degraded = 0; int i; @@ -3675,6 +3970,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski if (sector_nr >= max_sector) { /* just being told to finish up .. nothing much to do */ unplug_slaves(mddev); + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { end_reshape(conf); return 0; @@ -3705,7 +4001,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski */ if (mddev->degraded >= conf->max_degraded && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - sector_t rv = (mddev->size << 1) - sector_nr; + sector_t rv = mddev->dev_sectors - sector_nr; *skipped = 1; return rv; } @@ -3721,10 +4017,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski bitmap_cond_end_sync(mddev->bitmap, sector_nr); - pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); - sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1); + sh = get_active_stripe(conf, sector_nr, 0, 1); if (sh == NULL) { - sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); + sh = get_active_stripe(conf, sector_nr, 0, 0); /* make sure we don't swamp the stripe cache if someone else * is trying to get access */ @@ -3766,19 +4061,15 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. */ struct stripe_head *sh; - int dd_idx, pd_idx; + int dd_idx; sector_t sector, logical_sector, last_sector; int scnt = 0; int remaining; int handled = 0; logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); - sector = raid5_compute_sector( logical_sector, - conf->raid_disks, - conf->raid_disks - conf->max_degraded, - &dd_idx, - &pd_idx, - conf); + sector = raid5_compute_sector(conf, logical_sector, + 0, &dd_idx, NULL); last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); for (; logical_sector < last_sector; @@ -3790,7 +4081,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) /* already done this stripe */ continue; - sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1); + sh = get_active_stripe(conf, sector, 0, 1); if (!sh) { /* failed to get a stripe - must wait */ @@ -3992,89 +4283,69 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; -static int run(mddev_t *mddev) +static sector_t +raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + + if (!sectors) + sectors = mddev->dev_sectors; + if (!raid_disks) { + /* size is defined by the smallest of previous and new size */ + if (conf->raid_disks < conf->previous_raid_disks) + raid_disks = conf->raid_disks; + else + raid_disks = conf->previous_raid_disks; + } + + sectors &= ~((sector_t)mddev->chunk_size/512 - 1); + sectors &= ~((sector_t)mddev->new_chunk/512 - 1); + return sectors * (raid_disks - conf->max_degraded); +} + +static raid5_conf_t *setup_conf(mddev_t *mddev) { raid5_conf_t *conf; int raid_disk, memory; mdk_rdev_t *rdev; struct disk_info *disk; - int working_disks = 0; - if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) { + if (mddev->new_level != 5 + && mddev->new_level != 4 + && mddev->new_level != 6) { printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", - mdname(mddev), mddev->level); - return -EIO; + mdname(mddev), mddev->new_level); + return ERR_PTR(-EIO); } - - if (mddev->chunk_size < PAGE_SIZE) { - printk(KERN_ERR "md/raid5: chunk_size must be at least " - "PAGE_SIZE but %d < %ld\n", - mddev->chunk_size, PAGE_SIZE); - return -EINVAL; + if ((mddev->new_level == 5 + && !algorithm_valid_raid5(mddev->new_layout)) || + (mddev->new_level == 6 + && !algorithm_valid_raid6(mddev->new_layout))) { + printk(KERN_ERR "raid5: %s: layout %d not supported\n", + mdname(mddev), mddev->new_layout); + return ERR_PTR(-EIO); } - - if (mddev->reshape_position != MaxSector) { - /* Check that we can continue the reshape. - * Currently only disks can change, it must - * increase, and we must be past the point where - * a stripe over-writes itself - */ - sector_t here_new, here_old; - int old_disks; - int max_degraded = (mddev->level == 5 ? 1 : 2); - - if (mddev->new_level != mddev->level || - mddev->new_layout != mddev->layout || - mddev->new_chunk != mddev->chunk_size) { - printk(KERN_ERR "raid5: %s: unsupported reshape " - "required - aborting.\n", - mdname(mddev)); - return -EINVAL; - } - if (mddev->delta_disks <= 0) { - printk(KERN_ERR "raid5: %s: unsupported reshape " - "(reduce disks) required - aborting.\n", - mdname(mddev)); - return -EINVAL; - } - old_disks = mddev->raid_disks - mddev->delta_disks; - /* reshape_position must be on a new-stripe boundary, and one - * further up in new geometry must map after here in old - * geometry. - */ - here_new = mddev->reshape_position; - if (sector_div(here_new, (mddev->chunk_size>>9)* - (mddev->raid_disks - max_degraded))) { - printk(KERN_ERR "raid5: reshape_position not " - "on a stripe boundary\n"); - return -EINVAL; - } - /* here_new is the stripe we will write to */ - here_old = mddev->reshape_position; - sector_div(here_old, (mddev->chunk_size>>9)* - (old_disks-max_degraded)); - /* here_old is the first stripe that we might need to read - * from */ - if (here_new >= here_old) { - /* Reading from the same stripe as writing to - bad */ - printk(KERN_ERR "raid5: reshape_position too early for " - "auto-recovery - aborting.\n"); - return -EINVAL; - } - printk(KERN_INFO "raid5: reshape will continue\n"); - /* OK, we should be able to continue; */ + if (mddev->new_level == 6 && mddev->raid_disks < 4) { + printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", + mdname(mddev), mddev->raid_disks); + return ERR_PTR(-EINVAL); } + if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) { + printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", + mddev->new_chunk, mdname(mddev)); + return ERR_PTR(-EINVAL); + } - mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL); - if ((conf = mddev->private) == NULL) + conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL); + if (conf == NULL) goto abort; - if (mddev->reshape_position == MaxSector) { - conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks; - } else { - conf->raid_disks = mddev->raid_disks; + + conf->raid_disks = mddev->raid_disks; + if (mddev->reshape_position == MaxSector) + conf->previous_raid_disks = mddev->raid_disks; + else conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; - } conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), GFP_KERNEL); @@ -4086,13 +4357,12 @@ static int run(mddev_t *mddev) if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) goto abort; - if (mddev->level == 6) { + if (mddev->new_level == 6) { conf->spare_page = alloc_page(GFP_KERNEL); if (!conf->spare_page) goto abort; } spin_lock_init(&conf->device_lock); - mddev->queue->queue_lock = &conf->device_lock; init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); @@ -4121,47 +4391,134 @@ static int run(mddev_t *mddev) printk(KERN_INFO "raid5: device %s operational as raid" " disk %d\n", bdevname(rdev->bdev,b), raid_disk); - working_disks++; } else /* Cannot rely on bitmap to complete recovery */ conf->fullsync = 1; } - /* - * 0 for a fully functional array, 1 or 2 for a degraded array. - */ - mddev->degraded = conf->raid_disks - working_disks; - conf->mddev = mddev; - conf->chunk_size = mddev->chunk_size; - conf->level = mddev->level; + conf->chunk_size = mddev->new_chunk; + conf->level = mddev->new_level; if (conf->level == 6) conf->max_degraded = 2; else conf->max_degraded = 1; - conf->algorithm = mddev->layout; + conf->algorithm = mddev->new_layout; conf->max_nr_stripes = NR_STRIPES; - conf->expand_progress = mddev->reshape_position; - - /* device size must be a multiple of chunk size */ - mddev->size &= ~(mddev->chunk_size/1024 -1); - mddev->resync_max_sectors = mddev->size << 1; + conf->reshape_progress = mddev->reshape_position; + if (conf->reshape_progress != MaxSector) { + conf->prev_chunk = mddev->chunk_size; + conf->prev_algo = mddev->layout; + } - if (conf->level == 6 && conf->raid_disks < 4) { - printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", - mdname(mddev), conf->raid_disks); + memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + + conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; + if (grow_stripes(conf, conf->max_nr_stripes)) { + printk(KERN_ERR + "raid5: couldn't allocate %dkB for buffers\n", memory); goto abort; - } - if (!conf->chunk_size || conf->chunk_size % 4) { - printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", - conf->chunk_size, mdname(mddev)); + } else + printk(KERN_INFO "raid5: allocated %dkB for %s\n", + memory, mdname(mddev)); + + conf->thread = md_register_thread(raid5d, mddev, "%s_raid5"); + if (!conf->thread) { + printk(KERN_ERR + "raid5: couldn't allocate thread for %s\n", + mdname(mddev)); goto abort; } - if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { - printk(KERN_ERR - "raid5: unsupported parity algorithm %d for %s\n", - conf->algorithm, mdname(mddev)); - goto abort; + + return conf; + + abort: + if (conf) { + shrink_stripes(conf); + safe_put_page(conf->spare_page); + kfree(conf->disks); + kfree(conf->stripe_hashtbl); + kfree(conf); + return ERR_PTR(-EIO); + } else + return ERR_PTR(-ENOMEM); +} + +static int run(mddev_t *mddev) +{ + raid5_conf_t *conf; + int working_disks = 0; + mdk_rdev_t *rdev; + + if (mddev->reshape_position != MaxSector) { + /* Check that we can continue the reshape. + * Currently only disks can change, it must + * increase, and we must be past the point where + * a stripe over-writes itself + */ + sector_t here_new, here_old; + int old_disks; + int max_degraded = (mddev->level == 6 ? 2 : 1); + + if (mddev->new_level != mddev->level) { + printk(KERN_ERR "raid5: %s: unsupported reshape " + "required - aborting.\n", + mdname(mddev)); + return -EINVAL; + } + old_disks = mddev->raid_disks - mddev->delta_disks; + /* reshape_position must be on a new-stripe boundary, and one + * further up in new geometry must map after here in old + * geometry. + */ + here_new = mddev->reshape_position; + if (sector_div(here_new, (mddev->new_chunk>>9)* + (mddev->raid_disks - max_degraded))) { + printk(KERN_ERR "raid5: reshape_position not " + "on a stripe boundary\n"); + return -EINVAL; + } + /* here_new is the stripe we will write to */ + here_old = mddev->reshape_position; + sector_div(here_old, (mddev->chunk_size>>9)* + (old_disks-max_degraded)); + /* here_old is the first stripe that we might need to read + * from */ + if (here_new >= here_old) { + /* Reading from the same stripe as writing to - bad */ + printk(KERN_ERR "raid5: reshape_position too early for " + "auto-recovery - aborting.\n"); + return -EINVAL; + } + printk(KERN_INFO "raid5: reshape will continue\n"); + /* OK, we should be able to continue; */ + } else { + BUG_ON(mddev->level != mddev->new_level); + BUG_ON(mddev->layout != mddev->new_layout); + BUG_ON(mddev->chunk_size != mddev->new_chunk); + BUG_ON(mddev->delta_disks != 0); } + + if (mddev->private == NULL) + conf = setup_conf(mddev); + else + conf = mddev->private; + + if (IS_ERR(conf)) + return PTR_ERR(conf); + + mddev->thread = conf->thread; + conf->thread = NULL; + mddev->private = conf; + + /* + * 0 for a fully functional array, 1 or 2 for a degraded array. + */ + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0 && + test_bit(In_sync, &rdev->flags)) + working_disks++; + + mddev->degraded = conf->raid_disks - working_disks; + if (mddev->degraded > conf->max_degraded) { printk(KERN_ERR "raid5: not enough operational devices for %s" " (%d/%d failed)\n", @@ -4169,6 +4526,10 @@ static int run(mddev_t *mddev) goto abort; } + /* device size must be a multiple of chunk size */ + mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1); + mddev->resync_max_sectors = mddev->dev_sectors; + if (mddev->degraded > 0 && mddev->recovery_cp != MaxSector) { if (mddev->ok_start_degraded) @@ -4184,43 +4545,22 @@ static int run(mddev_t *mddev) } } - { - mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5"); - if (!mddev->thread) { - printk(KERN_ERR - "raid5: couldn't allocate thread for %s\n", - mdname(mddev)); - goto abort; - } - } - memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + - conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; - if (grow_stripes(conf, conf->max_nr_stripes)) { - printk(KERN_ERR - "raid5: couldn't allocate %dkB for buffers\n", memory); - shrink_stripes(conf); - md_unregister_thread(mddev->thread); - goto abort; - } else - printk(KERN_INFO "raid5: allocated %dkB for %s\n", - memory, mdname(mddev)); - if (mddev->degraded == 0) printk("raid5: raid level %d set %s active with %d out of %d" - " devices, algorithm %d\n", conf->level, mdname(mddev), - mddev->raid_disks-mddev->degraded, mddev->raid_disks, - conf->algorithm); + " devices, algorithm %d\n", conf->level, mdname(mddev), + mddev->raid_disks-mddev->degraded, mddev->raid_disks, + mddev->new_layout); else printk(KERN_ALERT "raid5: raid level %d set %s active with %d" " out of %d devices, algorithm %d\n", conf->level, mdname(mddev), mddev->raid_disks - mddev->degraded, - mddev->raid_disks, conf->algorithm); + mddev->raid_disks, mddev->new_layout); print_raid5_conf(conf); - if (conf->expand_progress != MaxSector) { + if (conf->reshape_progress != MaxSector) { printk("...ok start reshape thread\n"); - conf->expand_lo = conf->expand_progress; + conf->reshape_safe = conf->reshape_progress; atomic_set(&conf->reshape_stripes, 0); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); @@ -4247,18 +4587,22 @@ static int run(mddev_t *mddev) "raid5: failed to create sysfs attributes for %s\n", mdname(mddev)); + mddev->queue->queue_lock = &conf->device_lock; + mddev->queue->unplug_fn = raid5_unplug_device; mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; - mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks - - conf->max_degraded); + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); return 0; abort: + md_unregister_thread(mddev->thread); + mddev->thread = NULL; if (conf) { + shrink_stripes(conf); print_raid5_conf(conf); safe_put_page(conf->spare_page); kfree(conf->disks); @@ -4396,6 +4740,10 @@ static int raid5_remove_disk(mddev_t *mddev, int number) print_raid5_conf(conf); rdev = p->rdev; if (rdev) { + if (number >= conf->raid_disks && + conf->reshape_progress == MaxSector) + clear_bit(In_sync, &rdev->flags); + if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { err = -EBUSY; @@ -4405,7 +4753,8 @@ static int raid5_remove_disk(mddev_t *mddev, int number) * isn't possible. */ if (!test_bit(Faulty, &rdev->flags) && - mddev->degraded <= conf->max_degraded) { + mddev->degraded <= conf->max_degraded && + number < conf->raid_disks) { err = -EBUSY; goto abort; } @@ -4472,36 +4821,48 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - raid5_conf_t *conf = mddev_to_conf(mddev); - sectors &= ~((sector_t)mddev->chunk_size/512 - 1); - mddev->array_sectors = sectors * (mddev->raid_disks - - conf->max_degraded); + md_set_array_sectors(mddev, raid5_size(mddev, sectors, + mddev->raid_disks)); + if (mddev->array_sectors > + raid5_size(mddev, sectors, mddev->raid_disks)) + return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; - if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { - mddev->recovery_cp = mddev->size << 1; + if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { + mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } - mddev->size = sectors /2; + mddev->dev_sectors = sectors; mddev->resync_max_sectors = sectors; return 0; } -#ifdef CONFIG_MD_RAID5_RESHAPE static int raid5_check_reshape(mddev_t *mddev) { raid5_conf_t *conf = mddev_to_conf(mddev); - int err; - if (mddev->delta_disks < 0 || - mddev->new_level != mddev->level) - return -EINVAL; /* Cannot shrink array or change level yet */ - if (mddev->delta_disks == 0) - return 0; /* nothing to do */ + if (mddev->delta_disks == 0 && + mddev->new_layout == mddev->layout && + mddev->new_chunk == mddev->chunk_size) + return -EINVAL; /* nothing to do */ if (mddev->bitmap) /* Cannot grow a bitmap yet */ return -EBUSY; + if (mddev->degraded > conf->max_degraded) + return -EINVAL; + if (mddev->delta_disks < 0) { + /* We might be able to shrink, but the devices must + * be made bigger first. + * For raid6, 4 is the minimum size. + * Otherwise 2 is the minimum + */ + int min = 2; + if (mddev->level == 6) + min = 4; + if (mddev->raid_disks + mddev->delta_disks < min) + return -EINVAL; + } /* Can only proceed if there are plenty of stripe_heads. * We need a minimum of one full stripe,, and for sensible progress @@ -4514,18 +4875,12 @@ static int raid5_check_reshape(mddev_t *mddev) if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes || (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", - (mddev->chunk_size / STRIPE_SIZE)*4); + (max(mddev->chunk_size, mddev->new_chunk) + / STRIPE_SIZE)*4); return -ENOSPC; } - err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks); - if (err) - return err; - - if (mddev->degraded > conf->max_degraded) - return -EINVAL; - /* looks like we might be able to manage this */ - return 0; + return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); } static int raid5_start_reshape(mddev_t *mddev) @@ -4550,12 +4905,31 @@ static int raid5_start_reshape(mddev_t *mddev) */ return -EINVAL; + /* Refuse to reduce size of the array. Any reductions in + * array size must be through explicit setting of array_size + * attribute. + */ + if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) + < mddev->array_sectors) { + printk(KERN_ERR "md: %s: array size must be reduced " + "before number of disks\n", mdname(mddev)); + return -EINVAL; + } + atomic_set(&conf->reshape_stripes, 0); spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; conf->raid_disks += mddev->delta_disks; - conf->expand_progress = 0; - conf->expand_lo = 0; + conf->prev_chunk = conf->chunk_size; + conf->chunk_size = mddev->new_chunk; + conf->prev_algo = conf->algorithm; + conf->algorithm = mddev->new_layout; + if (mddev->delta_disks < 0) + conf->reshape_progress = raid5_size(mddev, 0, 0); + else + conf->reshape_progress = 0; + conf->reshape_safe = conf->reshape_progress; + conf->generation++; spin_unlock_irq(&conf->device_lock); /* Add some new drives, as many as will fit. @@ -4580,9 +4954,12 @@ static int raid5_start_reshape(mddev_t *mddev) break; } - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices; - spin_unlock_irqrestore(&conf->device_lock, flags); + if (mddev->delta_disks > 0) { + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) + - added_devices; + spin_unlock_irqrestore(&conf->device_lock, flags); + } mddev->raid_disks = conf->raid_disks; mddev->reshape_position = 0; set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -4597,52 +4974,86 @@ static int raid5_start_reshape(mddev_t *mddev) mddev->recovery = 0; spin_lock_irq(&conf->device_lock); mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; - conf->expand_progress = MaxSector; + conf->reshape_progress = MaxSector; spin_unlock_irq(&conf->device_lock); return -EAGAIN; } + conf->reshape_checkpoint = jiffies; md_wakeup_thread(mddev->sync_thread); md_new_event(mddev); return 0; } -#endif +/* This is called from the reshape thread and should make any + * changes needed in 'conf' + */ static void end_reshape(raid5_conf_t *conf) { - struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - conf->mddev->array_sectors = 2 * conf->mddev->size * - (conf->raid_disks - conf->max_degraded); - set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors); - conf->mddev->changed = 1; - - bdev = bdget_disk(conf->mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)conf->mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } + spin_lock_irq(&conf->device_lock); - conf->expand_progress = MaxSector; + conf->previous_raid_disks = conf->raid_disks; + conf->reshape_progress = MaxSector; spin_unlock_irq(&conf->device_lock); - conf->mddev->reshape_position = MaxSector; + wake_up(&conf->wait_for_overlap); /* read-ahead size must cover two whole stripes, which is * 2 * (datadisks) * chunksize where 'n' is the number of raid devices */ { - int data_disks = conf->previous_raid_disks - conf->max_degraded; - int stripe = data_disks * - (conf->mddev->chunk_size / PAGE_SIZE); + int data_disks = conf->raid_disks - conf->max_degraded; + int stripe = data_disks * (conf->chunk_size + / PAGE_SIZE); if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; } } } +/* This is called from the raid5d thread with mddev_lock held. + * It makes config changes to the device. + */ +static void raid5_finish_reshape(mddev_t *mddev) +{ + struct block_device *bdev; + raid5_conf_t *conf = mddev_to_conf(mddev); + + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + + if (mddev->delta_disks > 0) { + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); + set_capacity(mddev->gendisk, mddev->array_sectors); + mddev->changed = 1; + + bdev = bdget_disk(mddev->gendisk, 0); + if (bdev) { + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, + (loff_t)mddev->array_sectors << 9); + mutex_unlock(&bdev->bd_inode->i_mutex); + bdput(bdev); + } + } else { + int d; + mddev->degraded = conf->raid_disks; + for (d = 0; d < conf->raid_disks ; d++) + if (conf->disks[d].rdev && + test_bit(In_sync, + &conf->disks[d].rdev->flags)) + mddev->degraded--; + for (d = conf->raid_disks ; + d < conf->raid_disks - mddev->delta_disks; + d++) + raid5_remove_disk(mddev, d); + } + mddev->layout = conf->algorithm; + mddev->chunk_size = conf->chunk_size; + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; + } +} + static void raid5_quiesce(mddev_t *mddev, int state) { raid5_conf_t *conf = mddev_to_conf(mddev); @@ -4672,6 +5083,212 @@ static void raid5_quiesce(mddev_t *mddev, int state) } } + +static void *raid5_takeover_raid1(mddev_t *mddev) +{ + int chunksect; + + if (mddev->raid_disks != 2 || + mddev->degraded > 1) + return ERR_PTR(-EINVAL); + + /* Should check if there are write-behind devices? */ + + chunksect = 64*2; /* 64K by default */ + + /* The array must be an exact multiple of chunksize */ + while (chunksect && (mddev->array_sectors & (chunksect-1))) + chunksect >>= 1; + + if ((chunksect<<9) < STRIPE_SIZE) + /* array size does not allow a suitable chunk size */ + return ERR_PTR(-EINVAL); + + mddev->new_level = 5; + mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; + mddev->new_chunk = chunksect << 9; + + return setup_conf(mddev); +} + +static void *raid5_takeover_raid6(mddev_t *mddev) +{ + int new_layout; + + switch (mddev->layout) { + case ALGORITHM_LEFT_ASYMMETRIC_6: + new_layout = ALGORITHM_LEFT_ASYMMETRIC; + break; + case ALGORITHM_RIGHT_ASYMMETRIC_6: + new_layout = ALGORITHM_RIGHT_ASYMMETRIC; + break; + case ALGORITHM_LEFT_SYMMETRIC_6: + new_layout = ALGORITHM_LEFT_SYMMETRIC; + break; + case ALGORITHM_RIGHT_SYMMETRIC_6: + new_layout = ALGORITHM_RIGHT_SYMMETRIC; + break; + case ALGORITHM_PARITY_0_6: + new_layout = ALGORITHM_PARITY_0; + break; + case ALGORITHM_PARITY_N: + new_layout = ALGORITHM_PARITY_N; + break; + default: + return ERR_PTR(-EINVAL); + } + mddev->new_level = 5; + mddev->new_layout = new_layout; + mddev->delta_disks = -1; + mddev->raid_disks -= 1; + return setup_conf(mddev); +} + + +static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) +{ + /* For a 2-drive array, the layout and chunk size can be changed + * immediately as not restriping is needed. + * For larger arrays we record the new value - after validation + * to be used by a reshape pass. + */ + raid5_conf_t *conf = mddev_to_conf(mddev); + + if (new_layout >= 0 && !algorithm_valid_raid5(new_layout)) + return -EINVAL; + if (new_chunk > 0) { + if (new_chunk & (new_chunk-1)) + /* not a power of 2 */ + return -EINVAL; + if (new_chunk < PAGE_SIZE) + return -EINVAL; + if (mddev->array_sectors & ((new_chunk>>9)-1)) + /* not factor of array size */ + return -EINVAL; + } + + /* They look valid */ + + if (mddev->raid_disks == 2) { + + if (new_layout >= 0) { + conf->algorithm = new_layout; + mddev->layout = mddev->new_layout = new_layout; + } + if (new_chunk > 0) { + conf->chunk_size = new_chunk; + mddev->chunk_size = mddev->new_chunk = new_chunk; + } + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + } else { + if (new_layout >= 0) + mddev->new_layout = new_layout; + if (new_chunk > 0) + mddev->new_chunk = new_chunk; + } + return 0; +} + +static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk) +{ + if (new_layout >= 0 && !algorithm_valid_raid6(new_layout)) + return -EINVAL; + if (new_chunk > 0) { + if (new_chunk & (new_chunk-1)) + /* not a power of 2 */ + return -EINVAL; + if (new_chunk < PAGE_SIZE) + return -EINVAL; + if (mddev->array_sectors & ((new_chunk>>9)-1)) + /* not factor of array size */ + return -EINVAL; + } + + /* They look valid */ + + if (new_layout >= 0) + mddev->new_layout = new_layout; + if (new_chunk > 0) + mddev->new_chunk = new_chunk; + + return 0; +} + +static void *raid5_takeover(mddev_t *mddev) +{ + /* raid5 can take over: + * raid0 - if all devices are the same - make it a raid4 layout + * raid1 - if there are two drives. We need to know the chunk size + * raid4 - trivial - just use a raid4 layout. + * raid6 - Providing it is a *_6 layout + * + * For now, just do raid1 + */ + + if (mddev->level == 1) + return raid5_takeover_raid1(mddev); + if (mddev->level == 4) { + mddev->new_layout = ALGORITHM_PARITY_N; + mddev->new_level = 5; + return setup_conf(mddev); + } + if (mddev->level == 6) + return raid5_takeover_raid6(mddev); + + return ERR_PTR(-EINVAL); +} + + +static struct mdk_personality raid5_personality; + +static void *raid6_takeover(mddev_t *mddev) +{ + /* Currently can only take over a raid5. We map the + * personality to an equivalent raid6 personality + * with the Q block at the end. + */ + int new_layout; + + if (mddev->pers != &raid5_personality) + return ERR_PTR(-EINVAL); + if (mddev->degraded > 1) + return ERR_PTR(-EINVAL); + if (mddev->raid_disks > 253) + return ERR_PTR(-EINVAL); + if (mddev->raid_disks < 3) + return ERR_PTR(-EINVAL); + + switch (mddev->layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; + break; + case ALGORITHM_LEFT_SYMMETRIC: + new_layout = ALGORITHM_LEFT_SYMMETRIC_6; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; + break; + case ALGORITHM_PARITY_0: + new_layout = ALGORITHM_PARITY_0_6; + break; + case ALGORITHM_PARITY_N: + new_layout = ALGORITHM_PARITY_N; + break; + default: + return ERR_PTR(-EINVAL); + } + mddev->new_level = 6; + mddev->new_layout = new_layout; + mddev->delta_disks = 1; + mddev->raid_disks += 1; + return setup_conf(mddev); +} + + static struct mdk_personality raid6_personality = { .name = "raid6", @@ -4687,11 +5304,13 @@ static struct mdk_personality raid6_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, -#ifdef CONFIG_MD_RAID5_RESHAPE + .size = raid5_size, .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, -#endif + .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, + .takeover = raid6_takeover, + .reconfig = raid6_reconfig, }; static struct mdk_personality raid5_personality = { @@ -4708,11 +5327,13 @@ static struct mdk_personality raid5_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, -#ifdef CONFIG_MD_RAID5_RESHAPE + .size = raid5_size, .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, -#endif + .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, + .takeover = raid5_takeover, + .reconfig = raid5_reconfig, }; static struct mdk_personality raid4_personality = @@ -4730,20 +5351,15 @@ static struct mdk_personality raid4_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, -#ifdef CONFIG_MD_RAID5_RESHAPE + .size = raid5_size, .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, -#endif + .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, }; static int __init raid5_init(void) { - int e; - - e = raid6_select_algo(); - if ( e ) - return e; register_md_personality(&raid6_personality); register_md_personality(&raid5_personality); register_md_personality(&raid4_personality); diff --git a/include/linux/raid/raid5.h b/drivers/md/raid5.h similarity index 81% rename from include/linux/raid/raid5.h rename to drivers/md/raid5.h index 3b2672792457ff08b991acb78e619f3daa3dab6e..52ba99954decf0ceee11bc30da2213a7c6619983 100644 --- a/include/linux/raid/raid5.h +++ b/drivers/md/raid5.h @@ -1,7 +1,6 @@ #ifndef _RAID5_H #define _RAID5_H -#include #include /* @@ -197,15 +196,19 @@ enum reconstruct_states { struct stripe_head { struct hlist_node hash; - struct list_head lru; /* inactive_list or handle_list */ - struct raid5_private_data *raid_conf; - sector_t sector; /* sector of this row */ - int pd_idx; /* parity disk index */ - unsigned long state; /* state flags */ - atomic_t count; /* nr of active thread/requests */ + struct list_head lru; /* inactive_list or handle_list */ + struct raid5_private_data *raid_conf; + short generation; /* increments with every + * reshape */ + sector_t sector; /* sector of this row */ + short pd_idx; /* parity disk index */ + short qd_idx; /* 'Q' disk index for raid6 */ + short ddf_layout;/* use DDF ordering to calculate Q */ + unsigned long state; /* state flags */ + atomic_t count; /* nr of active thread/requests */ spinlock_t lock; int bm_seq; /* sequence number for bitmap flushes */ - int disks; /* disks in stripe */ + int disks; /* disks in stripe */ enum check_states check_state; enum reconstruct_states reconstruct_state; /* stripe_operations @@ -238,7 +241,7 @@ struct stripe_head_state { /* r6_state - extra state data only relevant to r6 */ struct r6_state { - int p_failed, q_failed, qd_idx, failed_num[2]; + int p_failed, q_failed, failed_num[2]; }; /* Flags */ @@ -268,6 +271,8 @@ struct r6_state { #define READ_MODIFY_WRITE 2 /* not a write method, but a compute_parity mode */ #define CHECK_PARITY 3 +/* Additional compute_parity mode -- updates the parity w/o LOCKING */ +#define UPDATE_PARITY 4 /* * Stripe state @@ -319,7 +324,7 @@ struct r6_state { * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. * HANDLE gets cleared if stripe_handle leave nothing locked. */ - + struct disk_info { mdk_rdev_t *rdev; @@ -334,12 +339,21 @@ struct raid5_private_data { int raid_disks; int max_nr_stripes; - /* used during an expand */ - sector_t expand_progress; /* MaxSector when no expand happening */ - sector_t expand_lo; /* from here up to expand_progress it out-of-bounds - * as we haven't flushed the metadata yet - */ + /* reshape_progress is the leading edge of a 'reshape' + * It has value MaxSector when no reshape is happening + * If delta_disks < 0, it is the last sector we started work on, + * else is it the next sector to work on. + */ + sector_t reshape_progress; + /* reshape_safe is the trailing edge of a reshape. We know that + * before (or after) this address, all reshape has completed. + */ + sector_t reshape_safe; int previous_raid_disks; + int prev_chunk, prev_algo; + short generation; /* increments with every reshape */ + unsigned long reshape_checkpoint; /* Time we last updated + * metadata */ struct list_head handle_list; /* stripes needing handling */ struct list_head hold_list; /* preread ready stripes */ @@ -385,6 +399,11 @@ struct raid5_private_data { int pool_size; /* number of disks in stripeheads in pool */ spinlock_t device_lock; struct disk_info *disks; + + /* When taking over an array from a different personality, we store + * the new thread here until we fully activate the array. + */ + struct mdk_thread_s *thread; }; typedef struct raid5_private_data raid5_conf_t; @@ -394,9 +413,62 @@ typedef struct raid5_private_data raid5_conf_t; /* * Our supported algorithms */ -#define ALGORITHM_LEFT_ASYMMETRIC 0 -#define ALGORITHM_RIGHT_ASYMMETRIC 1 -#define ALGORITHM_LEFT_SYMMETRIC 2 -#define ALGORITHM_RIGHT_SYMMETRIC 3 +#define ALGORITHM_LEFT_ASYMMETRIC 0 /* Rotating Parity N with Data Restart */ +#define ALGORITHM_RIGHT_ASYMMETRIC 1 /* Rotating Parity 0 with Data Restart */ +#define ALGORITHM_LEFT_SYMMETRIC 2 /* Rotating Parity N with Data Continuation */ +#define ALGORITHM_RIGHT_SYMMETRIC 3 /* Rotating Parity 0 with Data Continuation */ + +/* Define non-rotating (raid4) algorithms. These allow + * conversion of raid4 to raid5. + */ +#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */ +#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */ + +/* DDF RAID6 layouts differ from md/raid6 layouts in two ways. + * Firstly, the exact positioning of the parity block is slightly + * different between the 'LEFT_*' modes of md and the "_N_*" modes + * of DDF. + * Secondly, or order of datablocks over which the Q syndrome is computed + * is different. + * Consequently we have different layouts for DDF/raid6 than md/raid6. + * These layouts are from the DDFv1.2 spec. + * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but + * leaves RLQ=3 as 'Vendor Specific' + */ + +#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */ +#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */ +#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */ + + +/* For every RAID5 algorithm we define a RAID6 algorithm + * with exactly the same layout for data and parity, and + * with the Q block always on the last device (N-1). + * This allows trivial conversion from RAID5 to RAID6 + */ +#define ALGORITHM_LEFT_ASYMMETRIC_6 16 +#define ALGORITHM_RIGHT_ASYMMETRIC_6 17 +#define ALGORITHM_LEFT_SYMMETRIC_6 18 +#define ALGORITHM_RIGHT_SYMMETRIC_6 19 +#define ALGORITHM_PARITY_0_6 20 +#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N + +static inline int algorithm_valid_raid5(int layout) +{ + return (layout >= 0) && + (layout <= 5); +} +static inline int algorithm_valid_raid6(int layout) +{ + return (layout >= 0 && layout <= 5) + || + (layout == 8 || layout == 10) + || + (layout >= 16 && layout <= 20); +} +static inline int algorithm_is_DDF(int layout) +{ + return layout >= 8 && layout <= 10; +} #endif diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c index 21987e3dbe6c0ce4199fcc25f3e81506c337713f..866215ac7f2554f86733cc900f4b902b3f60bc76 100644 --- a/drivers/md/raid6algos.c +++ b/drivers/md/raid6algos.c @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -16,13 +16,20 @@ * Algorithm list and algorithm selection for RAID-6 */ -#include "raid6.h" +#include #ifndef __KERNEL__ #include #include +#else +#if !RAID6_USE_EMPTY_ZERO_PAGE +/* In .bss so it's zeroed */ +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +EXPORT_SYMBOL(raid6_empty_zero_page); +#endif #endif struct raid6_calls raid6_call; +EXPORT_SYMBOL_GPL(raid6_call); /* Various routine sets */ extern const struct raid6_calls raid6_intx1; @@ -79,6 +86,7 @@ const struct raid6_calls * const raid6_algos[] = { #else /* Need more time to be stable in userspace */ #define RAID6_TIME_JIFFIES_LG2 9 +#define time_before(x, y) ((x) < (y)) #endif /* Try to pick the best algorithm */ @@ -152,3 +160,12 @@ int __init raid6_select_algo(void) return best ? 0 : -EINVAL; } + +static void raid6_exit(void) +{ + do { } while (0); +} + +subsys_initcall(raid6_select_algo); +module_exit(raid6_exit); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc index b9afd35b8812cf46c1994a37b99f4e8935680614..699dfeee494459afdd1ba2ff3b7a7e90c063b12f 100644 --- a/drivers/md/raid6altivec.uc +++ b/drivers/md/raid6altivec.uc @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -22,7 +22,7 @@ * bracked this with preempt_disable/enable or in a lock) */ -#include "raid6.h" +#include #ifdef CONFIG_ALTIVEC diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc index ad004cee0e261c414e989fec6e9ae4e7a7884011..f9bf9cba357fd1202b6bef088a7e26b49fed6ab2 100644 --- a/drivers/md/raid6int.uc +++ b/drivers/md/raid6int.uc @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -18,7 +18,7 @@ * This file is postprocessed using unroll.pl */ -#include "raid6.h" +#include /* * This is the C data type to use diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c index d4e4a1bd70ad2f0841a8dd4036a167dfd1e9a3c0..e7f6c13132bfd12e2f145a337ed954659198d971 100644 --- a/drivers/md/raid6mmx.c +++ b/drivers/md/raid6mmx.c @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -18,7 +18,7 @@ #if defined(__i386__) && !defined(__arch_um__) -#include "raid6.h" +#include #include "raid6x86.h" /* Shared with raid6sse1.c */ diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c index a8c4d9451bd901da3172447f7bd00536ceec794a..2609f00e0d61ed8c5b80347704c4bf6d9fbae29f 100644 --- a/drivers/md/raid6recov.c +++ b/drivers/md/raid6recov.c @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -18,7 +18,7 @@ * the syndrome.) */ -#include "raid6.h" +#include /* Recover two failed data blocks. */ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, @@ -63,9 +63,7 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, p++; q++; } } - - - +EXPORT_SYMBOL_GPL(raid6_2data_recov); /* Recover failure of one data block plus the P block */ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) @@ -97,9 +95,10 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) q++; dq++; } } +EXPORT_SYMBOL_GPL(raid6_datap_recov); - -#ifndef __KERNEL__ /* Testing only */ +#ifndef __KERNEL__ +/* Testing only */ /* Recover two failed blocks. */ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs) diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c index 0666237276ff37283e06aacc8980340eada56840..b274dd5eab8f1a39aa3706e0d012bfeb8cef92f2 100644 --- a/drivers/md/raid6sse1.c +++ b/drivers/md/raid6sse1.c @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -23,7 +23,7 @@ #if defined(__i386__) && !defined(__arch_um__) -#include "raid6.h" +#include #include "raid6x86.h" /* Defined in raid6mmx.c */ diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c index b034ad8680397e0ab62eff5d55ddf6f5820979eb..6ed6c6c0389f47aef453c8708defd7f01e4d1c8f 100644 --- a/drivers/md/raid6sse2.c +++ b/drivers/md/raid6sse2.c @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -19,7 +19,7 @@ #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) -#include "raid6.h" +#include #include "raid6x86.h" static const struct raid6_sse_constants { diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile index 78e0396adf2ac28ab8cc23f0427303b10d2207b2..58ffdf4f51619caa05253d5d42abb520ae5c8e20 100644 --- a/drivers/md/raid6test/Makefile +++ b/drivers/md/raid6test/Makefile @@ -5,7 +5,7 @@ CC = gcc OPTFLAGS = -O2 # Adjust as desired -CFLAGS = -I.. -g $(OPTFLAGS) +CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) LD = ld PERL = perl AR = ar diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c index 559cc41b258566d4b0437241559156fbbc574721..7a930318b17d60c406bb1d421ba406c0f3bcb139 100644 --- a/drivers/md/raid6test/test.c +++ b/drivers/md/raid6test/test.c @@ -17,7 +17,7 @@ #include #include #include -#include "raid6.h" +#include #define NDISKS 16 /* Including P and Q */ diff --git a/drivers/md/raid6x86.h b/drivers/md/raid6x86.h index 99fea7a70ca70ace27cca159e039b0ab318368ae..4c22c1568558b7c92dac1b456559d9981cc2ae3c 100644 --- a/drivers/md/raid6x86.h +++ b/drivers/md/raid6x86.h @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index ff786687e93b7289e74f0587c170ca5a2f1006c0..3e87ce443ea2c0fd39f32743d56d9cdcd89175d2 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h deleted file mode 100644 index 82bea14cae1a7a948aafe1a1a1329aa1a9e6037d..0000000000000000000000000000000000000000 --- a/include/linux/raid/md.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - md.h : Multiple Devices driver for Linux - Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman - Copyright (C) 1994-96 Marc ZYNGIER - or - - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#ifndef _MD_H -#define _MD_H - -#include -#include - -/* - * 'md_p.h' holds the 'physical' layout of RAID devices - * 'md_u.h' holds the user <=> kernel API - * - * 'md_k.h' holds kernel internal definitions - */ - -#include -#include -#include - -#ifdef CONFIG_MD - -/* - * Different major versions are not compatible. - * Different minor versions are only downward compatible. - * Different patchlevel versions are downward and upward compatible. - */ -#define MD_MAJOR_VERSION 0 -#define MD_MINOR_VERSION 90 -/* - * MD_PATCHLEVEL_VERSION indicates kernel functionality. - * >=1 means different superblock formats are selectable using SET_ARRAY_INFO - * and major_version/minor_version accordingly - * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT - * in the super status byte - * >=3 means that bitmap superblock version 4 is supported, which uses - * little-ending representation rather than host-endian - */ -#define MD_PATCHLEVEL_VERSION 3 - -extern int mdp_major; - -extern int register_md_personality(struct mdk_personality *p); -extern int unregister_md_personality(struct mdk_personality *p); -extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), - mddev_t *mddev, const char *name); -extern void md_unregister_thread(mdk_thread_t *thread); -extern void md_wakeup_thread(mdk_thread_t *thread); -extern void md_check_recovery(mddev_t *mddev); -extern void md_write_start(mddev_t *mddev, struct bio *bi); -extern void md_write_end(mddev_t *mddev); -extern void md_done_sync(mddev_t *mddev, int blocks, int ok); -extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); - -extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, - sector_t sector, int size, struct page *page); -extern void md_super_wait(mddev_t *mddev); -extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, - struct page *page, int rw); -extern void md_do_sync(mddev_t *mddev); -extern void md_new_event(mddev_t *mddev); -extern int md_allow_write(mddev_t *mddev); -extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); - -#endif /* CONFIG_MD */ -#endif - diff --git a/include/linux/raid/md_u.h b/include/linux/raid/md_u.h index 7192035fc4b0680094f3c4f0c2780a79c20dfd1b..fb1abb3367e9520a7e68cf738709e7510abe80f0 100644 --- a/include/linux/raid/md_u.h +++ b/include/linux/raid/md_u.h @@ -15,6 +15,24 @@ #ifndef _MD_U_H #define _MD_U_H +/* + * Different major versions are not compatible. + * Different minor versions are only downward compatible. + * Different patchlevel versions are downward and upward compatible. + */ +#define MD_MAJOR_VERSION 0 +#define MD_MINOR_VERSION 90 +/* + * MD_PATCHLEVEL_VERSION indicates kernel functionality. + * >=1 means different superblock formats are selectable using SET_ARRAY_INFO + * and major_version/minor_version accordingly + * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT + * in the super status byte + * >=3 means that bitmap superblock version 4 is supported, which uses + * little-ending representation rather than host-endian + */ +#define MD_PATCHLEVEL_VERSION 3 + /* ioctls */ /* status */ @@ -46,6 +64,12 @@ #define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) #define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) +/* 63 partitions with the alternate major number (mdp) */ +#define MdpMinorShift 6 +#ifdef __KERNEL__ +extern int mdp_major; +#endif + typedef struct mdu_version_s { int major; int minor; @@ -85,6 +109,17 @@ typedef struct mdu_array_info_s { } mdu_array_info_t; +/* non-obvious values for 'level' */ +#define LEVEL_MULTIPATH (-4) +#define LEVEL_LINEAR (-1) +#define LEVEL_FAULTY (-5) + +/* we need a value for 'no level specified' and 0 + * means 'raid0', so we need something else. This is + * for internal use only + */ +#define LEVEL_NONE (-1000000) + typedef struct mdu_disk_info_s { /* * configuration/status of one particular disk diff --git a/drivers/md/raid6.h b/include/linux/raid/pq.h similarity index 86% rename from drivers/md/raid6.h rename to include/linux/raid/pq.h index 98dcde88470e4c23b8fc204e46587be0243bfa32..d92480f8285c76bec5b67fb5950fdc12f2d06c3c 100644 --- a/drivers/md/raid6.h +++ b/include/linux/raid/pq.h @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -17,14 +17,7 @@ /* Set to 1 to use kernel-wide empty_zero_page */ #define RAID6_USE_EMPTY_ZERO_PAGE 0 - -#include -#include - -typedef raid5_conf_t raid6_conf_t; /* Same configuration */ - -/* Additional compute_parity mode -- updates the parity w/o LOCKING */ -#define UPDATE_PARITY 4 +#include /* We need a pre-zeroed page... if we don't want to use the kernel-provided one define it here */ @@ -68,6 +61,10 @@ extern const char raid6_empty_zero_page[PAGE_SIZE]; #define enable_kernel_altivec() #define disable_kernel_altivec() +#define EXPORT_SYMBOL(sym) +#define MODULE_LICENSE(licence) +#define subsys_initcall(x) +#define module_exit(x) #endif /* __KERNEL__ */ /* Routine choices */ @@ -98,9 +95,11 @@ extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); /* Recovery routines */ -void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); +void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, + void **ptrs); void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); -void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); +void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, + void **ptrs); /* Some definitions to allow code to be compiled for testing in userspace */ #ifndef __KERNEL__ @@ -108,8 +107,11 @@ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs # define jiffies raid6_jiffies() # define printk printf # define GFP_KERNEL 0 -# define __get_free_pages(x,y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0)) -# define free_pages(x,y) munmap((void *)(x), (y)*PAGE_SIZE) +# define __get_free_pages(x, y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), \ + PROT_READ|PROT_WRITE, \ + MAP_PRIVATE|MAP_ANONYMOUS,\ + 0, 0)) +# define free_pages(x, y) munmap((void *)(x), (y)*PAGE_SIZE) static inline void cpu_relax(void) { diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h index 3e120587eadac53669053e040db23b747ad4a91f..5a210959e3f8a71f5251ccf325bc52f629df124a 100644 --- a/include/linux/raid/xor.h +++ b/include/linux/raid/xor.h @@ -1,8 +1,6 @@ #ifndef _XOR_H #define _XOR_H -#include - #define MAX_XOR_BLOCKS 4 extern void xor_blocks(unsigned int count, unsigned int bytes, diff --git a/init/do_mounts.h b/init/do_mounts.h index 9aa968d5432937a3aeb799d87ff42e87dfb5c871..f5b978a9bb92892a5e876ae3ce1338ad8a896e04 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -1,4 +1,5 @@ #include +#include #include #include #include diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c index 9bdddbcb3d6a62614b0d43783abc9a265948bdde..69aebbf8fd2dbf4ebeb301cf1fa590d3c864b9c9 100644 --- a/init/do_mounts_md.c +++ b/init/do_mounts_md.c @@ -1,5 +1,6 @@ #include -#include +#include +#include #include "do_mounts.h" @@ -112,8 +113,6 @@ static int __init md_setup(char *str) return 1; } -#define MdpMinorShift 6 - static void __init md_setup_drive(void) { int minor, i, ent, partitioned;