提交 2a4c32ed 编写于 作者: L Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

Pull MD updates from Shaohua Li:

 - a raid5 writeback cache feature.

   The goal is to aggregate writes to make full stripe write and reduce
   read-modify-write. It's helpful for workload which does sequential
   write and follows fsync for example. This feature is experimental and
   off by default right now.

 - FAILFAST support.

   This fails IOs to broken raid disks quickly, so can improve latency.
   It's mainly for DASD storage, but some patches help normal raid array
   too.

 - support bad block for raid array with external metadata

 - AVX2 instruction support for raid6 parity calculation

 - normalize MD info output

 - add missing blktrace

 - other bug fixes

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: (66 commits)
  md: separate flags for superblock changes
  md: MD_RECOVERY_NEEDED is set for mddev->recovery
  md: takeover should clear unrelated bits
  md/r5cache: after recovery, increase journal seq by 10000
  md/raid5-cache: fix crc in rewrite_data_only_stripes()
  md/raid5-cache: no recovery is required when create super-block
  md: fix refcount problem on mddev when stopping array.
  md/r5cache: do r5c_update_log_state after log recovery
  md/raid5-cache: adjust the write position of the empty block if no data blocks
  md/r5cache: run_no_space_stripes() when R5C_LOG_CRITICAL == 0
  md/raid5: limit request size according to implementation limits
  md/raid5-cache: do not need to set STRIPE_PREREAD_ACTIVE repeatedly
  md/raid5-cache: remove the unnecessary next_cp_seq field from the r5l_log
  md/raid5-cache: release the stripe_head at the appropriate location
  md/raid5-cache: use ring add to prevent overflow
  md/raid5-cache: remove unnecessary function parameters
  raid5-cache: don't set STRIPE_R5C_PARTIAL_STRIPE flag while load stripe into cache
  raid5-cache: add another check conditon before replaying one stripe
  md/r5cache: enable IRQs on error path
  md/r5cache: handle alloc_page failure
  ...
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include <linux/mount.h> #include <linux/mount.h>
#include <linux/buffer_head.h> #include <linux/buffer_head.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <trace/events/block.h>
#include "md.h" #include "md.h"
#include "bitmap.h" #include "bitmap.h"
...@@ -208,11 +209,13 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde ...@@ -208,11 +209,13 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
{ {
struct md_rdev *rdev = NULL; struct md_rdev *rdev;
struct block_device *bdev; struct block_device *bdev;
struct mddev *mddev = bitmap->mddev; struct mddev *mddev = bitmap->mddev;
struct bitmap_storage *store = &bitmap->storage; struct bitmap_storage *store = &bitmap->storage;
restart:
rdev = NULL;
while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
int size = PAGE_SIZE; int size = PAGE_SIZE;
loff_t offset = mddev->bitmap_info.offset; loff_t offset = mddev->bitmap_info.offset;
...@@ -268,8 +271,8 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) ...@@ -268,8 +271,8 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
page); page);
} }
if (wait) if (wait && md_super_wait(mddev) < 0)
md_super_wait(mddev); goto restart;
return 0; return 0;
bad_alignment: bad_alignment:
...@@ -405,10 +408,10 @@ static int read_page(struct file *file, unsigned long index, ...@@ -405,10 +408,10 @@ static int read_page(struct file *file, unsigned long index,
ret = -EIO; ret = -EIO;
out: out:
if (ret) if (ret)
printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n", pr_err("md: bitmap read error: (%dB @ %llu): %d\n",
(int)PAGE_SIZE, (int)PAGE_SIZE,
(unsigned long long)index << PAGE_SHIFT, (unsigned long long)index << PAGE_SHIFT,
ret); ret);
return ret; return ret;
} }
...@@ -416,6 +419,28 @@ static int read_page(struct file *file, unsigned long index, ...@@ -416,6 +419,28 @@ static int read_page(struct file *file, unsigned long index,
* bitmap file superblock operations * bitmap file superblock operations
*/ */
/*
* bitmap_wait_writes() should be called before writing any bitmap
* blocks, to ensure previous writes, particularly from
* bitmap_daemon_work(), have completed.
*/
static void bitmap_wait_writes(struct bitmap *bitmap)
{
if (bitmap->storage.file)
wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes)==0);
else
/* Note that we ignore the return value. The writes
* might have failed, but that would just mean that
* some bits which should be cleared haven't been,
* which is safe. The relevant bitmap blocks will
* probably get written again, but there is no great
* loss if they aren't.
*/
md_super_wait(bitmap->mddev);
}
/* update the event counter and sync the superblock to disk */ /* update the event counter and sync the superblock to disk */
void bitmap_update_sb(struct bitmap *bitmap) void bitmap_update_sb(struct bitmap *bitmap)
{ {
...@@ -455,24 +480,24 @@ void bitmap_print_sb(struct bitmap *bitmap) ...@@ -455,24 +480,24 @@ void bitmap_print_sb(struct bitmap *bitmap)
if (!bitmap || !bitmap->storage.sb_page) if (!bitmap || !bitmap->storage.sb_page)
return; return;
sb = kmap_atomic(bitmap->storage.sb_page); sb = kmap_atomic(bitmap->storage.sb_page);
printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); pr_debug("%s: bitmap file superblock:\n", bmname(bitmap));
printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic));
printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); pr_debug(" version: %d\n", le32_to_cpu(sb->version));
printk(KERN_DEBUG " uuid: %08x.%08x.%08x.%08x\n", pr_debug(" uuid: %08x.%08x.%08x.%08x\n",
*(__u32 *)(sb->uuid+0), *(__u32 *)(sb->uuid+0),
*(__u32 *)(sb->uuid+4), *(__u32 *)(sb->uuid+4),
*(__u32 *)(sb->uuid+8), *(__u32 *)(sb->uuid+8),
*(__u32 *)(sb->uuid+12)); *(__u32 *)(sb->uuid+12));
printk(KERN_DEBUG " events: %llu\n", pr_debug(" events: %llu\n",
(unsigned long long) le64_to_cpu(sb->events)); (unsigned long long) le64_to_cpu(sb->events));
printk(KERN_DEBUG "events cleared: %llu\n", pr_debug("events cleared: %llu\n",
(unsigned long long) le64_to_cpu(sb->events_cleared)); (unsigned long long) le64_to_cpu(sb->events_cleared));
printk(KERN_DEBUG " state: %08x\n", le32_to_cpu(sb->state)); pr_debug(" state: %08x\n", le32_to_cpu(sb->state));
printk(KERN_DEBUG " chunksize: %d B\n", le32_to_cpu(sb->chunksize)); pr_debug(" chunksize: %d B\n", le32_to_cpu(sb->chunksize));
printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); pr_debug(" daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
printk(KERN_DEBUG " sync size: %llu KB\n", pr_debug(" sync size: %llu KB\n",
(unsigned long long)le64_to_cpu(sb->sync_size)/2); (unsigned long long)le64_to_cpu(sb->sync_size)/2);
printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind));
kunmap_atomic(sb); kunmap_atomic(sb);
} }
...@@ -506,14 +531,14 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) ...@@ -506,14 +531,14 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
BUG_ON(!chunksize); BUG_ON(!chunksize);
if (!is_power_of_2(chunksize)) { if (!is_power_of_2(chunksize)) {
kunmap_atomic(sb); kunmap_atomic(sb);
printk(KERN_ERR "bitmap chunksize not a power of 2\n"); pr_warn("bitmap chunksize not a power of 2\n");
return -EINVAL; return -EINVAL;
} }
sb->chunksize = cpu_to_le32(chunksize); sb->chunksize = cpu_to_le32(chunksize);
daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep; daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep;
if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) { if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) {
printk(KERN_INFO "Choosing daemon_sleep default (5 sec)\n"); pr_debug("Choosing daemon_sleep default (5 sec)\n");
daemon_sleep = 5 * HZ; daemon_sleep = 5 * HZ;
} }
sb->daemon_sleep = cpu_to_le32(daemon_sleep); sb->daemon_sleep = cpu_to_le32(daemon_sleep);
...@@ -584,7 +609,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) ...@@ -584,7 +609,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
/* to 4k blocks */ /* to 4k blocks */
bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3)); offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3));
pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
bitmap->cluster_slot, offset); bitmap->cluster_slot, offset);
} }
...@@ -634,7 +659,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) ...@@ -634,7 +659,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
else if (write_behind > COUNTER_MAX) else if (write_behind > COUNTER_MAX)
reason = "write-behind limit out of range (0 - 16383)"; reason = "write-behind limit out of range (0 - 16383)";
if (reason) { if (reason) {
printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", pr_warn("%s: invalid bitmap file superblock: %s\n",
bmname(bitmap), reason); bmname(bitmap), reason);
goto out; goto out;
} }
...@@ -648,18 +673,15 @@ static int bitmap_read_sb(struct bitmap *bitmap) ...@@ -648,18 +673,15 @@ static int bitmap_read_sb(struct bitmap *bitmap)
* bitmap's UUID and event counter to the mddev's * bitmap's UUID and event counter to the mddev's
*/ */
if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
printk(KERN_INFO pr_warn("%s: bitmap superblock UUID mismatch\n",
"%s: bitmap superblock UUID mismatch\n", bmname(bitmap));
bmname(bitmap));
goto out; goto out;
} }
events = le64_to_cpu(sb->events); events = le64_to_cpu(sb->events);
if (!nodes && (events < bitmap->mddev->events)) { if (!nodes && (events < bitmap->mddev->events)) {
printk(KERN_INFO pr_warn("%s: bitmap file is out of date (%llu < %llu) -- forcing full recovery\n",
"%s: bitmap file is out of date (%llu < %llu) " bmname(bitmap), events,
"-- forcing full recovery\n", (unsigned long long) bitmap->mddev->events);
bmname(bitmap), events,
(unsigned long long) bitmap->mddev->events);
set_bit(BITMAP_STALE, &bitmap->flags); set_bit(BITMAP_STALE, &bitmap->flags);
} }
} }
...@@ -679,8 +701,8 @@ static int bitmap_read_sb(struct bitmap *bitmap) ...@@ -679,8 +701,8 @@ static int bitmap_read_sb(struct bitmap *bitmap)
if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
err = md_setup_cluster(bitmap->mddev, nodes); err = md_setup_cluster(bitmap->mddev, nodes);
if (err) { if (err) {
pr_err("%s: Could not setup cluster service (%d)\n", pr_warn("%s: Could not setup cluster service (%d)\n",
bmname(bitmap), err); bmname(bitmap), err);
goto out_no_sb; goto out_no_sb;
} }
bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev);
...@@ -847,15 +869,13 @@ static void bitmap_file_kick(struct bitmap *bitmap) ...@@ -847,15 +869,13 @@ static void bitmap_file_kick(struct bitmap *bitmap)
ptr = file_path(bitmap->storage.file, ptr = file_path(bitmap->storage.file,
path, PAGE_SIZE); path, PAGE_SIZE);
printk(KERN_ALERT pr_warn("%s: kicking failed bitmap file %s from array!\n",
"%s: kicking failed bitmap file %s from array!\n", bmname(bitmap), IS_ERR(ptr) ? "" : ptr);
bmname(bitmap), IS_ERR(ptr) ? "" : ptr);
kfree(path); kfree(path);
} else } else
printk(KERN_ALERT pr_warn("%s: disabling internal bitmap due to errors\n",
"%s: disabling internal bitmap due to errors\n", bmname(bitmap));
bmname(bitmap));
} }
} }
...@@ -983,6 +1003,7 @@ void bitmap_unplug(struct bitmap *bitmap) ...@@ -983,6 +1003,7 @@ void bitmap_unplug(struct bitmap *bitmap)
{ {
unsigned long i; unsigned long i;
int dirty, need_write; int dirty, need_write;
int writing = 0;
if (!bitmap || !bitmap->storage.filemap || if (!bitmap || !bitmap->storage.filemap ||
test_bit(BITMAP_STALE, &bitmap->flags)) test_bit(BITMAP_STALE, &bitmap->flags))
...@@ -997,15 +1018,19 @@ void bitmap_unplug(struct bitmap *bitmap) ...@@ -997,15 +1018,19 @@ void bitmap_unplug(struct bitmap *bitmap)
need_write = test_and_clear_page_attr(bitmap, i, need_write = test_and_clear_page_attr(bitmap, i,
BITMAP_PAGE_NEEDWRITE); BITMAP_PAGE_NEEDWRITE);
if (dirty || need_write) { if (dirty || need_write) {
if (!writing) {
bitmap_wait_writes(bitmap);
if (bitmap->mddev->queue)
blk_add_trace_msg(bitmap->mddev->queue,
"md bitmap_unplug");
}
clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
write_page(bitmap, bitmap->storage.filemap[i], 0); write_page(bitmap, bitmap->storage.filemap[i], 0);
writing = 1;
} }
} }
if (bitmap->storage.file) if (writing)
wait_event(bitmap->write_wait, bitmap_wait_writes(bitmap);
atomic_read(&bitmap->pending_writes)==0);
else
md_super_wait(bitmap->mddev);
if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
bitmap_file_kick(bitmap); bitmap_file_kick(bitmap);
...@@ -1056,14 +1081,13 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) ...@@ -1056,14 +1081,13 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
outofdate = test_bit(BITMAP_STALE, &bitmap->flags); outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
if (outofdate) if (outofdate)
printk(KERN_INFO "%s: bitmap file is out of date, doing full " pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap));
"recovery\n", bmname(bitmap));
if (file && i_size_read(file->f_mapping->host) < store->bytes) { if (file && i_size_read(file->f_mapping->host) < store->bytes) {
printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", pr_warn("%s: bitmap file too short %lu < %lu\n",
bmname(bitmap), bmname(bitmap),
(unsigned long) i_size_read(file->f_mapping->host), (unsigned long) i_size_read(file->f_mapping->host),
store->bytes); store->bytes);
goto err; goto err;
} }
...@@ -1137,16 +1161,15 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) ...@@ -1137,16 +1161,15 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
offset = 0; offset = 0;
} }
printk(KERN_INFO "%s: bitmap initialized from disk: " pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n",
"read %lu pages, set %lu of %lu bits\n", bmname(bitmap), store->file_pages,
bmname(bitmap), store->file_pages, bit_cnt, chunks);
bit_cnt, chunks);
return 0; return 0;
err: err:
printk(KERN_INFO "%s: bitmap initialisation failed: %d\n", pr_warn("%s: bitmap initialisation failed: %d\n",
bmname(bitmap), ret); bmname(bitmap), ret);
return ret; return ret;
} }
...@@ -1225,6 +1248,10 @@ void bitmap_daemon_work(struct mddev *mddev) ...@@ -1225,6 +1248,10 @@ void bitmap_daemon_work(struct mddev *mddev)
} }
bitmap->allclean = 1; bitmap->allclean = 1;
if (bitmap->mddev->queue)
blk_add_trace_msg(bitmap->mddev->queue,
"md bitmap_daemon_work");
/* Any file-page which is PENDING now needs to be written. /* Any file-page which is PENDING now needs to be written.
* So set NEEDWRITE now, then after we make any last-minute changes * So set NEEDWRITE now, then after we make any last-minute changes
* we will write it. * we will write it.
...@@ -1289,6 +1316,7 @@ void bitmap_daemon_work(struct mddev *mddev) ...@@ -1289,6 +1316,7 @@ void bitmap_daemon_work(struct mddev *mddev)
} }
spin_unlock_irq(&counts->lock); spin_unlock_irq(&counts->lock);
bitmap_wait_writes(bitmap);
/* Now start writeout on any page in NEEDWRITE that isn't DIRTY. /* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
* DIRTY pages need to be written by bitmap_unplug so it can wait * DIRTY pages need to be written by bitmap_unplug so it can wait
* for them. * for them.
...@@ -1595,7 +1623,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) ...@@ -1595,7 +1623,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
atomic_read(&bitmap->mddev->recovery_active) == 0); atomic_read(&bitmap->mddev->recovery_active) == 0);
bitmap->mddev->curr_resync_completed = sector; bitmap->mddev->curr_resync_completed = sector;
set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); set_bit(MD_SB_CHANGE_CLEAN, &bitmap->mddev->sb_flags);
sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); sector &= ~((1ULL << bitmap->counts.chunkshift) - 1);
s = 0; s = 0;
while (s < sector && s < bitmap->mddev->resync_max_sectors) { while (s < sector && s < bitmap->mddev->resync_max_sectors) {
...@@ -1825,8 +1853,8 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot) ...@@ -1825,8 +1853,8 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot)
if (err) if (err)
goto error; goto error;
printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", pr_debug("created bitmap (%lu pages) for device %s\n",
bitmap->counts.pages, bmname(bitmap)); bitmap->counts.pages, bmname(bitmap));
err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
if (err) if (err)
...@@ -2029,8 +2057,10 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, ...@@ -2029,8 +2057,10 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
!bitmap->mddev->bitmap_info.external, !bitmap->mddev->bitmap_info.external,
mddev_is_clustered(bitmap->mddev) mddev_is_clustered(bitmap->mddev)
? bitmap->cluster_slot : 0); ? bitmap->cluster_slot : 0);
if (ret) if (ret) {
bitmap_file_unmap(&store);
goto err; goto err;
}
pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO); pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO);
...@@ -2089,7 +2119,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, ...@@ -2089,7 +2119,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift + bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift +
BITMAP_BLOCK_SHIFT); BITMAP_BLOCK_SHIFT);
blocks = old_counts.chunks << old_counts.chunkshift; blocks = old_counts.chunks << old_counts.chunkshift;
pr_err("Could not pre-allocate in-memory bitmap for cluster raid\n"); pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n");
break; break;
} else } else
bitmap->counts.bp[page].count += 1; bitmap->counts.bp[page].count += 1;
...@@ -2266,7 +2296,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -2266,7 +2296,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
/* Ensure new bitmap info is stored in /* Ensure new bitmap info is stored in
* metadata promptly. * metadata promptly.
*/ */
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
} }
rv = 0; rv = 0;
......
...@@ -2011,7 +2011,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) ...@@ -2011,7 +2011,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190); sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190);
/* Force writing of superblocks to disk */ /* Force writing of superblocks to disk */
set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags); set_bit(MD_SB_CHANGE_DEVS, &rdev->mddev->sb_flags);
/* Any superblock is better than none, choose that if given */ /* Any superblock is better than none, choose that if given */
return refdev ? 0 : 1; return refdev ? 0 : 1;
...@@ -3497,7 +3497,7 @@ static void rs_update_sbs(struct raid_set *rs) ...@@ -3497,7 +3497,7 @@ static void rs_update_sbs(struct raid_set *rs)
struct mddev *mddev = &rs->md; struct mddev *mddev = &rs->md;
int ro = mddev->ro; int ro = mddev->ro;
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
mddev->ro = 0; mddev->ro = 0;
md_update_sb(mddev, 1); md_update_sb(mddev, 1);
mddev->ro = ro; mddev->ro = ro;
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <trace/events/block.h>
#include "md.h" #include "md.h"
#include "linear.h" #include "linear.h"
...@@ -101,8 +102,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) ...@@ -101,8 +102,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
sector_t sectors; sector_t sectors;
if (j < 0 || j >= raid_disks || disk->rdev) { if (j < 0 || j >= raid_disks || disk->rdev) {
printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n", pr_warn("md/linear:%s: disk numbering problem. Aborting!\n",
mdname(mddev)); mdname(mddev));
goto out; goto out;
} }
...@@ -123,8 +124,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) ...@@ -123,8 +124,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
discard_supported = true; discard_supported = true;
} }
if (cnt != raid_disks) { if (cnt != raid_disks) {
printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", pr_warn("md/linear:%s: not enough drives present. Aborting!\n",
mdname(mddev)); mdname(mddev));
goto out; goto out;
} }
...@@ -227,22 +228,22 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) ...@@ -227,22 +228,22 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
} }
do { do {
tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector); sector_t bio_sector = bio->bi_iter.bi_sector;
tmp_dev = which_dev(mddev, bio_sector);
start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
end_sector = tmp_dev->end_sector; end_sector = tmp_dev->end_sector;
data_offset = tmp_dev->rdev->data_offset; data_offset = tmp_dev->rdev->data_offset;
bio->bi_bdev = tmp_dev->rdev->bdev; bio->bi_bdev = tmp_dev->rdev->bdev;
if (unlikely(bio->bi_iter.bi_sector >= end_sector || if (unlikely(bio_sector >= end_sector ||
bio->bi_iter.bi_sector < start_sector)) bio_sector < start_sector))
goto out_of_bounds; goto out_of_bounds;
if (unlikely(bio_end_sector(bio) > end_sector)) { if (unlikely(bio_end_sector(bio) > end_sector)) {
/* This bio crosses a device boundary, so we have to /* This bio crosses a device boundary, so we have to
* split it. * split it.
*/ */
split = bio_split(bio, end_sector - split = bio_split(bio, end_sector - bio_sector,
bio->bi_iter.bi_sector,
GFP_NOIO, fs_bio_set); GFP_NOIO, fs_bio_set);
bio_chain(split, bio); bio_chain(split, bio);
} else { } else {
...@@ -256,15 +257,18 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) ...@@ -256,15 +257,18 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
!blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
/* Just ignore it */ /* Just ignore it */
bio_endio(split); bio_endio(split);
} else } else {
if (mddev->gendisk)
trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
split, disk_devt(mddev->gendisk),
bio_sector);
generic_make_request(split); generic_make_request(split);
}
} while (split != bio); } while (split != bio);
return; return;
out_of_bounds: out_of_bounds:
printk(KERN_ERR pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n",
"md/linear:%s: make_request: Sector %llu out of bounds on "
"dev %s: %llu sectors, offset %llu\n",
mdname(mddev), mdname(mddev),
(unsigned long long)bio->bi_iter.bi_sector, (unsigned long long)bio->bi_iter.bi_sector,
bdevname(tmp_dev->rdev->bdev, b), bdevname(tmp_dev->rdev->bdev, b),
...@@ -275,7 +279,6 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) ...@@ -275,7 +279,6 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
static void linear_status (struct seq_file *seq, struct mddev *mddev) static void linear_status (struct seq_file *seq, struct mddev *mddev)
{ {
seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
} }
......
此差异已折叠。
...@@ -29,6 +29,16 @@ ...@@ -29,6 +29,16 @@
#define MaxSector (~(sector_t)0) #define MaxSector (~(sector_t)0)
/*
* These flags should really be called "NO_RETRY" rather than
* "FAILFAST" because they don't make any promise about time lapse,
* only about the number of retries, which will be zero.
* REQ_FAILFAST_DRIVER is not included because
* Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.")
* seems to suggest that the errors it avoids retrying should usually
* be retried.
*/
#define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
/* /*
* MD's 'extended' device * MD's 'extended' device
*/ */
...@@ -168,6 +178,19 @@ enum flag_bits { ...@@ -168,6 +178,19 @@ enum flag_bits {
* so it is safe to remove without * so it is safe to remove without
* another synchronize_rcu() call. * another synchronize_rcu() call.
*/ */
ExternalBbl, /* External metadata provides bad
* block management for a disk
*/
FailFast, /* Minimal retries should be attempted on
* this device, so use REQ_FAILFAST_DEV.
* Also don't try to repair failed reads.
* It is expects that no bad block log
* is present.
*/
LastDev, /* Seems to be the last working dev as
* it didn't fail, so don't use FailFast
* any more for metadata
*/
}; };
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
...@@ -189,6 +212,31 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, ...@@ -189,6 +212,31 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new); int is_new);
struct md_cluster_info; struct md_cluster_info;
enum mddev_flags {
MD_ARRAY_FIRST_USE, /* First use of array, needs initialization */
MD_CLOSING, /* If set, we are closing the array, do not open
* it then */
MD_JOURNAL_CLEAN, /* A raid with journal is already clean */
MD_HAS_JOURNAL, /* The raid array has journal feature set */
MD_RELOAD_SB, /* Reload the superblock because another node
* updated it.
*/
MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
* already took resync lock, need to
* release the lock */
MD_FAILFAST_SUPPORTED, /* Using MD_FAILFAST on metadata writes is
* supported as calls to md_error() will
* never cause the array to become failed.
*/
};
enum mddev_sb_flags {
MD_SB_CHANGE_DEVS, /* Some device status has changed */
MD_SB_CHANGE_CLEAN, /* transition to or from 'clean' */
MD_SB_CHANGE_PENDING, /* switch from 'clean' to 'active' in progress */
MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
};
struct mddev { struct mddev {
void *private; void *private;
struct md_personality *pers; struct md_personality *pers;
...@@ -196,21 +244,7 @@ struct mddev { ...@@ -196,21 +244,7 @@ struct mddev {
int md_minor; int md_minor;
struct list_head disks; struct list_head disks;
unsigned long flags; unsigned long flags;
#define MD_CHANGE_DEVS 0 /* Some device status has changed */ unsigned long sb_flags;
#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */
#define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */
#define MD_UPDATE_SB_FLAGS (1 | 2 | 4) /* If these are set, md_update_sb needed */
#define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */
#define MD_CLOSING 4 /* If set, we are closing the array, do not open
* it then */
#define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */
#define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */
#define MD_RELOAD_SB 7 /* Reload the superblock because another node
* updated it.
*/
#define MD_CLUSTER_RESYNC_LOCKED 8 /* cluster raid only, which means node
* already took resync lock, need to
* release the lock */
int suspended; int suspended;
atomic_t active_io; atomic_t active_io;
...@@ -304,31 +338,6 @@ struct mddev { ...@@ -304,31 +338,6 @@ struct mddev {
int parallel_resync; int parallel_resync;
int ok_start_degraded; int ok_start_degraded;
/* recovery/resync flags
* NEEDED: we might need to start a resync/recover
* RUNNING: a thread is running, or about to be started
* SYNC: actually doing a resync, not a recovery
* RECOVER: doing recovery, or need to try it.
* INTR: resync needs to be aborted for some reason
* DONE: thread is done and is waiting to be reaped
* REQUEST: user-space has requested a sync (used with SYNC)
* CHECK: user-space request for check-only, no repair
* RESHAPE: A reshape is happening
* ERROR: sync-action interrupted because io-error
*
* If neither SYNC or RESHAPE are set, then it is a recovery.
*/
#define MD_RECOVERY_RUNNING 0
#define MD_RECOVERY_SYNC 1
#define MD_RECOVERY_RECOVER 2
#define MD_RECOVERY_INTR 3
#define MD_RECOVERY_DONE 4
#define MD_RECOVERY_NEEDED 5
#define MD_RECOVERY_REQUESTED 6
#define MD_RECOVERY_CHECK 7
#define MD_RECOVERY_RESHAPE 8
#define MD_RECOVERY_FROZEN 9
#define MD_RECOVERY_ERROR 10
unsigned long recovery; unsigned long recovery;
/* If a RAID personality determines that recovery (of a particular /* If a RAID personality determines that recovery (of a particular
...@@ -442,6 +451,23 @@ struct mddev { ...@@ -442,6 +451,23 @@ struct mddev {
unsigned int good_device_nr; /* good device num within cluster raid */ unsigned int good_device_nr; /* good device num within cluster raid */
}; };
enum recovery_flags {
/*
* If neither SYNC or RESHAPE are set, then it is a recovery.
*/
MD_RECOVERY_RUNNING, /* a thread is running, or about to be started */
MD_RECOVERY_SYNC, /* actually doing a resync, not a recovery */
MD_RECOVERY_RECOVER, /* doing recovery, or need to try it. */
MD_RECOVERY_INTR, /* resync needs to be aborted for some reason */
MD_RECOVERY_DONE, /* thread is done and is waiting to be reaped */
MD_RECOVERY_NEEDED, /* we might need to start a resync/recover */
MD_RECOVERY_REQUESTED, /* user-space has requested a sync (used with SYNC) */
MD_RECOVERY_CHECK, /* user-space request for check-only, no repair */
MD_RECOVERY_RESHAPE, /* A reshape is happening */
MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */
MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */
};
static inline int __must_check mddev_lock(struct mddev *mddev) static inline int __must_check mddev_lock(struct mddev *mddev)
{ {
return mutex_lock_interruptible(&mddev->reconfig_mutex); return mutex_lock_interruptible(&mddev->reconfig_mutex);
...@@ -623,7 +649,7 @@ extern int mddev_congested(struct mddev *mddev, int bits); ...@@ -623,7 +649,7 @@ extern int mddev_congested(struct mddev *mddev, int bits);
extern void md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_flush_request(struct mddev *mddev, struct bio *bio);
extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
sector_t sector, int size, struct page *page); sector_t sector, int size, struct page *page);
extern void md_super_wait(struct mddev *mddev); extern int md_super_wait(struct mddev *mddev);
extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
struct page *page, int op, int op_flags, struct page *page, int op, int op_flags,
bool metadata_op); bool metadata_op);
......
...@@ -52,7 +52,7 @@ static int multipath_map (struct mpconf *conf) ...@@ -52,7 +52,7 @@ static int multipath_map (struct mpconf *conf)
} }
rcu_read_unlock(); rcu_read_unlock();
printk(KERN_ERR "multipath_map(): no more operational IO paths?\n"); pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n");
return (-1); return (-1);
} }
...@@ -97,9 +97,9 @@ static void multipath_end_request(struct bio *bio) ...@@ -97,9 +97,9 @@ static void multipath_end_request(struct bio *bio)
*/ */
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
md_error (mp_bh->mddev, rdev); md_error (mp_bh->mddev, rdev);
printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", pr_info("multipath: %s: rescheduling sector %llu\n",
bdevname(rdev->bdev,b), bdevname(rdev->bdev,b),
(unsigned long long)bio->bi_iter.bi_sector); (unsigned long long)bio->bi_iter.bi_sector);
multipath_reschedule_retry(mp_bh); multipath_reschedule_retry(mp_bh);
} else } else
multipath_end_bh_io(mp_bh, bio->bi_error); multipath_end_bh_io(mp_bh, bio->bi_error);
...@@ -194,8 +194,7 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) ...@@ -194,8 +194,7 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
* first check if this is a queued request for a device * first check if this is a queued request for a device
* which has just failed. * which has just failed.
*/ */
printk(KERN_ALERT pr_warn("multipath: only one IO path left and IO error.\n");
"multipath: only one IO path left and IO error.\n");
/* leave it active... it's all we have */ /* leave it active... it's all we have */
return; return;
} }
...@@ -209,11 +208,9 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) ...@@ -209,11 +208,9 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
spin_unlock_irqrestore(&conf->device_lock, flags); spin_unlock_irqrestore(&conf->device_lock, flags);
} }
set_bit(Faulty, &rdev->flags); set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
printk(KERN_ALERT "multipath: IO failure on %s," pr_err("multipath: IO failure on %s, disabling IO path.\n"
" disabling IO path.\n" "multipath: Operation continuing on %d IO paths.\n",
"multipath: Operation continuing"
" on %d IO paths.\n",
bdevname(rdev->bdev, b), bdevname(rdev->bdev, b),
conf->raid_disks - mddev->degraded); conf->raid_disks - mddev->degraded);
} }
...@@ -223,21 +220,21 @@ static void print_multipath_conf (struct mpconf *conf) ...@@ -223,21 +220,21 @@ static void print_multipath_conf (struct mpconf *conf)
int i; int i;
struct multipath_info *tmp; struct multipath_info *tmp;
printk("MULTIPATH conf printout:\n"); pr_debug("MULTIPATH conf printout:\n");
if (!conf) { if (!conf) {
printk("(conf==NULL)\n"); pr_debug("(conf==NULL)\n");
return; return;
} }
printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
conf->raid_disks); conf->raid_disks);
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
tmp = conf->multipaths + i; tmp = conf->multipaths + i;
if (tmp->rdev) if (tmp->rdev)
printk(" disk%d, o:%d, dev:%s\n", pr_debug(" disk%d, o:%d, dev:%s\n",
i,!test_bit(Faulty, &tmp->rdev->flags), i,!test_bit(Faulty, &tmp->rdev->flags),
bdevname(tmp->rdev->bdev,b)); bdevname(tmp->rdev->bdev,b));
} }
} }
...@@ -292,8 +289,7 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -292,8 +289,7 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (rdev == p->rdev) { if (rdev == p->rdev) {
if (test_bit(In_sync, &rdev->flags) || if (test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending)) { atomic_read(&rdev->nr_pending)) {
printk(KERN_ERR "hot-remove-disk, slot %d is identified" pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number);
" but is still operational!\n", number);
err = -EBUSY; err = -EBUSY;
goto abort; goto abort;
} }
...@@ -346,16 +342,14 @@ static void multipathd(struct md_thread *thread) ...@@ -346,16 +342,14 @@ static void multipathd(struct md_thread *thread)
bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector; bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;
if ((mp_bh->path = multipath_map (conf))<0) { if ((mp_bh->path = multipath_map (conf))<0) {
printk(KERN_ALERT "multipath: %s: unrecoverable IO read" pr_err("multipath: %s: unrecoverable IO read error for block %llu\n",
" error for block %llu\n", bdevname(bio->bi_bdev,b),
bdevname(bio->bi_bdev,b), (unsigned long long)bio->bi_iter.bi_sector);
(unsigned long long)bio->bi_iter.bi_sector);
multipath_end_bh_io(mp_bh, -EIO); multipath_end_bh_io(mp_bh, -EIO);
} else { } else {
printk(KERN_ERR "multipath: %s: redirecting sector %llu" pr_err("multipath: %s: redirecting sector %llu to another IO path\n",
" to another IO path\n", bdevname(bio->bi_bdev,b),
bdevname(bio->bi_bdev,b), (unsigned long long)bio->bi_iter.bi_sector);
(unsigned long long)bio->bi_iter.bi_sector);
*bio = *(mp_bh->master_bio); *bio = *(mp_bh->master_bio);
bio->bi_iter.bi_sector += bio->bi_iter.bi_sector +=
conf->multipaths[mp_bh->path].rdev->data_offset; conf->multipaths[mp_bh->path].rdev->data_offset;
...@@ -389,8 +383,8 @@ static int multipath_run (struct mddev *mddev) ...@@ -389,8 +383,8 @@ static int multipath_run (struct mddev *mddev)
return -EINVAL; return -EINVAL;
if (mddev->level != LEVEL_MULTIPATH) { if (mddev->level != LEVEL_MULTIPATH) {
printk("multipath: %s: raid level not set to multipath IO (%d)\n", pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n",
mdname(mddev), mddev->level); mdname(mddev), mddev->level);
goto out; goto out;
} }
/* /*
...@@ -401,21 +395,13 @@ static int multipath_run (struct mddev *mddev) ...@@ -401,21 +395,13 @@ static int multipath_run (struct mddev *mddev)
conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
mddev->private = conf; mddev->private = conf;
if (!conf) { if (!conf)
printk(KERN_ERR
"multipath: couldn't allocate memory for %s\n",
mdname(mddev));
goto out; goto out;
}
conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks, conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks,
GFP_KERNEL); GFP_KERNEL);
if (!conf->multipaths) { if (!conf->multipaths)
printk(KERN_ERR
"multipath: couldn't allocate memory for %s\n",
mdname(mddev));
goto out_free_conf; goto out_free_conf;
}
working_disks = 0; working_disks = 0;
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
...@@ -439,7 +425,7 @@ static int multipath_run (struct mddev *mddev) ...@@ -439,7 +425,7 @@ static int multipath_run (struct mddev *mddev)
INIT_LIST_HEAD(&conf->retry_list); INIT_LIST_HEAD(&conf->retry_list);
if (!working_disks) { if (!working_disks) {
printk(KERN_ERR "multipath: no operational IO paths for %s\n", pr_warn("multipath: no operational IO paths for %s\n",
mdname(mddev)); mdname(mddev));
goto out_free_conf; goto out_free_conf;
} }
...@@ -447,27 +433,17 @@ static int multipath_run (struct mddev *mddev) ...@@ -447,27 +433,17 @@ static int multipath_run (struct mddev *mddev)
conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
sizeof(struct multipath_bh)); sizeof(struct multipath_bh));
if (conf->pool == NULL) { if (conf->pool == NULL)
printk(KERN_ERR
"multipath: couldn't allocate memory for %s\n",
mdname(mddev));
goto out_free_conf; goto out_free_conf;
}
{ mddev->thread = md_register_thread(multipathd, mddev,
mddev->thread = md_register_thread(multipathd, mddev, "multipath");
"multipath"); if (!mddev->thread)
if (!mddev->thread) { goto out_free_conf;
printk(KERN_ERR "multipath: couldn't allocate thread"
" for %s\n", mdname(mddev));
goto out_free_conf;
}
}
printk(KERN_INFO pr_info("multipath: array %s active with %d out of %d IO paths\n",
"multipath: array %s active with %d out of %d IO paths\n",
mdname(mddev), conf->raid_disks - mddev->degraded, mdname(mddev), conf->raid_disks - mddev->degraded,
mddev->raid_disks); mddev->raid_disks);
/* /*
* Ok, everything is just fine now * Ok, everything is just fine now
*/ */
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <trace/events/block.h>
#include "md.h" #include "md.h"
#include "raid0.h" #include "raid0.h"
#include "raid5.h" #include "raid5.h"
...@@ -51,20 +52,21 @@ static void dump_zones(struct mddev *mddev) ...@@ -51,20 +52,21 @@ static void dump_zones(struct mddev *mddev)
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
struct r0conf *conf = mddev->private; struct r0conf *conf = mddev->private;
int raid_disks = conf->strip_zone[0].nb_dev; int raid_disks = conf->strip_zone[0].nb_dev;
printk(KERN_INFO "md: RAID0 configuration for %s - %d zone%s\n", pr_debug("md: RAID0 configuration for %s - %d zone%s\n",
mdname(mddev), mdname(mddev),
conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s"); conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s");
for (j = 0; j < conf->nr_strip_zones; j++) { for (j = 0; j < conf->nr_strip_zones; j++) {
printk(KERN_INFO "md: zone%d=[", j); char line[200];
int len = 0;
for (k = 0; k < conf->strip_zone[j].nb_dev; k++) for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
printk(KERN_CONT "%s%s", k?"/":"", len += snprintf(line+len, 200-len, "%s%s", k?"/":"",
bdevname(conf->devlist[j*raid_disks bdevname(conf->devlist[j*raid_disks
+ k]->bdev, b)); + k]->bdev, b));
printk(KERN_CONT "]\n"); pr_debug("md: zone%d=[%s]\n", j, line);
zone_size = conf->strip_zone[j].zone_end - zone_start; zone_size = conf->strip_zone[j].zone_end - zone_start;
printk(KERN_INFO " zone-offset=%10lluKB, " pr_debug(" zone-offset=%10lluKB, device-offset=%10lluKB, size=%10lluKB\n",
"device-offset=%10lluKB, size=%10lluKB\n",
(unsigned long long)zone_start>>1, (unsigned long long)zone_start>>1,
(unsigned long long)conf->strip_zone[j].dev_start>>1, (unsigned long long)conf->strip_zone[j].dev_start>>1,
(unsigned long long)zone_size>>1); (unsigned long long)zone_size>>1);
...@@ -142,9 +144,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) ...@@ -142,9 +144,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
* chunk size is a multiple of that sector size * chunk size is a multiple of that sector size
*/ */
if ((mddev->chunk_sectors << 9) % blksize) { if ((mddev->chunk_sectors << 9) % blksize) {
printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n", pr_warn("md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
mdname(mddev), mdname(mddev),
mddev->chunk_sectors << 9, blksize); mddev->chunk_sectors << 9, blksize);
err = -EINVAL; err = -EINVAL;
goto abort; goto abort;
} }
...@@ -186,19 +188,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) ...@@ -186,19 +188,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
} }
if (j < 0) { if (j < 0) {
printk(KERN_ERR pr_warn("md/raid0:%s: remove inactive devices before converting to RAID0\n",
"md/raid0:%s: remove inactive devices before converting to RAID0\n", mdname(mddev));
mdname(mddev));
goto abort; goto abort;
} }
if (j >= mddev->raid_disks) { if (j >= mddev->raid_disks) {
printk(KERN_ERR "md/raid0:%s: bad disk number %d - " pr_warn("md/raid0:%s: bad disk number %d - aborting!\n",
"aborting!\n", mdname(mddev), j); mdname(mddev), j);
goto abort; goto abort;
} }
if (dev[j]) { if (dev[j]) {
printk(KERN_ERR "md/raid0:%s: multiple devices for %d - " pr_warn("md/raid0:%s: multiple devices for %d - aborting!\n",
"aborting!\n", mdname(mddev), j); mdname(mddev), j);
goto abort; goto abort;
} }
dev[j] = rdev1; dev[j] = rdev1;
...@@ -208,8 +209,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) ...@@ -208,8 +209,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
cnt++; cnt++;
} }
if (cnt != mddev->raid_disks) { if (cnt != mddev->raid_disks) {
printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " pr_warn("md/raid0:%s: too few disks (%d of %d) - aborting!\n",
"aborting!\n", mdname(mddev), cnt, mddev->raid_disks); mdname(mddev), cnt, mddev->raid_disks);
goto abort; goto abort;
} }
zone->nb_dev = cnt; zone->nb_dev = cnt;
...@@ -357,8 +358,7 @@ static int raid0_run(struct mddev *mddev) ...@@ -357,8 +358,7 @@ static int raid0_run(struct mddev *mddev)
int ret; int ret;
if (mddev->chunk_sectors == 0) { if (mddev->chunk_sectors == 0) {
printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n", pr_warn("md/raid0:%s: chunk size must be set.\n", mdname(mddev));
mdname(mddev));
return -EINVAL; return -EINVAL;
} }
if (md_check_no_bitmap(mddev)) if (md_check_no_bitmap(mddev))
...@@ -399,9 +399,9 @@ static int raid0_run(struct mddev *mddev) ...@@ -399,9 +399,9 @@ static int raid0_run(struct mddev *mddev)
/* calculate array device size */ /* calculate array device size */
md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n", pr_debug("md/raid0:%s: md_size is %llu sectors.\n",
mdname(mddev), mdname(mddev),
(unsigned long long)mddev->array_sectors); (unsigned long long)mddev->array_sectors);
if (mddev->queue) { if (mddev->queue) {
/* calculate the max read-ahead size. /* calculate the max read-ahead size.
...@@ -464,7 +464,8 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) ...@@ -464,7 +464,8 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
} }
do { do {
sector_t sector = bio->bi_iter.bi_sector; sector_t bio_sector = bio->bi_iter.bi_sector;
sector_t sector = bio_sector;
unsigned chunk_sects = mddev->chunk_sectors; unsigned chunk_sects = mddev->chunk_sectors;
unsigned sectors = chunk_sects - unsigned sectors = chunk_sects -
...@@ -473,7 +474,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) ...@@ -473,7 +474,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
: sector_div(sector, chunk_sects)); : sector_div(sector, chunk_sects));
/* Restore due to sector_div */ /* Restore due to sector_div */
sector = bio->bi_iter.bi_sector; sector = bio_sector;
if (sectors < bio_sectors(bio)) { if (sectors < bio_sectors(bio)) {
split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set); split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
...@@ -492,8 +493,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) ...@@ -492,8 +493,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
!blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
/* Just ignore it */ /* Just ignore it */
bio_endio(split); bio_endio(split);
} else } else {
if (mddev->gendisk)
trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
split, disk_devt(mddev->gendisk),
bio_sector);
generic_make_request(split); generic_make_request(split);
}
} while (split != bio); } while (split != bio);
} }
...@@ -509,17 +515,17 @@ static void *raid0_takeover_raid45(struct mddev *mddev) ...@@ -509,17 +515,17 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
struct r0conf *priv_conf; struct r0conf *priv_conf;
if (mddev->degraded != 1) { if (mddev->degraded != 1) {
printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", pr_warn("md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n",
mdname(mddev), mdname(mddev),
mddev->degraded); mddev->degraded);
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
/* check slot number for a disk */ /* check slot number for a disk */
if (rdev->raid_disk == mddev->raid_disks-1) { if (rdev->raid_disk == mddev->raid_disks-1) {
printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", pr_warn("md/raid0:%s: raid5 must have missing parity disk!\n",
mdname(mddev)); mdname(mddev));
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
rdev->sectors = mddev->dev_sectors; rdev->sectors = mddev->dev_sectors;
...@@ -533,8 +539,11 @@ static void *raid0_takeover_raid45(struct mddev *mddev) ...@@ -533,8 +539,11 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
mddev->delta_disks = -1; mddev->delta_disks = -1;
/* make sure it will be not marked as dirty */ /* make sure it will be not marked as dirty */
mddev->recovery_cp = MaxSector; mddev->recovery_cp = MaxSector;
clear_bit(MD_HAS_JOURNAL, &mddev->flags);
clear_bit(MD_JOURNAL_CLEAN, &mddev->flags);
create_strip_zones(mddev, &priv_conf); create_strip_zones(mddev, &priv_conf);
return priv_conf; return priv_conf;
} }
...@@ -549,19 +558,19 @@ static void *raid0_takeover_raid10(struct mddev *mddev) ...@@ -549,19 +558,19 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
* - all mirrors must be already degraded * - all mirrors must be already degraded
*/ */
if (mddev->layout != ((1 << 8) + 2)) { if (mddev->layout != ((1 << 8) + 2)) {
printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n", pr_warn("md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n",
mdname(mddev), mdname(mddev),
mddev->layout); mddev->layout);
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
if (mddev->raid_disks & 1) { if (mddev->raid_disks & 1) {
printk(KERN_ERR "md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n", pr_warn("md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n",
mdname(mddev)); mdname(mddev));
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
if (mddev->degraded != (mddev->raid_disks>>1)) { if (mddev->degraded != (mddev->raid_disks>>1)) {
printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n", pr_warn("md/raid0:%s: All mirrors must be already degraded!\n",
mdname(mddev)); mdname(mddev));
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
...@@ -574,6 +583,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev) ...@@ -574,6 +583,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
mddev->degraded = 0; mddev->degraded = 0;
/* make sure it will be not marked as dirty */ /* make sure it will be not marked as dirty */
mddev->recovery_cp = MaxSector; mddev->recovery_cp = MaxSector;
clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
create_strip_zones(mddev, &priv_conf); create_strip_zones(mddev, &priv_conf);
return priv_conf; return priv_conf;
...@@ -588,7 +598,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev) ...@@ -588,7 +598,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
* - (N - 1) mirror drives must be already faulty * - (N - 1) mirror drives must be already faulty
*/ */
if ((mddev->raid_disks - 1) != mddev->degraded) { if ((mddev->raid_disks - 1) != mddev->degraded) {
printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n", pr_err("md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n",
mdname(mddev)); mdname(mddev));
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
...@@ -616,6 +626,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev) ...@@ -616,6 +626,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
mddev->raid_disks = 1; mddev->raid_disks = 1;
/* make sure it will be not marked as dirty */ /* make sure it will be not marked as dirty */
mddev->recovery_cp = MaxSector; mddev->recovery_cp = MaxSector;
clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
create_strip_zones(mddev, &priv_conf); create_strip_zones(mddev, &priv_conf);
return priv_conf; return priv_conf;
...@@ -631,8 +642,8 @@ static void *raid0_takeover(struct mddev *mddev) ...@@ -631,8 +642,8 @@ static void *raid0_takeover(struct mddev *mddev)
*/ */
if (mddev->bitmap) { if (mddev->bitmap) {
printk(KERN_ERR "md/raid0: %s: cannot takeover array with bitmap\n", pr_warn("md/raid0: %s: cannot takeover array with bitmap\n",
mdname(mddev)); mdname(mddev));
return ERR_PTR(-EBUSY); return ERR_PTR(-EBUSY);
} }
if (mddev->level == 4) if (mddev->level == 4)
...@@ -642,8 +653,8 @@ static void *raid0_takeover(struct mddev *mddev) ...@@ -642,8 +653,8 @@ static void *raid0_takeover(struct mddev *mddev)
if (mddev->layout == ALGORITHM_PARITY_N) if (mddev->layout == ALGORITHM_PARITY_N)
return raid0_takeover_raid45(mddev); return raid0_takeover_raid45(mddev);
printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", pr_warn("md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
mdname(mddev), ALGORITHM_PARITY_N); mdname(mddev), ALGORITHM_PARITY_N);
} }
if (mddev->level == 10) if (mddev->level == 10)
...@@ -652,7 +663,7 @@ static void *raid0_takeover(struct mddev *mddev) ...@@ -652,7 +663,7 @@ static void *raid0_takeover(struct mddev *mddev)
if (mddev->level == 1) if (mddev->level == 1)
return raid0_takeover_raid1(mddev); return raid0_takeover_raid1(mddev);
printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n", pr_warn("Takeover from raid%i to raid0 not supported\n",
mddev->level); mddev->level);
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
......
此差异已折叠。
...@@ -161,14 +161,15 @@ struct r1bio { ...@@ -161,14 +161,15 @@ struct r1bio {
}; };
/* bits for r1bio.state */ /* bits for r1bio.state */
#define R1BIO_Uptodate 0 enum r1bio_state {
#define R1BIO_IsSync 1 R1BIO_Uptodate,
#define R1BIO_Degraded 2 R1BIO_IsSync,
#define R1BIO_BehindIO 3 R1BIO_Degraded,
R1BIO_BehindIO,
/* Set ReadError on bios that experience a readerror so that /* Set ReadError on bios that experience a readerror so that
* raid1d knows what to do with them. * raid1d knows what to do with them.
*/ */
#define R1BIO_ReadError 4 R1BIO_ReadError,
/* For write-behind requests, we call bi_end_io when /* For write-behind requests, we call bi_end_io when
* the last non-write-behind device completes, providing * the last non-write-behind device completes, providing
* any write was successful. Otherwise we call when * any write was successful. Otherwise we call when
...@@ -176,10 +177,12 @@ struct r1bio { ...@@ -176,10 +177,12 @@ struct r1bio {
* with failure when last write completes (and all failed). * with failure when last write completes (and all failed).
* Record that bi_end_io was called with this flag... * Record that bi_end_io was called with this flag...
*/ */
#define R1BIO_Returned 6 R1BIO_Returned,
/* If a write for this request means we can clear some /* If a write for this request means we can clear some
* known-bad-block records, we set this flag * known-bad-block records, we set this flag
*/ */
#define R1BIO_MadeGood 7 R1BIO_MadeGood,
#define R1BIO_WriteError 8 R1BIO_WriteError,
R1BIO_FailFast,
};
#endif #endif
此差异已折叠。
...@@ -156,5 +156,7 @@ enum r10bio_state { ...@@ -156,5 +156,7 @@ enum r10bio_state {
* flag is set * flag is set
*/ */
R10BIO_Previous, R10BIO_Previous,
/* failfast devices did receive failfast requests. */
R10BIO_FailFast,
}; };
#endif #endif
此差异已折叠。
此差异已折叠。
...@@ -226,6 +226,8 @@ struct stripe_head { ...@@ -226,6 +226,8 @@ struct stripe_head {
struct r5l_io_unit *log_io; struct r5l_io_unit *log_io;
struct list_head log_list; struct list_head log_list;
sector_t log_start; /* first meta block on the journal */
struct list_head r5c; /* for r5c_cache->stripe_in_journal */
/** /**
* struct stripe_operations * struct stripe_operations
* @target - STRIPE_OP_COMPUTE_BLK target * @target - STRIPE_OP_COMPUTE_BLK target
...@@ -264,6 +266,7 @@ struct stripe_head_state { ...@@ -264,6 +266,7 @@ struct stripe_head_state {
int syncing, expanding, expanded, replacing; int syncing, expanding, expanded, replacing;
int locked, uptodate, to_read, to_write, failed, written; int locked, uptodate, to_read, to_write, failed, written;
int to_fill, compute, req_compute, non_overwrite; int to_fill, compute, req_compute, non_overwrite;
int injournal, just_cached;
int failed_num[2]; int failed_num[2];
int p_failed, q_failed; int p_failed, q_failed;
int dec_preread_active; int dec_preread_active;
...@@ -273,6 +276,7 @@ struct stripe_head_state { ...@@ -273,6 +276,7 @@ struct stripe_head_state {
struct md_rdev *blocked_rdev; struct md_rdev *blocked_rdev;
int handle_bad_blocks; int handle_bad_blocks;
int log_failed; int log_failed;
int waiting_extra_page;
}; };
/* Flags for struct r5dev.flags */ /* Flags for struct r5dev.flags */
...@@ -313,6 +317,11 @@ enum r5dev_flags { ...@@ -313,6 +317,11 @@ enum r5dev_flags {
*/ */
R5_Discard, /* Discard the stripe */ R5_Discard, /* Discard the stripe */
R5_SkipCopy, /* Don't copy data from bio to stripe cache */ R5_SkipCopy, /* Don't copy data from bio to stripe cache */
R5_InJournal, /* data being written is in the journal device.
* if R5_InJournal is set for parity pd_idx, all the
* data and parity being written are in the journal
* device
*/
}; };
/* /*
...@@ -345,7 +354,30 @@ enum { ...@@ -345,7 +354,30 @@ enum {
STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
* to batch yet. * to batch yet.
*/ */
STRIPE_LOG_TRAPPED, /* trapped into log */ STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c)
* this bit is used in two scenarios:
*
* 1. write-out phase
* set in first entry of r5l_write_stripe
* clear in second entry of r5l_write_stripe
* used to bypass logic in handle_stripe
*
* 2. caching phase
* set in r5c_try_caching_write()
* clear when journal write is done
* used to initiate r5c_cache_data()
* also used to bypass logic in handle_stripe
*/
STRIPE_R5C_CACHING, /* the stripe is in caching phase
* see more detail in the raid5-cache.c
*/
STRIPE_R5C_PARTIAL_STRIPE, /* in r5c cache (to-be/being handled or
* in conf->r5c_partial_stripe_list)
*/
STRIPE_R5C_FULL_STRIPE, /* in r5c cache (to-be/being handled or
* in conf->r5c_full_stripe_list)
*/
STRIPE_R5C_PREFLUSH, /* need to flush journal device */
}; };
#define STRIPE_EXPAND_SYNC_FLAGS \ #define STRIPE_EXPAND_SYNC_FLAGS \
...@@ -408,8 +440,86 @@ enum { ...@@ -408,8 +440,86 @@ enum {
struct disk_info { struct disk_info {
struct md_rdev *rdev, *replacement; struct md_rdev *rdev, *replacement;
struct page *extra_page; /* extra page to use in prexor */
}; };
/*
* Stripe cache
*/
#define NR_STRIPES 256
#define STRIPE_SIZE PAGE_SIZE
#define STRIPE_SHIFT (PAGE_SHIFT - 9)
#define STRIPE_SECTORS (STRIPE_SIZE>>9)
#define IO_THRESHOLD 1
#define BYPASS_THRESHOLD 1
#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
#define HASH_MASK (NR_HASH - 1)
#define MAX_STRIPE_BATCH 8
/* bio's attached to a stripe+device for I/O are linked together in bi_sector
* order without overlap. There may be several bio's per stripe+device, and
* a bio could span several devices.
* When walking this list for a particular stripe+device, we must never proceed
* beyond a bio that extends past this device, as the next bio might no longer
* be valid.
* This function is used to determine the 'next' bio in the list, given the
* sector of the current stripe+device
*/
static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
{
int sectors = bio_sectors(bio);
if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
return bio->bi_next;
else
return NULL;
}
/*
* We maintain a biased count of active stripes in the bottom 16 bits of
* bi_phys_segments, and a count of processed stripes in the upper 16 bits
*/
static inline int raid5_bi_processed_stripes(struct bio *bio)
{
atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
return (atomic_read(segments) >> 16) & 0xffff;
}
static inline int raid5_dec_bi_active_stripes(struct bio *bio)
{
atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
return atomic_sub_return(1, segments) & 0xffff;
}
static inline void raid5_inc_bi_active_stripes(struct bio *bio)
{
atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
atomic_inc(segments);
}
static inline void raid5_set_bi_processed_stripes(struct bio *bio,
unsigned int cnt)
{
atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
int old, new;
do {
old = atomic_read(segments);
new = (old & 0xffff) | (cnt << 16);
} while (atomic_cmpxchg(segments, old, new) != old);
}
static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
{
atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
atomic_set(segments, cnt);
}
/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
* This is because we sometimes take all the spinlocks * This is because we sometimes take all the spinlocks
* and creating that much locking depth can cause * and creating that much locking depth can cause
...@@ -432,6 +542,30 @@ struct r5worker_group { ...@@ -432,6 +542,30 @@ struct r5worker_group {
int stripes_cnt; int stripes_cnt;
}; };
enum r5_cache_state {
R5_INACTIVE_BLOCKED, /* release of inactive stripes blocked,
* waiting for 25% to be free
*/
R5_ALLOC_MORE, /* It might help to allocate another
* stripe.
*/
R5_DID_ALLOC, /* A stripe was allocated, don't allocate
* more until at least one has been
* released. This avoids flooding
* the cache.
*/
R5C_LOG_TIGHT, /* log device space tight, need to
* prioritize stripes at last_checkpoint
*/
R5C_LOG_CRITICAL, /* log device is running out of space,
* only process stripes that are already
* occupying the log
*/
R5C_EXTRA_PAGE_IN_USE, /* a stripe is using disk_info.extra_page
* for prexor
*/
};
struct r5conf { struct r5conf {
struct hlist_head *stripe_hashtbl; struct hlist_head *stripe_hashtbl;
/* only protect corresponding hash list and inactive_list */ /* only protect corresponding hash list and inactive_list */
...@@ -519,23 +653,18 @@ struct r5conf { ...@@ -519,23 +653,18 @@ struct r5conf {
*/ */
atomic_t active_stripes; atomic_t active_stripes;
struct list_head inactive_list[NR_STRIPE_HASH_LOCKS]; struct list_head inactive_list[NR_STRIPE_HASH_LOCKS];
atomic_t r5c_cached_full_stripes;
struct list_head r5c_full_stripe_list;
atomic_t r5c_cached_partial_stripes;
struct list_head r5c_partial_stripe_list;
atomic_t empty_inactive_list_nr; atomic_t empty_inactive_list_nr;
struct llist_head released_stripes; struct llist_head released_stripes;
wait_queue_head_t wait_for_quiescent; wait_queue_head_t wait_for_quiescent;
wait_queue_head_t wait_for_stripe; wait_queue_head_t wait_for_stripe;
wait_queue_head_t wait_for_overlap; wait_queue_head_t wait_for_overlap;
unsigned long cache_state; unsigned long cache_state;
#define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked,
* waiting for 25% to be free
*/
#define R5_ALLOC_MORE 2 /* It might help to allocate another
* stripe.
*/
#define R5_DID_ALLOC 4 /* A stripe was allocated, don't allocate
* more until at least one has been
* released. This avoids flooding
* the cache.
*/
struct shrinker shrinker; struct shrinker shrinker;
int pool_size; /* number of disks in stripeheads in pool */ int pool_size; /* number of disks in stripeheads in pool */
spinlock_t device_lock; spinlock_t device_lock;
...@@ -633,4 +762,23 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh); ...@@ -633,4 +762,23 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh);
extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
extern void r5l_quiesce(struct r5l_log *log, int state); extern void r5l_quiesce(struct r5l_log *log, int state);
extern bool r5l_log_disk_error(struct r5conf *conf); extern bool r5l_log_disk_error(struct r5conf *conf);
extern bool r5c_is_writeback(struct r5l_log *log);
extern int
r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
struct stripe_head_state *s, int disks);
extern void
r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
struct stripe_head_state *s);
extern void r5c_release_extra_page(struct stripe_head *sh);
extern void r5c_use_extra_page(struct stripe_head *sh);
extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
extern void r5c_handle_cached_data_endio(struct r5conf *conf,
struct stripe_head *sh, int disks, struct bio_list *return_bi);
extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
struct stripe_head_state *s);
extern void r5c_make_stripe_write_out(struct stripe_head *sh);
extern void r5c_flush_cache(struct r5conf *conf, int num);
extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
extern void r5c_check_cached_full_stripe(struct r5conf *conf);
extern struct md_sysfs_entry r5c_journal_mode;
#endif #endif
...@@ -84,6 +84,10 @@ ...@@ -84,6 +84,10 @@
#define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed #define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed
* For clustered enviroments only. * For clustered enviroments only.
*/ */
#define MD_DISK_FAILFAST 10 /* Send REQ_FAILFAST if there are multiple
* devices available - and don't try to
* correct read errors.
*/
#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. #define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config.
* read requests will only be sent here in * read requests will only be sent here in
...@@ -265,8 +269,9 @@ struct mdp_superblock_1 { ...@@ -265,8 +269,9 @@ struct mdp_superblock_1 {
__le32 dev_number; /* permanent identifier of this device - not role in raid */ __le32 dev_number; /* permanent identifier of this device - not role in raid */
__le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ __le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */
__u8 device_uuid[16]; /* user-space setable, ignored by kernel */ __u8 device_uuid[16]; /* user-space setable, ignored by kernel */
__u8 devflags; /* per-device flags. Only one defined...*/ __u8 devflags; /* per-device flags. Only two defined...*/
#define WriteMostly1 1 /* mask for writemostly flag in above */ #define WriteMostly1 1 /* mask for writemostly flag in above */
#define FailFast1 2 /* Should avoid retries and fixups and just fail */
/* Bad block log. If there are any bad blocks the feature flag is set. /* Bad block log. If there are any bad blocks the feature flag is set.
* If offset and size are non-zero, that space is reserved and available * If offset and size are non-zero, that space is reserved and available
*/ */
......
...@@ -87,9 +87,57 @@ static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs) ...@@ -87,9 +87,57 @@ static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
kernel_fpu_end(); kernel_fpu_end();
} }
static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
size_t bytes, void **ptrs)
{
u8 **dptr = (u8 **)ptrs;
u8 *p, *q;
int d, z, z0;
z0 = stop; /* P/Q right side optimization */
p = dptr[disks-2]; /* XOR parity */
q = dptr[disks-1]; /* RS syndrome */
kernel_fpu_begin();
asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
for (d = 0 ; d < bytes ; d += 32) {
asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
asm volatile("vpxor %ymm4,%ymm2,%ymm2");
/* P/Q data pages */
for (z = z0-1 ; z >= start ; z--) {
asm volatile("vpxor %ymm5,%ymm5,%ymm5");
asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
asm volatile("vpand %ymm0,%ymm5,%ymm5");
asm volatile("vpxor %ymm5,%ymm4,%ymm4");
asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
asm volatile("vpxor %ymm5,%ymm2,%ymm2");
asm volatile("vpxor %ymm5,%ymm4,%ymm4");
}
/* P/Q left side optimization */
for (z = start-1 ; z >= 0 ; z--) {
asm volatile("vpxor %ymm5,%ymm5,%ymm5");
asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
asm volatile("vpand %ymm0,%ymm5,%ymm5");
asm volatile("vpxor %ymm5,%ymm4,%ymm4");
}
asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
/* Don't use movntdq for r/w memory area < cache line */
asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
}
asm volatile("sfence" : : : "memory");
kernel_fpu_end();
}
const struct raid6_calls raid6_avx2x1 = { const struct raid6_calls raid6_avx2x1 = {
raid6_avx21_gen_syndrome, raid6_avx21_gen_syndrome,
NULL, /* XOR not yet implemented */ raid6_avx21_xor_syndrome,
raid6_have_avx2, raid6_have_avx2,
"avx2x1", "avx2x1",
1 /* Has cache hints */ 1 /* Has cache hints */
...@@ -149,9 +197,77 @@ static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs) ...@@ -149,9 +197,77 @@ static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
kernel_fpu_end(); kernel_fpu_end();
} }
static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
size_t bytes, void **ptrs)
{
u8 **dptr = (u8 **)ptrs;
u8 *p, *q;
int d, z, z0;
z0 = stop; /* P/Q right side optimization */
p = dptr[disks-2]; /* XOR parity */
q = dptr[disks-1]; /* RS syndrome */
kernel_fpu_begin();
asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
for (d = 0 ; d < bytes ; d += 64) {
asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
asm volatile("vpxor %ymm4,%ymm2,%ymm2");
asm volatile("vpxor %ymm6,%ymm3,%ymm3");
/* P/Q data pages */
for (z = z0-1 ; z >= start ; z--) {
asm volatile("vpxor %ymm5,%ymm5,%ymm5");
asm volatile("vpxor %ymm7,%ymm7,%ymm7");
asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
asm volatile("vpand %ymm0,%ymm5,%ymm5");
asm volatile("vpand %ymm0,%ymm7,%ymm7");
asm volatile("vpxor %ymm5,%ymm4,%ymm4");
asm volatile("vpxor %ymm7,%ymm6,%ymm6");
asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
asm volatile("vmovdqa %0,%%ymm7"
:: "m" (dptr[z][d+32]));
asm volatile("vpxor %ymm5,%ymm2,%ymm2");
asm volatile("vpxor %ymm7,%ymm3,%ymm3");
asm volatile("vpxor %ymm5,%ymm4,%ymm4");
asm volatile("vpxor %ymm7,%ymm6,%ymm6");
}
/* P/Q left side optimization */
for (z = start-1 ; z >= 0 ; z--) {
asm volatile("vpxor %ymm5,%ymm5,%ymm5");
asm volatile("vpxor %ymm7,%ymm7,%ymm7");
asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
asm volatile("vpand %ymm0,%ymm5,%ymm5");
asm volatile("vpand %ymm0,%ymm7,%ymm7");
asm volatile("vpxor %ymm5,%ymm4,%ymm4");
asm volatile("vpxor %ymm7,%ymm6,%ymm6");
}
asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
/* Don't use movntdq for r/w memory area < cache line */
asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
}
asm volatile("sfence" : : : "memory");
kernel_fpu_end();
}
const struct raid6_calls raid6_avx2x2 = { const struct raid6_calls raid6_avx2x2 = {
raid6_avx22_gen_syndrome, raid6_avx22_gen_syndrome,
NULL, /* XOR not yet implemented */ raid6_avx22_xor_syndrome,
raid6_have_avx2, raid6_have_avx2,
"avx2x2", "avx2x2",
1 /* Has cache hints */ 1 /* Has cache hints */
...@@ -242,9 +358,119 @@ static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs) ...@@ -242,9 +358,119 @@ static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
kernel_fpu_end(); kernel_fpu_end();
} }
static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
size_t bytes, void **ptrs)
{
u8 **dptr = (u8 **)ptrs;
u8 *p, *q;
int d, z, z0;
z0 = stop; /* P/Q right side optimization */
p = dptr[disks-2]; /* XOR parity */
q = dptr[disks-1]; /* RS syndrome */
kernel_fpu_begin();
asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
for (d = 0 ; d < bytes ; d += 128) {
asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
asm volatile("vpxor %ymm4,%ymm2,%ymm2");
asm volatile("vpxor %ymm6,%ymm3,%ymm3");
asm volatile("vpxor %ymm12,%ymm10,%ymm10");
asm volatile("vpxor %ymm14,%ymm11,%ymm11");
/* P/Q data pages */
for (z = z0-1 ; z >= start ; z--) {
asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
asm volatile("vpxor %ymm5,%ymm5,%ymm5");
asm volatile("vpxor %ymm7,%ymm7,%ymm7");
asm volatile("vpxor %ymm13,%ymm13,%ymm13");
asm volatile("vpxor %ymm15,%ymm15,%ymm15");
asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
asm volatile("vpand %ymm0,%ymm5,%ymm5");
asm volatile("vpand %ymm0,%ymm7,%ymm7");
asm volatile("vpand %ymm0,%ymm13,%ymm13");
asm volatile("vpand %ymm0,%ymm15,%ymm15");
asm volatile("vpxor %ymm5,%ymm4,%ymm4");
asm volatile("vpxor %ymm7,%ymm6,%ymm6");
asm volatile("vpxor %ymm13,%ymm12,%ymm12");
asm volatile("vpxor %ymm15,%ymm14,%ymm14");
asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
asm volatile("vmovdqa %0,%%ymm7"
:: "m" (dptr[z][d+32]));
asm volatile("vmovdqa %0,%%ymm13"
:: "m" (dptr[z][d+64]));
asm volatile("vmovdqa %0,%%ymm15"
:: "m" (dptr[z][d+96]));
asm volatile("vpxor %ymm5,%ymm2,%ymm2");
asm volatile("vpxor %ymm7,%ymm3,%ymm3");
asm volatile("vpxor %ymm13,%ymm10,%ymm10");
asm volatile("vpxor %ymm15,%ymm11,%ymm11");
asm volatile("vpxor %ymm5,%ymm4,%ymm4");
asm volatile("vpxor %ymm7,%ymm6,%ymm6");
asm volatile("vpxor %ymm13,%ymm12,%ymm12");
asm volatile("vpxor %ymm15,%ymm14,%ymm14");
}
asm volatile("prefetchnta %0" :: "m" (q[d]));
asm volatile("prefetchnta %0" :: "m" (q[d+64]));
/* P/Q left side optimization */
for (z = start-1 ; z >= 0 ; z--) {
asm volatile("vpxor %ymm5,%ymm5,%ymm5");
asm volatile("vpxor %ymm7,%ymm7,%ymm7");
asm volatile("vpxor %ymm13,%ymm13,%ymm13");
asm volatile("vpxor %ymm15,%ymm15,%ymm15");
asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
asm volatile("vpand %ymm0,%ymm5,%ymm5");
asm volatile("vpand %ymm0,%ymm7,%ymm7");
asm volatile("vpand %ymm0,%ymm13,%ymm13");
asm volatile("vpand %ymm0,%ymm15,%ymm15");
asm volatile("vpxor %ymm5,%ymm4,%ymm4");
asm volatile("vpxor %ymm7,%ymm6,%ymm6");
asm volatile("vpxor %ymm13,%ymm12,%ymm12");
asm volatile("vpxor %ymm15,%ymm14,%ymm14");
}
asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
}
asm volatile("sfence" : : : "memory");
kernel_fpu_end();
}
const struct raid6_calls raid6_avx2x4 = { const struct raid6_calls raid6_avx2x4 = {
raid6_avx24_gen_syndrome, raid6_avx24_gen_syndrome,
NULL, /* XOR not yet implemented */ raid6_avx24_xor_syndrome,
raid6_have_avx2, raid6_have_avx2,
"avx2x4", "avx2x4",
1 /* Has cache hints */ 1 /* Has cache hints */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
新手
引导
客服 返回
顶部