Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

Pull MD updates from Shaohua Li: - a raid5 writeback cache feature. The goal is to aggregate writes to make full stripe write and reduce read-modify-write. It's helpful for workload which does sequential write and follows fsync for example. This feature is experimental and off by default right now. - FAILFAST support. This fails IOs to broken raid disks quickly, so can improve latency. It's mainly for DASD storage, but some patches help normal raid array too. - support bad block for raid array with external metadata - AVX2 instruction support for raid6 parity calculation - normalize MD info output - add missing blktrace - other bug fixes * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: (66 commits) md: separate flags for superblock changes md: MD_RECOVERY_NEEDED is set for mddev->recovery md: takeover should clear unrelated bits md/r5cache: after recovery, increase journal seq by 10000 md/raid5-cache: fix crc in rewrite_data_only_stripes() md/raid5-cache: no recovery is required when create super-block md: fix refcount problem on mddev when stopping array. md/r5cache: do r5c_update_log_state after log recovery md/raid5-cache: adjust the write position of the empty block if no data blocks md/r5cache: run_no_space_stripes() when R5C_LOG_CRITICAL == 0 md/raid5: limit request size according to implementation limits md/raid5-cache: do not need to set STRIPE_PREREAD_ACTIVE repeatedly md/raid5-cache: remove the unnecessary next_cp_seq field from the r5l_log md/raid5-cache: release the stripe_head at the appropriate location md/raid5-cache: use ring add to prevent overflow md/raid5-cache: remove unnecessary function parameters raid5-cache: don't set STRIPE_R5C_PARTIAL_STRIPE flag while load stripe into cache raid5-cache: add another check conditon before replaying one stripe md/r5cache: enable IRQs on error path md/r5cache: handle alloc_page failure ...

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
Pull MD updates from Shaohua Li: - a raid5 writeback cache feature. The goal is to aggregate writes to make full stripe write and reduce read-modify-write. It's helpful for workload which does sequential write and follows fsync for example. This feature is experimental and off by default right now. - FAILFAST support. This fails IOs to broken raid disks quickly, so can improve latency. It's mainly for DASD storage, but some patches help normal raid array too. - support bad block for raid array with external metadata - AVX2 instruction support for raid6 parity calculation - normalize MD info output - add missing blktrace - other bug fixes * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: (66 commits) md: separate flags for superblock changes md: MD_RECOVERY_NEEDED is set for mddev->recovery md: takeover should clear unrelated bits md/r5cache: after recovery, increase journal seq by 10000 md/raid5-cache: fix crc in rewrite_data_only_stripes() md/raid5-cache: no recovery is required when create super-block md: fix refcount problem on mddev when stopping array. md/r5cache: do r5c_update_log_state after log recovery md/raid5-cache: adjust the write position of the empty block if no data blocks md/r5cache: run_no_space_stripes() when R5C_LOG_CRITICAL == 0 md/raid5: limit request size according to implementation limits md/raid5-cache: do not need to set STRIPE_PREREAD_ACTIVE repeatedly md/raid5-cache: remove the unnecessary next_cp_seq field from the r5l_log md/raid5-cache: release the stripe_head at the appropriate location md/raid5-cache: use ring add to prevent overflow md/raid5-cache: remove unnecessary function parameters raid5-cache: don't set STRIPE_R5C_PARTIAL_STRIPE flag while load stripe into cache raid5-cache: add another check conditon before replaying one stripe md/r5cache: enable IRQs on error path md/r5cache: handle alloc_page failure ...
2a4c32ed · Linus Torvalds · b9f98bd4 · 20737738 · 2a4c32ed · 2a4c32ed
16 changed file
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -27,6 +27,7 @@
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
 #include <linux/seq_file.h>
+#include <trace/events/block.h>
 #include "md.h"
 #include "bitmap.h"
@@ -208,11 +209,13 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
 static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 {
-	struct md_rdev *rdev = NULL;
+	struct md_rdev *rdev;
 	struct block_device *bdev;
 	struct mddev *mddev = bitmap->mddev;
 	struct bitmap_storage *store = &bitmap->storage;
+restart:
+	rdev = NULL;
 	while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
 		int size = PAGE_SIZE;
 		loff_t offset = mddev->bitmap_info.offset;
@@ -268,8 +271,8 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 			       page);
 	}
-	if (wait)
+	if (wait && md_super_wait(mddev) < 0)
-		md_super_wait(mddev);
+		goto restart;
 	return 0;
 bad_alignment:
@@ -405,7 +408,7 @@ static int read_page(struct file *file, unsigned long index,
 		ret = -EIO;
 out:
 	if (ret)
-		printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n",
+		pr_err("md: bitmap read error: (%dB @ %llu): %d\n",
 		       (int)PAGE_SIZE,
 		       (unsigned long long)index << PAGE_SHIFT,
 		       ret);
@@ -416,6 +419,28 @@ static int read_page(struct file *file, unsigned long index,
 * bitmap file superblock operations
 */
+/*
+ * bitmap_wait_writes() should be called before writing any bitmap
+ * blocks, to ensure previous writes, particularly from
+ * bitmap_daemon_work(), have completed.
+ */
+static void bitmap_wait_writes(struct bitmap *bitmap)
+{
+	if (bitmap->storage.file)
+		wait_event(bitmap->write_wait,
+			   atomic_read(&bitmap->pending_writes)==0);
+	else
+		/* Note that we ignore the return value.  The writes
+		 * might have failed, but that would just mean that
+		 * some bits which should be cleared haven't been,
+		 * which is safe.  The relevant bitmap blocks will
+		 * probably get written again, but there is no great
+		 * loss if they aren't.
+		 */
+		md_super_wait(bitmap->mddev);
+}
 /* update the event counter and sync the superblock to disk */
 void bitmap_update_sb(struct bitmap *bitmap)
 {
@@ -455,24 +480,24 @@ void bitmap_print_sb(struct bitmap *bitmap)
 	if (!bitmap || !bitmap->storage.sb_page)
 		return;
 	sb = kmap_atomic(bitmap->storage.sb_page);
-	printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
+	pr_debug("%s: bitmap file superblock:\n", bmname(bitmap));
-	printk(KERN_DEBUG "         magic: %08x\n", le32_to_cpu(sb->magic));
+	pr_debug("         magic: %08x\n", le32_to_cpu(sb->magic));
-	printk(KERN_DEBUG "       version: %d\n", le32_to_cpu(sb->version));
+	pr_debug("       version: %d\n", le32_to_cpu(sb->version));
-	printk(KERN_DEBUG "          uuid: %08x.%08x.%08x.%08x\n",
+	pr_debug("          uuid: %08x.%08x.%08x.%08x\n",
 		 *(__u32 *)(sb->uuid+0),
 		 *(__u32 *)(sb->uuid+4),
 		 *(__u32 *)(sb->uuid+8),
 		 *(__u32 *)(sb->uuid+12));
-	printk(KERN_DEBUG "        events: %llu\n",
+	pr_debug("        events: %llu\n",
 		 (unsigned long long) le64_to_cpu(sb->events));
-	printk(KERN_DEBUG "events cleared: %llu\n",
+	pr_debug("events cleared: %llu\n",
 		 (unsigned long long) le64_to_cpu(sb->events_cleared));
-	printk(KERN_DEBUG "         state: %08x\n", le32_to_cpu(sb->state));
+	pr_debug("         state: %08x\n", le32_to_cpu(sb->state));
-	printk(KERN_DEBUG "     chunksize: %d B\n", le32_to_cpu(sb->chunksize));
+	pr_debug("     chunksize: %d B\n", le32_to_cpu(sb->chunksize));
-	printk(KERN_DEBUG "  daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
+	pr_debug("  daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
-	printk(KERN_DEBUG "     sync size: %llu KB\n",
+	pr_debug("     sync size: %llu KB\n",
 		 (unsigned long long)le64_to_cpu(sb->sync_size)/2);
-	printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind));
+	pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind));
 	kunmap_atomic(sb);
 }
@@ -506,14 +531,14 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
 	BUG_ON(!chunksize);
 	if (!is_power_of_2(chunksize)) {
 		kunmap_atomic(sb);
-		printk(KERN_ERR "bitmap chunksize not a power of 2\n");
+		pr_warn("bitmap chunksize not a power of 2\n");
 		return -EINVAL;
 	}
 	sb->chunksize = cpu_to_le32(chunksize);
 	daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep;
 	if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) {
-		printk(KERN_INFO "Choosing daemon_sleep default (5 sec)\n");
+		pr_debug("Choosing daemon_sleep default (5 sec)\n");
 		daemon_sleep = 5 * HZ;
 	}
 	sb->daemon_sleep = cpu_to_le32(daemon_sleep);
@@ -584,7 +609,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 		/* to 4k blocks */
 		bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
 		offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3));
-		pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
+		pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
 			bitmap->cluster_slot, offset);
 	}
@@ -634,7 +659,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	else if (write_behind > COUNTER_MAX)
 		reason = "write-behind limit out of range (0 - 16383)";
 	if (reason) {
-		printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n",
+		pr_warn("%s: invalid bitmap file superblock: %s\n",
 			bmname(bitmap), reason);
 		goto out;
 	}
@@ -648,16 +673,13 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 		 * bitmap's UUID and event counter to the mddev's
 		 */
 		if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
-			printk(KERN_INFO
+			pr_warn("%s: bitmap superblock UUID mismatch\n",
-			       "%s: bitmap superblock UUID mismatch\n",
 				bmname(bitmap));
 			goto out;
 		}
 		events = le64_to_cpu(sb->events);
 		if (!nodes && (events < bitmap->mddev->events)) {
-			printk(KERN_INFO
+			pr_warn("%s: bitmap file is out of date (%llu < %llu) -- forcing full recovery\n",
-			       "%s: bitmap file is out of date (%llu < %llu) "
-			       "-- forcing full recovery\n",
 				bmname(bitmap), events,
 				(unsigned long long) bitmap->mddev->events);
 			set_bit(BITMAP_STALE, &bitmap->flags);
@@ -679,7 +701,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
 		err = md_setup_cluster(bitmap->mddev, nodes);
 		if (err) {
-			pr_err("%s: Could not setup cluster service (%d)\n",
+			pr_warn("%s: Could not setup cluster service (%d)\n",
 				bmname(bitmap), err);
 			goto out_no_sb;
 		}
@@ -847,14 +869,12 @@ static void bitmap_file_kick(struct bitmap *bitmap)
 				ptr = file_path(bitmap->storage.file,
 					     path, PAGE_SIZE);
-			printk(KERN_ALERT
+			pr_warn("%s: kicking failed bitmap file %s from array!\n",
-			      "%s: kicking failed bitmap file %s from array!\n",
 				bmname(bitmap), IS_ERR(ptr) ? "" : ptr);
 			kfree(path);
 		} else
-			printk(KERN_ALERT
+			pr_warn("%s: disabling internal bitmap due to errors\n",
-			       "%s: disabling internal bitmap due to errors\n",
 				bmname(bitmap));
 	}
 }
@@ -983,6 +1003,7 @@ void bitmap_unplug(struct bitmap *bitmap)
 {
 	unsigned long i;
 	int dirty, need_write;
+	int writing = 0;
 	if (!bitmap || !bitmap->storage.filemap ||
 	    test_bit(BITMAP_STALE, &bitmap->flags))
@@ -997,15 +1018,19 @@ void bitmap_unplug(struct bitmap *bitmap)
 		need_write = test_and_clear_page_attr(bitmap, i,
 						      BITMAP_PAGE_NEEDWRITE);
 		if (dirty || need_write) {
+			if (!writing) {
+				bitmap_wait_writes(bitmap);
+				if (bitmap->mddev->queue)
+					blk_add_trace_msg(bitmap->mddev->queue,
+							  "md bitmap_unplug");
+			}
 			clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
 			write_page(bitmap, bitmap->storage.filemap[i], 0);
+			writing = 1;
 		}
 	}
-	if (bitmap->storage.file)
+	if (writing)
-		wait_event(bitmap->write_wait,
+		bitmap_wait_writes(bitmap);
-			   atomic_read(&bitmap->pending_writes)==0);
-	else
-		md_super_wait(bitmap->mddev);
 	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
 		bitmap_file_kick(bitmap);
@@ -1056,11 +1081,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 	outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
 	if (outofdate)
-		printk(KERN_INFO "%s: bitmap file is out of date, doing full "
+		pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap));
-			"recovery\n", bmname(bitmap));
 	if (file && i_size_read(file->f_mapping->host) < store->bytes) {
-		printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
+		pr_warn("%s: bitmap file too short %lu < %lu\n",
 			bmname(bitmap),
 			(unsigned long) i_size_read(file->f_mapping->host),
 			store->bytes);
@@ -1137,15 +1161,14 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		offset = 0;
 	}
-	printk(KERN_INFO "%s: bitmap initialized from disk: "
+	pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n",
-	       "read %lu pages, set %lu of %lu bits\n",
 		 bmname(bitmap), store->file_pages,
 		 bit_cnt, chunks);
 	return 0;
 err:
-	printk(KERN_INFO "%s: bitmap initialisation failed: %d\n",
+	pr_warn("%s: bitmap initialisation failed: %d\n",
 		bmname(bitmap), ret);
 	return ret;
 }
@@ -1225,6 +1248,10 @@ void bitmap_daemon_work(struct mddev *mddev)
 	}
 	bitmap->allclean = 1;
+	if (bitmap->mddev->queue)
+		blk_add_trace_msg(bitmap->mddev->queue,
+				  "md bitmap_daemon_work");
 	/* Any file-page which is PENDING now needs to be written.
 	 * So set NEEDWRITE now, then after we make any last-minute changes
 	 * we will write it.
@@ -1289,6 +1316,7 @@ void bitmap_daemon_work(struct mddev *mddev)
 	}
 	spin_unlock_irq(&counts->lock);
+	bitmap_wait_writes(bitmap);
 	/* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
 	 * DIRTY pages need to be written by bitmap_unplug so it can wait
 	 * for them.
@@ -1595,7 +1623,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
 		   atomic_read(&bitmap->mddev->recovery_active) == 0);
 	bitmap->mddev->curr_resync_completed = sector;
-	set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
+	set_bit(MD_SB_CHANGE_CLEAN, &bitmap->mddev->sb_flags);
 	sector &= ~((1ULL << bitmap->counts.chunkshift) - 1);
 	s = 0;
 	while (s < sector && s < bitmap->mddev->resync_max_sectors) {
@@ -1825,7 +1853,7 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot)
 	if (err)
 		goto error;
-	printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
+	pr_debug("created bitmap (%lu pages) for device %s\n",
 		 bitmap->counts.pages, bmname(bitmap));
 	err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
@@ -2029,8 +2057,10 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 					   !bitmap->mddev->bitmap_info.external,
 					   mddev_is_clustered(bitmap->mddev)
 					   ? bitmap->cluster_slot : 0);
-	if (ret)
+	if (ret) {
+		bitmap_file_unmap(&store);
 		goto err;
+	}
 	pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO);
@@ -2089,7 +2119,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 				bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift +
 									     BITMAP_BLOCK_SHIFT);
 				blocks = old_counts.chunks << old_counts.chunkshift;
-				pr_err("Could not pre-allocate in-memory bitmap for cluster raid\n");
+				pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n");
 				break;
 			} else
 				bitmap->counts.bp[page].count += 1;
@@ -2266,7 +2296,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 		/* Ensure new bitmap info is stored in
 		 * metadata promptly.
 		 */
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 		md_wakeup_thread(mddev->thread);
 	}
 	rv = 0;

--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2011,7 +2011,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
 		sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190);
 		/* Force writing of superblocks to disk */
-		set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
+		set_bit(MD_SB_CHANGE_DEVS, &rdev->mddev->sb_flags);
 		/* Any superblock is better than none, choose that if given */
 		return refdev ? 0 : 1;
@@ -3497,7 +3497,7 @@ static void rs_update_sbs(struct raid_set *rs)
 	struct mddev *mddev = &rs->md;
 	int ro = mddev->ro;
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 	mddev->ro = 0;
 	md_update_sb(mddev, 1);
 	mddev->ro = ro;

--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -21,6 +21,7 @@
 #include <linux/seq_file.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <trace/events/block.h>
 #include "md.h"
 #include "linear.h"
@@ -101,7 +102,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 		sector_t sectors;
 		if (j < 0 || j >= raid_disks || disk->rdev) {
-			printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n",
+			pr_warn("md/linear:%s: disk numbering problem. Aborting!\n",
 				mdname(mddev));
 			goto out;
 		}
@@ -123,7 +124,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 			discard_supported = true;
 	}
 	if (cnt != raid_disks) {
-		printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
+		pr_warn("md/linear:%s: not enough drives present. Aborting!\n",
 			mdname(mddev));
 		goto out;
 	}
@@ -227,22 +228,22 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 	}
 	do {
-		tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector);
+		sector_t bio_sector = bio->bi_iter.bi_sector;
+		tmp_dev = which_dev(mddev, bio_sector);
 		start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
 		end_sector = tmp_dev->end_sector;
 		data_offset = tmp_dev->rdev->data_offset;
 		bio->bi_bdev = tmp_dev->rdev->bdev;
-		if (unlikely(bio->bi_iter.bi_sector >= end_sector ||
+		if (unlikely(bio_sector >= end_sector ||
-			     bio->bi_iter.bi_sector < start_sector))
+			     bio_sector < start_sector))
 			goto out_of_bounds;
 		if (unlikely(bio_end_sector(bio) > end_sector)) {
 			/* This bio crosses a device boundary, so we have to
 			 * split it.
 			 */
-			split = bio_split(bio, end_sector -
+			split = bio_split(bio, end_sector - bio_sector,
-					  bio->bi_iter.bi_sector,
 					  GFP_NOIO, fs_bio_set);
 			bio_chain(split, bio);
 		} else {
@@ -256,15 +257,18 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
 			/* Just ignore it */
 			bio_endio(split);
-		} else
+		} else {
+			if (mddev->gendisk)
+				trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
+						      split, disk_devt(mddev->gendisk),
+						      bio_sector);
 			generic_make_request(split);
+		}
 	} while (split != bio);
 	return;
 out_of_bounds:
-	printk(KERN_ERR
+	pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n",
-	       "md/linear:%s: make_request: Sector %llu out of bounds on "
-	       "dev %s: %llu sectors, offset %llu\n",
 	       mdname(mddev),
 	       (unsigned long long)bio->bi_iter.bi_sector,
 	       bdevname(tmp_dev->rdev->bdev, b),
@@ -275,7 +279,6 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 static void linear_status (struct seq_file *seq, struct mddev *mddev)
 {
 	seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
 }

--- a/drivers/md/md.c
+++ b/drivers/md/md.c
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -29,6 +29,16 @@
 #define MaxSector (~(sector_t)0)
+/*
+ * These flags should really be called "NO_RETRY" rather than
+ * "FAILFAST" because they don't make any promise about time lapse,
+ * only about the number of retries, which will be zero.
+ * REQ_FAILFAST_DRIVER is not included because
+ * Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.")
+ * seems to suggest that the errors it avoids retrying should usually
+ * be retried.
+ */
+#define	MD_FAILFAST	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
 /*
 * MD's 'extended' device
 */
@@ -168,6 +178,19 @@ enum flag_bits {
 				 * so it is safe to remove without
 				 * another synchronize_rcu() call.
 				 */
+	ExternalBbl,            /* External metadata provides bad
+				 * block management for a disk
+				 */
+	FailFast,		/* Minimal retries should be attempted on
+				 * this device, so use REQ_FAILFAST_DEV.
+				 * Also don't try to repair failed reads.
+				 * It is expects that no bad block log
+				 * is present.
+				 */
+	LastDev,		/* Seems to be the last working dev as
+				 * it didn't fail, so don't use FailFast
+				 * any more for metadata
+				 */
 };
 static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@@ -189,6 +212,31 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 				int is_new);
 struct md_cluster_info;
+enum mddev_flags {
+	MD_ARRAY_FIRST_USE,	/* First use of array, needs initialization */
+	MD_CLOSING,		/* If set, we are closing the array, do not open
+				 * it then */
+	MD_JOURNAL_CLEAN,	/* A raid with journal is already clean */
+	MD_HAS_JOURNAL,		/* The raid array has journal feature set */
+	MD_RELOAD_SB,		/* Reload the superblock because another node
+				 * updated it.
+				 */
+	MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
+				   * already took resync lock, need to
+				   * release the lock */
+	MD_FAILFAST_SUPPORTED,	/* Using MD_FAILFAST on metadata writes is
+				 * supported as calls to md_error() will
+				 * never cause the array to become failed.
+				 */
+};
+enum mddev_sb_flags {
+	MD_SB_CHANGE_DEVS,		/* Some device status has changed */
+	MD_SB_CHANGE_CLEAN,	/* transition to or from 'clean' */
+	MD_SB_CHANGE_PENDING,	/* switch from 'clean' to 'active' in progress */
+	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
+};
 struct mddev {
 	void				*private;
 	struct md_personality		*pers;
@@ -196,21 +244,7 @@ struct mddev {
 	int				md_minor;
 	struct list_head		disks;
 	unsigned long			flags;
-#define MD_CHANGE_DEVS	0	/* Some device status has changed */
+	unsigned long			sb_flags;
-#define MD_CHANGE_CLEAN 1	/* transition to or from 'clean' */
-#define MD_CHANGE_PENDING 2	/* switch from 'clean' to 'active' in progress */
-#define MD_UPDATE_SB_FLAGS (1 | 2 | 4)	/* If these are set, md_update_sb needed */
-#define MD_ARRAY_FIRST_USE 3    /* First use of array, needs initialization */
-#define MD_CLOSING	4	/* If set, we are closing the array, do not open
-				 * it then */
-#define MD_JOURNAL_CLEAN 5	/* A raid with journal is already clean */
-#define MD_HAS_JOURNAL	6	/* The raid array has journal feature set */
-#define MD_RELOAD_SB	7	/* Reload the superblock because another node
-				 * updated it.
-				 */
-#define MD_CLUSTER_RESYNC_LOCKED 8 /* cluster raid only, which means node
-				    * already took resync lock, need to
-				    * release the lock */
 	int				suspended;
 	atomic_t			active_io;
@@ -304,31 +338,6 @@ struct mddev {
 	int				parallel_resync;
 	int				ok_start_degraded;
-	/* recovery/resync flags
-	 * NEEDED:   we might need to start a resync/recover
-	 * RUNNING:  a thread is running, or about to be started
-	 * SYNC:     actually doing a resync, not a recovery
-	 * RECOVER:  doing recovery, or need to try it.
-	 * INTR:     resync needs to be aborted for some reason
-	 * DONE:     thread is done and is waiting to be reaped
-	 * REQUEST:  user-space has requested a sync (used with SYNC)
-	 * CHECK:    user-space request for check-only, no repair
-	 * RESHAPE:  A reshape is happening
-	 * ERROR:    sync-action interrupted because io-error
-	 *
-	 * If neither SYNC or RESHAPE are set, then it is a recovery.
-	 */
-#define	MD_RECOVERY_RUNNING	0
-#define	MD_RECOVERY_SYNC	1
-#define	MD_RECOVERY_RECOVER	2
-#define	MD_RECOVERY_INTR	3
-#define	MD_RECOVERY_DONE	4
-#define	MD_RECOVERY_NEEDED	5
-#define	MD_RECOVERY_REQUESTED	6
-#define	MD_RECOVERY_CHECK	7
-#define MD_RECOVERY_RESHAPE	8
-#define	MD_RECOVERY_FROZEN	9
-#define	MD_RECOVERY_ERROR	10
 	unsigned long			recovery;
 	/* If a RAID personality determines that recovery (of a particular
@@ -442,6 +451,23 @@ struct mddev {
 	unsigned int			good_device_nr;	/* good device num within cluster raid */
 };
+enum recovery_flags {
+	/*
+	 * If neither SYNC or RESHAPE are set, then it is a recovery.
+	 */
+	MD_RECOVERY_RUNNING,	/* a thread is running, or about to be started */
+	MD_RECOVERY_SYNC,	/* actually doing a resync, not a recovery */
+	MD_RECOVERY_RECOVER,	/* doing recovery, or need to try it. */
+	MD_RECOVERY_INTR,	/* resync needs to be aborted for some reason */
+	MD_RECOVERY_DONE,	/* thread is done and is waiting to be reaped */
+	MD_RECOVERY_NEEDED,	/* we might need to start a resync/recover */
+	MD_RECOVERY_REQUESTED,	/* user-space has requested a sync (used with SYNC) */
+	MD_RECOVERY_CHECK,	/* user-space request for check-only, no repair */
+	MD_RECOVERY_RESHAPE,	/* A reshape is happening */
+	MD_RECOVERY_FROZEN,	/* User request to abort, and not restart, any action */
+	MD_RECOVERY_ERROR,	/* sync-action interrupted because io-error */
+};
 static inline int __must_check mddev_lock(struct mddev *mddev)
 {
 	return mutex_lock_interruptible(&mddev->reconfig_mutex);
@@ -623,7 +649,7 @@ extern int mddev_congested(struct mddev *mddev, int bits);
 extern void md_flush_request(struct mddev *mddev, struct bio *bio);
 extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 			   sector_t sector, int size, struct page *page);
-extern void md_super_wait(struct mddev *mddev);
+extern int md_super_wait(struct mddev *mddev);
 extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 			struct page *page, int op, int op_flags,
 			bool metadata_op);

--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -52,7 +52,7 @@ static int multipath_map (struct mpconf *conf)
 	}
 	rcu_read_unlock();
-	printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
+	pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n");
 	return (-1);
 }
@@ -97,7 +97,7 @@ static void multipath_end_request(struct bio *bio)
 		 */
 		char b[BDEVNAME_SIZE];
 		md_error (mp_bh->mddev, rdev);
-		printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n",
+		pr_info("multipath: %s: rescheduling sector %llu\n",
 			bdevname(rdev->bdev,b),
 			(unsigned long long)bio->bi_iter.bi_sector);
 		multipath_reschedule_retry(mp_bh);
@@ -194,8 +194,7 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
 		 * first check if this is a queued request for a device
 		 * which has just failed.
 		 */
-		printk(KERN_ALERT
+		pr_warn("multipath: only one IO path left and IO error.\n");
-		       "multipath: only one IO path left and IO error.\n");
 		/* leave it active... it's all we have */
 		return;
 	}
@@ -209,11 +208,9 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 	set_bit(Faulty, &rdev->flags);
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
-	printk(KERN_ALERT "multipath: IO failure on %s,"
+	pr_err("multipath: IO failure on %s, disabling IO path.\n"
-	       " disabling IO path.\n"
+	       "multipath: Operation continuing on %d IO paths.\n",
-	       "multipath: Operation continuing"
-	       " on %d IO paths.\n",
 	       bdevname(rdev->bdev, b),
 	       conf->raid_disks - mddev->degraded);
 }
@@ -223,19 +220,19 @@ static void print_multipath_conf (struct mpconf *conf)
 	int i;
 	struct multipath_info *tmp;
-	printk("MULTIPATH conf printout:\n");
+	pr_debug("MULTIPATH conf printout:\n");
 	if (!conf) {
-		printk("(conf==NULL)\n");
+		pr_debug("(conf==NULL)\n");
 		return;
 	}
-	printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
+	pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
 		 conf->raid_disks);
 	for (i = 0; i < conf->raid_disks; i++) {
 		char b[BDEVNAME_SIZE];
 		tmp = conf->multipaths + i;
 		if (tmp->rdev)
-			printk(" disk%d, o:%d, dev:%s\n",
+			pr_debug(" disk%d, o:%d, dev:%s\n",
 				 i,!test_bit(Faulty, &tmp->rdev->flags),
 				 bdevname(tmp->rdev->bdev,b));
 	}
@@ -292,8 +289,7 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (rdev == p->rdev) {
 		if (test_bit(In_sync, &rdev->flags) ||
 		    atomic_read(&rdev->nr_pending)) {
-			printk(KERN_ERR "hot-remove-disk, slot %d is identified"
+			pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number);
-			       " but is still operational!\n", number);
 			err = -EBUSY;
 			goto abort;
 		}
@@ -346,14 +342,12 @@ static void multipathd(struct md_thread *thread)
 		bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;
 		if ((mp_bh->path = multipath_map (conf))<0) {
-			printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
+			pr_err("multipath: %s: unrecoverable IO read error for block %llu\n",
-				" error for block %llu\n",
 			       bdevname(bio->bi_bdev,b),
 			       (unsigned long long)bio->bi_iter.bi_sector);
 			multipath_end_bh_io(mp_bh, -EIO);
 		} else {
-			printk(KERN_ERR "multipath: %s: redirecting sector %llu"
+			pr_err("multipath: %s: redirecting sector %llu to another IO path\n",
-				" to another IO path\n",
 			       bdevname(bio->bi_bdev,b),
 			       (unsigned long long)bio->bi_iter.bi_sector);
 			*bio = *(mp_bh->master_bio);
@@ -389,7 +383,7 @@ static int multipath_run (struct mddev *mddev)
 		return -EINVAL;
 	if (mddev->level != LEVEL_MULTIPATH) {
-		printk("multipath: %s: raid level not set to multipath IO (%d)\n",
+		pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n",
 			mdname(mddev), mddev->level);
 		goto out;
 	}
@@ -401,21 +395,13 @@ static int multipath_run (struct mddev *mddev)
 	conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
 	mddev->private = conf;
-	if (!conf) {
+	if (!conf)
-		printk(KERN_ERR
-			"multipath: couldn't allocate memory for %s\n",
-			mdname(mddev));
 		goto out;
-	}
 	conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks,
 				   GFP_KERNEL);
-	if (!conf->multipaths) {
+	if (!conf->multipaths)
-		printk(KERN_ERR
-			"multipath: couldn't allocate memory for %s\n",
-			mdname(mddev));
 		goto out_free_conf;
-	}
 	working_disks = 0;
 	rdev_for_each(rdev, mddev) {
@@ -439,7 +425,7 @@ static int multipath_run (struct mddev *mddev)
 	INIT_LIST_HEAD(&conf->retry_list);
 	if (!working_disks) {
-		printk(KERN_ERR "multipath: no operational IO paths for %s\n",
+		pr_warn("multipath: no operational IO paths for %s\n",
 			mdname(mddev));
 		goto out_free_conf;
 	}
@@ -447,25 +433,15 @@ static int multipath_run (struct mddev *mddev)
 	conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
 						 sizeof(struct multipath_bh));
-	if (conf->pool == NULL) {
+	if (conf->pool == NULL)
-		printk(KERN_ERR
-			"multipath: couldn't allocate memory for %s\n",
-			mdname(mddev));
 		goto out_free_conf;
-	}
-	{
 	mddev->thread = md_register_thread(multipathd, mddev,
 					   "multipath");
-		if (!mddev->thread) {
+	if (!mddev->thread)
-			printk(KERN_ERR "multipath: couldn't allocate thread"
-				" for %s\n", mdname(mddev));
 		goto out_free_conf;
-		}
-	}
-	printk(KERN_INFO
+	pr_info("multipath: array %s active with %d out of %d IO paths\n",
-		"multipath: array %s active with %d out of %d IO paths\n",
 		mdname(mddev), conf->raid_disks - mddev->degraded,
 		mddev->raid_disks);
 	/*

--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -21,6 +21,7 @@
 #include <linux/seq_file.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <trace/events/block.h>
 #include "md.h"
 #include "raid0.h"
 #include "raid5.h"
@@ -51,20 +52,21 @@ static void dump_zones(struct mddev *mddev)
 	char b[BDEVNAME_SIZE];
 	struct r0conf *conf = mddev->private;
 	int raid_disks = conf->strip_zone[0].nb_dev;
-	printk(KERN_INFO "md: RAID0 configuration for %s - %d zone%s\n",
+	pr_debug("md: RAID0 configuration for %s - %d zone%s\n",
 		 mdname(mddev),
 		 conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s");
 	for (j = 0; j < conf->nr_strip_zones; j++) {
-		printk(KERN_INFO "md: zone%d=[", j);
+		char line[200];
+		int len = 0;
 		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
-			printk(KERN_CONT "%s%s", k?"/":"",
+			len += snprintf(line+len, 200-len, "%s%s", k?"/":"",
 					bdevname(conf->devlist[j*raid_disks
 							       + k]->bdev, b));
-		printk(KERN_CONT "]\n");
+		pr_debug("md: zone%d=[%s]\n", j, line);
 		zone_size  = conf->strip_zone[j].zone_end - zone_start;
-		printk(KERN_INFO "      zone-offset=%10lluKB, "
+		pr_debug("      zone-offset=%10lluKB, device-offset=%10lluKB, size=%10lluKB\n",
-				"device-offset=%10lluKB, size=%10lluKB\n",
 			(unsigned long long)zone_start>>1,
 			(unsigned long long)conf->strip_zone[j].dev_start>>1,
 			(unsigned long long)zone_size>>1);
@@ -142,7 +144,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	 * chunk size is a multiple of that sector size
 	 */
 	if ((mddev->chunk_sectors << 9) % blksize) {
-		printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
+		pr_warn("md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
 			mdname(mddev),
 			mddev->chunk_sectors << 9, blksize);
 		err = -EINVAL;
@@ -186,19 +188,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		}
 		if (j < 0) {
-			printk(KERN_ERR
+			pr_warn("md/raid0:%s: remove inactive devices before converting to RAID0\n",
-			       "md/raid0:%s: remove inactive devices before converting to RAID0\n",
 				mdname(mddev));
 			goto abort;
 		}
 		if (j >= mddev->raid_disks) {
-			printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
+			pr_warn("md/raid0:%s: bad disk number %d - aborting!\n",
-			       "aborting!\n", mdname(mddev), j);
+				mdname(mddev), j);
 			goto abort;
 		}
 		if (dev[j]) {
-			printk(KERN_ERR "md/raid0:%s: multiple devices for %d - "
+			pr_warn("md/raid0:%s: multiple devices for %d - aborting!\n",
-			       "aborting!\n", mdname(mddev), j);
+				mdname(mddev), j);
 			goto abort;
 		}
 		dev[j] = rdev1;
@@ -208,8 +209,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		cnt++;
 	}
 	if (cnt != mddev->raid_disks) {
-		printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
+		pr_warn("md/raid0:%s: too few disks (%d of %d) - aborting!\n",
-		       "aborting!\n", mdname(mddev), cnt, mddev->raid_disks);
+			mdname(mddev), cnt, mddev->raid_disks);
 		goto abort;
 	}
 	zone->nb_dev = cnt;
@@ -357,8 +358,7 @@ static int raid0_run(struct mddev *mddev)
 	int ret;
 	if (mddev->chunk_sectors == 0) {
-		printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n",
+		pr_warn("md/raid0:%s: chunk size must be set.\n", mdname(mddev));
-		       mdname(mddev));
 		return -EINVAL;
 	}
 	if (md_check_no_bitmap(mddev))
@@ -399,7 +399,7 @@ static int raid0_run(struct mddev *mddev)
 	/* calculate array device size */
 	md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
-	printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n",
+	pr_debug("md/raid0:%s: md_size is %llu sectors.\n",
 		 mdname(mddev),
 		 (unsigned long long)mddev->array_sectors);
@@ -464,7 +464,8 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 	}
 	do {
-		sector_t sector = bio->bi_iter.bi_sector;
+		sector_t bio_sector = bio->bi_iter.bi_sector;
+		sector_t sector = bio_sector;
 		unsigned chunk_sects = mddev->chunk_sectors;
 		unsigned sectors = chunk_sects -
@@ -473,7 +474,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 			 : sector_div(sector, chunk_sects));
 		/* Restore due to sector_div */
-		sector = bio->bi_iter.bi_sector;
+		sector = bio_sector;
 		if (sectors < bio_sectors(bio)) {
 			split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
@@ -492,8 +493,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
 			/* Just ignore it */
 			bio_endio(split);
-		} else
+		} else {
+			if (mddev->gendisk)
+				trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
+						      split, disk_devt(mddev->gendisk),
+						      bio_sector);
 			generic_make_request(split);
+		}
 	} while (split != bio);
 }
@@ -509,7 +515,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
 	struct r0conf *priv_conf;
 	if (mddev->degraded != 1) {
-		printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n",
+		pr_warn("md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n",
 			mdname(mddev),
 			mddev->degraded);
 		return ERR_PTR(-EINVAL);
@@ -518,7 +524,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
 	rdev_for_each(rdev, mddev) {
 		/* check slot number for a disk */
 		if (rdev->raid_disk == mddev->raid_disks-1) {
-			printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
+			pr_warn("md/raid0:%s: raid5 must have missing parity disk!\n",
 				mdname(mddev));
 			return ERR_PTR(-EINVAL);
 		}
@@ -533,8 +539,11 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
 	mddev->delta_disks = -1;
 	/* make sure it will be not marked as dirty */
 	mddev->recovery_cp = MaxSector;
+	clear_bit(MD_HAS_JOURNAL, &mddev->flags);
+	clear_bit(MD_JOURNAL_CLEAN, &mddev->flags);
 	create_strip_zones(mddev, &priv_conf);
 	return priv_conf;
 }
@@ -549,18 +558,18 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
 	 *  - all mirrors must be already degraded
 	 */
 	if (mddev->layout != ((1 << 8) + 2)) {
-		printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n",
+		pr_warn("md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n",
 			mdname(mddev),
 			mddev->layout);
 		return ERR_PTR(-EINVAL);
 	}
 	if (mddev->raid_disks & 1) {
-		printk(KERN_ERR "md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n",
+		pr_warn("md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n",
 			mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
 	if (mddev->degraded != (mddev->raid_disks>>1)) {
-		printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n",
+		pr_warn("md/raid0:%s: All mirrors must be already degraded!\n",
 			mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
@@ -574,6 +583,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
 	mddev->degraded = 0;
 	/* make sure it will be not marked as dirty */
 	mddev->recovery_cp = MaxSector;
+	clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
 	create_strip_zones(mddev, &priv_conf);
 	return priv_conf;
@@ -588,7 +598,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
 	 *  - (N - 1) mirror drives must be already faulty
 	 */
 	if ((mddev->raid_disks - 1) != mddev->degraded) {
-		printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n",
+		pr_err("md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n",
 		       mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
@@ -616,6 +626,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
 	mddev->raid_disks = 1;
 	/* make sure it will be not marked as dirty */
 	mddev->recovery_cp = MaxSector;
+	clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
 	create_strip_zones(mddev, &priv_conf);
 	return priv_conf;
@@ -631,7 +642,7 @@ static void *raid0_takeover(struct mddev *mddev)
 	 */
 	if (mddev->bitmap) {
-		printk(KERN_ERR "md/raid0: %s: cannot takeover array with bitmap\n",
+		pr_warn("md/raid0: %s: cannot takeover array with bitmap\n",
 			mdname(mddev));
 		return ERR_PTR(-EBUSY);
 	}
@@ -642,7 +653,7 @@ static void *raid0_takeover(struct mddev *mddev)
 		if (mddev->layout == ALGORITHM_PARITY_N)
 			return raid0_takeover_raid45(mddev);
-		printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
+		pr_warn("md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
 			mdname(mddev), ALGORITHM_PARITY_N);
 	}
@@ -652,7 +663,7 @@ static void *raid0_takeover(struct mddev *mddev)
 	if (mddev->level == 1)
 		return raid0_takeover_raid1(mddev);
-	printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n",
+	pr_warn("Takeover from raid%i to raid0 not supported\n",
 		mddev->level);
 	return ERR_PTR(-EINVAL);

--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -161,14 +161,15 @@ struct r1bio {
 };
 /* bits for r1bio.state */
-#define	R1BIO_Uptodate	0
+enum r1bio_state {
-#define	R1BIO_IsSync	1
+	R1BIO_Uptodate,
-#define	R1BIO_Degraded	2
+	R1BIO_IsSync,
-#define	R1BIO_BehindIO	3
+	R1BIO_Degraded,
+	R1BIO_BehindIO,
 /* Set ReadError on bios that experience a readerror so that
 * raid1d knows what to do with them.
 */
-#define R1BIO_ReadError 4
+	R1BIO_ReadError,
 /* For write-behind requests, we call bi_end_io when
 * the last non-write-behind device completes, providing
 * any write was successful.  Otherwise we call when
@@ -176,10 +177,12 @@ struct r1bio {
 * with failure when last write completes (and all failed).
 * Record that bi_end_io was called with this flag...
 */
-#define	R1BIO_Returned 6
+	R1BIO_Returned,
 /* If a write for this request means we can clear some
 * known-bad-block records, we set this flag
 */
-#define	R1BIO_MadeGood 7
+	R1BIO_MadeGood,
-#define	R1BIO_WriteError 8
+	R1BIO_WriteError,
+	R1BIO_FailFast,
+};
 #endif
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -156,5 +156,7 @@ enum r10bio_state {
 * flag is set
 */
 	R10BIO_Previous,
+/* failfast devices did receive failfast requests. */
+	R10BIO_FailFast,
 };
 #endif
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -84,6 +84,10 @@
 #define MD_DISK_CANDIDATE	5 /* disk is added as spare (local) until confirmed
 				   * For clustered enviroments only.
 				   */
+#define MD_DISK_FAILFAST	10 /* Send REQ_FAILFAST if there are multiple
+				    * devices available - and don't try to
+				    * correct read errors.
+				    */
 #define	MD_DISK_WRITEMOSTLY	9 /* disk is "write-mostly" is RAID1 config.
 				   * read requests will only be sent here in
@@ -265,8 +269,9 @@ struct mdp_superblock_1 {
 	__le32	dev_number;	/* permanent identifier of this  device - not role in raid */
 	__le32	cnt_corrected_read; /* number of read errors that were corrected by re-writing */
 	__u8	device_uuid[16]; /* user-space setable, ignored by kernel */
-	__u8	devflags;	/* per-device flags.  Only one defined...*/
+	__u8	devflags;	/* per-device flags.  Only two defined...*/
 #define	WriteMostly1	1	/* mask for writemostly flag in above */
+#define	FailFast1	2	/* Should avoid retries and fixups and just fail */
 	/* Bad block log.  If there are any bad blocks the feature flag is set.
 	 * If offset and size are non-zero, that space is reserved and available
 	 */

--- a/lib/raid6/avx2.c
+++ b/lib/raid6/avx2.c