Merge branch 'md-next' into md-linus

20737738 · Shaohua Li · b78b499a · 2953079c · 20737738 · 20737738
16 changed file
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -27,6 +27,7 @@
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
 #include <linux/seq_file.h>
+#include <trace/events/block.h>
 #include "md.h"
 #include "bitmap.h"

@@ -208,11 +209,13 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde

 static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 {
-	struct md_rdev *rdev = NULL;
+	struct md_rdev *rdev;
 	struct block_device *bdev;
 	struct mddev *mddev = bitmap->mddev;
 	struct bitmap_storage *store = &bitmap->storage;

+restart:
+	rdev = NULL;
 	while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
 		int size = PAGE_SIZE;
 		loff_t offset = mddev->bitmap_info.offset;
@@ -268,8 +271,8 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 			       page);
 	}

-	if (wait)
-		md_super_wait(mddev);
+	if (wait && md_super_wait(mddev) < 0)
+		goto restart;
 	return 0;

 bad_alignment:
@@ -405,10 +408,10 @@ static int read_page(struct file *file, unsigned long index,
 		ret = -EIO;
 out:
 	if (ret)
-		printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n",
-			(int)PAGE_SIZE,
-			(unsigned long long)index << PAGE_SHIFT,
-			ret);
+		pr_err("md: bitmap read error: (%dB @ %llu): %d\n",
+		       (int)PAGE_SIZE,
+		       (unsigned long long)index << PAGE_SHIFT,
+		       ret);
 	return ret;
 }

@@ -416,6 +419,28 @@ static int read_page(struct file *file, unsigned long index,
 * bitmap file superblock operations
 */

+/*
+ * bitmap_wait_writes() should be called before writing any bitmap
+ * blocks, to ensure previous writes, particularly from
+ * bitmap_daemon_work(), have completed.
+ */
+static void bitmap_wait_writes(struct bitmap *bitmap)
+{
+	if (bitmap->storage.file)
+		wait_event(bitmap->write_wait,
+			   atomic_read(&bitmap->pending_writes)==0);
+	else
+		/* Note that we ignore the return value.  The writes
+		 * might have failed, but that would just mean that
+		 * some bits which should be cleared haven't been,
+		 * which is safe.  The relevant bitmap blocks will
+		 * probably get written again, but there is no great
+		 * loss if they aren't.
+		 */
+		md_super_wait(bitmap->mddev);
+}
+
+
 /* update the event counter and sync the superblock to disk */
 void bitmap_update_sb(struct bitmap *bitmap)
 {
@@ -455,24 +480,24 @@ void bitmap_print_sb(struct bitmap *bitmap)
 	if (!bitmap || !bitmap->storage.sb_page)
 		return;
 	sb = kmap_atomic(bitmap->storage.sb_page);
-	printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
-	printk(KERN_DEBUG "         magic: %08x\n", le32_to_cpu(sb->magic));
-	printk(KERN_DEBUG "       version: %d\n", le32_to_cpu(sb->version));
-	printk(KERN_DEBUG "          uuid: %08x.%08x.%08x.%08x\n",
-					*(__u32 *)(sb->uuid+0),
-					*(__u32 *)(sb->uuid+4),
-					*(__u32 *)(sb->uuid+8),
-					*(__u32 *)(sb->uuid+12));
-	printk(KERN_DEBUG "        events: %llu\n",
-			(unsigned long long) le64_to_cpu(sb->events));
-	printk(KERN_DEBUG "events cleared: %llu\n",
-			(unsigned long long) le64_to_cpu(sb->events_cleared));
-	printk(KERN_DEBUG "         state: %08x\n", le32_to_cpu(sb->state));
-	printk(KERN_DEBUG "     chunksize: %d B\n", le32_to_cpu(sb->chunksize));
-	printk(KERN_DEBUG "  daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
-	printk(KERN_DEBUG "     sync size: %llu KB\n",
-			(unsigned long long)le64_to_cpu(sb->sync_size)/2);
-	printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind));
+	pr_debug("%s: bitmap file superblock:\n", bmname(bitmap));
+	pr_debug("         magic: %08x\n", le32_to_cpu(sb->magic));
+	pr_debug("       version: %d\n", le32_to_cpu(sb->version));
+	pr_debug("          uuid: %08x.%08x.%08x.%08x\n",
+		 *(__u32 *)(sb->uuid+0),
+		 *(__u32 *)(sb->uuid+4),
+		 *(__u32 *)(sb->uuid+8),
+		 *(__u32 *)(sb->uuid+12));
+	pr_debug("        events: %llu\n",
+		 (unsigned long long) le64_to_cpu(sb->events));
+	pr_debug("events cleared: %llu\n",
+		 (unsigned long long) le64_to_cpu(sb->events_cleared));
+	pr_debug("         state: %08x\n", le32_to_cpu(sb->state));
+	pr_debug("     chunksize: %d B\n", le32_to_cpu(sb->chunksize));
+	pr_debug("  daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
+	pr_debug("     sync size: %llu KB\n",
+		 (unsigned long long)le64_to_cpu(sb->sync_size)/2);
+	pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind));
 	kunmap_atomic(sb);
 }

@@ -506,14 +531,14 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
 	BUG_ON(!chunksize);
 	if (!is_power_of_2(chunksize)) {
 		kunmap_atomic(sb);
-		printk(KERN_ERR "bitmap chunksize not a power of 2\n");
+		pr_warn("bitmap chunksize not a power of 2\n");
 		return -EINVAL;
 	}
 	sb->chunksize = cpu_to_le32(chunksize);

 	daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep;
 	if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) {
-		printk(KERN_INFO "Choosing daemon_sleep default (5 sec)\n");
+		pr_debug("Choosing daemon_sleep default (5 sec)\n");
 		daemon_sleep = 5 * HZ;
 	}
 	sb->daemon_sleep = cpu_to_le32(daemon_sleep);
@@ -584,7 +609,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 		/* to 4k blocks */
 		bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
 		offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3));
-		pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
+		pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
 			bitmap->cluster_slot, offset);
 	}

@@ -634,7 +659,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	else if (write_behind > COUNTER_MAX)
 		reason = "write-behind limit out of range (0 - 16383)";
 	if (reason) {
-		printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n",
+		pr_warn("%s: invalid bitmap file superblock: %s\n",
 			bmname(bitmap), reason);
 		goto out;
 	}
@@ -648,18 +673,15 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 		 * bitmap's UUID and event counter to the mddev's
 		 */
 		if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
-			printk(KERN_INFO
-			       "%s: bitmap superblock UUID mismatch\n",
-			       bmname(bitmap));
+			pr_warn("%s: bitmap superblock UUID mismatch\n",
+				bmname(bitmap));
 			goto out;
 		}
 		events = le64_to_cpu(sb->events);
 		if (!nodes && (events < bitmap->mddev->events)) {
-			printk(KERN_INFO
-			       "%s: bitmap file is out of date (%llu < %llu) "
-			       "-- forcing full recovery\n",
-			       bmname(bitmap), events,
-			       (unsigned long long) bitmap->mddev->events);
+			pr_warn("%s: bitmap file is out of date (%llu < %llu) -- forcing full recovery\n",
+				bmname(bitmap), events,
+				(unsigned long long) bitmap->mddev->events);
 			set_bit(BITMAP_STALE, &bitmap->flags);
 		}
 	}
@@ -679,8 +701,8 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
 		err = md_setup_cluster(bitmap->mddev, nodes);
 		if (err) {
-			pr_err("%s: Could not setup cluster service (%d)\n",
-					bmname(bitmap), err);
+			pr_warn("%s: Could not setup cluster service (%d)\n",
+				bmname(bitmap), err);
 			goto out_no_sb;
 		}
 		bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev);
@@ -847,15 +869,13 @@ static void bitmap_file_kick(struct bitmap *bitmap)
 				ptr = file_path(bitmap->storage.file,
 					     path, PAGE_SIZE);

-			printk(KERN_ALERT
-			      "%s: kicking failed bitmap file %s from array!\n",
-			      bmname(bitmap), IS_ERR(ptr) ? "" : ptr);
+			pr_warn("%s: kicking failed bitmap file %s from array!\n",
+				bmname(bitmap), IS_ERR(ptr) ? "" : ptr);

 			kfree(path);
 		} else
-			printk(KERN_ALERT
-			       "%s: disabling internal bitmap due to errors\n",
-			       bmname(bitmap));
+			pr_warn("%s: disabling internal bitmap due to errors\n",
+				bmname(bitmap));
 	}
 }

@@ -983,6 +1003,7 @@ void bitmap_unplug(struct bitmap *bitmap)
 {
 	unsigned long i;
 	int dirty, need_write;
+	int writing = 0;

 	if (!bitmap || !bitmap->storage.filemap ||
 	    test_bit(BITMAP_STALE, &bitmap->flags))
@@ -997,15 +1018,19 @@ void bitmap_unplug(struct bitmap *bitmap)
 		need_write = test_and_clear_page_attr(bitmap, i,
 						      BITMAP_PAGE_NEEDWRITE);
 		if (dirty || need_write) {
+			if (!writing) {
+				bitmap_wait_writes(bitmap);
+				if (bitmap->mddev->queue)
+					blk_add_trace_msg(bitmap->mddev->queue,
+							  "md bitmap_unplug");
+			}
 			clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
 			write_page(bitmap, bitmap->storage.filemap[i], 0);
+			writing = 1;
 		}
 	}
-	if (bitmap->storage.file)
-		wait_event(bitmap->write_wait,
-			   atomic_read(&bitmap->pending_writes)==0);
-	else
-		md_super_wait(bitmap->mddev);
+	if (writing)
+		bitmap_wait_writes(bitmap);

 	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
 		bitmap_file_kick(bitmap);
@@ -1056,14 +1081,13 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)

 	outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
 	if (outofdate)
-		printk(KERN_INFO "%s: bitmap file is out of date, doing full "
-			"recovery\n", bmname(bitmap));
+		pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap));

 	if (file && i_size_read(file->f_mapping->host) < store->bytes) {
-		printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
-		       bmname(bitmap),
-		       (unsigned long) i_size_read(file->f_mapping->host),
-		       store->bytes);
+		pr_warn("%s: bitmap file too short %lu < %lu\n",
+			bmname(bitmap),
+			(unsigned long) i_size_read(file->f_mapping->host),
+			store->bytes);
 		goto err;
 	}

@@ -1137,16 +1161,15 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		offset = 0;
 	}

-	printk(KERN_INFO "%s: bitmap initialized from disk: "
-	       "read %lu pages, set %lu of %lu bits\n",
-	       bmname(bitmap), store->file_pages,
-	       bit_cnt, chunks);
+	pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n",
+		 bmname(bitmap), store->file_pages,
+		 bit_cnt, chunks);

 	return 0;

 err:
-	printk(KERN_INFO "%s: bitmap initialisation failed: %d\n",
-	       bmname(bitmap), ret);
+	pr_warn("%s: bitmap initialisation failed: %d\n",
+		bmname(bitmap), ret);
 	return ret;
 }

@@ -1225,6 +1248,10 @@ void bitmap_daemon_work(struct mddev *mddev)
 	}
 	bitmap->allclean = 1;

+	if (bitmap->mddev->queue)
+		blk_add_trace_msg(bitmap->mddev->queue,
+				  "md bitmap_daemon_work");
+
 	/* Any file-page which is PENDING now needs to be written.
 	 * So set NEEDWRITE now, then after we make any last-minute changes
 	 * we will write it.
@@ -1289,6 +1316,7 @@ void bitmap_daemon_work(struct mddev *mddev)
 	}
 	spin_unlock_irq(&counts->lock);

+	bitmap_wait_writes(bitmap);
 	/* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
 	 * DIRTY pages need to be written by bitmap_unplug so it can wait
 	 * for them.
@@ -1595,7 +1623,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
 		   atomic_read(&bitmap->mddev->recovery_active) == 0);

 	bitmap->mddev->curr_resync_completed = sector;
-	set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
+	set_bit(MD_SB_CHANGE_CLEAN, &bitmap->mddev->sb_flags);
 	sector &= ~((1ULL << bitmap->counts.chunkshift) - 1);
 	s = 0;
 	while (s < sector && s < bitmap->mddev->resync_max_sectors) {
@@ -1825,8 +1853,8 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot)
 	if (err)
 		goto error;

-	printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
-	       bitmap->counts.pages, bmname(bitmap));
+	pr_debug("created bitmap (%lu pages) for device %s\n",
+		 bitmap->counts.pages, bmname(bitmap));

 	err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
 	if (err)
@@ -2029,8 +2057,10 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 					   !bitmap->mddev->bitmap_info.external,
 					   mddev_is_clustered(bitmap->mddev)
 					   ? bitmap->cluster_slot : 0);
-	if (ret)
+	if (ret) {
+		bitmap_file_unmap(&store);
 		goto err;
+	}

 	pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO);

@@ -2089,7 +2119,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 				bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift +
 									     BITMAP_BLOCK_SHIFT);
 				blocks = old_counts.chunks << old_counts.chunkshift;
-				pr_err("Could not pre-allocate in-memory bitmap for cluster raid\n");
+				pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n");
 				break;
 			} else
 				bitmap->counts.bp[page].count += 1;
@@ -2266,7 +2296,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 		/* Ensure new bitmap info is stored in
 		 * metadata promptly.
 		 */
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 		md_wakeup_thread(mddev->thread);
 	}
 	rv = 0;

--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2011,7 +2011,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
 		sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190);

 		/* Force writing of superblocks to disk */
-		set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
+		set_bit(MD_SB_CHANGE_DEVS, &rdev->mddev->sb_flags);

 		/* Any superblock is better than none, choose that if given */
 		return refdev ? 0 : 1;
@@ -3497,7 +3497,7 @@ static void rs_update_sbs(struct raid_set *rs)
 	struct mddev *mddev = &rs->md;
 	int ro = mddev->ro;

-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 	mddev->ro = 0;
 	md_update_sb(mddev, 1);
 	mddev->ro = ro;

--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -21,6 +21,7 @@
 #include <linux/seq_file.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <trace/events/block.h>
 #include "md.h"
 #include "linear.h"

@@ -101,8 +102,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 		sector_t sectors;

 		if (j < 0 || j >= raid_disks || disk->rdev) {
-			printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n",
-			       mdname(mddev));
+			pr_warn("md/linear:%s: disk numbering problem. Aborting!\n",
+				mdname(mddev));
 			goto out;
 		}

@@ -123,8 +124,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 			discard_supported = true;
 	}
 	if (cnt != raid_disks) {
-		printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
-		       mdname(mddev));
+		pr_warn("md/linear:%s: not enough drives present. Aborting!\n",
+			mdname(mddev));
 		goto out;
 	}

@@ -227,22 +228,22 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 	}

 	do {
-		tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector);
+		sector_t bio_sector = bio->bi_iter.bi_sector;
+		tmp_dev = which_dev(mddev, bio_sector);
 		start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
 		end_sector = tmp_dev->end_sector;
 		data_offset = tmp_dev->rdev->data_offset;
 		bio->bi_bdev = tmp_dev->rdev->bdev;

-		if (unlikely(bio->bi_iter.bi_sector >= end_sector ||
-			     bio->bi_iter.bi_sector < start_sector))
+		if (unlikely(bio_sector >= end_sector ||
+			     bio_sector < start_sector))
 			goto out_of_bounds;

 		if (unlikely(bio_end_sector(bio) > end_sector)) {
 			/* This bio crosses a device boundary, so we have to
 			 * split it.
 			 */
-			split = bio_split(bio, end_sector -
-					  bio->bi_iter.bi_sector,
+			split = bio_split(bio, end_sector - bio_sector,
 					  GFP_NOIO, fs_bio_set);
 			bio_chain(split, bio);
 		} else {
@@ -256,15 +257,18 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
 			/* Just ignore it */
 			bio_endio(split);
-		} else
+		} else {
+			if (mddev->gendisk)
+				trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
+						      split, disk_devt(mddev->gendisk),
+						      bio_sector);
 			generic_make_request(split);
+		}
 	} while (split != bio);
 	return;

 out_of_bounds:
-	printk(KERN_ERR
-	       "md/linear:%s: make_request: Sector %llu out of bounds on "
-	       "dev %s: %llu sectors, offset %llu\n",
+	pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n",
 	       mdname(mddev),
 	       (unsigned long long)bio->bi_iter.bi_sector,
 	       bdevname(tmp_dev->rdev->bdev, b),
@@ -275,7 +279,6 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)

 static void linear_status (struct seq_file *seq, struct mddev *mddev)
 {
-
 	seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
 }


--- a/drivers/md/md.c
+++ b/drivers/md/md.c
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -29,6 +29,16 @@

 #define MaxSector (~(sector_t)0)

+/*
+ * These flags should really be called "NO_RETRY" rather than
+ * "FAILFAST" because they don't make any promise about time lapse,
+ * only about the number of retries, which will be zero.
+ * REQ_FAILFAST_DRIVER is not included because
+ * Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.")
+ * seems to suggest that the errors it avoids retrying should usually
+ * be retried.
+ */
+#define	MD_FAILFAST	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
 /*
 * MD's 'extended' device
 */
@@ -168,6 +178,19 @@ enum flag_bits {
 				 * so it is safe to remove without
 				 * another synchronize_rcu() call.
 				 */
+	ExternalBbl,            /* External metadata provides bad
+				 * block management for a disk
+				 */
+	FailFast,		/* Minimal retries should be attempted on
+				 * this device, so use REQ_FAILFAST_DEV.
+				 * Also don't try to repair failed reads.
+				 * It is expects that no bad block log
+				 * is present.
+				 */
+	LastDev,		/* Seems to be the last working dev as
+				 * it didn't fail, so don't use FailFast
+				 * any more for metadata
+				 */
 };

 static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@@ -189,6 +212,31 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 				int is_new);
 struct md_cluster_info;

+enum mddev_flags {
+	MD_ARRAY_FIRST_USE,	/* First use of array, needs initialization */
+	MD_CLOSING,		/* If set, we are closing the array, do not open
+				 * it then */
+	MD_JOURNAL_CLEAN,	/* A raid with journal is already clean */
+	MD_HAS_JOURNAL,		/* The raid array has journal feature set */
+	MD_RELOAD_SB,		/* Reload the superblock because another node
+				 * updated it.
+				 */
+	MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
+				   * already took resync lock, need to
+				   * release the lock */
+	MD_FAILFAST_SUPPORTED,	/* Using MD_FAILFAST on metadata writes is
+				 * supported as calls to md_error() will
+				 * never cause the array to become failed.
+				 */
+};
+
+enum mddev_sb_flags {
+	MD_SB_CHANGE_DEVS,		/* Some device status has changed */
+	MD_SB_CHANGE_CLEAN,	/* transition to or from 'clean' */
+	MD_SB_CHANGE_PENDING,	/* switch from 'clean' to 'active' in progress */
+	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
+};
+
 struct mddev {
 	void				*private;
 	struct md_personality		*pers;
@@ -196,21 +244,7 @@ struct mddev {
 	int				md_minor;
 	struct list_head		disks;
 	unsigned long			flags;
-#define MD_CHANGE_DEVS	0	/* Some device status has changed */
-#define MD_CHANGE_CLEAN 1	/* transition to or from 'clean' */
-#define MD_CHANGE_PENDING 2	/* switch from 'clean' to 'active' in progress */
-#define MD_UPDATE_SB_FLAGS (1 | 2 | 4)	/* If these are set, md_update_sb needed */
-#define MD_ARRAY_FIRST_USE 3    /* First use of array, needs initialization */
-#define MD_CLOSING	4	/* If set, we are closing the array, do not open
-				 * it then */
-#define MD_JOURNAL_CLEAN 5	/* A raid with journal is already clean */
-#define MD_HAS_JOURNAL	6	/* The raid array has journal feature set */
-#define MD_RELOAD_SB	7	/* Reload the superblock because another node
-				 * updated it.
-				 */
-#define MD_CLUSTER_RESYNC_LOCKED 8 /* cluster raid only, which means node
-				    * already took resync lock, need to
-				    * release the lock */
+	unsigned long			sb_flags;

 	int				suspended;
 	atomic_t			active_io;
@@ -304,31 +338,6 @@ struct mddev {
 	int				parallel_resync;

 	int				ok_start_degraded;
-	/* recovery/resync flags
-	 * NEEDED:   we might need to start a resync/recover
-	 * RUNNING:  a thread is running, or about to be started
-	 * SYNC:     actually doing a resync, not a recovery
-	 * RECOVER:  doing recovery, or need to try it.
-	 * INTR:     resync needs to be aborted for some reason
-	 * DONE:     thread is done and is waiting to be reaped
-	 * REQUEST:  user-space has requested a sync (used with SYNC)
-	 * CHECK:    user-space request for check-only, no repair
-	 * RESHAPE:  A reshape is happening
-	 * ERROR:    sync-action interrupted because io-error
-	 *
-	 * If neither SYNC or RESHAPE are set, then it is a recovery.
-	 */
-#define	MD_RECOVERY_RUNNING	0
-#define	MD_RECOVERY_SYNC	1
-#define	MD_RECOVERY_RECOVER	2
-#define	MD_RECOVERY_INTR	3
-#define	MD_RECOVERY_DONE	4
-#define	MD_RECOVERY_NEEDED	5
-#define	MD_RECOVERY_REQUESTED	6
-#define	MD_RECOVERY_CHECK	7
-#define MD_RECOVERY_RESHAPE	8
-#define	MD_RECOVERY_FROZEN	9
-#define	MD_RECOVERY_ERROR	10

 	unsigned long			recovery;
 	/* If a RAID personality determines that recovery (of a particular
@@ -442,6 +451,23 @@ struct mddev {
 	unsigned int			good_device_nr;	/* good device num within cluster raid */
 };

+enum recovery_flags {
+	/*
+	 * If neither SYNC or RESHAPE are set, then it is a recovery.
+	 */
+	MD_RECOVERY_RUNNING,	/* a thread is running, or about to be started */
+	MD_RECOVERY_SYNC,	/* actually doing a resync, not a recovery */
+	MD_RECOVERY_RECOVER,	/* doing recovery, or need to try it. */
+	MD_RECOVERY_INTR,	/* resync needs to be aborted for some reason */
+	MD_RECOVERY_DONE,	/* thread is done and is waiting to be reaped */
+	MD_RECOVERY_NEEDED,	/* we might need to start a resync/recover */
+	MD_RECOVERY_REQUESTED,	/* user-space has requested a sync (used with SYNC) */
+	MD_RECOVERY_CHECK,	/* user-space request for check-only, no repair */
+	MD_RECOVERY_RESHAPE,	/* A reshape is happening */
+	MD_RECOVERY_FROZEN,	/* User request to abort, and not restart, any action */
+	MD_RECOVERY_ERROR,	/* sync-action interrupted because io-error */
+};
+
 static inline int __must_check mddev_lock(struct mddev *mddev)
 {
 	return mutex_lock_interruptible(&mddev->reconfig_mutex);
@@ -623,7 +649,7 @@ extern int mddev_congested(struct mddev *mddev, int bits);
 extern void md_flush_request(struct mddev *mddev, struct bio *bio);
 extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 			   sector_t sector, int size, struct page *page);
-extern void md_super_wait(struct mddev *mddev);
+extern int md_super_wait(struct mddev *mddev);
 extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 			struct page *page, int op, int op_flags,
 			bool metadata_op);

--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -52,7 +52,7 @@ static int multipath_map (struct mpconf *conf)
 	}
 	rcu_read_unlock();

-	printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
+	pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n");
 	return (-1);
 }

@@ -97,9 +97,9 @@ static void multipath_end_request(struct bio *bio)
 		 */
 		char b[BDEVNAME_SIZE];
 		md_error (mp_bh->mddev, rdev);
-		printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n",
-		       bdevname(rdev->bdev,b),
-		       (unsigned long long)bio->bi_iter.bi_sector);
+		pr_info("multipath: %s: rescheduling sector %llu\n",
+			bdevname(rdev->bdev,b),
+			(unsigned long long)bio->bi_iter.bi_sector);
 		multipath_reschedule_retry(mp_bh);
 	} else
 		multipath_end_bh_io(mp_bh, bio->bi_error);
@@ -194,8 +194,7 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
 		 * first check if this is a queued request for a device
 		 * which has just failed.
 		 */
-		printk(KERN_ALERT
-		       "multipath: only one IO path left and IO error.\n");
+		pr_warn("multipath: only one IO path left and IO error.\n");
 		/* leave it active... it's all we have */
 		return;
 	}
@@ -209,11 +208,9 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 	set_bit(Faulty, &rdev->flags);
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-	printk(KERN_ALERT "multipath: IO failure on %s,"
-	       " disabling IO path.\n"
-	       "multipath: Operation continuing"
-	       " on %d IO paths.\n",
+	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
+	pr_err("multipath: IO failure on %s, disabling IO path.\n"
+	       "multipath: Operation continuing on %d IO paths.\n",
 	       bdevname(rdev->bdev, b),
 	       conf->raid_disks - mddev->degraded);
 }
@@ -223,21 +220,21 @@ static void print_multipath_conf (struct mpconf *conf)
 	int i;
 	struct multipath_info *tmp;

-	printk("MULTIPATH conf printout:\n");
+	pr_debug("MULTIPATH conf printout:\n");
 	if (!conf) {
-		printk("(conf==NULL)\n");
+		pr_debug("(conf==NULL)\n");
 		return;
 	}
-	printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
-			 conf->raid_disks);
+	pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
+		 conf->raid_disks);

 	for (i = 0; i < conf->raid_disks; i++) {
 		char b[BDEVNAME_SIZE];
 		tmp = conf->multipaths + i;
 		if (tmp->rdev)
-			printk(" disk%d, o:%d, dev:%s\n",
-				i,!test_bit(Faulty, &tmp->rdev->flags),
-			       bdevname(tmp->rdev->bdev,b));
+			pr_debug(" disk%d, o:%d, dev:%s\n",
+				 i,!test_bit(Faulty, &tmp->rdev->flags),
+				 bdevname(tmp->rdev->bdev,b));
 	}
 }

@@ -292,8 +289,7 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (rdev == p->rdev) {
 		if (test_bit(In_sync, &rdev->flags) ||
 		    atomic_read(&rdev->nr_pending)) {
-			printk(KERN_ERR "hot-remove-disk, slot %d is identified"
-			       " but is still operational!\n", number);
+			pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number);
 			err = -EBUSY;
 			goto abort;
 		}
@@ -346,16 +342,14 @@ static void multipathd(struct md_thread *thread)
 		bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;

 		if ((mp_bh->path = multipath_map (conf))<0) {
-			printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
-				" error for block %llu\n",
-				bdevname(bio->bi_bdev,b),
-				(unsigned long long)bio->bi_iter.bi_sector);
+			pr_err("multipath: %s: unrecoverable IO read error for block %llu\n",
+			       bdevname(bio->bi_bdev,b),
+			       (unsigned long long)bio->bi_iter.bi_sector);
 			multipath_end_bh_io(mp_bh, -EIO);
 		} else {
-			printk(KERN_ERR "multipath: %s: redirecting sector %llu"
-				" to another IO path\n",
-				bdevname(bio->bi_bdev,b),
-				(unsigned long long)bio->bi_iter.bi_sector);
+			pr_err("multipath: %s: redirecting sector %llu to another IO path\n",
+			       bdevname(bio->bi_bdev,b),
+			       (unsigned long long)bio->bi_iter.bi_sector);
 			*bio = *(mp_bh->master_bio);
 			bio->bi_iter.bi_sector +=
 				conf->multipaths[mp_bh->path].rdev->data_offset;
@@ -389,8 +383,8 @@ static int multipath_run (struct mddev *mddev)
 		return -EINVAL;

 	if (mddev->level != LEVEL_MULTIPATH) {
-		printk("multipath: %s: raid level not set to multipath IO (%d)\n",
-		       mdname(mddev), mddev->level);
+		pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n",
+			mdname(mddev), mddev->level);
 		goto out;
 	}
 	/*
@@ -401,21 +395,13 @@ static int multipath_run (struct mddev *mddev)

 	conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
 	mddev->private = conf;
-	if (!conf) {
-		printk(KERN_ERR
-			"multipath: couldn't allocate memory for %s\n",
-			mdname(mddev));
+	if (!conf)
 		goto out;
-	}

 	conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks,
 				   GFP_KERNEL);
-	if (!conf->multipaths) {
-		printk(KERN_ERR
-			"multipath: couldn't allocate memory for %s\n",
-			mdname(mddev));
+	if (!conf->multipaths)
 		goto out_free_conf;
-	}

 	working_disks = 0;
 	rdev_for_each(rdev, mddev) {
@@ -439,7 +425,7 @@ static int multipath_run (struct mddev *mddev)
 	INIT_LIST_HEAD(&conf->retry_list);

 	if (!working_disks) {
-		printk(KERN_ERR "multipath: no operational IO paths for %s\n",
+		pr_warn("multipath: no operational IO paths for %s\n",
 			mdname(mddev));
 		goto out_free_conf;
 	}
@@ -447,27 +433,17 @@ static int multipath_run (struct mddev *mddev)

 	conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
 						 sizeof(struct multipath_bh));
-	if (conf->pool == NULL) {
-		printk(KERN_ERR
-			"multipath: couldn't allocate memory for %s\n",
-			mdname(mddev));
+	if (conf->pool == NULL)
 		goto out_free_conf;
-	}

-	{
-		mddev->thread = md_register_thread(multipathd, mddev,
-						   "multipath");
-		if (!mddev->thread) {
-			printk(KERN_ERR "multipath: couldn't allocate thread"
-				" for %s\n", mdname(mddev));
-			goto out_free_conf;
-		}
-	}
+	mddev->thread = md_register_thread(multipathd, mddev,
+					   "multipath");
+	if (!mddev->thread)
+		goto out_free_conf;

-	printk(KERN_INFO
-		"multipath: array %s active with %d out of %d IO paths\n",
+	pr_info("multipath: array %s active with %d out of %d IO paths\n",
 		mdname(mddev), conf->raid_disks - mddev->degraded,
-	       mddev->raid_disks);
+		mddev->raid_disks);
 	/*
 	 * Ok, everything is just fine now
 	 */

--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -21,6 +21,7 @@
 #include <linux/seq_file.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <trace/events/block.h>
 #include "md.h"
 #include "raid0.h"
 #include "raid5.h"
@@ -51,20 +52,21 @@ static void dump_zones(struct mddev *mddev)
 	char b[BDEVNAME_SIZE];
 	struct r0conf *conf = mddev->private;
 	int raid_disks = conf->strip_zone[0].nb_dev;
-	printk(KERN_INFO "md: RAID0 configuration for %s - %d zone%s\n",
-	       mdname(mddev),
-	       conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s");
+	pr_debug("md: RAID0 configuration for %s - %d zone%s\n",
+		 mdname(mddev),
+		 conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s");
 	for (j = 0; j < conf->nr_strip_zones; j++) {
-		printk(KERN_INFO "md: zone%d=[", j);
+		char line[200];
+		int len = 0;
+
 		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
-			printk(KERN_CONT "%s%s", k?"/":"",
-			bdevname(conf->devlist[j*raid_disks
-						+ k]->bdev, b));
-		printk(KERN_CONT "]\n");
+			len += snprintf(line+len, 200-len, "%s%s", k?"/":"",
+					bdevname(conf->devlist[j*raid_disks
+							       + k]->bdev, b));
+		pr_debug("md: zone%d=[%s]\n", j, line);

 		zone_size  = conf->strip_zone[j].zone_end - zone_start;
-		printk(KERN_INFO "      zone-offset=%10lluKB, "
-				"device-offset=%10lluKB, size=%10lluKB\n",
+		pr_debug("      zone-offset=%10lluKB, device-offset=%10lluKB, size=%10lluKB\n",
 			(unsigned long long)zone_start>>1,
 			(unsigned long long)conf->strip_zone[j].dev_start>>1,
 			(unsigned long long)zone_size>>1);
@@ -142,9 +144,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	 * chunk size is a multiple of that sector size
 	 */
 	if ((mddev->chunk_sectors << 9) % blksize) {
-		printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
-		       mdname(mddev),
-		       mddev->chunk_sectors << 9, blksize);
+		pr_warn("md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
+			mdname(mddev),
+			mddev->chunk_sectors << 9, blksize);
 		err = -EINVAL;
 		goto abort;
 	}
@@ -186,19 +188,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		}

 		if (j < 0) {
-			printk(KERN_ERR
-			       "md/raid0:%s: remove inactive devices before converting to RAID0\n",
-			       mdname(mddev));
+			pr_warn("md/raid0:%s: remove inactive devices before converting to RAID0\n",
+				mdname(mddev));
 			goto abort;
 		}
 		if (j >= mddev->raid_disks) {
-			printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
-			       "aborting!\n", mdname(mddev), j);
+			pr_warn("md/raid0:%s: bad disk number %d - aborting!\n",
+				mdname(mddev), j);
 			goto abort;
 		}
 		if (dev[j]) {
-			printk(KERN_ERR "md/raid0:%s: multiple devices for %d - "
-			       "aborting!\n", mdname(mddev), j);
+			pr_warn("md/raid0:%s: multiple devices for %d - aborting!\n",
+				mdname(mddev), j);
 			goto abort;
 		}
 		dev[j] = rdev1;
@@ -208,8 +209,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		cnt++;
 	}
 	if (cnt != mddev->raid_disks) {
-		printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
-		       "aborting!\n", mdname(mddev), cnt, mddev->raid_disks);
+		pr_warn("md/raid0:%s: too few disks (%d of %d) - aborting!\n",
+			mdname(mddev), cnt, mddev->raid_disks);
 		goto abort;
 	}
 	zone->nb_dev = cnt;
@@ -357,8 +358,7 @@ static int raid0_run(struct mddev *mddev)
 	int ret;

 	if (mddev->chunk_sectors == 0) {
-		printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n",
-		       mdname(mddev));
+		pr_warn("md/raid0:%s: chunk size must be set.\n", mdname(mddev));
 		return -EINVAL;
 	}
 	if (md_check_no_bitmap(mddev))
@@ -399,9 +399,9 @@ static int raid0_run(struct mddev *mddev)
 	/* calculate array device size */
 	md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));

-	printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n",
-	       mdname(mddev),
-	       (unsigned long long)mddev->array_sectors);
+	pr_debug("md/raid0:%s: md_size is %llu sectors.\n",
+		 mdname(mddev),
+		 (unsigned long long)mddev->array_sectors);

 	if (mddev->queue) {
 		/* calculate the max read-ahead size.
@@ -464,7 +464,8 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 	}

 	do {
-		sector_t sector = bio->bi_iter.bi_sector;
+		sector_t bio_sector = bio->bi_iter.bi_sector;
+		sector_t sector = bio_sector;
 		unsigned chunk_sects = mddev->chunk_sectors;

 		unsigned sectors = chunk_sects -
@@ -473,7 +474,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 			 : sector_div(sector, chunk_sects));

 		/* Restore due to sector_div */
-		sector = bio->bi_iter.bi_sector;
+		sector = bio_sector;

 		if (sectors < bio_sectors(bio)) {
 			split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
@@ -492,8 +493,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
 			/* Just ignore it */
 			bio_endio(split);
-		} else
+		} else {
+			if (mddev->gendisk)
+				trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
+						      split, disk_devt(mddev->gendisk),
+						      bio_sector);
 			generic_make_request(split);
+		}
 	} while (split != bio);
 }

@@ -509,17 +515,17 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
 	struct r0conf *priv_conf;

 	if (mddev->degraded != 1) {
-		printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n",
-		       mdname(mddev),
-		       mddev->degraded);
+		pr_warn("md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n",
+			mdname(mddev),
+			mddev->degraded);
 		return ERR_PTR(-EINVAL);
 	}

 	rdev_for_each(rdev, mddev) {
 		/* check slot number for a disk */
 		if (rdev->raid_disk == mddev->raid_disks-1) {
-			printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
-			       mdname(mddev));
+			pr_warn("md/raid0:%s: raid5 must have missing parity disk!\n",
+				mdname(mddev));
 			return ERR_PTR(-EINVAL);
 		}
 		rdev->sectors = mddev->dev_sectors;
@@ -533,8 +539,11 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
 	mddev->delta_disks = -1;
 	/* make sure it will be not marked as dirty */
 	mddev->recovery_cp = MaxSector;
+	clear_bit(MD_HAS_JOURNAL, &mddev->flags);
+	clear_bit(MD_JOURNAL_CLEAN, &mddev->flags);

 	create_strip_zones(mddev, &priv_conf);
+
 	return priv_conf;
 }

@@ -549,19 +558,19 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
 	 *  - all mirrors must be already degraded
 	 */
 	if (mddev->layout != ((1 << 8) + 2)) {
-		printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n",
-		       mdname(mddev),
-		       mddev->layout);
+		pr_warn("md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n",
+			mdname(mddev),
+			mddev->layout);
 		return ERR_PTR(-EINVAL);
 	}
 	if (mddev->raid_disks & 1) {
-		printk(KERN_ERR "md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n",
-		       mdname(mddev));
+		pr_warn("md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n",
+			mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
 	if (mddev->degraded != (mddev->raid_disks>>1)) {
-		printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n",
-		       mdname(mddev));
+		pr_warn("md/raid0:%s: All mirrors must be already degraded!\n",
+			mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}

@@ -574,6 +583,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
 	mddev->degraded = 0;
 	/* make sure it will be not marked as dirty */
 	mddev->recovery_cp = MaxSector;
+	clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);

 	create_strip_zones(mddev, &priv_conf);
 	return priv_conf;
@@ -588,7 +598,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
 	 *  - (N - 1) mirror drives must be already faulty
 	 */
 	if ((mddev->raid_disks - 1) != mddev->degraded) {
-		printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n",
+		pr_err("md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n",
 		       mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
@@ -616,6 +626,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
 	mddev->raid_disks = 1;
 	/* make sure it will be not marked as dirty */
 	mddev->recovery_cp = MaxSector;
+	clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);

 	create_strip_zones(mddev, &priv_conf);
 	return priv_conf;
@@ -631,8 +642,8 @@ static void *raid0_takeover(struct mddev *mddev)
 	 */

 	if (mddev->bitmap) {
-		printk(KERN_ERR "md/raid0: %s: cannot takeover array with bitmap\n",
-		       mdname(mddev));
+		pr_warn("md/raid0: %s: cannot takeover array with bitmap\n",
+			mdname(mddev));
 		return ERR_PTR(-EBUSY);
 	}
 	if (mddev->level == 4)
@@ -642,8 +653,8 @@ static void *raid0_takeover(struct mddev *mddev)
 		if (mddev->layout == ALGORITHM_PARITY_N)
 			return raid0_takeover_raid45(mddev);

-		printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
-		       mdname(mddev), ALGORITHM_PARITY_N);
+		pr_warn("md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
+			mdname(mddev), ALGORITHM_PARITY_N);
 	}

 	if (mddev->level == 10)
@@ -652,7 +663,7 @@ static void *raid0_takeover(struct mddev *mddev)
 	if (mddev->level == 1)
 		return raid0_takeover_raid1(mddev);

-	printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n",
+	pr_warn("Takeover from raid%i to raid0 not supported\n",
 		mddev->level);

 	return ERR_PTR(-EINVAL);

--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -161,14 +161,15 @@ struct r1bio {
 };

 /* bits for r1bio.state */
-#define	R1BIO_Uptodate	0
-#define	R1BIO_IsSync	1
-#define	R1BIO_Degraded	2
-#define	R1BIO_BehindIO	3
+enum r1bio_state {
+	R1BIO_Uptodate,
+	R1BIO_IsSync,
+	R1BIO_Degraded,
+	R1BIO_BehindIO,
 /* Set ReadError on bios that experience a readerror so that
 * raid1d knows what to do with them.
 */
-#define R1BIO_ReadError 4
+	R1BIO_ReadError,
 /* For write-behind requests, we call bi_end_io when
 * the last non-write-behind device completes, providing
 * any write was successful.  Otherwise we call when
@@ -176,10 +177,12 @@ struct r1bio {
 * with failure when last write completes (and all failed).
 * Record that bi_end_io was called with this flag...
 */
-#define	R1BIO_Returned 6
+	R1BIO_Returned,
 /* If a write for this request means we can clear some
 * known-bad-block records, we set this flag
 */
-#define	R1BIO_MadeGood 7
-#define	R1BIO_WriteError 8
+	R1BIO_MadeGood,
+	R1BIO_WriteError,
+	R1BIO_FailFast,
+};
 #endif
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -156,5 +156,7 @@ enum r10bio_state {
 * flag is set
 */
 	R10BIO_Previous,
+/* failfast devices did receive failfast requests. */
+	R10BIO_FailFast,
 };
 #endif
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -226,6 +226,8 @@ struct stripe_head {

 	struct r5l_io_unit	*log_io;
 	struct list_head	log_list;
+	sector_t		log_start; /* first meta block on the journal */
+	struct list_head	r5c; /* for r5c_cache->stripe_in_journal */
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -264,6 +266,7 @@ struct stripe_head_state {
 	int syncing, expanding, expanded, replacing;
 	int locked, uptodate, to_read, to_write, failed, written;
 	int to_fill, compute, req_compute, non_overwrite;
+	int injournal, just_cached;
 	int failed_num[2];
 	int p_failed, q_failed;
 	int dec_preread_active;
@@ -273,6 +276,7 @@ struct stripe_head_state {
 	struct md_rdev *blocked_rdev;
 	int handle_bad_blocks;
 	int log_failed;
+	int waiting_extra_page;
 };

 /* Flags for struct r5dev.flags */
@@ -313,6 +317,11 @@ enum r5dev_flags {
 			 */
 	R5_Discard,	/* Discard the stripe */
 	R5_SkipCopy,	/* Don't copy data from bio to stripe cache */
+	R5_InJournal,	/* data being written is in the journal device.
+			 * if R5_InJournal is set for parity pd_idx, all the
+			 * data and parity being written are in the journal
+			 * device
+			 */
 };

 /*
@@ -345,7 +354,30 @@ enum {
 	STRIPE_BITMAP_PENDING,	/* Being added to bitmap, don't add
 				 * to batch yet.
 				 */
-	STRIPE_LOG_TRAPPED, /* trapped into log */
+	STRIPE_LOG_TRAPPED,	/* trapped into log (see raid5-cache.c)
+				 * this bit is used in two scenarios:
+				 *
+				 * 1. write-out phase
+				 *  set in first entry of r5l_write_stripe
+				 *  clear in second entry of r5l_write_stripe
+				 *  used to bypass logic in handle_stripe
+				 *
+				 * 2. caching phase
+				 *  set in r5c_try_caching_write()
+				 *  clear when journal write is done
+				 *  used to initiate r5c_cache_data()
+				 *  also used to bypass logic in handle_stripe
+				 */
+	STRIPE_R5C_CACHING,	/* the stripe is in caching phase
+				 * see more detail in the raid5-cache.c
+				 */
+	STRIPE_R5C_PARTIAL_STRIPE,	/* in r5c cache (to-be/being handled or
+					 * in conf->r5c_partial_stripe_list)
+					 */
+	STRIPE_R5C_FULL_STRIPE,	/* in r5c cache (to-be/being handled or
+				 * in conf->r5c_full_stripe_list)
+				 */
+	STRIPE_R5C_PREFLUSH,	/* need to flush journal device */
 };

 #define STRIPE_EXPAND_SYNC_FLAGS \
@@ -408,8 +440,86 @@ enum {

 struct disk_info {
 	struct md_rdev	*rdev, *replacement;
+	struct page	*extra_page; /* extra page to use in prexor */
 };

+/*
+ * Stripe cache
+ */
+
+#define NR_STRIPES		256
+#define STRIPE_SIZE		PAGE_SIZE
+#define STRIPE_SHIFT		(PAGE_SHIFT - 9)
+#define STRIPE_SECTORS		(STRIPE_SIZE>>9)
+#define	IO_THRESHOLD		1
+#define BYPASS_THRESHOLD	1
+#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
+#define HASH_MASK		(NR_HASH - 1)
+#define MAX_STRIPE_BATCH	8
+
+/* bio's attached to a stripe+device for I/O are linked together in bi_sector
+ * order without overlap.  There may be several bio's per stripe+device, and
+ * a bio could span several devices.
+ * When walking this list for a particular stripe+device, we must never proceed
+ * beyond a bio that extends past this device, as the next bio might no longer
+ * be valid.
+ * This function is used to determine the 'next' bio in the list, given the
+ * sector of the current stripe+device
+ */
+static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
+{
+	int sectors = bio_sectors(bio);
+
+	if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
+		return bio->bi_next;
+	else
+		return NULL;
+}
+
+/*
+ * We maintain a biased count of active stripes in the bottom 16 bits of
+ * bi_phys_segments, and a count of processed stripes in the upper 16 bits
+ */
+static inline int raid5_bi_processed_stripes(struct bio *bio)
+{
+	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+
+	return (atomic_read(segments) >> 16) & 0xffff;
+}
+
+static inline int raid5_dec_bi_active_stripes(struct bio *bio)
+{
+	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+
+	return atomic_sub_return(1, segments) & 0xffff;
+}
+
+static inline void raid5_inc_bi_active_stripes(struct bio *bio)
+{
+	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+
+	atomic_inc(segments);
+}
+
+static inline void raid5_set_bi_processed_stripes(struct bio *bio,
+	unsigned int cnt)
+{
+	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+	int old, new;
+
+	do {
+		old = atomic_read(segments);
+		new = (old & 0xffff) | (cnt << 16);
+	} while (atomic_cmpxchg(segments, old, new) != old);
+}
+
+static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
+{
+	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+
+	atomic_set(segments, cnt);
+}
+
 /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
 * This is because we sometimes take all the spinlocks
 * and creating that much locking depth can cause
@@ -432,6 +542,30 @@ struct r5worker_group {
 	int stripes_cnt;
 };

+enum r5_cache_state {
+	R5_INACTIVE_BLOCKED,	/* release of inactive stripes blocked,
+				 * waiting for 25% to be free
+				 */
+	R5_ALLOC_MORE,		/* It might help to allocate another
+				 * stripe.
+				 */
+	R5_DID_ALLOC,		/* A stripe was allocated, don't allocate
+				 * more until at least one has been
+				 * released.  This avoids flooding
+				 * the cache.
+				 */
+	R5C_LOG_TIGHT,		/* log device space tight, need to
+				 * prioritize stripes at last_checkpoint
+				 */
+	R5C_LOG_CRITICAL,	/* log device is running out of space,
+				 * only process stripes that are already
+				 * occupying the log
+				 */
+	R5C_EXTRA_PAGE_IN_USE,	/* a stripe is using disk_info.extra_page
+				 * for prexor
+				 */
+};
+
 struct r5conf {
 	struct hlist_head	*stripe_hashtbl;
 	/* only protect corresponding hash list and inactive_list */
@@ -519,23 +653,18 @@ struct r5conf {
 	 */
 	atomic_t		active_stripes;
 	struct list_head	inactive_list[NR_STRIPE_HASH_LOCKS];
+
+	atomic_t		r5c_cached_full_stripes;
+	struct list_head	r5c_full_stripe_list;
+	atomic_t		r5c_cached_partial_stripes;
+	struct list_head	r5c_partial_stripe_list;
+
 	atomic_t		empty_inactive_list_nr;
 	struct llist_head	released_stripes;
 	wait_queue_head_t	wait_for_quiescent;
 	wait_queue_head_t	wait_for_stripe;
 	wait_queue_head_t	wait_for_overlap;
 	unsigned long		cache_state;
-#define R5_INACTIVE_BLOCKED	1	/* release of inactive stripes blocked,
-					 * waiting for 25% to be free
-					 */
-#define R5_ALLOC_MORE		2	/* It might help to allocate another
-					 * stripe.
-					 */
-#define R5_DID_ALLOC		4	/* A stripe was allocated, don't allocate
-					 * more until at least one has been
-					 * released.  This avoids flooding
-					 * the cache.
-					 */
 	struct shrinker		shrinker;
 	int			pool_size; /* number of disks in stripeheads in pool */
 	spinlock_t		device_lock;
@@ -633,4 +762,23 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh);
 extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
 extern void r5l_quiesce(struct r5l_log *log, int state);
 extern bool r5l_log_disk_error(struct r5conf *conf);
+extern bool r5c_is_writeback(struct r5l_log *log);
+extern int
+r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
+		      struct stripe_head_state *s, int disks);
+extern void
+r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
+			    struct stripe_head_state *s);
+extern void r5c_release_extra_page(struct stripe_head *sh);
+extern void r5c_use_extra_page(struct stripe_head *sh);
+extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+extern void r5c_handle_cached_data_endio(struct r5conf *conf,
+	struct stripe_head *sh, int disks, struct bio_list *return_bi);
+extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
+			  struct stripe_head_state *s);
+extern void r5c_make_stripe_write_out(struct stripe_head *sh);
+extern void r5c_flush_cache(struct r5conf *conf, int num);
+extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
+extern void r5c_check_cached_full_stripe(struct r5conf *conf);
+extern struct md_sysfs_entry r5c_journal_mode;
 #endif
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -84,6 +84,10 @@
 #define MD_DISK_CANDIDATE	5 /* disk is added as spare (local) until confirmed
 				   * For clustered enviroments only.
 				   */
+#define MD_DISK_FAILFAST	10 /* Send REQ_FAILFAST if there are multiple
+				    * devices available - and don't try to
+				    * correct read errors.
+				    */

 #define	MD_DISK_WRITEMOSTLY	9 /* disk is "write-mostly" is RAID1 config.
 				   * read requests will only be sent here in
@@ -265,8 +269,9 @@ struct mdp_superblock_1 {
 	__le32	dev_number;	/* permanent identifier of this  device - not role in raid */
 	__le32	cnt_corrected_read; /* number of read errors that were corrected by re-writing */
 	__u8	device_uuid[16]; /* user-space setable, ignored by kernel */
-	__u8	devflags;	/* per-device flags.  Only one defined...*/
+	__u8	devflags;	/* per-device flags.  Only two defined...*/
 #define	WriteMostly1	1	/* mask for writemostly flag in above */
+#define	FailFast1	2	/* Should avoid retries and fixups and just fail */
 	/* Bad block log.  If there are any bad blocks the feature flag is set.
 	 * If offset and size are non-zero, that space is reserved and available
 	 */

--- a/lib/raid6/avx2.c
+++ b/lib/raid6/avx2.c
@@ -87,9 +87,57 @@ static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
 	kernel_fpu_end();
 }

+static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
+				     size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
+
+	for (d = 0 ; d < bytes ; d += 32) {
+		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
+		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
+		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+			asm volatile("vpand %ymm0,%ymm5,%ymm5");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
+			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+		}
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+			asm volatile("vpand %ymm0,%ymm5,%ymm5");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+		}
+		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
+		/* Don't use movntdq for r/w memory area < cache line */
+		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
+		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
 const struct raid6_calls raid6_avx2x1 = {
 	raid6_avx21_gen_syndrome,
-	NULL,			/* XOR not yet implemented */
+	raid6_avx21_xor_syndrome,
 	raid6_have_avx2,
 	"avx2x1",
 	1			/* Has cache hints */
@@ -149,9 +197,77 @@ static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
 	kernel_fpu_end();
 }

+static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
+				     size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
+
+	for (d = 0 ; d < bytes ; d += 64) {
+		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
+		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
+		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
+		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
+		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
+		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
+			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
+			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+			asm volatile("vpand %ymm0,%ymm5,%ymm5");
+			asm volatile("vpand %ymm0,%ymm7,%ymm7");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
+			asm volatile("vmovdqa %0,%%ymm7"
+				     :: "m" (dptr[z][d+32]));
+			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
+			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+		}
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
+			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
+			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+			asm volatile("vpand %ymm0,%ymm5,%ymm5");
+			asm volatile("vpand %ymm0,%ymm7,%ymm7");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+		}
+		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
+		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
+		/* Don't use movntdq for r/w memory area < cache line */
+		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
+		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
+		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
+		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
 const struct raid6_calls raid6_avx2x2 = {
 	raid6_avx22_gen_syndrome,
-	NULL,			/* XOR not yet implemented */
+	raid6_avx22_xor_syndrome,
 	raid6_have_avx2,
 	"avx2x2",
 	1			/* Has cache hints */
@@ -242,9 +358,119 @@ static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
 	kernel_fpu_end();
 }

+static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
+				     size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
+
+	for (d = 0 ; d < bytes ; d += 128) {
+		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
+		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
+		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
+		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
+		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
+		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
+		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
+		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
+		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
+		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
+		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
+		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
+			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
+			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
+			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
+			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
+			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
+			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
+			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
+			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
+			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
+			asm volatile("vpand %ymm0,%ymm5,%ymm5");
+			asm volatile("vpand %ymm0,%ymm7,%ymm7");
+			asm volatile("vpand %ymm0,%ymm13,%ymm13");
+			asm volatile("vpand %ymm0,%ymm15,%ymm15");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
+			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
+			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
+			asm volatile("vmovdqa %0,%%ymm7"
+				     :: "m" (dptr[z][d+32]));
+			asm volatile("vmovdqa %0,%%ymm13"
+				     :: "m" (dptr[z][d+64]));
+			asm volatile("vmovdqa %0,%%ymm15"
+				     :: "m" (dptr[z][d+96]));
+			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
+			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
+			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
+			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
+			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
+		}
+		asm volatile("prefetchnta %0" :: "m" (q[d]));
+		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
+			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
+			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
+			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
+			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
+			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
+			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
+			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
+			asm volatile("vpand %ymm0,%ymm5,%ymm5");
+			asm volatile("vpand %ymm0,%ymm7,%ymm7");
+			asm volatile("vpand %ymm0,%ymm13,%ymm13");
+			asm volatile("vpand %ymm0,%ymm15,%ymm15");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
+			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
+		}
+		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
+		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
+		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
+		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
+		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
+		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
+		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
+		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
+		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
+		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
+		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
+		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
+	}
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
 const struct raid6_calls raid6_avx2x4 = {
 	raid6_avx24_gen_syndrome,
-	NULL,			/* XOR not yet implemented */
+	raid6_avx24_xor_syndrome,
 	raid6_have_avx2,
 	"avx2x4",
 	1			/* Has cache hints */