Merge branch 'for-linus2' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe: "Round 2 of this. I cut back to the bare necessities, the patch is still larger than it usually would be at this time, due to the number of NVMe fixes in there. This pull request contains: - The 4 core fixes from Ming, that fix both problems with exceeding the virtual boundary limit in case of merging, and the gap checking for cloned bio's. - NVMe fixes from Keith and Christoph: - Regression on larger user commands, causing problems with reading log pages (for instance). This touches both NVMe, and the block core since that is now generally utilized also for these types of commands. - Hot removal fixes. - User exploitable issue with passthrough IO commands, if !length is given, causing us to fault on writing to the zero page. - Fix for a hang under error conditions - And finally, the current series regression for umount with cgroup writeback, where the final flush would happen async and hence open up window after umount where the device wasn't consistent. fsck right after umount would show this. From Tejun" * 'for-linus2' of git://git.kernel.dk/linux-block: block: support large requests in blk_rq_map_user_iov block: fix blk_rq_get_max_sectors for driver private requests nvme: fix max_segments integer truncation nvme: set queue limits for the admin queue writeback: flush inode cgroup wb switches instead of pinning super_block NVMe: Fix 0-length integrity payload NVMe: Don't allow unsupported flags NVMe: Move error handling to failed reset handler NVMe: Simplify device reset failure NVMe: Fix namespace removal deadlock NVMe: Use IDA for namespace disk naming NVMe: Don't unmap controller registers on reset block: merge: get the 1st and last bvec via helpers block: get the 1st and last bvec via helpers block: check virt boundary in bio_will_gap() block: bio: introduce helpers to get the 1st and last bvec

Merge branch 'for-linus2' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe: "Round 2 of this. I cut back to the bare necessities, the patch is still larger than it usually would be at this time, due to the number of NVMe fixes in there. This pull request contains: - The 4 core fixes from Ming, that fix both problems with exceeding the virtual boundary limit in case of merging, and the gap checking for cloned bio's. - NVMe fixes from Keith and Christoph: - Regression on larger user commands, causing problems with reading log pages (for instance). This touches both NVMe, and the block core since that is now generally utilized also for these types of commands. - Hot removal fixes. - User exploitable issue with passthrough IO commands, if !length is given, causing us to fault on writing to the zero page. - Fix for a hang under error conditions - And finally, the current series regression for umount with cgroup writeback, where the final flush would happen async and hence open up window after umount where the device wasn't consistent. fsck right after umount would show this. From Tejun" * 'for-linus2' of git://git.kernel.dk/linux-block: block: support large requests in blk_rq_map_user_iov block: fix blk_rq_get_max_sectors for driver private requests nvme: fix max_segments integer truncation nvme: set queue limits for the admin queue writeback: flush inode cgroup wb switches instead of pinning super_block NVMe: Fix 0-length integrity payload NVMe: Don't allow unsupported flags NVMe: Move error handling to failed reset handler NVMe: Simplify device reset failure NVMe: Fix namespace removal deadlock NVMe: Use IDA for namespace disk naming NVMe: Don't unmap controller registers on reset block: merge: get the 1st and last bvec via helpers block: get the 1st and last bvec via helpers block: check virt boundary in bio_will_gap() block: bio: introduce helpers to get the 1st and last bvec
e5322c54 · Linus Torvalds · bdf9d297 · 4d6af73d · e5322c54 · e5322c54
10 changed file
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -57,6 +57,49 @@ static int __blk_rq_unmap_user(struct bio *bio)
 	return ret;
 }

+static int __blk_rq_map_user_iov(struct request *rq,
+		struct rq_map_data *map_data, struct iov_iter *iter,
+		gfp_t gfp_mask, bool copy)
+{
+	struct request_queue *q = rq->q;
+	struct bio *bio, *orig_bio;
+	int ret;
+
+	if (copy)
+		bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
+	else
+		bio = bio_map_user_iov(q, iter, gfp_mask);
+
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
+
+	if (map_data && map_data->null_mapped)
+		bio_set_flag(bio, BIO_NULL_MAPPED);
+
+	iov_iter_advance(iter, bio->bi_iter.bi_size);
+	if (map_data)
+		map_data->offset += bio->bi_iter.bi_size;
+
+	orig_bio = bio;
+	blk_queue_bounce(q, &bio);
+
+	/*
+	 * We link the bounce buffer in and could have to traverse it
+	 * later so we have to get a ref to prevent it from being freed
+	 */
+	bio_get(bio);
+
+	ret = blk_rq_append_bio(q, rq, bio);
+	if (ret) {
+		bio_endio(bio);
+		__blk_rq_unmap_user(orig_bio);
+		bio_put(bio);
+		return ret;
+	}
+
+	return 0;
+}
+
 /**
 * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
 * @q:		request queue where request should be inserted
@@ -82,10 +125,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 			struct rq_map_data *map_data,
 			const struct iov_iter *iter, gfp_t gfp_mask)
 {
-	struct bio *bio;
-	int unaligned = 0;
-	struct iov_iter i;
 	struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0};
+	bool copy = (q->dma_pad_mask & iter->count) || map_data;
+	struct bio *bio = NULL;
+	struct iov_iter i;
+	int ret;

 	if (!iter || !iter->count)
 		return -EINVAL;
@@ -101,42 +145,29 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 		 */
 		if ((uaddr & queue_dma_alignment(q)) ||
 		    iovec_gap_to_prv(q, &prv, &iov))
-			unaligned = 1;
+			copy = true;

 		prv.iov_base = iov.iov_base;
 		prv.iov_len = iov.iov_len;
 	}

-	if (unaligned || (q->dma_pad_mask & iter->count) || map_data)
-		bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
-	else
-		bio = bio_map_user_iov(q, iter, gfp_mask);
-
-	if (IS_ERR(bio))
-		return PTR_ERR(bio);
-
-	if (map_data && map_data->null_mapped)
-		bio_set_flag(bio, BIO_NULL_MAPPED);
-
-	if (bio->bi_iter.bi_size != iter->count) {
-		/*
-		 * Grab an extra reference to this bio, as bio_unmap_user()
-		 * expects to be able to drop it twice as it happens on the
-		 * normal IO completion path
-		 */
-		bio_get(bio);
-		bio_endio(bio);
-		__blk_rq_unmap_user(bio);
-		return -EINVAL;
-	}
+	i = *iter;
+	do {
+		ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy);
+		if (ret)
+			goto unmap_rq;
+		if (!bio)
+			bio = rq->bio;
+	} while (iov_iter_count(&i));

 	if (!bio_flagged(bio, BIO_USER_MAPPED))
 		rq->cmd_flags |= REQ_COPY_USER;
-
-	blk_queue_bounce(q, &bio);
-	bio_get(bio);
-	blk_rq_bio_prep(q, rq, bio);
 	return 0;
+
+unmap_rq:
+	__blk_rq_unmap_user(bio);
+	rq->bio = NULL;
+	return -EINVAL;
 }
 EXPORT_SYMBOL(blk_rq_map_user_iov);


--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -304,7 +304,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 				   struct bio *nxt)
 {
 	struct bio_vec end_bv = { NULL }, nxt_bv;
-	struct bvec_iter iter;

 	if (!blk_queue_cluster(q))
 		return 0;
@@ -316,11 +315,8 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 	if (!bio_has_data(bio))
 		return 1;

-	bio_for_each_segment(end_bv, bio, iter)
-		if (end_bv.bv_len == iter.bi_size)
-			break;
-
-	nxt_bv = bio_iovec(nxt);
+	bio_get_last_bvec(bio, &end_bv);
+	bio_get_first_bvec(nxt, &nxt_bv);

 	if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv))
 		return 0;

--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -55,8 +55,9 @@ static void nvme_free_ns(struct kref *kref)
 	ns->disk->private_data = NULL;
 	spin_unlock(&dev_list_lock);

-	nvme_put_ctrl(ns->ctrl);
 	put_disk(ns->disk);
+	ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
+	nvme_put_ctrl(ns->ctrl);
 	kfree(ns);
 }

@@ -183,7 +184,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
 			goto out_unmap;
 		}

-		if (meta_buffer) {
+		if (meta_buffer && meta_len) {
 			struct bio_integrity_payload *bip;

 			meta = kmalloc(meta_len, GFP_KERNEL);
@@ -373,6 +374,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)

 	if (copy_from_user(&io, uio, sizeof(io)))
 		return -EFAULT;
+	if (io.flags)
+		return -EINVAL;

 	switch (io.opcode) {
 	case nvme_cmd_write:
@@ -424,6 +427,8 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 		return -EACCES;
 	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
 		return -EFAULT;
+	if (cmd.flags)
+		return -EINVAL;

 	memset(&c, 0, sizeof(c));
 	c.common.opcode = cmd.opcode;
@@ -556,6 +561,10 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 	u16 old_ms;
 	unsigned short bs;

+	if (test_bit(NVME_NS_DEAD, &ns->flags)) {
+		set_capacity(disk, 0);
+		return -ENODEV;
+	}
 	if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
 		dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
 				__func__, ns->ctrl->instance, ns->ns_id);
@@ -831,6 +840,23 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
 	return ret;
 }

+static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
+		struct request_queue *q)
+{
+	if (ctrl->max_hw_sectors) {
+		u32 max_segments =
+			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
+
+		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
+		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
+	}
+	if (ctrl->stripe_size)
+		blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9);
+	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+		blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
+	blk_queue_virt_boundary(q, ctrl->page_size - 1);
+}
+
 /*
 * Initialize the cached copies of the Identify data and various controller
 * register in our nvme_ctrl structure.  This should be called as soon as
@@ -888,6 +914,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 		}
 	}

+	nvme_set_queue_limits(ctrl, ctrl->admin_q);
+
 	kfree(id);
 	return 0;
 }
@@ -1118,9 +1146,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	if (!ns)
 		return;

+	ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL);
+	if (ns->instance < 0)
+		goto out_free_ns;
+
 	ns->queue = blk_mq_init_queue(ctrl->tagset);
 	if (IS_ERR(ns->queue))
-		goto out_free_ns;
+		goto out_release_instance;
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
 	ns->queue->queuedata = ns;
 	ns->ctrl = ctrl;
@@ -1134,17 +1166,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	ns->disk = disk;
 	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */

+
 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
-	if (ctrl->max_hw_sectors) {
-		blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors);
-		blk_queue_max_segments(ns->queue,
-			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1);
-	}
-	if (ctrl->stripe_size)
-		blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9);
-	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
-		blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
-	blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1);
+	nvme_set_queue_limits(ctrl, ns->queue);

 	disk->major = nvme_major;
 	disk->first_minor = 0;
@@ -1153,7 +1177,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	disk->queue = ns->queue;
 	disk->driverfs_dev = ctrl->device;
 	disk->flags = GENHD_FL_EXT_DEVT;
-	sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid);
+	sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance);

 	if (nvme_revalidate_disk(ns->disk))
 		goto out_free_disk;
@@ -1173,40 +1197,29 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	kfree(disk);
 out_free_queue:
 	blk_cleanup_queue(ns->queue);
+ out_release_instance:
+	ida_simple_remove(&ctrl->ns_ida, ns->instance);
 out_free_ns:
 	kfree(ns);
 }

 static void nvme_ns_remove(struct nvme_ns *ns)
 {
-	bool kill = nvme_io_incapable(ns->ctrl) &&
-			!blk_queue_dying(ns->queue);
-
-	lockdep_assert_held(&ns->ctrl->namespaces_mutex);
-
-	if (kill) {
-		blk_set_queue_dying(ns->queue);
+	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
+		return;

-		/*
-		 * The controller was shutdown first if we got here through
-		 * device removal. The shutdown may requeue outstanding
-		 * requests. These need to be aborted immediately so
-		 * del_gendisk doesn't block indefinitely for their completion.
-		 */
-		blk_mq_abort_requeue_list(ns->queue);
-	}
 	if (ns->disk->flags & GENHD_FL_UP) {
 		if (blk_get_integrity(ns->disk))
 			blk_integrity_unregister(ns->disk);
 		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
 					&nvme_ns_attr_group);
 		del_gendisk(ns->disk);
-	}
-	if (kill || !blk_queue_dying(ns->queue)) {
 		blk_mq_abort_requeue_list(ns->queue);
 		blk_cleanup_queue(ns->queue);
 	}
+	mutex_lock(&ns->ctrl->namespaces_mutex);
 	list_del_init(&ns->list);
+	mutex_unlock(&ns->ctrl->namespaces_mutex);
 	nvme_put_ns(ns);
 }

@@ -1300,10 +1313,8 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns, *next;

-	mutex_lock(&ctrl->namespaces_mutex);
 	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
 		nvme_ns_remove(ns);
-	mutex_unlock(&ctrl->namespaces_mutex);
 }

 static DEFINE_IDA(nvme_instance_ida);
@@ -1350,6 +1361,7 @@ static void nvme_free_ctrl(struct kref *kref)

 	put_device(ctrl->device);
 	nvme_release_instance(ctrl);
+	ida_destroy(&ctrl->ns_ida);

 	ctrl->ops->free_ctrl(ctrl);
 }
@@ -1390,6 +1402,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	}
 	get_device(ctrl->device);
 	dev_set_drvdata(ctrl->device, ctrl);
+	ida_init(&ctrl->ns_ida);

 	spin_lock(&dev_list_lock);
 	list_add_tail(&ctrl->node, &nvme_ctrl_list);
@@ -1402,6 +1415,38 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	return ret;
 }

+/**
+ * nvme_kill_queues(): Ends all namespace queues
+ * @ctrl: the dead controller that needs to end
+ *
+ * Call this function when the driver determines it is unable to get the
+ * controller in a state capable of servicing IO.
+ */
+void nvme_kill_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	mutex_lock(&ctrl->namespaces_mutex);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		if (!kref_get_unless_zero(&ns->kref))
+			continue;
+
+		/*
+		 * Revalidating a dead namespace sets capacity to 0. This will
+		 * end buffered writers dirtying pages that can't be synced.
+		 */
+		if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+			revalidate_disk(ns->disk);
+
+		blk_set_queue_dying(ns->queue);
+		blk_mq_abort_requeue_list(ns->queue);
+		blk_mq_start_stopped_hw_queues(ns->queue, true);
+
+		nvme_put_ns(ns);
+	}
+	mutex_unlock(&ctrl->namespaces_mutex);
+}
+
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;

--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -72,6 +72,7 @@ struct nvme_ctrl {
 	struct mutex namespaces_mutex;
 	struct device *device;	/* char device */
 	struct list_head node;
+	struct ida ns_ida;

 	char name[12];
 	char serial[20];
@@ -102,6 +103,7 @@ struct nvme_ns {
 	struct request_queue *queue;
 	struct gendisk *disk;
 	struct kref kref;
+	int instance;

 	u8 eui[8];
 	u8 uuid[16];
@@ -112,6 +114,11 @@ struct nvme_ns {
 	bool ext;
 	u8 pi_type;
 	int type;
+	unsigned long flags;
+
+#define NVME_NS_REMOVING 0
+#define NVME_NS_DEAD     1
+
 	u64 mode_select_num_blocks;
 	u32 mode_select_block_len;
 };
@@ -240,6 +247,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl);

 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
+void nvme_kill_queues(struct nvme_ctrl *ctrl);

 struct request *nvme_alloc_request(struct request_queue *q,
 		struct nvme_command *cmd, unsigned int flags);

--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -86,7 +86,6 @@ struct nvme_queue;

 static int nvme_reset(struct nvme_dev *dev);
 static void nvme_process_cq(struct nvme_queue *nvmeq);
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev);
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);

 /*
@@ -120,6 +119,7 @@ struct nvme_dev {
 	unsigned long flags;

 #define NVME_CTRL_RESETTING    0
+#define NVME_CTRL_REMOVING     1

 	struct nvme_ctrl ctrl;
 	struct completion ioq_wait;
@@ -286,6 +286,17 @@ static int nvme_init_request(void *data, struct request *req,
 	return 0;
 }

+static void nvme_queue_scan(struct nvme_dev *dev)
+{
+	/*
+	 * Do not queue new scan work when a controller is reset during
+	 * removal.
+	 */
+	if (test_bit(NVME_CTRL_REMOVING, &dev->flags))
+		return;
+	queue_work(nvme_workq, &dev->scan_work);
+}
+
 static void nvme_complete_async_event(struct nvme_dev *dev,
 		struct nvme_completion *cqe)
 {
@@ -300,7 +311,7 @@ static void nvme_complete_async_event(struct nvme_dev *dev,
 	switch (result & 0xff07) {
 	case NVME_AER_NOTICE_NS_CHANGED:
 		dev_info(dev->dev, "rescanning\n");
-		queue_work(nvme_workq, &dev->scan_work);
+		nvme_queue_scan(dev);
 	default:
 		dev_warn(dev->dev, "async event result %08x\n", result);
 	}
@@ -679,7 +690,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,

 	spin_lock_irq(&nvmeq->q_lock);
 	if (unlikely(nvmeq->cq_vector < 0)) {
-		ret = BLK_MQ_RQ_QUEUE_BUSY;
+		if (ns && !test_bit(NVME_NS_DEAD, &ns->flags))
+			ret = BLK_MQ_RQ_QUEUE_BUSY;
+		else
+			ret = BLK_MQ_RQ_QUEUE_ERROR;
 		spin_unlock_irq(&nvmeq->q_lock);
 		goto out;
 	}
@@ -1250,6 +1264,12 @@ static struct blk_mq_ops nvme_mq_ops = {
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
 {
 	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
+		/*
+		 * If the controller was reset during removal, it's possible
+		 * user requests may be waiting on a stopped queue. Start the
+		 * queue to flush these to completion.
+		 */
+		blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true);
 		blk_cleanup_queue(dev->ctrl.admin_q);
 		blk_mq_free_tag_set(&dev->admin_tagset);
 	}
@@ -1690,14 +1710,14 @@ static int nvme_dev_add(struct nvme_dev *dev)
 			return 0;
 		dev->ctrl.tagset = &dev->tagset;
 	}
-	queue_work(nvme_workq, &dev->scan_work);
+	nvme_queue_scan(dev);
 	return 0;
 }

-static int nvme_dev_map(struct nvme_dev *dev)
+static int nvme_pci_enable(struct nvme_dev *dev)
 {
 	u64 cap;
-	int bars, result = -ENOMEM;
+	int result = -ENOMEM;
 	struct pci_dev *pdev = to_pci_dev(dev->dev);

 	if (pci_enable_device_mem(pdev))
@@ -1705,24 +1725,14 @@ static int nvme_dev_map(struct nvme_dev *dev)

 	dev->entry[0].vector = pdev->irq;
 	pci_set_master(pdev);
-	bars = pci_select_bars(pdev, IORESOURCE_MEM);
-	if (!bars)
-		goto disable_pci;
-
-	if (pci_request_selected_regions(pdev, bars, "nvme"))
-		goto disable_pci;

 	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
 	    dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
 		goto disable;

-	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
-	if (!dev->bar)
-		goto disable;
-
 	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
 		result = -ENODEV;
-		goto unmap;
+		goto disable;
 	}

 	/*
@@ -1732,7 +1742,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
 	if (!pdev->irq) {
 		result = pci_enable_msix(pdev, dev->entry, 1);
 		if (result < 0)
-			goto unmap;
+			goto disable;
 	}

 	cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
@@ -1759,17 +1769,19 @@ static int nvme_dev_map(struct nvme_dev *dev)
 	pci_save_state(pdev);
 	return 0;

- unmap:
-	iounmap(dev->bar);
-	dev->bar = NULL;
 disable:
-	pci_release_regions(pdev);
- disable_pci:
 	pci_disable_device(pdev);
 	return result;
 }

 static void nvme_dev_unmap(struct nvme_dev *dev)
+{
+	if (dev->bar)
+		iounmap(dev->bar);
+	pci_release_regions(to_pci_dev(dev->dev));
+}
+
+static void nvme_pci_disable(struct nvme_dev *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev->dev);

@@ -1778,12 +1790,6 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
 	else if (pdev->msix_enabled)
 		pci_disable_msix(pdev);

-	if (dev->bar) {
-		iounmap(dev->bar);
-		dev->bar = NULL;
-		pci_release_regions(pdev);
-	}
-
 	if (pci_is_enabled(pdev)) {
 		pci_disable_pcie_error_reporting(pdev);
 		pci_disable_device(pdev);
@@ -1842,7 +1848,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	nvme_dev_list_remove(dev);

 	mutex_lock(&dev->shutdown_lock);
-	if (dev->bar) {
+	if (pci_is_enabled(to_pci_dev(dev->dev))) {
 		nvme_stop_queues(&dev->ctrl);
 		csts = readl(dev->bar + NVME_REG_CSTS);
 	}
@@ -1855,7 +1861,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 		nvme_disable_io_queues(dev);
 		nvme_disable_admin_queue(dev, shutdown);
 	}
-	nvme_dev_unmap(dev);
+	nvme_pci_disable(dev);

 	for (i = dev->queue_count - 1; i >= 0; i--)
 		nvme_clear_queue(dev->queues[i]);
@@ -1899,10 +1905,20 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 	kfree(dev);
 }

+static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
+{
+	dev_warn(dev->dev, "Removing after probe failure status: %d\n", status);
+
+	kref_get(&dev->ctrl.kref);
+	nvme_dev_disable(dev, false);
+	if (!schedule_work(&dev->remove_work))
+		nvme_put_ctrl(&dev->ctrl);
+}
+
 static void nvme_reset_work(struct work_struct *work)
 {
 	struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
-	int result;
+	int result = -ENODEV;

 	if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags)))
 		goto out;
@@ -1911,37 +1927,37 @@ static void nvme_reset_work(struct work_struct *work)
 	 * If we're called to reset a live controller first shut it down before
 	 * moving on.
 	 */
-	if (dev->bar)
+	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
 		nvme_dev_disable(dev, false);

 	set_bit(NVME_CTRL_RESETTING, &dev->flags);

-	result = nvme_dev_map(dev);
+	result = nvme_pci_enable(dev);
 	if (result)
 		goto out;

 	result = nvme_configure_admin_queue(dev);
 	if (result)
-		goto unmap;
+		goto out;

 	nvme_init_queue(dev->queues[0], 0);
 	result = nvme_alloc_admin_tags(dev);
 	if (result)
-		goto disable;
+		goto out;

 	result = nvme_init_identify(&dev->ctrl);
 	if (result)
-		goto free_tags;
+		goto out;

 	result = nvme_setup_io_queues(dev);
 	if (result)
-		goto free_tags;
+		goto out;

 	dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;

 	result = nvme_dev_list_add(dev);
 	if (result)
-		goto remove;
+		goto out;

 	/*
 	 * Keep the controller around but remove all namespaces if we don't have
@@ -1958,19 +1974,8 @@ static void nvme_reset_work(struct work_struct *work)
 	clear_bit(NVME_CTRL_RESETTING, &dev->flags);
 	return;

- remove:
-	nvme_dev_list_remove(dev);
- free_tags:
-	nvme_dev_remove_admin(dev);
-	blk_put_queue(dev->ctrl.admin_q);
-	dev->ctrl.admin_q = NULL;
-	dev->queues[0]->tags = NULL;
- disable:
-	nvme_disable_admin_queue(dev, false);
- unmap:
-	nvme_dev_unmap(dev);
 out:
-	nvme_remove_dead_ctrl(dev);
+	nvme_remove_dead_ctrl(dev, result);
 }

 static void nvme_remove_dead_ctrl_work(struct work_struct *work)
@@ -1978,19 +1983,12 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
 	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
 	struct pci_dev *pdev = to_pci_dev(dev->dev);

+	nvme_kill_queues(&dev->ctrl);
 	if (pci_get_drvdata(pdev))
 		pci_stop_and_remove_bus_device_locked(pdev);
 	nvme_put_ctrl(&dev->ctrl);
 }

-static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
-{
-	dev_warn(dev->dev, "Removing after probe failure\n");
-	kref_get(&dev->ctrl.kref);
-	if (!schedule_work(&dev->remove_work))
-		nvme_put_ctrl(&dev->ctrl);
-}
-
 static int nvme_reset(struct nvme_dev *dev)
 {
 	if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
@@ -2042,6 +2040,27 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
 	.free_ctrl		= nvme_pci_free_ctrl,
 };

+static int nvme_dev_map(struct nvme_dev *dev)
+{
+	int bars;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	bars = pci_select_bars(pdev, IORESOURCE_MEM);
+	if (!bars)
+		return -ENODEV;
+	if (pci_request_selected_regions(pdev, bars, "nvme"))
+		return -ENODEV;
+
+	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
+	if (!dev->bar)
+		goto release;
+
+       return 0;
+  release:
+       pci_release_regions(pdev);
+       return -ENODEV;
+}
+
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	int node, result = -ENOMEM;
@@ -2066,6 +2085,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	dev->dev = get_device(&pdev->dev);
 	pci_set_drvdata(pdev, dev);

+	result = nvme_dev_map(dev);
+	if (result)
+		goto free;
+
 	INIT_LIST_HEAD(&dev->node);
 	INIT_WORK(&dev->scan_work, nvme_dev_scan);
 	INIT_WORK(&dev->reset_work, nvme_reset_work);
@@ -2089,6 +2112,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	nvme_release_prp_pools(dev);
 put_pci:
 	put_device(dev->dev);
+	nvme_dev_unmap(dev);
 free:
 	kfree(dev->queues);
 	kfree(dev->entry);
@@ -2112,10 +2136,16 @@ static void nvme_shutdown(struct pci_dev *pdev)
 	nvme_dev_disable(dev, true);
 }

+/*
+ * The driver's remove may be called on a device in a partially initialized
+ * state. This function must not have any dependencies on the device state in
+ * order to proceed.
+ */
 static void nvme_remove(struct pci_dev *pdev)
 {
 	struct nvme_dev *dev = pci_get_drvdata(pdev);

+	set_bit(NVME_CTRL_REMOVING, &dev->flags);
 	pci_set_drvdata(pdev, NULL);
 	flush_work(&dev->scan_work);
 	nvme_remove_namespaces(&dev->ctrl);
@@ -2126,6 +2156,7 @@ static void nvme_remove(struct pci_dev *pdev)
 	nvme_free_queues(dev, 0);
 	nvme_release_cmb(dev);
 	nvme_release_prp_pools(dev);
+	nvme_dev_unmap(dev);
 	nvme_put_ctrl(&dev->ctrl);
 }


--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -223,6 +223,9 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
 #define WB_FRN_HIST_MAX_SLOTS	(WB_FRN_HIST_THR_SLOTS / 2 + 1)
 					/* one round can affect upto 5 slots */

+static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
+static struct workqueue_struct *isw_wq;
+
 void __inode_attach_wb(struct inode *inode, struct page *page)
 {
 	struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -317,7 +320,6 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
 	struct inode_switch_wbs_context *isw =
 		container_of(work, struct inode_switch_wbs_context, work);
 	struct inode *inode = isw->inode;
-	struct super_block *sb = inode->i_sb;
 	struct address_space *mapping = inode->i_mapping;
 	struct bdi_writeback *old_wb = inode->i_wb;
 	struct bdi_writeback *new_wb = isw->new_wb;
@@ -424,8 +426,9 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
 	wb_put(new_wb);

 	iput(inode);
-	deactivate_super(sb);
 	kfree(isw);
+
+	atomic_dec(&isw_nr_in_flight);
 }

 static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
@@ -435,7 +438,7 @@ static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)

 	/* needs to grab bh-unsafe locks, bounce to work item */
 	INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
-	schedule_work(&isw->work);
+	queue_work(isw_wq, &isw->work);
 }

 /**
@@ -471,20 +474,20 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)

 	/* while holding I_WB_SWITCH, no one else can update the association */
 	spin_lock(&inode->i_lock);
-
-	if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
-	    inode_to_wb(inode) == isw->new_wb)
-		goto out_unlock;
-
-	if (!atomic_inc_not_zero(&inode->i_sb->s_active))
-		goto out_unlock;
-
+	if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+	    inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+	    inode_to_wb(inode) == isw->new_wb) {
+		spin_unlock(&inode->i_lock);
+		goto out_free;
+	}
 	inode->i_state |= I_WB_SWITCH;
 	spin_unlock(&inode->i_lock);

 	ihold(inode);
 	isw->inode = inode;

+	atomic_inc(&isw_nr_in_flight);
+
 	/*
 	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
 	 * the RCU protected stat update paths to grab the mapping's
@@ -494,8 +497,6 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
 	return;

-out_unlock:
-	spin_unlock(&inode->i_lock);
 out_free:
 	if (isw->new_wb)
 		wb_put(isw->new_wb);
@@ -847,6 +848,33 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 		wb_put(last_wb);
 }

+/**
+ * cgroup_writeback_umount - flush inode wb switches for umount
+ *
+ * This function is called when a super_block is about to be destroyed and
+ * flushes in-flight inode wb switches.  An inode wb switch goes through
+ * RCU and then workqueue, so the two need to be flushed in order to ensure
+ * that all previously scheduled switches are finished.  As wb switches are
+ * rare occurrences and synchronize_rcu() can take a while, perform
+ * flushing iff wb switches are in flight.
+ */
+void cgroup_writeback_umount(void)
+{
+	if (atomic_read(&isw_nr_in_flight)) {
+		synchronize_rcu();
+		flush_workqueue(isw_wq);
+	}
+}
+
+static int __init cgroup_writeback_init(void)
+{
+	isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+	if (!isw_wq)
+		return -ENOMEM;
+	return 0;
+}
+fs_initcall(cgroup_writeback_init);
+
 #else	/* CONFIG_CGROUP_WRITEBACK */

 static struct bdi_writeback *

--- a/fs/super.c
+++ b/fs/super.c
@@ -415,6 +415,7 @@ void generic_shutdown_super(struct super_block *sb)
 		sb->s_flags &= ~MS_ACTIVE;

 		fsnotify_unmount_inodes(sb);
+		cgroup_writeback_umount();

 		evict_inodes(sb);


--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -310,6 +310,43 @@ static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
 	bio->bi_flags &= ~(1U << bit);
 }

+static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
+{
+	*bv = bio_iovec(bio);
+}
+
+static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
+{
+	struct bvec_iter iter = bio->bi_iter;
+	int idx;
+
+	if (!bio_flagged(bio, BIO_CLONED)) {
+		*bv = bio->bi_io_vec[bio->bi_vcnt - 1];
+		return;
+	}
+
+	if (unlikely(!bio_multiple_segments(bio))) {
+		*bv = bio_iovec(bio);
+		return;
+	}
+
+	bio_advance_iter(bio, &iter, iter.bi_size);
+
+	if (!iter.bi_bvec_done)
+		idx = iter.bi_idx - 1;
+	else	/* in the middle of bvec */
+		idx = iter.bi_idx;
+
+	*bv = bio->bi_io_vec[idx];
+
+	/*
+	 * iter.bi_bvec_done records actual length of the last bvec
+	 * if this bio ends in the middle of one io vector
+	 */
+	if (iter.bi_bvec_done)
+		bv->bv_len = iter.bi_bvec_done;
+}
+
 enum bip_flags {
 	BIP_BLOCK_INTEGRITY	= 1 << 0, /* block layer owns integrity data */
 	BIP_MAPPED_INTEGRITY	= 1 << 1, /* ref tag has been remapped */

--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -895,7 +895,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq)
 {
 	struct request_queue *q = rq->q;

-	if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC))
+	if (unlikely(rq->cmd_type != REQ_TYPE_FS))
 		return q->limits.max_hw_sectors;

 	if (!q->limits.chunk_sectors || (rq->cmd_flags & REQ_DISCARD))
@@ -1372,6 +1372,13 @@ static inline void put_dev_sector(Sector p)
 	page_cache_release(p.v);
 }

+static inline bool __bvec_gap_to_prev(struct request_queue *q,
+				struct bio_vec *bprv, unsigned int offset)
+{
+	return offset ||
+		((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+}
+
 /*
 * Check if adding a bio_vec after bprv with offset would create a gap in
 * the SG list. Most drivers don't care about this, but some do.
@@ -1381,18 +1388,22 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
 {
 	if (!queue_virt_boundary(q))
 		return false;
-	return offset ||
-		((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+	return __bvec_gap_to_prev(q, bprv, offset);
 }

 static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
 			 struct bio *next)
 {
-	if (!bio_has_data(prev))
-		return false;
+	if (bio_has_data(prev) && queue_virt_boundary(q)) {
+		struct bio_vec pb, nb;
+
+		bio_get_last_bvec(prev, &pb);
+		bio_get_first_bvec(next, &nb);

-	return bvec_gap_to_prev(q, &prev->bi_io_vec[prev->bi_vcnt - 1],
-				next->bi_io_vec[0].bv_offset);
+		return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
+	}
+
+	return false;
 }

 static inline bool req_gap_back_merge(struct request *req, struct bio *bio)

--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -198,6 +198,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 void wbc_detach_inode(struct writeback_control *wbc);
 void wbc_account_io(struct writeback_control *wbc, struct page *page,
 		    size_t bytes);
+void cgroup_writeback_umount(void);

 /**
 * inode_attach_wb - associate an inode with its wb
@@ -301,6 +302,10 @@ static inline void wbc_account_io(struct writeback_control *wbc,
 {
 }

+static inline void cgroup_writeback_umount(void)
+{
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */

 /*