From df1d15a65b5af195bc34c8aab5f6e8085dc999b9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 30 Nov 2018 08:47:03 -0700
Subject: [PATCH] block: implement bio helper to add iter bvec pages to bio

commit 6d0c48aede85e38316d0251564cab39cbc2422f6 upstream.

For an ITER_BVEC, we can just iterate the iov and add the pages
to the bio directly. For now, we grab a reference to those pages,
and release them normally on IO completion. This isn't really needed
for the normal case of O_DIRECT from/to a file, but some of the more
esoteric use cases (like splice(2)) will unconditionally put the
pipe buffer pages when the buffers are released. Until we can manage
that case properly, ITER_BVEC pages are treated like normal pages
in terms of reference counting.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Acked-by: Caspar Zhang <caspar@linux.alibaba.com>
---
 block/bio.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 54 insertions(+), 8 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 24269e5d5fa7..8513a12f2c65 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -828,6 +828,40 @@ int bio_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_add_page);
 
+static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
+{
+	const struct bio_vec *bv = iter->bvec;
+	unsigned int len;
+	size_t size;
+
+	if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len))
+		return -EINVAL;
+
+	len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count);
+	size = bio_add_page(bio, bv->bv_page, len,
+				bv->bv_offset + iter->iov_offset);
+	if (size == len) {
+		struct page *page;
+		int i;
+
+		/*
+		 * For the normal O_DIRECT case, we could skip grabbing this
+		 * reference and then not have to put them again when IO
+		 * completes. But this breaks some in-kernel users, like
+		 * splicing to/from a loop device, where we release the pipe
+		 * pages unconditionally. If we can fix that case, we can
+		 * get rid of the get here and the need to call
+		 * bio_release_pages() at IO completion time.
+		 */
+		mp_bvec_for_each_page(page, bv, i)
+			get_page(page);
+		iov_iter_advance(iter, size);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
 /**
  * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
  * @bio: bio to add pages to
@@ -877,23 +911,35 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 }
 
 /**
- * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
+ * bio_iov_iter_get_pages - add user or kernel pages to a bio
  * @bio: bio to add pages to
- * @iter: iov iterator describing the region to be mapped
+ * @iter: iov iterator describing the region to be added
+ *
+ * This takes either an iterator pointing to user memory, or one pointing to
+ * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
+ * map them into the kernel. On IO completion, the caller should put those
+ * pages. For now, when adding kernel pages, we still grab a reference to the
+ * page. This isn't strictly needed for the common case, but some call paths
+ * end up releasing pages from eg a pipe and we can't easily control these.
+ * See comment in __bio_iov_bvec_add_pages().
  *
- * Pins pages from *iter and appends them to @bio's bvec array. The
- * pages will have to be released using put_page() when done.
  * The function tries, but does not guarantee, to pin as many pages as
- * fit into the bio, or are requested in *iter, whatever is smaller.
- * If MM encounters an error pinning the requested pages, it stops.
- * Error is returned only if 0 pages could be pinned.
+ * fit into the bio, or are requested in *iter, whatever is smaller. If
+ * MM encounters an error pinning the requested pages, it stops. Error
+ * is returned only if 0 pages could be pinned.
  */
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
+	const bool is_bvec = iov_iter_is_bvec(iter);
 	unsigned short orig_vcnt = bio->bi_vcnt;
 
 	do {
-		int ret = __bio_iov_iter_get_pages(bio, iter);
+		int ret;
+
+		if (is_bvec)
+			ret = __bio_iov_bvec_add_pages(bio, iter);
+		else
+			ret = __bio_iov_iter_get_pages(bio, iter);
 
 		if (unlikely(ret))
 			return bio->bi_vcnt > orig_vcnt ? 0 : ret;
-- 
GitLab