提交 e10de372 编写于 作者: D Dave Chinner 提交者: Dave Chinner

xfs: don't chain ioends during writepage submission

Currently we can build a long ioend chain during ->writepages that
gets attached to the writepage context. IO submission only then
occurs when we finish all the writepage processing. This means we
can have many ioends allocated and pending, and this violates the
mempool guarantees that we need to give about forwards progress.
i.e. we really should only have one ioend being built at a time,
otherwise we may drain the mempool trying to allocate a new ioend
and that blocks submission, completion and freeing of ioends that
are already in progress.

To prevent this situation from happening, we need to submit ioends
for IO as soon as they are ready for dispatch rather than queuing
them for later submission. This means the ioends have bios built
immediately and they get queued on any plug that is current active.
Hence if we schedule away from writeback, the ioends that have been
built will make forwards progress due to the plug flushing on
context switch. This will also prevent context switches from
creating unnecessary IO submission latency.

We can't completely avoid having nested IO allocation - when we have
a block size smaller than a page size, we still need to hold the
ioend submission until after we have marked the current page dirty.
Hence we may need multiple ioends to be held while the current page
is completely mapped and made ready for IO dispatch. We cannot avoid
this problem - the current code already has this ioend chaining
within a page so we can mostly ignore that it occurs.
Signed-off-by: NDave Chinner <dchinner@redhat.com>
Reviewed-by: NChristoph Hellwig <hch@lst.de>
Signed-off-by: NDave Chinner <david@fromorbit.com>
上级 bfce7d2e
...@@ -43,7 +43,6 @@ struct xfs_writepage_ctx { ...@@ -43,7 +43,6 @@ struct xfs_writepage_ctx {
struct xfs_bmbt_irec imap; struct xfs_bmbt_irec imap;
bool imap_valid; bool imap_valid;
unsigned int io_type; unsigned int io_type;
struct xfs_ioend *iohead;
struct xfs_ioend *ioend; struct xfs_ioend *ioend;
sector_t last_block; sector_t last_block;
}; };
...@@ -277,7 +276,7 @@ xfs_alloc_ioend( ...@@ -277,7 +276,7 @@ xfs_alloc_ioend(
*/ */
atomic_set(&ioend->io_remaining, 1); atomic_set(&ioend->io_remaining, 1);
ioend->io_error = 0; ioend->io_error = 0;
ioend->io_list = NULL; INIT_LIST_HEAD(&ioend->io_list);
ioend->io_type = type; ioend->io_type = type;
ioend->io_inode = inode; ioend->io_inode = inode;
ioend->io_buffer_head = NULL; ioend->io_buffer_head = NULL;
...@@ -420,8 +419,7 @@ xfs_start_buffer_writeback( ...@@ -420,8 +419,7 @@ xfs_start_buffer_writeback(
STATIC void STATIC void
xfs_start_page_writeback( xfs_start_page_writeback(
struct page *page, struct page *page,
int clear_dirty, int clear_dirty)
int buffers)
{ {
ASSERT(PageLocked(page)); ASSERT(PageLocked(page));
ASSERT(!PageWriteback(page)); ASSERT(!PageWriteback(page));
...@@ -440,10 +438,6 @@ xfs_start_page_writeback( ...@@ -440,10 +438,6 @@ xfs_start_page_writeback(
set_page_writeback_keepwrite(page); set_page_writeback_keepwrite(page);
unlock_page(page); unlock_page(page);
/* If no buffers on the page are to be written, finish it here */
if (!buffers)
end_page_writeback(page);
} }
static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
...@@ -452,110 +446,90 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) ...@@ -452,110 +446,90 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
} }
/* /*
* Submit all of the bios for all of the ioends we have saved up, covering the * Submit all of the bios for an ioend. We are only passed a single ioend at a
* initial writepage page and also any probed pages. * time; the caller is responsible for chaining prior to submission.
*
* Because we may have multiple ioends spanning a page, we need to start
* writeback on all the buffers before we submit them for I/O. If we mark the
* buffers as we got, then we can end up with a page that only has buffers
* marked async write and I/O complete on can occur before we mark the other
* buffers async write.
*
* The end result of this is that we trip a bug in end_page_writeback() because
* we call it twice for the one page as the code in end_buffer_async_write()
* assumes that all buffers on the page are started at the same time.
*
* The fix is two passes across the ioend list - one to start writeback on the
* buffer_heads, and then submit them for I/O on the second pass.
* *
* If @fail is non-zero, it means that we have a situation where some part of * If @fail is non-zero, it means that we have a situation where some part of
* the submission process has failed after we have marked paged for writeback * the submission process has failed after we have marked paged for writeback
* and unlocked them. In this situation, we need to fail the ioend chain rather * and unlocked them. In this situation, we need to fail the ioend chain rather
* than submit it to IO. This typically only happens on a filesystem shutdown. * than submit it to IO. This typically only happens on a filesystem shutdown.
*/ */
STATIC void STATIC int
xfs_submit_ioend( xfs_submit_ioend(
struct writeback_control *wbc, struct writeback_control *wbc,
xfs_ioend_t *ioend, xfs_ioend_t *ioend,
int fail) int status)
{ {
xfs_ioend_t *head = ioend;
xfs_ioend_t *next;
struct buffer_head *bh; struct buffer_head *bh;
struct bio *bio; struct bio *bio;
sector_t lastblock = 0; sector_t lastblock = 0;
/* Pass 1 - start writeback */ /* Reserve log space if we might write beyond the on-disk inode size. */
do { if (!status &&
next = ioend->io_list; ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) status = xfs_setfilesize_trans_alloc(ioend);
xfs_start_buffer_writeback(bh); /*
} while ((ioend = next) != NULL); * If we are failing the IO now, just mark the ioend with an
* error and finish it. This will run IO completion immediately
* as there is only one reference to the ioend at this point in
* time.
*/
if (status) {
ioend->io_error = status;
xfs_finish_ioend(ioend);
return status;
}
/* Pass 2 - submit I/O */ bio = NULL;
ioend = head; for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
do {
next = ioend->io_list;
bio = NULL;
/* if (!bio) {
* If we are failing the IO now, just mark the ioend with an retry:
* error and finish it. This will run IO completion immediately bio = xfs_alloc_ioend_bio(bh);
* as there is only one reference to the ioend at this point in } else if (bh->b_blocknr != lastblock + 1) {
* time. xfs_submit_ioend_bio(wbc, ioend, bio);
*/ goto retry;
if (fail) {
ioend->io_error = fail;
xfs_finish_ioend(ioend);
continue;
} }
for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
if (!bio) {
retry:
bio = xfs_alloc_ioend_bio(bh);
} else if (bh->b_blocknr != lastblock + 1) {
xfs_submit_ioend_bio(wbc, ioend, bio);
goto retry;
}
if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
xfs_submit_ioend_bio(wbc, ioend, bio);
goto retry;
}
lastblock = bh->b_blocknr;
}
if (bio)
xfs_submit_ioend_bio(wbc, ioend, bio); xfs_submit_ioend_bio(wbc, ioend, bio);
xfs_finish_ioend(ioend); goto retry;
} while ((ioend = next) != NULL); }
lastblock = bh->b_blocknr;
}
if (bio)
xfs_submit_ioend_bio(wbc, ioend, bio);
xfs_finish_ioend(ioend);
return 0;
} }
/* /*
* Test to see if we've been building up a completion structure for * Test to see if we've been building up a completion structure for
* earlier buffers -- if so, we try to append to this ioend if we * earlier buffers -- if so, we try to append to this ioend if we
* can, otherwise we finish off any current ioend and start another. * can, otherwise we finish off any current ioend and start another.
* Return true if we've finished the given ioend. * Return the ioend we finished off so that the caller can submit it
* once it has finished processing the dirty page.
*/ */
STATIC void STATIC void
xfs_add_to_ioend( xfs_add_to_ioend(
struct inode *inode, struct inode *inode,
struct buffer_head *bh, struct buffer_head *bh,
xfs_off_t offset, xfs_off_t offset,
struct xfs_writepage_ctx *wpc) struct xfs_writepage_ctx *wpc,
struct list_head *iolist)
{ {
if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
bh->b_blocknr != wpc->last_block + 1) { bh->b_blocknr != wpc->last_block + 1) {
struct xfs_ioend *new; struct xfs_ioend *new;
if (wpc->ioend)
list_add(&wpc->ioend->io_list, iolist);
new = xfs_alloc_ioend(inode, wpc->io_type); new = xfs_alloc_ioend(inode, wpc->io_type);
new->io_offset = offset; new->io_offset = offset;
new->io_buffer_head = bh; new->io_buffer_head = bh;
new->io_buffer_tail = bh; new->io_buffer_tail = bh;
if (wpc->ioend)
wpc->ioend->io_list = new;
wpc->ioend = new; wpc->ioend = new;
} else { } else {
wpc->ioend->io_buffer_tail->b_private = bh; wpc->ioend->io_buffer_tail->b_private = bh;
...@@ -565,6 +539,7 @@ xfs_add_to_ioend( ...@@ -565,6 +539,7 @@ xfs_add_to_ioend(
bh->b_private = NULL; bh->b_private = NULL;
wpc->ioend->io_size += bh->b_size; wpc->ioend->io_size += bh->b_size;
wpc->last_block = bh->b_blocknr; wpc->last_block = bh->b_blocknr;
xfs_start_buffer_writeback(bh);
} }
STATIC void STATIC void
...@@ -726,44 +701,41 @@ xfs_aops_discard_page( ...@@ -726,44 +701,41 @@ xfs_aops_discard_page(
return; return;
} }
static int /*
xfs_writepage_submit( * We implement an immediate ioend submission policy here to avoid needing to
struct xfs_writepage_ctx *wpc, * chain multiple ioends and hence nest mempool allocations which can violate
struct writeback_control *wbc, * forward progress guarantees we need to provide. The current ioend we are
int status) * adding buffers to is cached on the writepage context, and if the new buffer
{ * does not append to the cached ioend it will create a new ioend and cache that
struct blk_plug plug; * instead.
*
/* Reserve log space if we might write beyond the on-disk inode size. */ * If a new ioend is created and cached, the old ioend is returned and queued
if (!status && wpc->ioend && wpc->ioend->io_type != XFS_IO_UNWRITTEN && * locally for submission once the entire page is processed or an error has been
xfs_ioend_is_append(wpc->ioend)) * detected. While ioends are submitted immediately after they are completed,
status = xfs_setfilesize_trans_alloc(wpc->ioend); * batching optimisations are provided by higher level block plugging.
*
if (wpc->iohead) { * At the end of a writeback pass, there will be a cached ioend remaining on the
blk_start_plug(&plug); * writepage context that the caller will need to submit.
xfs_submit_ioend(wbc, wpc->iohead, status); */
blk_finish_plug(&plug);
}
return status;
}
static int static int
xfs_writepage_map( xfs_writepage_map(
struct xfs_writepage_ctx *wpc, struct xfs_writepage_ctx *wpc,
struct writeback_control *wbc,
struct inode *inode, struct inode *inode,
struct page *page, struct page *page,
loff_t offset, loff_t offset,
__uint64_t end_offset) __uint64_t end_offset)
{ {
LIST_HEAD(submit_list);
struct xfs_ioend *ioend, *next;
struct buffer_head *bh, *head; struct buffer_head *bh, *head;
ssize_t len = 1 << inode->i_blkbits; ssize_t len = 1 << inode->i_blkbits;
int error = 0; int error = 0;
int uptodate = 1;
int count = 0; int count = 0;
int uptodate = 1;
bh = head = page_buffers(page); bh = head = page_buffers(page);
offset = page_offset(page); offset = page_offset(page);
do { do {
if (offset >= end_offset) if (offset >= end_offset)
break; break;
...@@ -816,7 +788,7 @@ xfs_writepage_map( ...@@ -816,7 +788,7 @@ xfs_writepage_map(
error = xfs_map_blocks(inode, offset, &wpc->imap, error = xfs_map_blocks(inode, offset, &wpc->imap,
wpc->io_type); wpc->io_type);
if (error) if (error)
goto out_error; goto out;
wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
offset); offset);
} }
...@@ -824,46 +796,65 @@ xfs_writepage_map( ...@@ -824,46 +796,65 @@ xfs_writepage_map(
lock_buffer(bh); lock_buffer(bh);
if (wpc->io_type != XFS_IO_OVERWRITE) if (wpc->io_type != XFS_IO_OVERWRITE)
xfs_map_at_offset(inode, bh, &wpc->imap, offset); xfs_map_at_offset(inode, bh, &wpc->imap, offset);
xfs_add_to_ioend(inode, bh, offset, wpc); xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
count++; count++;
} }
if (!wpc->iohead)
wpc->iohead = wpc->ioend;
} while (offset += len, ((bh = bh->b_this_page) != head)); } while (offset += len, ((bh = bh->b_this_page) != head));
if (uptodate && bh == head) if (uptodate && bh == head)
SetPageUptodate(page); SetPageUptodate(page);
xfs_start_page_writeback(page, 1, count); ASSERT(wpc->ioend || list_empty(&submit_list));
ASSERT(wpc->iohead || !count);
return 0;
out_error: out:
/* /*
* On error, we have to fail the iohead here because we locked buffers * On error, we have to fail the ioend here because we have locked
* in the ioend chain. If we don't do this, we'll deadlock invalidating * buffers in the ioend. If we don't do this, we'll deadlock
* the page as that tries to lock the buffers on the page. Also, because * invalidating the page as that tries to lock the buffers on the page.
* we may have set pages under writeback, we have to make sure we run * Also, because we may have set pages under writeback, we have to make
* IO completion to mark the error state of the IO appropriately, so we * sure we run IO completion to mark the error state of the IO
* can't cancel the ioend directly here. That means we have to mark this * appropriately, so we can't cancel the ioend directly here. That means
* page as under writeback if we included any buffers from it in the * we have to mark this page as under writeback if we included any
* ioend chain so that completion treats it correctly. * buffers from it in the ioend chain so that completion treats it
* correctly.
* *
* If we didn't include the page in the ioend, then we can simply * If we didn't include the page in the ioend, the on error we can
* discard and unlock it as there are no other users of the page or it's * simply discard and unlock it as there are no other users of the page
* buffers right now. The caller will still need to trigger submission * or it's buffers right now. The caller will still need to trigger
* of outstanding ioends on the writepage context so they are treated * submission of outstanding ioends on the writepage context so they are
* correctly on error. * treated correctly on error.
*/ */
if (count) if (count) {
xfs_start_page_writeback(page, 0, count); xfs_start_page_writeback(page, !error);
else {
/*
* Preserve the original error if there was one, otherwise catch
* submission errors here and propagate into subsequent ioend
* submissions.
*/
list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
int error2;
list_del_init(&ioend->io_list);
error2 = xfs_submit_ioend(wbc, ioend, error);
if (error2 && !error)
error = error2;
}
} else if (error) {
xfs_aops_discard_page(page); xfs_aops_discard_page(page);
ClearPageUptodate(page); ClearPageUptodate(page);
unlock_page(page); unlock_page(page);
} else {
/*
* We can end up here with no error and nothing to write if we
* race with a partial page truncate on a sub-page block sized
* filesystem. In that case we need to mark the page clean.
*/
xfs_start_page_writeback(page, 1);
end_page_writeback(page);
} }
mapping_set_error(page->mapping, error); mapping_set_error(page->mapping, error);
return error; return error;
} }
...@@ -979,7 +970,7 @@ xfs_do_writepage( ...@@ -979,7 +970,7 @@ xfs_do_writepage(
end_offset = offset; end_offset = offset;
} }
return xfs_writepage_map(wpc, inode, page, offset, end_offset); return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
redirty: redirty:
redirty_page_for_writepage(wbc, page); redirty_page_for_writepage(wbc, page);
...@@ -998,7 +989,9 @@ xfs_vm_writepage( ...@@ -998,7 +989,9 @@ xfs_vm_writepage(
int ret; int ret;
ret = xfs_do_writepage(page, wbc, &wpc); ret = xfs_do_writepage(page, wbc, &wpc);
return xfs_writepage_submit(&wpc, wbc, ret); if (wpc.ioend)
ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
return ret;
} }
STATIC int STATIC int
...@@ -1013,7 +1006,9 @@ xfs_vm_writepages( ...@@ -1013,7 +1006,9 @@ xfs_vm_writepages(
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc); ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
return xfs_writepage_submit(&wpc, wbc, ret); if (wpc.ioend)
ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
return ret;
} }
/* /*
......
...@@ -41,7 +41,7 @@ enum { ...@@ -41,7 +41,7 @@ enum {
* It can manage several multi-page bio's at once. * It can manage several multi-page bio's at once.
*/ */
typedef struct xfs_ioend { typedef struct xfs_ioend {
struct xfs_ioend *io_list; /* next ioend in chain */ struct list_head io_list; /* next ioend in chain */
unsigned int io_type; /* delalloc / unwritten */ unsigned int io_type; /* delalloc / unwritten */
int io_error; /* I/O error code */ int io_error; /* I/O error code */
atomic_t io_remaining; /* hold count */ atomic_t io_remaining; /* hold count */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册