提交 4f06dd92 编写于 作者: V Vivek Goyal 提交者: Miklos Szeredi

fuse: fix write deadlock

There are two modes for write(2) and friends in fuse:

a) write through (update page cache, send sync WRITE request to userspace)

b) buffered write (update page cache, async writeout later)

The write through method kept all the page cache pages locked that were
used for the request.  Keeping more than one page locked is deadlock prone
and Qian Cai demonstrated this with trinity fuzzing.

The reason for keeping the pages locked is that concurrent mapped reads
shouldn't try to pull possibly stale data into the page cache.

For full page writes, the easy way to fix this is to make the cached page
be the authoritative source by marking the page PG_uptodate immediately.
After this the page can be safely unlocked, since mapped/cached reads will
take the written data from the cache.

Concurrent mapped writes will now cause data in the original WRITE request
to be updated; this however doesn't cause any data inconsistency and this
scenario should be exceedingly rare anyway.

If the WRITE request returns with an error in the above case, currently the
page is not marked uptodate; this means that a concurrent read will always
read consistent data.  After this patch the page is uptodate between
writing to the cache and receiving the error: there's window where a cached
read will read the wrong data.  While theoretically this could be a
regression, it is unlikely to be one in practice, since this is normal for
buffered writes.

In case of a partial page write to an already uptodate page the locking is
also unnecessary, with the above caveats.

Partial write of a not uptodate page still needs to be handled.  One way
would be to read the complete page before doing the write.  This is not
possible, since it might break filesystems that don't expect any READ
requests when the file was opened O_WRONLY.

The other solution is to serialize the synchronous write with reads from
the partial pages.  The easiest way to do this is to keep the partial pages
locked.  The problem is that a write() may involve two such pages (one head
and one tail).  This patch fixes it by only locking the partial tail page.
If there's a partial head page as well, then split that off as a separate
WRITE request.
Reported-by: NQian Cai <cai@lca.pw>
Link: https://lore.kernel.org/linux-fsdevel/4794a3fa3742a5e84fb0f934944204b55730829b.camel@lca.pw/
Fixes: ea9b9907 ("fuse: implement perform_write")
Cc: <stable@vger.kernel.org> # v2.6.26
Signed-off-by: NVivek Goyal <vgoyal@redhat.com>
Signed-off-by: NMiklos Szeredi <mszeredi@redhat.com>
上级 e49d033b
...@@ -1099,6 +1099,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, ...@@ -1099,6 +1099,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
struct fuse_file *ff = file->private_data; struct fuse_file *ff = file->private_data;
struct fuse_mount *fm = ff->fm; struct fuse_mount *fm = ff->fm;
unsigned int offset, i; unsigned int offset, i;
bool short_write;
int err; int err;
for (i = 0; i < ap->num_pages; i++) for (i = 0; i < ap->num_pages; i++)
...@@ -1113,32 +1114,38 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, ...@@ -1113,32 +1114,38 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
if (!err && ia->write.out.size > count) if (!err && ia->write.out.size > count)
err = -EIO; err = -EIO;
short_write = ia->write.out.size < count;
offset = ap->descs[0].offset; offset = ap->descs[0].offset;
count = ia->write.out.size; count = ia->write.out.size;
for (i = 0; i < ap->num_pages; i++) { for (i = 0; i < ap->num_pages; i++) {
struct page *page = ap->pages[i]; struct page *page = ap->pages[i];
if (!err && !offset && count >= PAGE_SIZE) if (err) {
SetPageUptodate(page); ClearPageUptodate(page);
} else {
if (count > PAGE_SIZE - offset) if (count >= PAGE_SIZE - offset)
count -= PAGE_SIZE - offset; count -= PAGE_SIZE - offset;
else else {
count = 0; if (short_write)
offset = 0; ClearPageUptodate(page);
count = 0;
unlock_page(page); }
offset = 0;
}
if (ia->write.page_locked && (i == ap->num_pages - 1))
unlock_page(page);
put_page(page); put_page(page);
} }
return err; return err;
} }
static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
struct address_space *mapping, struct address_space *mapping,
struct iov_iter *ii, loff_t pos, struct iov_iter *ii, loff_t pos,
unsigned int max_pages) unsigned int max_pages)
{ {
struct fuse_args_pages *ap = &ia->ap;
struct fuse_conn *fc = get_fuse_conn(mapping->host); struct fuse_conn *fc = get_fuse_conn(mapping->host);
unsigned offset = pos & (PAGE_SIZE - 1); unsigned offset = pos & (PAGE_SIZE - 1);
size_t count = 0; size_t count = 0;
...@@ -1191,6 +1198,16 @@ static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, ...@@ -1191,6 +1198,16 @@ static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap,
if (offset == PAGE_SIZE) if (offset == PAGE_SIZE)
offset = 0; offset = 0;
/* If we copied full page, mark it uptodate */
if (tmp == PAGE_SIZE)
SetPageUptodate(page);
if (PageUptodate(page)) {
unlock_page(page);
} else {
ia->write.page_locked = true;
break;
}
if (!fc->big_writes) if (!fc->big_writes)
break; break;
} while (iov_iter_count(ii) && count < fc->max_write && } while (iov_iter_count(ii) && count < fc->max_write &&
...@@ -1234,7 +1251,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, ...@@ -1234,7 +1251,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
break; break;
} }
count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages); count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages);
if (count <= 0) { if (count <= 0) {
err = count; err = count;
} else { } else {
......
...@@ -912,6 +912,7 @@ struct fuse_io_args { ...@@ -912,6 +912,7 @@ struct fuse_io_args {
struct { struct {
struct fuse_write_in in; struct fuse_write_in in;
struct fuse_write_out out; struct fuse_write_out out;
bool page_locked;
} write; } write;
}; };
struct fuse_args_pages ap; struct fuse_args_pages ap;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册