diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index d4f5731dcbbb51e672a02026b160d0196fffbe79..94677e7dcb1311ac504c7f171fd6139deecddb5a 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -401,11 +401,16 @@ otherwise noted. started might not be in the page cache at the end of the walk). - truncate: called by the VFS to change the size of a file. The + truncate: Deprecated. This will not be called if ->setsize is defined. + Called by the VFS to change the size of a file. The i_size field of the inode is set to the desired size by the VFS before this method is called. This method is called by the truncate(2) system call and related functionality. + Note: ->truncate and vmtruncate are deprecated. Do not add new + instances/calls of these. Filesystems should be converted to do their + truncate sequence via ->setattr(). + permission: called by the VFS to check for access rights on a POSIX-like filesystem. diff --git a/fs/attr.c b/fs/attr.c index 0815e93bb487e0651619d942b6bd78d7fe6a2e89..b4fa3b0aa59691ac4ca33d30e9afc308d5df98bd 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -67,14 +67,14 @@ EXPORT_SYMBOL(inode_change_ok); * @offset: the new size to assign to the inode * @Returns: 0 on success, -ve errno on failure * + * inode_newsize_ok must be called with i_mutex held. + * * inode_newsize_ok will check filesystem limits and ulimits to check that the * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ * when necessary. Caller must not proceed with inode size change if failure is * returned. @inode must be a file (not directory), with appropriate * permissions to allow truncate (inode_newsize_ok does NOT check these * conditions). - * - * inode_newsize_ok must be called with i_mutex held. */ int inode_newsize_ok(const struct inode *inode, loff_t offset) { @@ -104,17 +104,25 @@ int inode_newsize_ok(const struct inode *inode, loff_t offset) } EXPORT_SYMBOL(inode_newsize_ok); -int inode_setattr(struct inode * inode, struct iattr * attr) +/** + * generic_setattr - copy simple metadata updates into the generic inode + * @inode: the inode to be updated + * @attr: the new attributes + * + * generic_setattr must be called with i_mutex held. + * + * generic_setattr updates the inode's metadata with that specified + * in attr. Noticably missing is inode size update, which is more complex + * as it requires pagecache updates. See simple_setsize. + * + * The inode is not marked as dirty after this operation. The rationale is + * that for "simple" filesystems, the struct inode is the inode storage. + * The caller is free to mark the inode dirty afterwards if needed. + */ +void generic_setattr(struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; - if (ia_valid & ATTR_SIZE && - attr->ia_size != i_size_read(inode)) { - int error = vmtruncate(inode, attr->ia_size); - if (error) - return error; - } - if (ia_valid & ATTR_UID) inode->i_uid = attr->ia_uid; if (ia_valid & ATTR_GID) @@ -135,6 +143,28 @@ int inode_setattr(struct inode * inode, struct iattr * attr) mode &= ~S_ISGID; inode->i_mode = mode; } +} +EXPORT_SYMBOL(generic_setattr); + +/* + * note this function is deprecated, the new truncate sequence should be + * used instead -- see eg. simple_setsize, generic_setattr. + */ +int inode_setattr(struct inode *inode, const struct iattr *attr) +{ + unsigned int ia_valid = attr->ia_valid; + + if (ia_valid & ATTR_SIZE && + attr->ia_size != i_size_read(inode)) { + int error; + + error = vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + generic_setattr(inode, attr); + mark_inode_dirty(inode); return 0; diff --git a/fs/buffer.c b/fs/buffer.c index e8aa7081d25c533d0548da7755114802b3706a9b..d54812b198e9d968007e407448a898cbc83fddb9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1949,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page, } /* - * block_write_begin takes care of the basic task of block allocation and - * bringing partial write blocks uptodate first. - * - * If *pagep is not NULL, then block_write_begin uses the locked page - * at *pagep rather than allocating its own. In this case, the page will - * not be unlocked or deallocated on failure. + * Filesystems implementing the new truncate sequence should use the + * _newtrunc postfix variant which won't incorrectly call vmtruncate. + * The filesystem needs to handle block truncation upon failure. */ -int block_write_begin(struct file *file, struct address_space *mapping, +int block_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block) @@ -1992,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); *pagep = NULL; - - /* - * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. - */ - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); } } out: return status; } +EXPORT_SYMBOL(block_write_begin_newtrunc); + +/* + * block_write_begin takes care of the basic task of block allocation and + * bringing partial write blocks uptodate first. + * + * If *pagep is not NULL, then block_write_begin uses the locked page + * at *pagep rather than allocating its own. In this case, the page will + * not be unlocked or deallocated on failure. + */ +int block_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ + int ret; + + ret = block_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block); + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + * + * Filesystems which pass down their own page also cannot + * call into vmtruncate here because it would lead to lock + * inversion problems (*pagep is locked). This is a further + * example of where the old truncate sequence is inadequate. + */ + if (unlikely(ret) && *pagep == NULL) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} EXPORT_SYMBOL(block_write_begin); int block_write_end(struct file *file, struct address_space *mapping, @@ -2324,7 +2351,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, * For moronic filesystems that do not allow holes in file. * We may have to extend the file. */ -int cont_write_begin(struct file *file, struct address_space *mapping, +int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block, loff_t *bytes) @@ -2345,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping, } *pagep = NULL; - err = block_write_begin(file, mapping, pos, len, + err = block_write_begin_newtrunc(file, mapping, pos, len, flags, pagep, fsdata, get_block); out: return err; } +EXPORT_SYMBOL(cont_write_begin_newtrunc); + +int cont_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block, loff_t *bytes) +{ + int ret; + + ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block, bytes); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} EXPORT_SYMBOL(cont_write_begin); int block_prepare_write(struct page *page, unsigned from, unsigned to, @@ -2381,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write); * * We are not allowed to take the i_mutex here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because - * vmtruncate() writes the inode size before removing pages, once we have the + * truncate writes the inode size before removing pages, once we have the * page lock we can determine safely if the page is beyond EOF. If it is not * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. @@ -2464,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head) } /* - * On entry, the page is fully not uptodate. - * On exit the page is fully uptodate in the areas outside (from,to) + * Filesystems implementing the new truncate sequence should use the + * _newtrunc postfix variant which won't incorrectly call vmtruncate. + * The filesystem needs to handle block truncation upon failure. */ -int nobh_write_begin(struct file *file, struct address_space *mapping, +int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block) @@ -2500,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); *pagep = NULL; - return block_write_begin(file, mapping, pos, len, flags, pagep, - fsdata, get_block); + return block_write_begin_newtrunc(file, mapping, pos, len, + flags, pagep, fsdata, get_block); } if (PageMappedToDisk(page)) @@ -2605,8 +2652,34 @@ int nobh_write_begin(struct file *file, struct address_space *mapping, page_cache_release(page); *pagep = NULL; - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + return ret; +} +EXPORT_SYMBOL(nobh_write_begin_newtrunc); + +/* + * On entry, the page is fully not uptodate. + * On exit the page is fully uptodate in the areas outside (from,to) + */ +int nobh_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ + int ret; + + ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block); + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } return ret; } diff --git a/fs/direct-io.c b/fs/direct-io.c index da111aacb46eed8e0ba2d94ffcba137d4fcb009a..7600aacf531dc8ed16ccfb727878d13834a16503 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1134,27 +1134,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, return ret; } -/* - * This is a library function for use by filesystem drivers. - * - * The locking rules are governed by the flags parameter: - * - if the flags value contains DIO_LOCKING we use a fancy locking - * scheme for dumb filesystems. - * For writes this function is called under i_mutex and returns with - * i_mutex held, for reads, i_mutex is not held on entry, but it is - * taken and dropped again before returning. - * For reads and writes i_alloc_sem is taken in shared mode and released - * on I/O completion (which may happen asynchronously after returning to - * the caller). - * - * - if the flags value does NOT contain DIO_LOCKING we don't use any - * internal locking but rather rely on the filesystem to synchronize - * direct I/O reads/writes versus each other and truncate. - * For reads and writes both i_mutex and i_alloc_sem are not held on - * entry and are never taken. - */ ssize_t -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, +__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) @@ -1247,9 +1228,46 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, nr_segs, blkbits, get_block, end_io, submit_io, dio); +out: + return retval; +} +EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc); + +/* + * This is a library function for use by filesystem drivers. + * + * The locking rules are governed by the flags parameter: + * - if the flags value contains DIO_LOCKING we use a fancy locking + * scheme for dumb filesystems. + * For writes this function is called under i_mutex and returns with + * i_mutex held, for reads, i_mutex is not held on entry, but it is + * taken and dropped again before returning. + * For reads and writes i_alloc_sem is taken in shared mode and released + * on I/O completion (which may happen asynchronously after returning to + * the caller). + * + * - if the flags value does NOT contain DIO_LOCKING we don't use any + * internal locking but rather rely on the filesystem to synchronize + * direct I/O reads/writes versus each other and truncate. + * For reads and writes both i_mutex and i_alloc_sem are not held on + * entry and are never taken. + */ +ssize_t +__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int flags) +{ + ssize_t retval; + + retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov, + offset, nr_segs, get_block, end_io, submit_io, flags); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again for DIO_LOCKING. + * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in + * their own manner. This is a further example of where the old + * truncate sequence is inadequate. * * NOTE: filesystems with their own locking have to handle this * on their own. @@ -1257,12 +1275,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (flags & DIO_LOCKING) { if (unlikely((rw & WRITE) && retval < 0)) { loff_t isize = i_size_read(inode); + loff_t end = offset + iov_length(iov, nr_segs); + if (end > isize) vmtruncate(inode, isize); } } -out: return retval; } EXPORT_SYMBOL(__blockdev_direct_IO); diff --git a/fs/libfs.c b/fs/libfs.c index b84d0a7a2204372fd8b55aa713fca8234b74b939..09e1016eb774556c0e69a5a4129da275410565c5 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -325,6 +326,81 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; } +/** + * simple_setsize - handle core mm and vfs requirements for file size change + * @inode: inode + * @newsize: new file size + * + * Returns 0 on success, -error on failure. + * + * simple_setsize must be called with inode_mutex held. + * + * simple_setsize will check that the requested new size is OK (see + * inode_newsize_ok), and then will perform the necessary i_size update + * and pagecache truncation (if necessary). It will be typically be called + * from the filesystem's setattr function when ATTR_SIZE is passed in. + * + * The inode itself must have correct permissions and attributes to allow + * i_size to be changed, this function then just checks that the new size + * requested is valid. + * + * In the case of simple in-memory filesystems with inodes stored solely + * in the inode cache, and file data in the pagecache, nothing more needs + * to be done to satisfy a truncate request. Filesystems with on-disk + * blocks for example will need to free them in the case of truncate, in + * that case it may be easier not to use simple_setsize (but each of its + * components will likely be required at some point to update pagecache + * and inode etc). + */ +int simple_setsize(struct inode *inode, loff_t newsize) +{ + loff_t oldsize; + int error; + + error = inode_newsize_ok(inode, newsize); + if (error) + return error; + + oldsize = inode->i_size; + i_size_write(inode, newsize); + truncate_pagecache(inode, oldsize, newsize); + + return error; +} +EXPORT_SYMBOL(simple_setsize); + +/** + * simple_setattr - setattr for simple in-memory filesystem + * @dentry: dentry + * @iattr: iattr structure + * + * Returns 0 on success, -error on failure. + * + * simple_setattr implements setattr for an in-memory filesystem which + * does not store its own file data or metadata (eg. uses the page cache + * and inode cache as its data store). + */ +int simple_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + if (iattr->ia_valid & ATTR_SIZE) { + error = simple_setsize(inode, iattr->ia_size); + if (error) + return error; + } + + generic_setattr(inode, iattr); + + return error; +} +EXPORT_SYMBOL(simple_setattr); + int simple_readpage(struct file *file, struct page *page) { clear_highpage(page); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 05e5f599621659313774d35c209b20b46cf3cea0..1b9ba193b7893784088b54b00667ce5fbc808e44 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -203,6 +203,9 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, int block_read_full_page(struct page*, get_block_t*); int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, unsigned long from); +int block_write_begin_newtrunc(struct file *, struct address_space *, + loff_t, unsigned, unsigned, + struct page **, void **, get_block_t*); int block_write_begin(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page **, void **, get_block_t*); @@ -214,6 +217,9 @@ int generic_write_end(struct file *, struct address_space *, struct page *, void *); void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); +int cont_write_begin_newtrunc(struct file *, struct address_space *, loff_t, + unsigned, unsigned, struct page **, void **, + get_block_t *, loff_t *); int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page **, void **, get_block_t *, loff_t *); @@ -225,6 +231,9 @@ void block_sync_page(struct page *); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); int block_truncate_page(struct address_space *, loff_t, get_block_t *); int file_fsync(struct file *, int); +int nobh_write_begin_newtrunc(struct file *, struct address_space *, + loff_t, unsigned, unsigned, + struct page **, void **, get_block_t*); int nobh_write_begin(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page **, void **, get_block_t*); diff --git a/include/linux/fs.h b/include/linux/fs.h index acf6c52a50dd76ed6f4f02fb72a39b8280d26caa..3428393942a642f877dc065d86c05044305af399 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2257,6 +2257,10 @@ typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, loff_t file_offset); void dio_end_io(struct bio *bio, int error); +ssize_t __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int lock_type); ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, @@ -2270,6 +2274,24 @@ enum { DIO_SKIP_HOLES = 0x02, }; +static inline ssize_t blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, + struct inode *inode, struct block_device *bdev, const struct iovec *iov, + loff_t offset, unsigned long nr_segs, get_block_t get_block, + dio_iodone_t end_io) +{ + return __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, NULL, + DIO_LOCKING | DIO_SKIP_HOLES); +} + +static inline ssize_t blockdev_direct_IO_no_locking_newtrunc(int rw, struct kiocb *iocb, + struct inode *inode, struct block_device *bdev, const struct iovec *iov, + loff_t offset, unsigned long nr_segs, get_block_t get_block, + dio_iodone_t end_io) +{ + return __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, NULL, 0); +} static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, @@ -2342,12 +2364,14 @@ extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); extern loff_t dcache_dir_lseek(struct file *, loff_t, int); extern int dcache_readdir(struct file *, void *, filldir_t); +extern int simple_setattr(struct dentry *, struct iattr *); extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int simple_statfs(struct dentry *, struct kstatfs *); extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); +extern int simple_setsize(struct inode *, loff_t); extern int noop_fsync(struct file *, int); extern int simple_empty(struct dentry *); extern int simple_readpage(struct file *file, struct page *page); @@ -2384,7 +2408,8 @@ extern int buffer_migrate_page(struct address_space *, extern int inode_change_ok(const struct inode *, struct iattr *); extern int inode_newsize_ok(const struct inode *, loff_t offset); -extern int __must_check inode_setattr(struct inode *, struct iattr *); +extern int __must_check inode_setattr(struct inode *, const struct iattr *); +extern void generic_setattr(struct inode *inode, const struct iattr *attr); extern void file_update_time(struct file *file); diff --git a/mm/truncate.c b/mm/truncate.c index f42675a3615d81ab59e722cc3e3d18b1e1e0525f..937571b8b23370afdcb607ae27a3a897464e46bb 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -548,18 +548,18 @@ EXPORT_SYMBOL(truncate_pagecache); * NOTE! We have to be ready to update the memory sharing * between the file and the memory map for a potential last * incomplete page. Ugly, but necessary. + * + * This function is deprecated and simple_setsize or truncate_pagecache + * should be used instead. */ int vmtruncate(struct inode *inode, loff_t offset) { - loff_t oldsize; int error; - error = inode_newsize_ok(inode, offset); + error = simple_setsize(inode, offset); if (error) return error; - oldsize = inode->i_size; - i_size_write(inode, offset); - truncate_pagecache(inode, oldsize, offset); + if (inode->i_op->truncate) inode->i_op->truncate(inode);