提交 149b3060 编写于 作者: L Linus Torvalds

Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
 "Mostly performance and bug fixes, plus some cleanups.  The one new
  feature this merge window is a new ioctl EXT4_IOC_SWAP_BOOT which
  allows installation of a hidden inode designed for boot loaders."

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (50 commits)
  ext4: fix type-widening bug in inode table readahead code
  ext4: add check for inodes_count overflow in new resize ioctl
  ext4: fix Kconfig documentation for CONFIG_EXT4_DEBUG
  ext4: fix online resizing for ext3-compat file systems
  jbd2: trace when lock_buffer in do_get_write_access takes a long time
  ext4: mark metadata blocks using bh flags
  buffer: add BH_Prio and BH_Meta flags
  ext4: mark all metadata I/O with REQ_META
  ext4: fix readdir error in case inline_data+^dir_index.
  ext4: fix readdir error in the case of inline_data+dir_index
  jbd2: use kmem_cache_zalloc instead of kmem_cache_alloc/memset
  ext4: mext_insert_extents should update extent block checksum
  ext4: move quota initialization out of inode allocation transaction
  ext4: reserve xattr index for Rich ACL support
  jbd2: reduce journal_head size
  ext4: clear buffer_uninit flag when submitting IO
  ext4: use io_end for multiple bios
  ext4: make ext4_bio_write_page() use BH_Async_Write flags
  ext4: Use kstrtoul() instead of parse_strtoul()
  ext4: defragmentation code cleanup
  ...
......@@ -494,6 +494,17 @@ Files in /sys/fs/ext4/<devname>
session_write_kbytes This file is read-only and shows the number of
kilobytes of data that have been written to this
filesystem since it was mounted.
reserved_clusters This is RW file and contains number of reserved
clusters in the file system which will be used
in the specific situations to avoid costly
zeroout, unexpected ENOSPC, or possible data
loss. The default is 2% or 4096 clusters,
whichever is smaller and this can be changed
however it can never exceed number of clusters
in the file system. If there is not enough space
for the reserved space when mounting the file
mount will _not_ fail.
..............................................................................
Ioctls
......@@ -587,6 +598,16 @@ Table of Ext4 specific ioctls
bitmaps and inode table, the userspace tool thus
just passes the new number of blocks.
EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes
(like i_blocks, i_size, i_flags, ...) from
the specified inode with inode
EXT4_BOOT_LOADER_INO (#5). This is typically
used to store a boot loader in a secure part of
the filesystem, where it can't be changed by a
normal user by accident.
The data blocks of the previous boot loader
will be associated with the given inode.
..............................................................................
References
......
......@@ -2987,6 +2987,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
/* Take care of bh's that straddle the end of the device */
guard_bh_eod(rw, bio, bh);
if (buffer_meta(bh))
rw |= REQ_META;
if (buffer_prio(bh))
rw |= REQ_PRIO;
bio_get(bio);
submit_bio(rw, bio);
......
......@@ -71,4 +71,5 @@ config EXT4_DEBUG
Enables run-time debugging support for the ext4 filesystem.
If you select Y here, then you will be able to turn on debugging
with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
with a command such as:
echo 1 > /sys/module/ext4/parameters/mballoc_debug
......@@ -29,6 +29,23 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
* balloc.c contains the blocks allocation and deallocation routines
*/
/*
* Calculate block group number for a given block number
*/
ext4_group_t ext4_get_group_number(struct super_block *sb,
ext4_fsblk_t block)
{
ext4_group_t group;
if (test_opt2(sb, STD_GROUP_SIZE))
group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
block) >>
(EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
else
ext4_get_group_no_and_offset(sb, block, &group, NULL);
return group;
}
/*
* Calculate the block group number and offset into the block/cluster
* allocation bitmap, given a block number
......@@ -49,14 +66,18 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
}
static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
ext4_group_t block_group)
/*
* Check whether the 'block' lives within the 'block_group'. Returns 1 if so
* and 0 otherwise.
*/
static inline int ext4_block_in_group(struct super_block *sb,
ext4_fsblk_t block,
ext4_group_t block_group)
{
ext4_group_t actual_group;
ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
if (actual_group == block_group)
return 1;
return 0;
actual_group = ext4_get_group_number(sb, block);
return (actual_group == block_group) ? 1 : 0;
}
/* Return the number of clusters used for file system metadata; this
......@@ -420,7 +441,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
trace_ext4_read_block_bitmap_load(sb, block_group);
bh->b_end_io = ext4_end_bitmap_read;
get_bh(bh);
submit_bh(READ, bh);
submit_bh(READ | REQ_META | REQ_PRIO, bh);
return bh;
verify:
ext4_validate_block_bitmap(sb, desc, block_group, bh);
......@@ -478,20 +499,22 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
s64 nclusters, unsigned int flags)
{
s64 free_clusters, dirty_clusters, root_clusters;
s64 free_clusters, dirty_clusters, rsv, resv_clusters;
struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
free_clusters = percpu_counter_read_positive(fcc);
dirty_clusters = percpu_counter_read_positive(dcc);
resv_clusters = atomic64_read(&sbi->s_resv_clusters);
/*
* r_blocks_count should always be multiple of the cluster ratio so
* we are safe to do a plane bit shift only.
*/
root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
resv_clusters;
if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
if (free_clusters - (nclusters + rsv + dirty_clusters) <
EXT4_FREECLUSTERS_WATERMARK) {
free_clusters = percpu_counter_sum_positive(fcc);
dirty_clusters = percpu_counter_sum_positive(dcc);
......@@ -499,15 +522,21 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
/* Check whether we have space after accounting for current
* dirty clusters & root reserved clusters.
*/
if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
if (free_clusters >= (rsv + nclusters + dirty_clusters))
return 1;
/* Hm, nope. Are (enough) root reserved clusters available? */
if (uid_eq(sbi->s_resuid, current_fsuid()) ||
(!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
capable(CAP_SYS_RESOURCE) ||
(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
if (free_clusters >= (nclusters + dirty_clusters +
resv_clusters))
return 1;
}
/* No free blocks. Let's see if we can dip into reserved pool */
if (flags & EXT4_MB_USE_RESERVED) {
if (free_clusters >= (nclusters + dirty_clusters))
return 1;
}
......
......@@ -46,7 +46,8 @@ static int is_dx_dir(struct inode *inode)
if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_COMPAT_DIR_INDEX) &&
((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
((inode->i_size >> sb->s_blocksize_bits) == 1)))
((inode->i_size >> sb->s_blocksize_bits) == 1) ||
ext4_has_inline_data(inode)))
return 1;
return 0;
......@@ -115,14 +116,6 @@ static int ext4_readdir(struct file *filp,
int ret = 0;
int dir_has_error = 0;
if (ext4_has_inline_data(inode)) {
int has_inline_data = 1;
ret = ext4_read_inline_dir(filp, dirent, filldir,
&has_inline_data);
if (has_inline_data)
return ret;
}
if (is_dx_dir(inode)) {
err = ext4_dx_readdir(filp, dirent, filldir);
if (err != ERR_BAD_DX_DIR) {
......@@ -136,6 +129,15 @@ static int ext4_readdir(struct file *filp,
ext4_clear_inode_flag(file_inode(filp),
EXT4_INODE_INDEX);
}
if (ext4_has_inline_data(inode)) {
int has_inline_data = 1;
ret = ext4_read_inline_dir(filp, dirent, filldir,
&has_inline_data);
if (has_inline_data)
return ret;
}
stored = 0;
offset = filp->f_pos & (sb->s_blocksize - 1);
......
......@@ -121,6 +121,8 @@ typedef unsigned int ext4_group_t;
#define EXT4_MB_STREAM_ALLOC 0x0800
/* Use reserved root blocks if needed */
#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
/* Use blocks from reserved pool */
#define EXT4_MB_USE_RESERVED 0x2000
struct ext4_allocation_request {
/* target inode for block we're allocating */
......@@ -196,19 +198,8 @@ struct mpage_da_data {
#define EXT4_IO_END_ERROR 0x0002
#define EXT4_IO_END_DIRECT 0x0004
struct ext4_io_page {
struct page *p_page;
atomic_t p_count;
};
#define MAX_IO_PAGES 128
/*
* For converting uninitialized extents on a work queue.
*
* 'page' is only used from the writepage() path; 'pages' is only used for
* buffered writes; they are used to keep page references until conversion
* takes place. For AIO/DIO, neither field is filled in.
*/
typedef struct ext4_io_end {
struct list_head list; /* per-file finished IO list */
......@@ -218,15 +209,13 @@ typedef struct ext4_io_end {
ssize_t size; /* size of the extent */
struct kiocb *iocb; /* iocb struct for AIO */
int result; /* error value for AIO */
int num_io_pages; /* for writepages() */
struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */
atomic_t count; /* reference counter */
} ext4_io_end_t;
struct ext4_io_submit {
int io_op;
struct bio *io_bio;
ext4_io_end_t *io_end;
struct ext4_io_page *io_page;
sector_t io_next_block;
};
......@@ -403,7 +392,7 @@ struct flex_groups {
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */
#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */
/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
......@@ -557,9 +546,8 @@ enum {
#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002
#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
EXT4_GET_BLOCKS_CREATE)
/* Caller is from the delayed allocation writeout path,
so set the magic i_delalloc_reserve_flag after taking the
inode allocation semaphore for */
/* Caller is from the delayed allocation writeout path
* finally doing the actual allocation of delayed blocks */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
/* caller is from the direct IO path, request to creation of an
unitialized extents if not allocated, split the uninitialized
......@@ -571,8 +559,9 @@ enum {
/* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
/* Punch out blocks of an extent */
#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
/* Eventual metadata allocation (due to growing extent tree)
* should not fail, so try to use reserved blocks for that.*/
#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
/* Don't normalize allocation size (used for fallocate) */
#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
/* Request will not result in inode size update (user for fallocate) */
......@@ -616,6 +605,7 @@ enum {
#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
......@@ -949,7 +939,7 @@ struct ext4_inode_info {
#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */
/*
* Mount flags
* Mount flags set via mount options or defaults
*/
#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
......@@ -981,8 +971,16 @@ struct ext4_inode_info {
#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
/*
* Mount flags set either automatically (could not be set by mount option)
* based on per file system feature or property or in special cases such as
* distinguishing between explicit mount option definition and default.
*/
#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly
specified delalloc */
#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group
size of blocksize * 8
blocks */
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
......@@ -1179,6 +1177,7 @@ struct ext4_sb_info {
unsigned int s_mount_flags;
unsigned int s_def_mount_opt;
ext4_fsblk_t s_sb_block;
atomic64_t s_resv_clusters;
kuid_t s_resuid;
kgid_t s_resgid;
unsigned short s_mount_state;
......@@ -1333,6 +1332,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
return ino == EXT4_ROOT_INO ||
ino == EXT4_USR_QUOTA_INO ||
ino == EXT4_GRP_QUOTA_INO ||
ino == EXT4_BOOT_LOADER_INO ||
ino == EXT4_JOURNAL_INO ||
ino == EXT4_RESIZE_INO ||
(ino >= EXT4_FIRST_INO(sb) &&
......@@ -1374,6 +1374,7 @@ enum {
EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
nolocking */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
......@@ -1784,9 +1785,6 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
*/
#define ERR_BAD_DX_DIR -75000
void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
/*
* Timeout and state flag for lazy initialization inode thread.
*/
......@@ -1908,6 +1906,13 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
struct buffer_head *bh);
/* balloc.c */
extern void ext4_get_group_no_and_offset(struct super_block *sb,
ext4_fsblk_t blocknr,
ext4_group_t *blockgrpp,
ext4_grpblk_t *offsetp);
extern ext4_group_t ext4_get_group_number(struct super_block *sb,
ext4_fsblk_t block);
extern void ext4_validate_block_bitmap(struct super_block *sb,
struct ext4_group_desc *desc,
unsigned int block_group,
......@@ -2108,8 +2113,9 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
unsigned long nr_segs);
extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
extern void ext4_ind_truncate(struct inode *inode);
extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t first, ext4_lblk_t stop);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
......@@ -2117,6 +2123,7 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
extern int ext4_ind_migrate(struct inode *inode);
/* namei.c */
extern int ext4_dirent_csum_verify(struct inode *inode,
......@@ -2511,6 +2518,11 @@ extern int ext4_try_create_inline_dir(handle_t *handle,
extern int ext4_read_inline_dir(struct file *filp,
void *dirent, filldir_t filldir,
int *has_inline_data);
extern int htree_inlinedir_to_tree(struct file *dir_file,
struct inode *dir, ext4_lblk_t block,
struct dx_hash_info *hinfo,
__u32 start_hash, __u32 start_minor_hash,
int *has_inline_data);
extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
const struct qstr *d_name,
struct ext4_dir_entry_2 **res_dir,
......@@ -2547,6 +2559,24 @@ extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
extern int ext4_handle_dirty_dirent_node(handle_t *handle,
struct inode *inode,
struct buffer_head *bh);
#define S_SHIFT 12
static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
[S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
[S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
[S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
[S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
[S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
};
static inline void ext4_set_de_type(struct super_block *sb,
struct ext4_dir_entry_2 *de,
umode_t mode) {
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}
/* symlink.c */
extern const struct inode_operations ext4_symlink_inode_operations;
......@@ -2573,9 +2603,9 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
int chunk);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern void ext4_ext_truncate(struct inode *);
extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
loff_t length);
extern void ext4_ext_truncate(handle_t *, struct inode *);
extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
......@@ -2609,17 +2639,26 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
struct inode *second);
extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
struct inode *donor_inode);
void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2);
void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2);
extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
__u64 start_orig, __u64 start_donor,
__u64 len, __u64 *moved_len);
/* page-io.c */
extern int __init ext4_init_pageio(void);
extern void ext4_add_complete_io(ext4_io_end_t *io_end);
extern void ext4_exit_pageio(void);
extern void ext4_ioend_shutdown(struct inode *);
extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
extern int ext4_put_io_end(ext4_io_end_t *io_end);
extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
extern void ext4_io_submit_init(struct ext4_io_submit *io,
struct writeback_control *wbc);
extern void ext4_end_io_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
......
......@@ -270,5 +270,10 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
0xffff);
}
#define ext4_ext_dirty(handle, inode, path) \
__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
struct inode *inode, struct ext4_ext_path *path);
#endif /* _EXT4_EXTENTS */
......@@ -43,6 +43,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
{
journal_t *journal;
might_sleep();
trace_ext4_journal_start(sb, nblocks, _RET_IP_);
if (sb->s_flags & MS_RDONLY)
return ERR_PTR(-EROFS);
......@@ -113,6 +115,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
{
int err = 0;
might_sleep();
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
if (err)
......@@ -209,6 +213,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
{
int err = 0;
might_sleep();
set_buffer_meta(bh);
set_buffer_prio(bh);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_dirty_metadata(handle, bh);
if (err) {
......
......@@ -29,11 +29,13 @@
* block to complete the transaction.
*
* For extents-enabled fs we may have to allocate and modify up to
* 5 levels of tree + root which are stored in the inode. */
* 5 levels of tree, data block (for each of these we need bitmap + group
* summaries), root which is stored in the inode, sb
*/
#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
? 27U : 8U)
? 20U : 8U)
/* Extended attribute operations touch at most two data buffers,
* two bitmap buffers, and two group summaries, in addition to the inode
......@@ -194,16 +196,20 @@ static inline void ext4_journal_callback_add(handle_t *handle,
* ext4_journal_callback_del: delete a registered callback
* @handle: active journal transaction handle on which callback was registered
* @jce: registered journal callback entry to unregister
* Return true if object was sucessfully removed
*/
static inline void ext4_journal_callback_del(handle_t *handle,
static inline bool ext4_journal_callback_try_del(handle_t *handle,
struct ext4_journal_cb_entry *jce)
{
bool deleted;
struct ext4_sb_info *sbi =
EXT4_SB(handle->h_transaction->t_journal->j_private);
spin_lock(&sbi->s_md_lock);
deleted = !list_empty(&jce->jce_list);
list_del_init(&jce->jce_list);
spin_unlock(&sbi->s_md_lock);
return deleted;
}
int
......
此差异已折叠。
......@@ -166,8 +166,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (journal->j_flags & JBD2_BARRIER &&
!jbd2_trans_will_send_data_barrier(journal, commit_tid))
needs_barrier = true;
jbd2_log_start_commit(journal, commit_tid);
ret = jbd2_log_wait_commit(journal, commit_tid);
ret = jbd2_complete_transaction(journal, commit_tid);
if (needs_barrier) {
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (!ret)
......
......@@ -166,7 +166,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
trace_ext4_load_inode_bitmap(sb, block_group);
bh->b_end_io = ext4_end_bitmap_read;
get_bh(bh);
submit_bh(READ, bh);
submit_bh(READ | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
put_bh(bh);
......@@ -666,6 +666,23 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
ei = EXT4_I(inode);
sbi = EXT4_SB(sb);
/*
* Initalize owners and quota early so that we don't have to account
* for quota initialization worst case in standard inode creating
* transaction
*/
if (owner) {
inode->i_mode = mode;
i_uid_write(inode, owner[0]);
i_gid_write(inode, owner[1]);
} else if (test_opt(sb, GRPID)) {
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = dir->i_gid;
} else
inode_init_owner(inode, dir, mode);
dquot_initialize(inode);
if (!goal)
goal = sbi->s_inode_goal;
......@@ -697,7 +714,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
if (!gdp)
goto fail;
goto out;
/*
* Check free inodes count before loading bitmap.
......@@ -711,7 +728,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
if (!inode_bitmap_bh)
goto fail;
goto out;
repeat_in_this_group:
ino = ext4_find_next_zero_bit((unsigned long *)
......@@ -733,13 +750,16 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
handle_type, nblocks);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto fail;
ext4_std_error(sb, err);
goto out;
}
}
BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
if (err)
goto fail;
if (err) {
ext4_std_error(sb, err);
goto out;
}
ext4_lock_group(sb, group);
ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
ext4_unlock_group(sb, group);
......@@ -755,8 +775,10 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
got:
BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
if (err)
goto fail;
if (err) {
ext4_std_error(sb, err);
goto out;
}
/* We may have to initialize the block bitmap if it isn't already */
if (ext4_has_group_desc_csum(sb) &&
......@@ -768,7 +790,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
err = ext4_journal_get_write_access(handle, block_bitmap_bh);
if (err) {
brelse(block_bitmap_bh);
goto fail;
ext4_std_error(sb, err);
goto out;
}
BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
......@@ -787,14 +810,18 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
ext4_unlock_group(sb, group);
brelse(block_bitmap_bh);
if (err)
goto fail;
if (err) {
ext4_std_error(sb, err);
goto out;
}
}
BUFFER_TRACE(group_desc_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, group_desc_bh);
if (err)
goto fail;
if (err) {
ext4_std_error(sb, err);
goto out;
}
/* Update the relevant bg descriptor fields */
if (ext4_has_group_desc_csum(sb)) {
......@@ -840,8 +867,10 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
if (err)
goto fail;
if (err) {
ext4_std_error(sb, err);
goto out;
}
percpu_counter_dec(&sbi->s_freeinodes_counter);
if (S_ISDIR(mode))
......@@ -851,16 +880,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
flex_group = ext4_flex_group(sbi, group);
atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
}
if (owner) {
inode->i_mode = mode;
i_uid_write(inode, owner[0]);
i_gid_write(inode, owner[1]);
} else if (test_opt(sb, GRPID)) {
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = dir->i_gid;
} else
inode_init_owner(inode, dir, mode);
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
/* This is the optimal IO size (for stat), not the fs block size */
......@@ -889,7 +908,9 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
* twice.
*/
err = -EIO;
goto fail;
ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
inode->i_ino);
goto out;
}
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
......@@ -899,7 +920,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
__u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = cpu_to_le32(inode->i_generation);
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
......@@ -918,7 +938,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = inode;
dquot_initialize(inode);
err = dquot_alloc_inode(inode);
if (err)
goto fail_drop;
......@@ -952,24 +971,17 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
ext4_debug("allocating inode %lu\n", inode->i_ino);
trace_ext4_allocate_inode(inode, dir, mode);
goto really_out;
fail:
ext4_std_error(sb, err);
out:
iput(inode);
ret = ERR_PTR(err);
really_out:
brelse(inode_bitmap_bh);
return ret;
fail_free_drop:
dquot_free_inode(inode);
fail_drop:
dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
clear_nlink(inode);
unlock_new_inode(inode);
out:
dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
iput(inode);
brelse(inode_bitmap_bh);
return ERR_PTR(err);
......
......@@ -291,131 +291,6 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
return count;
}
/**
* ext4_alloc_blocks: multiple allocate blocks needed for a branch
* @handle: handle for this transaction
* @inode: inode which needs allocated blocks
* @iblock: the logical block to start allocated at
* @goal: preferred physical block of allocation
* @indirect_blks: the number of blocks need to allocate for indirect
* blocks
* @blks: number of desired blocks
* @new_blocks: on return it will store the new block numbers for
* the indirect blocks(if needed) and the first direct block,
* @err: on return it will store the error code
*
* This function will return the number of blocks allocated as
* requested by the passed-in parameters.
*/
static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock, ext4_fsblk_t goal,
int indirect_blks, int blks,
ext4_fsblk_t new_blocks[4], int *err)
{
struct ext4_allocation_request ar;
int target, i;
unsigned long count = 0, blk_allocated = 0;
int index = 0;
ext4_fsblk_t current_block = 0;
int ret = 0;
/*
* Here we try to allocate the requested multiple blocks at once,
* on a best-effort basis.
* To build a branch, we should allocate blocks for
* the indirect blocks(if not allocated yet), and at least
* the first direct block of this branch. That's the
* minimum number of blocks need to allocate(required)
*/
/* first we try to allocate the indirect blocks */
target = indirect_blks;
while (target > 0) {
count = target;
/* allocating blocks for indirect blocks and direct blocks */
current_block = ext4_new_meta_blocks(handle, inode, goal,
0, &count, err);
if (*err)
goto failed_out;
if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
EXT4_ERROR_INODE(inode,
"current_block %llu + count %lu > %d!",
current_block, count,
EXT4_MAX_BLOCK_FILE_PHYS);
*err = -EIO;
goto failed_out;
}
target -= count;
/* allocate blocks for indirect blocks */
while (index < indirect_blks && count) {
new_blocks[index++] = current_block++;
count--;
}
if (count > 0) {
/*
* save the new block number
* for the first direct block
*/
new_blocks[index] = current_block;
WARN(1, KERN_INFO "%s returned more blocks than "
"requested\n", __func__);
break;
}
}
target = blks - count ;
blk_allocated = count;
if (!target)
goto allocated;
/* Now allocate data blocks */
memset(&ar, 0, sizeof(ar));
ar.inode = inode;
ar.goal = goal;
ar.len = target;
ar.logical = iblock;
if (S_ISREG(inode->i_mode))
/* enable in-core preallocation only for regular files */
ar.flags = EXT4_MB_HINT_DATA;
current_block = ext4_mb_new_blocks(handle, &ar, err);
if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
EXT4_ERROR_INODE(inode,
"current_block %llu + ar.len %d > %d!",
current_block, ar.len,
EXT4_MAX_BLOCK_FILE_PHYS);
*err = -EIO;
goto failed_out;
}
if (*err && (target == blks)) {
/*
* if the allocation failed and we didn't allocate
* any blocks before
*/
goto failed_out;
}
if (!*err) {
if (target == blks) {
/*
* save the new block number
* for the first direct block
*/
new_blocks[index] = current_block;
}
blk_allocated += ar.len;
}
allocated:
/* total number of blocks allocated for direct blocks */
ret = blk_allocated;
*err = 0;
return ret;
failed_out:
for (i = 0; i < index; i++)
ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
return ret;
}
/**
* ext4_alloc_branch - allocate and set up a chain of blocks.
* @handle: handle for this transaction
......@@ -448,60 +323,59 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
int *blks, ext4_fsblk_t goal,
ext4_lblk_t *offsets, Indirect *branch)
{
int blocksize = inode->i_sb->s_blocksize;
int i, n = 0;
int err = 0;
struct buffer_head *bh;
int num;
ext4_fsblk_t new_blocks[4];
ext4_fsblk_t current_block;
num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
*blks, new_blocks, &err);
if (err)
return err;
struct ext4_allocation_request ar;
struct buffer_head * bh;
ext4_fsblk_t b, new_blocks[4];
__le32 *p;
int i, j, err, len = 1;
branch[0].key = cpu_to_le32(new_blocks[0]);
/*
* metadata blocks and data blocks are allocated.
* Set up for the direct block allocation
*/
for (n = 1; n <= indirect_blks; n++) {
/*
* Get buffer_head for parent block, zero it out
* and set the pointer to new one, then send
* parent to disk.
*/
bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
memset(&ar, 0, sizeof(ar));
ar.inode = inode;
ar.len = *blks;
ar.logical = iblock;
if (S_ISREG(inode->i_mode))
ar.flags = EXT4_MB_HINT_DATA;
for (i = 0; i <= indirect_blks; i++) {
if (i == indirect_blks) {
ar.goal = goal;
new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err);
} else
goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode,
goal, 0, NULL, &err);
if (err) {
i--;
goto failed;
}
branch[i].key = cpu_to_le32(new_blocks[i]);
if (i == 0)
continue;
bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]);
if (unlikely(!bh)) {
err = -ENOMEM;
goto failed;
}
branch[n].bh = bh;
lock_buffer(bh);
BUFFER_TRACE(bh, "call get_create_access");
err = ext4_journal_get_create_access(handle, bh);
if (err) {
/* Don't brelse(bh) here; it's done in
* ext4_journal_forget() below */
unlock_buffer(bh);
goto failed;
}
memset(bh->b_data, 0, blocksize);
branch[n].p = (__le32 *) bh->b_data + offsets[n];
branch[n].key = cpu_to_le32(new_blocks[n]);
*branch[n].p = branch[n].key;
if (n == indirect_blks) {
current_block = new_blocks[n];
/*
* End of chain, update the last new metablock of
* the chain to point to the new allocated
* data blocks numbers
*/
for (i = 1; i < num; i++)
*(branch[n].p + i) = cpu_to_le32(++current_block);
}
memset(bh->b_data, 0, bh->b_size);
p = branch[i].p = (__le32 *) bh->b_data + offsets[i];
b = new_blocks[i];
if (i == indirect_blks)
len = ar.len;
for (j = 0; j < len; j++)
*p++ = cpu_to_le32(b++);
BUFFER_TRACE(bh, "marking uptodate");
set_buffer_uptodate(bh);
unlock_buffer(bh);
......@@ -511,25 +385,16 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
if (err)
goto failed;
}
*blks = num;
return err;
*blks = ar.len;
return 0;
failed:
/* Allocation failed, free what we already allocated */
ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
for (i = 1; i <= n ; i++) {
/*
* branch[i].bh is newly allocated, so there is no
* need to revoke the block, which is why we don't
* need to set EXT4_FREE_BLOCKS_METADATA.
*/
ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
EXT4_FREE_BLOCKS_FORGET);
for (; i >= 0; i--) {
if (i != indirect_blks && branch[i].bh)
ext4_forget(handle, 1, inode, branch[i].bh,
branch[i].bh->b_blocknr);
ext4_free_blocks(handle, inode, NULL, new_blocks[i],
(i == indirect_blks) ? ar.len : 1, 0);
}
for (i = n+1; i < indirect_blks; i++)
ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
return err;
}
......@@ -941,26 +806,9 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
* be able to restart the transaction at a conventient checkpoint to make
* sure we don't overflow the journal.
*
* start_transaction gets us a new handle for a truncate transaction,
* and extend_transaction tries to extend the existing one a bit. If
* Try to extend this transaction for the purposes of truncation. If
* extend fails, we need to propagate the failure up and restart the
* transaction in the top-level truncate loop. --sct
*/
static handle_t *start_transaction(struct inode *inode)
{
handle_t *result;
result = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
ext4_blocks_for_truncate(inode));
if (!IS_ERR(result))
return result;
ext4_std_error(inode->i_sb, PTR_ERR(result));
return result;
}
/*
* Try to extend this transaction for the purposes of truncation.
*
* Returns 0 if we managed to create more room. If we can't create more
* room, and the transaction must be restarted we return 1.
......@@ -1353,68 +1201,30 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
}
}
void ext4_ind_truncate(struct inode *inode)
void ext4_ind_truncate(handle_t *handle, struct inode *inode)
{
handle_t *handle;
struct ext4_inode_info *ei = EXT4_I(inode);
__le32 *i_data = ei->i_data;
int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
ext4_lblk_t offsets[4];
Indirect chain[4];
Indirect *partial;
__le32 nr = 0;
int n = 0;
ext4_lblk_t last_block, max_block;
loff_t page_len;
unsigned blocksize = inode->i_sb->s_blocksize;
int err;
handle = start_transaction(inode);
if (IS_ERR(handle))
return; /* AKPM: return what? */
last_block = (inode->i_size + blocksize-1)
>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
if (inode->i_size % PAGE_CACHE_SIZE != 0) {
page_len = PAGE_CACHE_SIZE -
(inode->i_size & (PAGE_CACHE_SIZE - 1));
err = ext4_discard_partial_page_buffers(handle,
mapping, inode->i_size, page_len, 0);
if (err)
goto out_stop;
}
if (last_block != max_block) {
n = ext4_block_to_path(inode, last_block, offsets, NULL);
if (n == 0)
goto out_stop; /* error */
return;
}
/*
* OK. This truncate is going to happen. We add the inode to the
* orphan list, so that if this truncate spans multiple transactions,
* and we crash, we will resume the truncate when the filesystem
* recovers. It also marks the inode dirty, to catch the new size.
*
* Implication: the file must always be in a sane, consistent
* truncatable state while each transaction commits.
*/
if (ext4_orphan_add(handle, inode))
goto out_stop;
/*
* From here we block out all ext4_get_block() callers who want to
* modify the block allocation tree.
*/
down_write(&ei->i_data_sem);
ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
/*
......@@ -1431,7 +1241,7 @@ void ext4_ind_truncate(struct inode *inode)
* It is unnecessary to free any data blocks if last_block is
* equal to the indirect block limit.
*/
goto out_unlock;
return;
} else if (n == 1) { /* direct blocks */
ext4_free_data(handle, inode, NULL, i_data+offsets[0],
i_data + EXT4_NDIR_BLOCKS);
......@@ -1491,31 +1301,6 @@ void ext4_ind_truncate(struct inode *inode)
case EXT4_TIND_BLOCK:
;
}
out_unlock:
up_write(&ei->i_data_sem);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
/*
* In a multi-transaction truncate, we only make the final transaction
* synchronous
*/
if (IS_SYNC(inode))
ext4_handle_sync(handle);
out_stop:
/*
* If this was a simple ftruncate(), and the file will remain alive
* then we need to clear up the orphan record which we created above.
* However, if this was a real unlink then we were called by
* ext4_delete_inode(), and we allow that function to clean up the
* orphan info for us.
*/
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
ext4_journal_stop(handle);
trace_ext4_truncate_exit(inode);
}
static int free_hole_blocks(handle_t *handle, struct inode *inode,
......@@ -1569,8 +1354,8 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode,
return ret;
}
static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t first, ext4_lblk_t stop)
int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t first, ext4_lblk_t stop)
{
int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
int level, ret = 0;
......@@ -1604,157 +1389,3 @@ static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
return ret;
}
int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length)
{
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
ext4_lblk_t first_block, stop_block;
struct address_space *mapping = inode->i_mapping;
handle_t *handle = NULL;
loff_t first_page, last_page, page_len;
loff_t first_page_offset, last_page_offset;
int err = 0;
/*
* Write out all dirty pages to avoid race conditions
* Then release them.
*/
if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
err = filemap_write_and_wait_range(mapping,
offset, offset + length - 1);
if (err)
return err;
}
mutex_lock(&inode->i_mutex);
/* It's not possible punch hole on append only file */
if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
err = -EPERM;
goto out_mutex;
}
if (IS_SWAPFILE(inode)) {
err = -ETXTBSY;
goto out_mutex;
}
/* No need to punch hole beyond i_size */
if (offset >= inode->i_size)
goto out_mutex;
/*
* If the hole extents beyond i_size, set the hole
* to end after the page that contains i_size
*/
if (offset + length > inode->i_size) {
length = inode->i_size +
PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
offset;
}
first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
last_page = (offset + length) >> PAGE_CACHE_SHIFT;
first_page_offset = first_page << PAGE_CACHE_SHIFT;
last_page_offset = last_page << PAGE_CACHE_SHIFT;
/* Now release the pages */
if (last_page_offset > first_page_offset) {
truncate_pagecache_range(inode, first_page_offset,
last_page_offset - 1);
}
/* Wait all existing dio works, newcomers will block on i_mutex */
inode_dio_wait(inode);
handle = start_transaction(inode);
if (IS_ERR(handle))
goto out_mutex;
/*
* Now we need to zero out the non-page-aligned data in the
* pages at the start and tail of the hole, and unmap the buffer
* heads for the block aligned regions of the page that were
* completely zerod.
*/
if (first_page > last_page) {
/*
* If the file space being truncated is contained within a page
* just zero out and unmap the middle of that page
*/
err = ext4_discard_partial_page_buffers(handle,
mapping, offset, length, 0);
if (err)
goto out;
} else {
/*
* Zero out and unmap the paritial page that contains
* the start of the hole
*/
page_len = first_page_offset - offset;
if (page_len > 0) {
err = ext4_discard_partial_page_buffers(handle, mapping,
offset, page_len, 0);
if (err)
goto out;
}
/*
* Zero out and unmap the partial page that contains
* the end of the hole
*/
page_len = offset + length - last_page_offset;
if (page_len > 0) {
err = ext4_discard_partial_page_buffers(handle, mapping,
last_page_offset, page_len, 0);
if (err)
goto out;
}
}
/*
* If i_size contained in the last page, we need to
* unmap and zero the paritial page after i_size
*/
if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
inode->i_size % PAGE_CACHE_SIZE != 0) {
page_len = PAGE_CACHE_SIZE -
(inode->i_size & (PAGE_CACHE_SIZE - 1));
if (page_len > 0) {
err = ext4_discard_partial_page_buffers(handle,
mapping, inode->i_size, page_len, 0);
if (err)
goto out;
}
}
first_block = (offset + sb->s_blocksize - 1) >>
EXT4_BLOCK_SIZE_BITS(sb);
stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
if (first_block >= stop_block)
goto out;
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
err = ext4_es_remove_extent(inode, first_block,
stop_block - first_block);
err = ext4_free_hole_blocks(handle, inode, first_block, stop_block);
ext4_discard_preallocations(inode);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
up_write(&EXT4_I(inode)->i_data_sem);
out:
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
out_mutex:
mutex_unlock(&inode->i_mutex);
return err;
}
......@@ -19,7 +19,8 @@
#define EXT4_XATTR_SYSTEM_DATA "data"
#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
#define EXT4_INLINE_DOTDOT_SIZE 4
#define EXT4_INLINE_DOTDOT_OFFSET 2
#define EXT4_INLINE_DOTDOT_SIZE 4
int ext4_get_inline_size(struct inode *inode)
{
......@@ -1289,6 +1290,120 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
return ret;
}
/*
* This function fills a red-black tree with information from an
* inlined dir. It returns the number directory entries loaded
* into the tree. If there is an error it is returned in err.
*/
int htree_inlinedir_to_tree(struct file *dir_file,
struct inode *dir, ext4_lblk_t block,
struct dx_hash_info *hinfo,
__u32 start_hash, __u32 start_minor_hash,
int *has_inline_data)
{
int err = 0, count = 0;
unsigned int parent_ino;
int pos;
struct ext4_dir_entry_2 *de;
struct inode *inode = file_inode(dir_file);
int ret, inline_size = 0;
struct ext4_iloc iloc;
void *dir_buf = NULL;
struct ext4_dir_entry_2 fake;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
return ret;
down_read(&EXT4_I(inode)->xattr_sem);
if (!ext4_has_inline_data(inode)) {
up_read(&EXT4_I(inode)->xattr_sem);
*has_inline_data = 0;
goto out;
}
inline_size = ext4_get_inline_size(inode);
dir_buf = kmalloc(inline_size, GFP_NOFS);
if (!dir_buf) {
ret = -ENOMEM;
up_read(&EXT4_I(inode)->xattr_sem);
goto out;
}
ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
up_read(&EXT4_I(inode)->xattr_sem);
if (ret < 0)
goto out;
pos = 0;
parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
while (pos < inline_size) {
/*
* As inlined dir doesn't store any information about '.' and
* only the inode number of '..' is stored, we have to handle
* them differently.
*/
if (pos == 0) {
fake.inode = cpu_to_le32(inode->i_ino);
fake.name_len = 1;
strcpy(fake.name, ".");
fake.rec_len = ext4_rec_len_to_disk(
EXT4_DIR_REC_LEN(fake.name_len),
inline_size);
ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
de = &fake;
pos = EXT4_INLINE_DOTDOT_OFFSET;
} else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
fake.inode = cpu_to_le32(parent_ino);
fake.name_len = 2;
strcpy(fake.name, "..");
fake.rec_len = ext4_rec_len_to_disk(
EXT4_DIR_REC_LEN(fake.name_len),
inline_size);
ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
de = &fake;
pos = EXT4_INLINE_DOTDOT_SIZE;
} else {
de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
if (ext4_check_dir_entry(inode, dir_file, de,
iloc.bh, dir_buf,
inline_size, pos)) {
ret = count;
goto out;
}
}
ext4fs_dirhash(de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
(hinfo->minor_hash < start_minor_hash)))
continue;
if (de->inode == 0)
continue;
err = ext4_htree_store_dirent(dir_file,
hinfo->hash, hinfo->minor_hash, de);
if (err) {
count = err;
goto out;
}
count++;
}
ret = count;
out:
kfree(dir_buf);
brelse(iloc.bh);
return ret;
}
/*
* So this function is called when the volume is mkfsed with
* dir_index disabled. In order to keep f_pos persistent
* after we convert from an inlined dir to a blocked based,
* we just pretend that we are a normal dir and return the
* offset as if '.' and '..' really take place.
*
*/
int ext4_read_inline_dir(struct file *filp,
void *dirent, filldir_t filldir,
int *has_inline_data)
......@@ -1302,6 +1417,7 @@ int ext4_read_inline_dir(struct file *filp,
int ret, inline_size = 0;
struct ext4_iloc iloc;
void *dir_buf = NULL;
int dotdot_offset, dotdot_size, extra_offset, extra_size;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
......@@ -1330,8 +1446,21 @@ int ext4_read_inline_dir(struct file *filp,
sb = inode->i_sb;
stored = 0;
parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
offset = filp->f_pos;
while (!error && !stored && filp->f_pos < inode->i_size) {
/*
* dotdot_offset and dotdot_size is the real offset and
* size for ".." and "." if the dir is block based while
* the real size for them are only EXT4_INLINE_DOTDOT_SIZE.
* So we will use extra_offset and extra_size to indicate them
* during the inline dir iteration.
*/
dotdot_offset = EXT4_DIR_REC_LEN(1);
dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
extra_size = extra_offset + inline_size;
while (!error && !stored && filp->f_pos < extra_size) {
revalidate:
/*
* If the version has changed since the last call to
......@@ -1340,15 +1469,23 @@ int ext4_read_inline_dir(struct file *filp,
* dir to make sure.
*/
if (filp->f_version != inode->i_version) {
for (i = 0;
i < inode->i_size && i < offset;) {
for (i = 0; i < extra_size && i < offset;) {
/*
* "." is with offset 0 and
* ".." is dotdot_offset.
*/
if (!i) {
/* skip "." and ".." if needed. */
i += EXT4_INLINE_DOTDOT_SIZE;
i = dotdot_offset;
continue;
} else if (i == dotdot_offset) {
i = dotdot_size;
continue;
}
/* for other entry, the real offset in
* the buf has to be tuned accordingly.
*/
de = (struct ext4_dir_entry_2 *)
(dir_buf + i);
(dir_buf + i - extra_offset);
/* It's too expensive to do a full
* dirent test each time round this
* loop, but we do have to test at
......@@ -1356,43 +1493,47 @@ int ext4_read_inline_dir(struct file *filp,
* failure will be detected in the
* dirent test below. */
if (ext4_rec_len_from_disk(de->rec_len,
inline_size) < EXT4_DIR_REC_LEN(1))
extra_size) < EXT4_DIR_REC_LEN(1))
break;
i += ext4_rec_len_from_disk(de->rec_len,
inline_size);
extra_size);
}
offset = i;
filp->f_pos = offset;
filp->f_version = inode->i_version;
}
while (!error && filp->f_pos < inode->i_size) {
while (!error && filp->f_pos < extra_size) {
if (filp->f_pos == 0) {
error = filldir(dirent, ".", 1, 0, inode->i_ino,
DT_DIR);
if (error)
break;
stored++;
filp->f_pos = dotdot_offset;
continue;
}
error = filldir(dirent, "..", 2, 0, parent_ino,
DT_DIR);
if (filp->f_pos == dotdot_offset) {
error = filldir(dirent, "..", 2,
dotdot_offset,
parent_ino, DT_DIR);
if (error)
break;
stored++;
filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE;
filp->f_pos = dotdot_size;
continue;
}
de = (struct ext4_dir_entry_2 *)(dir_buf + offset);
de = (struct ext4_dir_entry_2 *)
(dir_buf + filp->f_pos - extra_offset);
if (ext4_check_dir_entry(inode, filp, de,
iloc.bh, dir_buf,
inline_size, offset)) {
extra_size, filp->f_pos)) {
ret = stored;
goto out;
}
offset += ext4_rec_len_from_disk(de->rec_len,
inline_size);
if (le32_to_cpu(de->inode)) {
/* We might block in the next section
* if the data destination is
......@@ -1415,9 +1556,8 @@ int ext4_read_inline_dir(struct file *filp,
stored++;
}
filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
inline_size);
extra_size);
}
offset = 0;
}
out:
kfree(dir_buf);
......
此差异已折叠。
......@@ -17,9 +17,201 @@
#include <asm/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "ext4_extents.h"
#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
/**
* Swap memory between @a and @b for @len bytes.
*
* @a: pointer to first memory area
* @b: pointer to second memory area
* @len: number of bytes to swap
*
*/
static void memswap(void *a, void *b, size_t len)
{
unsigned char *ap, *bp;
unsigned char tmp;
ap = (unsigned char *)a;
bp = (unsigned char *)b;
while (len-- > 0) {
tmp = *ap;
*ap = *bp;
*bp = tmp;
ap++;
bp++;
}
}
/**
* Swap i_data and associated attributes between @inode1 and @inode2.
* This function is used for the primary swap between inode1 and inode2
* and also to revert this primary swap in case of errors.
*
* Therefore you have to make sure, that calling this method twice
* will revert all changes.
*
* @inode1: pointer to first inode
* @inode2: pointer to second inode
*/
static void swap_inode_data(struct inode *inode1, struct inode *inode2)
{
loff_t isize;
struct ext4_inode_info *ei1;
struct ext4_inode_info *ei2;
ei1 = EXT4_I(inode1);
ei2 = EXT4_I(inode2);
memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags));
memswap(&inode1->i_version, &inode2->i_version,
sizeof(inode1->i_version));
memswap(&inode1->i_blocks, &inode2->i_blocks,
sizeof(inode1->i_blocks));
memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes));
memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime));
memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime));
memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree));
memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr));
isize = i_size_read(inode1);
i_size_write(inode1, i_size_read(inode2));
i_size_write(inode2, isize);
}
/**
* Swap the information from the given @inode and the inode
* EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
* important fields of the inodes.
*
* @sb: the super block of the filesystem
* @inode: the inode to swap with EXT4_BOOT_LOADER_INO
*
*/
static long swap_inode_boot_loader(struct super_block *sb,
struct inode *inode)
{
handle_t *handle;
int err;
struct inode *inode_bl;
struct ext4_inode_info *ei;
struct ext4_inode_info *ei_bl;
struct ext4_sb_info *sbi;
if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
err = -EINVAL;
goto swap_boot_out;
}
if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
err = -EPERM;
goto swap_boot_out;
}
sbi = EXT4_SB(sb);
ei = EXT4_I(inode);
inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
if (IS_ERR(inode_bl)) {
err = PTR_ERR(inode_bl);
goto swap_boot_out;
}
ei_bl = EXT4_I(inode_bl);
filemap_flush(inode->i_mapping);
filemap_flush(inode_bl->i_mapping);
/* Protect orig inodes against a truncate and make sure,
* that only 1 swap_inode_boot_loader is running. */
ext4_inode_double_lock(inode, inode_bl);
truncate_inode_pages(&inode->i_data, 0);
truncate_inode_pages(&inode_bl->i_data, 0);
/* Wait for all existing dio workers */
ext4_inode_block_unlocked_dio(inode);
ext4_inode_block_unlocked_dio(inode_bl);
inode_dio_wait(inode);
inode_dio_wait(inode_bl);
handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
if (IS_ERR(handle)) {
err = -EINVAL;
goto swap_boot_out;
}
/* Protect extent tree against block allocations via delalloc */
ext4_double_down_write_data_sem(inode, inode_bl);
if (inode_bl->i_nlink == 0) {
/* this inode has never been used as a BOOT_LOADER */
set_nlink(inode_bl, 1);
i_uid_write(inode_bl, 0);
i_gid_write(inode_bl, 0);
inode_bl->i_flags = 0;
ei_bl->i_flags = 0;
inode_bl->i_version = 1;
i_size_write(inode_bl, 0);
inode_bl->i_mode = S_IFREG;
if (EXT4_HAS_INCOMPAT_FEATURE(sb,
EXT4_FEATURE_INCOMPAT_EXTENTS)) {
ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
ext4_ext_tree_init(handle, inode_bl);
} else
memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
}
swap_inode_data(inode, inode_bl);
inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
inode_bl->i_generation = sbi->s_next_generation++;
spin_unlock(&sbi->s_next_gen_lock);
ext4_discard_preallocations(inode);
err = ext4_mark_inode_dirty(handle, inode);
if (err < 0) {
ext4_warning(inode->i_sb,
"couldn't mark inode #%lu dirty (err %d)",
inode->i_ino, err);
/* Revert all changes: */
swap_inode_data(inode, inode_bl);
} else {
err = ext4_mark_inode_dirty(handle, inode_bl);
if (err < 0) {
ext4_warning(inode_bl->i_sb,
"couldn't mark inode #%lu dirty (err %d)",
inode_bl->i_ino, err);
/* Revert all changes: */
swap_inode_data(inode, inode_bl);
ext4_mark_inode_dirty(handle, inode);
}
}
ext4_journal_stop(handle);
ext4_double_up_write_data_sem(inode, inode_bl);
ext4_inode_resume_unlocked_dio(inode);
ext4_inode_resume_unlocked_dio(inode_bl);
ext4_inode_double_unlock(inode, inode_bl);
iput(inode_bl);
swap_boot_out:
return err;
}
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
......@@ -83,17 +275,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (!capable(CAP_SYS_RESOURCE))
goto flags_out;
}
if (oldflags & EXT4_EXTENTS_FL) {
/* We don't support clearning extent flags */
if (!(flags & EXT4_EXTENTS_FL)) {
err = -EOPNOTSUPP;
goto flags_out;
}
} else if (flags & EXT4_EXTENTS_FL) {
/* migrate the file */
if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
migrate = 1;
flags &= ~EXT4_EXTENTS_FL;
}
if (flags & EXT4_EOFBLOCKS_FL) {
/* we don't support adding EOFBLOCKS flag */
......@@ -137,8 +320,13 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
err = ext4_change_inode_journal_flag(inode, jflag);
if (err)
goto flags_out;
if (migrate)
err = ext4_ext_migrate(inode);
if (migrate) {
if (flags & EXT4_EXTENTS_FL)
err = ext4_ext_migrate(inode);
else
err = ext4_ind_migrate(inode);
}
flags_out:
mutex_unlock(&inode->i_mutex);
mnt_drop_write_file(filp);
......@@ -357,9 +545,13 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return err;
}
case EXT4_IOC_SWAP_BOOT:
if (!(filp->f_mode & FMODE_WRITE))
return -EBADF;
return swap_inode_boot_loader(sb, inode);
case EXT4_IOC_RESIZE_FS: {
ext4_fsblk_t n_blocks_count;
struct super_block *sb = inode->i_sb;
int err = 0, err2 = 0;
ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
......
......@@ -405,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr)
ext4_clear_bit(bit, addr);
}
static inline int mb_test_and_clear_bit(int bit, void *addr)
{
addr = mb_correct_addr_and_bit(&bit, addr);
return ext4_test_and_clear_bit(bit, addr);
}
static inline int mb_find_next_zero_bit(void *addr, int max, int start)
{
int fix = 0, ret, tmpmax;
......@@ -764,6 +770,24 @@ void ext4_mb_generate_buddy(struct super_block *sb,
spin_unlock(&EXT4_SB(sb)->s_bal_lock);
}
static void mb_regenerate_buddy(struct ext4_buddy *e4b)
{
int count;
int order = 1;
void *buddy;
while ((buddy = mb_find_buddy(e4b, order++, &count))) {
ext4_set_bits(buddy, 0, count);
}
e4b->bd_info->bb_fragments = 0;
memset(e4b->bd_info->bb_counters, 0,
sizeof(*e4b->bd_info->bb_counters) *
(e4b->bd_sb->s_blocksize_bits + 2));
ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
e4b->bd_bitmap, e4b->bd_group);
}
/* The buddy information is attached the buddy cache inode
* for convenience. The information regarding each group
* is loaded via ext4_mb_load_buddy. The information involve
......@@ -860,8 +884,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
first_block = page->index * blocks_per_page;
for (i = 0; i < blocks_per_page; i++) {
int group;
group = (first_block + i) >> 1;
if (group >= ngroups)
break;
......@@ -1011,6 +1033,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
struct page *page;
int ret = 0;
might_sleep();
mb_debug(1, "init group %u\n", group);
this_grp = ext4_get_group_info(sb, group);
/*
......@@ -1082,6 +1105,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct inode *inode = sbi->s_buddy_cache;
might_sleep();
mb_debug(1, "load group %u\n", group);
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
......@@ -1244,6 +1268,33 @@ static void mb_clear_bits(void *bm, int cur, int len)
}
}
/* clear bits in given range
* will return first found zero bit if any, -1 otherwise
*/
static int mb_test_and_clear_bits(void *bm, int cur, int len)
{
__u32 *addr;
int zero_bit = -1;
len = cur + len;
while (cur < len) {
if ((cur & 31) == 0 && (len - cur) >= 32) {
/* fast path: clear whole word at once */
addr = bm + (cur >> 3);
if (*addr != (__u32)(-1) && zero_bit == -1)
zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
*addr = 0;
cur += 32;
continue;
}
if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
zero_bit = cur;
cur++;
}
return zero_bit;
}
void ext4_set_bits(void *bm, int cur, int len)
{
__u32 *addr;
......@@ -1262,17 +1313,90 @@ void ext4_set_bits(void *bm, int cur, int len)
}
}
/*
* _________________________________________________________________ */
static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
{
if (mb_test_bit(*bit + side, bitmap)) {
mb_clear_bit(*bit, bitmap);
(*bit) -= side;
return 1;
}
else {
(*bit) += side;
mb_set_bit(*bit, bitmap);
return -1;
}
}
static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
{
int max;
int order = 1;
void *buddy = mb_find_buddy(e4b, order, &max);
while (buddy) {
void *buddy2;
/* Bits in range [first; last] are known to be set since
* corresponding blocks were allocated. Bits in range
* (first; last) will stay set because they form buddies on
* upper layer. We just deal with borders if they don't
* align with upper layer and then go up.
* Releasing entire group is all about clearing
* single bit of highest order buddy.
*/
/* Example:
* ---------------------------------
* | 1 | 1 | 1 | 1 |
* ---------------------------------
* | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
* ---------------------------------
* 0 1 2 3 4 5 6 7
* \_____________________/
*
* Neither [1] nor [6] is aligned to above layer.
* Left neighbour [0] is free, so mark it busy,
* decrease bb_counters and extend range to
* [0; 6]
* Right neighbour [7] is busy. It can't be coaleasced with [6], so
* mark [6] free, increase bb_counters and shrink range to
* [0; 5].
* Then shift range to [0; 2], go up and do the same.
*/
if (first & 1)
e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
if (!(last & 1))
e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
if (first > last)
break;
order++;
if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
mb_clear_bits(buddy, first, last - first + 1);
e4b->bd_info->bb_counters[order - 1] += last - first + 1;
break;
}
first >>= 1;
last >>= 1;
buddy = buddy2;
}
}
static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
int first, int count)
int first, int count)
{
int block = 0;
int max = 0;
int order;
void *buddy;
void *buddy2;
int left_is_free = 0;
int right_is_free = 0;
int block;
int last = first + count - 1;
struct super_block *sb = e4b->bd_sb;
BUG_ON(first + count > (sb->s_blocksize << 3));
BUG_ON(last >= (sb->s_blocksize << 3));
assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
mb_check_buddy(e4b);
mb_free_blocks_double(inode, e4b, first, count);
......@@ -1281,67 +1405,54 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
if (first < e4b->bd_info->bb_first_free)
e4b->bd_info->bb_first_free = first;
/* let's maintain fragments counter */
/* access memory sequentially: check left neighbour,
* clear range and then check right neighbour
*/
if (first != 0)
block = !mb_test_bit(first - 1, e4b->bd_bitmap);
if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
max = !mb_test_bit(first + count, e4b->bd_bitmap);
if (block && max)
e4b->bd_info->bb_fragments--;
else if (!block && !max)
e4b->bd_info->bb_fragments++;
left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
/* let's maintain buddy itself */
while (count-- > 0) {
block = first++;
order = 0;
if (unlikely(block != -1)) {
ext4_fsblk_t blocknr;
if (!mb_test_bit(block, e4b->bd_bitmap)) {
ext4_fsblk_t blocknr;
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
blocknr += EXT4_C2B(EXT4_SB(sb), block);
ext4_grp_locked_error(sb, e4b->bd_group,
inode ? inode->i_ino : 0,
blocknr,
"freeing already freed block "
"(bit %u)", block);
}
mb_clear_bit(block, e4b->bd_bitmap);
e4b->bd_info->bb_counters[order]++;
/* start of the buddy */
buddy = mb_find_buddy(e4b, order, &max);
do {
block &= ~1UL;
if (mb_test_bit(block, buddy) ||
mb_test_bit(block + 1, buddy))
break;
/* both the buddies are free, try to coalesce them */
buddy2 = mb_find_buddy(e4b, order + 1, &max);
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
blocknr += EXT4_C2B(EXT4_SB(sb), block);
ext4_grp_locked_error(sb, e4b->bd_group,
inode ? inode->i_ino : 0,
blocknr,
"freeing already freed block "
"(bit %u)", block);
mb_regenerate_buddy(e4b);
goto done;
}
if (!buddy2)
break;
/* let's maintain fragments counter */
if (left_is_free && right_is_free)
e4b->bd_info->bb_fragments--;
else if (!left_is_free && !right_is_free)
e4b->bd_info->bb_fragments++;
if (order > 0) {
/* for special purposes, we don't set
* free bits in bitmap */
mb_set_bit(block, buddy);
mb_set_bit(block + 1, buddy);
}
e4b->bd_info->bb_counters[order]--;
e4b->bd_info->bb_counters[order]--;
/* buddy[0] == bd_bitmap is a special case, so handle
* it right away and let mb_buddy_mark_free stay free of
* zero order checks.
* Check if neighbours are to be coaleasced,
* adjust bitmap bb_counters and borders appropriately.
*/
if (first & 1) {
first += !left_is_free;
e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
}
if (!(last & 1)) {
last -= !right_is_free;
e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
}
block = block >> 1;
order++;
e4b->bd_info->bb_counters[order]++;
if (first <= last)
mb_buddy_mark_free(e4b, first >> 1, last >> 1);
mb_clear_bit(block, buddy2);
buddy = buddy2;
} while (1);
}
done:
mb_set_largest_free_order(sb, e4b->bd_info);
mb_check_buddy(e4b);
}
......@@ -3342,7 +3453,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
if (pa->pa_type == MB_GROUP_PA)
grp_blk--;
ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
grp = ext4_get_group_number(sb, grp_blk);
/*
* possible race:
......@@ -3807,7 +3918,7 @@ void ext4_discard_preallocations(struct inode *inode)
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
BUG_ON(pa->pa_type != MB_INODE_PA);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
group = ext4_get_group_number(sb, pa->pa_pstart);
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
......@@ -4069,7 +4180,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
group = ext4_get_group_number(sb, pa->pa_pstart);
if (ext4_mb_load_buddy(sb, group, &e4b)) {
ext4_error(sb, "Error loading buddy information for %u",
group);
......@@ -4217,6 +4328,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
unsigned int inquota = 0;
unsigned int reserv_clstrs = 0;
might_sleep();
sb = ar->inode->i_sb;
sbi = EXT4_SB(sb);
......@@ -4420,11 +4532,11 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
node = rb_prev(new_node);
if (node) {
entry = rb_entry(node, struct ext4_free_data, efd_node);
if (can_merge(entry, new_entry)) {
if (can_merge(entry, new_entry) &&
ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
new_entry->efd_start_cluster = entry->efd_start_cluster;
new_entry->efd_count += entry->efd_count;
rb_erase(node, &(db->bb_free_root));
ext4_journal_callback_del(handle, &entry->efd_jce);
kmem_cache_free(ext4_free_data_cachep, entry);
}
}
......@@ -4432,10 +4544,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
node = rb_next(new_node);
if (node) {
entry = rb_entry(node, struct ext4_free_data, efd_node);
if (can_merge(new_entry, entry)) {
if (can_merge(new_entry, entry) &&
ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
new_entry->efd_count += entry->efd_count;
rb_erase(node, &(db->bb_free_root));
ext4_journal_callback_del(handle, &entry->efd_jce);
kmem_cache_free(ext4_free_data_cachep, entry);
}
}
......@@ -4470,6 +4582,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
int err = 0;
int ret;
might_sleep();
if (bh) {
if (block)
BUG_ON(block != bh->b_blocknr);
......
......@@ -426,7 +426,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
return retval;
}
return retval;
}
int ext4_ext_migrate(struct inode *inode)
......@@ -606,3 +605,64 @@ int ext4_ext_migrate(struct inode *inode)
return retval;
}
/*
* Migrate a simple extent-based inode to use the i_blocks[] array
*/
int ext4_ind_migrate(struct inode *inode)
{
struct ext4_extent_header *eh;
struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_extent *ex;
unsigned int i, len;
ext4_fsblk_t blk;
handle_t *handle;
int ret;
if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_INCOMPAT_EXTENTS) ||
(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EINVAL;
if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_RO_COMPAT_BIGALLOC))
return -EOPNOTSUPP;
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
down_write(&EXT4_I(inode)->i_data_sem);
ret = ext4_ext_check_inode(inode);
if (ret)
goto errout;
eh = ext_inode_hdr(inode);
ex = EXT_FIRST_EXTENT(eh);
if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS ||
eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) {
ret = -EOPNOTSUPP;
goto errout;
}
if (eh->eh_entries == 0)
blk = len = 0;
else {
len = le16_to_cpu(ex->ee_len);
blk = ext4_ext_pblock(ex);
if (len > EXT4_NDIR_BLOCKS) {
ret = -EOPNOTSUPP;
goto errout;
}
}
ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
memset(ei->i_data, 0, sizeof(ei->i_data));
for (i=0; i < len; i++)
ei->i_data[i] = cpu_to_le32(blk++);
ext4_mark_inode_dirty(handle, inode);
errout:
ext4_journal_stop(handle);
up_write(&EXT4_I(inode)->i_data_sem);
return ret;
}
......@@ -7,7 +7,7 @@
#include "ext4.h"
/* Checksumming functions */
static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
int offset = offsetof(struct mmp_struct, mmp_checksum);
......@@ -54,7 +54,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
lock_buffer(bh);
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
submit_bh(WRITE_SYNC, bh);
submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
sb_end_write(sb);
if (unlikely(!buffer_uptodate(bh)))
......@@ -86,7 +86,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
get_bh(*bh);
lock_buffer(*bh);
(*bh)->b_end_io = end_buffer_read_sync;
submit_bh(READ_SYNC, *bh);
submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
wait_on_buffer(*bh);
if (!buffer_uptodate(*bh)) {
brelse(*bh);
......
......@@ -144,12 +144,13 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
}
/**
* double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
* ext4_double_down_write_data_sem - Acquire two inodes' write lock
* of i_data_sem
*
* Acquire write lock of i_data_sem of the two inodes
*/
static void
double_down_write_data_sem(struct inode *first, struct inode *second)
void
ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
{
if (first < second) {
down_write(&EXT4_I(first)->i_data_sem);
......@@ -162,14 +163,15 @@ double_down_write_data_sem(struct inode *first, struct inode *second)
}
/**
* double_up_write_data_sem - Release two inodes' write lock of i_data_sem
* ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
*
* @orig_inode: original inode structure to be released its lock first
* @donor_inode: donor inode structure to be released its lock second
* Release write lock of i_data_sem of two inodes (orig and donor).
*/
static void
double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
void
ext4_double_up_write_data_sem(struct inode *orig_inode,
struct inode *donor_inode)
{
up_write(&EXT4_I(orig_inode)->i_data_sem);
up_write(&EXT4_I(donor_inode)->i_data_sem);
......@@ -407,18 +409,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode,
mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
end_ext, eh, range_to_move);
if (depth) {
ret = ext4_handle_dirty_metadata(handle, orig_inode,
orig_path->p_bh);
if (ret)
return ret;
} else {
ret = ext4_mark_inode_dirty(handle, orig_inode);
if (ret < 0)
return ret;
}
return 0;
return ext4_ext_dirty(handle, orig_inode, orig_path);
}
/**
......@@ -737,6 +728,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
donor_off += dext_alen;
orig_off += dext_alen;
BUG_ON(replaced_count > count);
/* Already moved the expected blocks */
if (replaced_count >= count)
break;
......@@ -814,7 +806,13 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
page_cache_release(page[0]);
return -ENOMEM;
}
/*
* grab_cache_page_write_begin() may not wait on page's writeback if
* BDI not demand that. But it is reasonable to be very conservative
* here and explicitly wait on page's writeback
*/
wait_on_page_writeback(page[0]);
wait_on_page_writeback(page[1]);
if (inode1 > inode2) {
struct page *tmp;
tmp = page[0];
......@@ -856,7 +854,6 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
if (buffer_uptodate(bh))
continue;
if (!buffer_mapped(bh)) {
int err = 0;
err = ext4_get_block(inode, block, bh, 0);
if (err) {
SetPageError(page);
......@@ -976,7 +973,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
* necessary, just swap data blocks between orig and donor.
*/
if (uninit) {
double_down_write_data_sem(orig_inode, donor_inode);
ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* If any of extents in range became initialized we have to
* fallback to data copying */
uninit = mext_check_coverage(orig_inode, orig_blk_offset,
......@@ -990,7 +987,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
goto drop_data_sem;
if (!uninit) {
double_up_write_data_sem(orig_inode, donor_inode);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto data_copy;
}
if ((page_has_private(pagep[0]) &&
......@@ -1004,7 +1001,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
donor_inode, orig_blk_offset,
block_len_in_page, err);
drop_data_sem:
double_up_write_data_sem(orig_inode, donor_inode);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto unlock_pages;
}
data_copy:
......@@ -1033,7 +1030,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
}
/* Perform all necessary steps similar write_begin()/write_end()
* but keeping in mind that i_size will not change */
*err = __block_write_begin(pagep[0], from, from + replaced_size,
*err = __block_write_begin(pagep[0], from, replaced_size,
ext4_get_block);
if (!*err)
*err = block_commit_write(pagep[0], from, from + replaced_size);
......@@ -1065,11 +1062,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
* Extents are swapped already, but we are not able to copy data.
* Try to swap extents to it's original places
*/
double_down_write_data_sem(orig_inode, donor_inode);
ext4_double_down_write_data_sem(orig_inode, donor_inode);
replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
orig_blk_offset,
block_len_in_page, &err2);
double_up_write_data_sem(orig_inode, donor_inode);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
if (replaced_count != block_len_in_page) {
EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
"Unable to copy data block,"
......@@ -1209,15 +1206,15 @@ mext_check_arguments(struct inode *orig_inode,
}
/**
* mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
* ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
*
* @inode1: the inode structure
* @inode2: the inode structure
*
* Lock two inodes' i_mutex
*/
static void
mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
void
ext4_inode_double_lock(struct inode *inode1, struct inode *inode2)
{
BUG_ON(inode1 == inode2);
if (inode1 < inode2) {
......@@ -1230,15 +1227,15 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
}
/**
* mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
* ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
*
* @inode1: the inode that is released first
* @inode2: the inode that is released second
*
*/
static void
mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
void
ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2)
{
mutex_unlock(&inode1->i_mutex);
mutex_unlock(&inode2->i_mutex);
......@@ -1333,7 +1330,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
return -EINVAL;
}
/* Protect orig and donor inodes against a truncate */
mext_inode_double_lock(orig_inode, donor_inode);
ext4_inode_double_lock(orig_inode, donor_inode);
/* Wait for all existing dio workers */
ext4_inode_block_unlocked_dio(orig_inode);
......@@ -1342,7 +1339,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
inode_dio_wait(donor_inode);
/* Protect extent tree against block allocations via delalloc */
double_down_write_data_sem(orig_inode, donor_inode);
ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
donor_start, &len);
......@@ -1466,7 +1463,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
* b. racing with ->readpage, ->write_begin, and ext4_get_block
* in move_extent_per_page
*/
double_up_write_data_sem(orig_inode, donor_inode);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
while (orig_page_offset <= seq_end_page) {
......@@ -1500,7 +1497,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
block_len_in_page = rest_blocks;
}
double_down_write_data_sem(orig_inode, donor_inode);
ext4_double_down_write_data_sem(orig_inode, donor_inode);
if (ret < 0)
break;
......@@ -1538,10 +1535,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
ext4_ext_drop_refs(holecheck_path);
kfree(holecheck_path);
}
double_up_write_data_sem(orig_inode, donor_inode);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
ext4_inode_resume_unlocked_dio(orig_inode);
ext4_inode_resume_unlocked_dio(donor_inode);
mext_inode_double_unlock(orig_inode, donor_inode);
ext4_inode_double_unlock(orig_inode, donor_inode);
return ret;
}
......@@ -416,15 +416,16 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
__u32 csum, old_csum;
__u32 csum;
__le32 save_csum;
int size;
size = count_offset + (count * sizeof(struct dx_entry));
old_csum = t->dt_checksum;
save_csum = t->dt_checksum;
t->dt_checksum = 0;
csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
t->dt_checksum = old_csum;
t->dt_checksum = save_csum;
return cpu_to_le32(csum);
}
......@@ -971,6 +972,17 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
hinfo.hash_version +=
EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
if (ext4_has_inline_data(dir)) {
int has_inline_data = 1;
count = htree_inlinedir_to_tree(dir_file, dir, 0,
&hinfo, start_hash,
start_minor_hash,
&has_inline_data);
if (has_inline_data) {
*next_hash = ~0;
return count;
}
}
count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
start_hash, start_minor_hash);
*next_hash = ~0;
......@@ -1455,24 +1467,6 @@ struct dentry *ext4_get_parent(struct dentry *child)
return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
}
#define S_SHIFT 12
static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
[S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
[S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
[S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
[S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
[S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
};
static inline void ext4_set_de_type(struct super_block *sb,
struct ext4_dir_entry_2 *de,
umode_t mode) {
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}
/*
* Move count entries from end of map between two memory locations.
* Returns pointer to last entry moved.
......@@ -2251,8 +2245,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
dquot_initialize(dir);
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
NULL, EXT4_HT_DIR, credits);
......@@ -2286,8 +2279,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
dquot_initialize(dir);
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
NULL, EXT4_HT_DIR, credits);
......@@ -2396,8 +2388,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
dquot_initialize(dir);
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
&dentry->d_name,
......@@ -2826,8 +2817,7 @@ static int ext4_symlink(struct inode *dir,
* quota blocks, sb is already counted in previous macros).
*/
credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
}
retry:
inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
......
......@@ -29,25 +29,19 @@
#include "xattr.h"
#include "acl.h"
static struct kmem_cache *io_page_cachep, *io_end_cachep;
static struct kmem_cache *io_end_cachep;
int __init ext4_init_pageio(void)
{
io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
if (io_page_cachep == NULL)
return -ENOMEM;
io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
if (io_end_cachep == NULL) {
kmem_cache_destroy(io_page_cachep);
if (io_end_cachep == NULL)
return -ENOMEM;
}
return 0;
}
void ext4_exit_pageio(void)
{
kmem_cache_destroy(io_end_cachep);
kmem_cache_destroy(io_page_cachep);
}
/*
......@@ -67,29 +61,28 @@ void ext4_ioend_shutdown(struct inode *inode)
cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
}
static void put_io_page(struct ext4_io_page *io_page)
static void ext4_release_io_end(ext4_io_end_t *io_end)
{
if (atomic_dec_and_test(&io_page->p_count)) {
end_page_writeback(io_page->p_page);
put_page(io_page->p_page);
kmem_cache_free(io_page_cachep, io_page);
}
BUG_ON(!list_empty(&io_end->list));
BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
wake_up_all(ext4_ioend_wq(io_end->inode));
if (io_end->flag & EXT4_IO_END_DIRECT)
inode_dio_done(io_end->inode);
if (io_end->iocb)
aio_complete(io_end->iocb, io_end->result, 0);
kmem_cache_free(io_end_cachep, io_end);
}
void ext4_free_io_end(ext4_io_end_t *io)
static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
int i;
BUG_ON(!io);
BUG_ON(!list_empty(&io->list));
BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
struct inode *inode = io_end->inode;
for (i = 0; i < io->num_io_pages; i++)
put_io_page(io->pages[i]);
io->num_io_pages = 0;
if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
wake_up_all(ext4_ioend_wq(io->inode));
kmem_cache_free(io_end_cachep, io);
io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
/* Wake up anyone waiting on unwritten extent conversion */
if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
wake_up_all(ext4_ioend_wq(inode));
}
/* check a range of space and convert unwritten extents to written. */
......@@ -112,13 +105,8 @@ static int ext4_end_io(ext4_io_end_t *io)
"(inode %lu, offset %llu, size %zd, error %d)",
inode->i_ino, offset, size, ret);
}
/* Wake up anyone waiting on unwritten extent conversion */
if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
wake_up_all(ext4_ioend_wq(inode));
if (io->flag & EXT4_IO_END_DIRECT)
inode_dio_done(inode);
if (io->iocb)
aio_complete(io->iocb, io->result, 0);
ext4_clear_io_unwritten_flag(io);
ext4_release_io_end(io);
return ret;
}
......@@ -149,7 +137,7 @@ static void dump_completed_IO(struct inode *inode)
}
/* Add the io_end to per-inode completed end_io list. */
void ext4_add_complete_io(ext4_io_end_t *io_end)
static void ext4_add_complete_io(ext4_io_end_t *io_end)
{
struct ext4_inode_info *ei = EXT4_I(io_end->inode);
struct workqueue_struct *wq;
......@@ -186,8 +174,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
err = ext4_end_io(io);
if (unlikely(!ret && err))
ret = err;
io->flag &= ~EXT4_IO_END_UNWRITTEN;
ext4_free_io_end(io);
}
return ret;
}
......@@ -219,10 +205,43 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
atomic_inc(&EXT4_I(inode)->i_ioend_count);
io->inode = inode;
INIT_LIST_HEAD(&io->list);
atomic_set(&io->count, 1);
}
return io;
}
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
if (atomic_dec_and_test(&io_end->count)) {
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
ext4_release_io_end(io_end);
return;
}
ext4_add_complete_io(io_end);
}
}
int ext4_put_io_end(ext4_io_end_t *io_end)
{
int err = 0;
if (atomic_dec_and_test(&io_end->count)) {
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
err = ext4_convert_unwritten_extents(io_end->inode,
io_end->offset, io_end->size);
ext4_clear_io_unwritten_flag(io_end);
}
ext4_release_io_end(io_end);
}
return err;
}
ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
{
atomic_inc(&io_end->count);
return io_end;
}
/*
* Print an buffer I/O error compatible with the fs/buffer.c. This
* provides compatibility with dmesg scrapers that look for a specific
......@@ -243,45 +262,56 @@ static void ext4_end_bio(struct bio *bio, int error)
ext4_io_end_t *io_end = bio->bi_private;
struct inode *inode;
int i;
int blocksize;
sector_t bi_sector = bio->bi_sector;
BUG_ON(!io_end);
inode = io_end->inode;
blocksize = 1 << inode->i_blkbits;
bio->bi_private = NULL;
bio->bi_end_io = NULL;
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
error = 0;
bio_put(bio);
for (i = 0; i < io_end->num_io_pages; i++) {
struct page *page = io_end->pages[i]->p_page;
for (i = 0; i < bio->bi_vcnt; i++) {
struct bio_vec *bvec = &bio->bi_io_vec[i];
struct page *page = bvec->bv_page;
struct buffer_head *bh, *head;
loff_t offset;
loff_t io_end_offset;
unsigned bio_start = bvec->bv_offset;
unsigned bio_end = bio_start + bvec->bv_len;
unsigned under_io = 0;
unsigned long flags;
if (!page)
continue;
if (error) {
SetPageError(page);
set_bit(AS_EIO, &page->mapping->flags);
head = page_buffers(page);
BUG_ON(!head);
io_end_offset = io_end->offset + io_end->size;
offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
bh = head;
do {
if ((offset >= io_end->offset) &&
(offset+bh->b_size <= io_end_offset))
buffer_io_error(bh);
offset += bh->b_size;
bh = bh->b_this_page;
} while (bh != head);
}
put_io_page(io_end->pages[i]);
bh = head = page_buffers(page);
/*
* We check all buffers in the page under BH_Uptodate_Lock
* to avoid races with other end io clearing async_write flags
*/
local_irq_save(flags);
bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
do {
if (bh_offset(bh) < bio_start ||
bh_offset(bh) + blocksize > bio_end) {
if (buffer_async_write(bh))
under_io++;
continue;
}
clear_buffer_async_write(bh);
if (error)
buffer_io_error(bh);
} while ((bh = bh->b_this_page) != head);
bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
local_irq_restore(flags);
if (!under_io)
end_page_writeback(page);
}
io_end->num_io_pages = 0;
inode = io_end->inode;
bio_put(bio);
if (error) {
io_end->flag |= EXT4_IO_END_ERROR;
......@@ -294,12 +324,7 @@ static void ext4_end_bio(struct bio *bio, int error)
bi_sector >> (inode->i_blkbits - 9));
}
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
ext4_free_io_end(io_end);
return;
}
ext4_add_complete_io(io_end);
ext4_put_io_end_defer(io_end);
}
void ext4_io_submit(struct ext4_io_submit *io)
......@@ -313,76 +338,59 @@ void ext4_io_submit(struct ext4_io_submit *io)
bio_put(io->io_bio);
}
io->io_bio = NULL;
io->io_op = 0;
}
void ext4_io_submit_init(struct ext4_io_submit *io,
struct writeback_control *wbc)
{
io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
io->io_bio = NULL;
io->io_end = NULL;
}
static int io_submit_init(struct ext4_io_submit *io,
struct inode *inode,
struct writeback_control *wbc,
struct buffer_head *bh)
static int io_submit_init_bio(struct ext4_io_submit *io,
struct buffer_head *bh)
{
ext4_io_end_t *io_end;
struct page *page = bh->b_page;
int nvecs = bio_get_nr_vecs(bh->b_bdev);
struct bio *bio;
io_end = ext4_init_io_end(inode, GFP_NOFS);
if (!io_end)
return -ENOMEM;
bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
bio->bi_private = io->io_end = io_end;
bio->bi_end_io = ext4_end_bio;
io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
bio->bi_private = ext4_get_io_end(io->io_end);
if (!io->io_end->size)
io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT)
+ bh_offset(bh);
io->io_bio = bio;
io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
io->io_next_block = bh->b_blocknr;
return 0;
}
static int io_submit_add_bh(struct ext4_io_submit *io,
struct ext4_io_page *io_page,
struct inode *inode,
struct writeback_control *wbc,
struct buffer_head *bh)
{
ext4_io_end_t *io_end;
int ret;
if (buffer_new(bh)) {
clear_buffer_new(bh);
unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
}
if (io->io_bio && bh->b_blocknr != io->io_next_block) {
submit_and_retry:
ext4_io_submit(io);
}
if (io->io_bio == NULL) {
ret = io_submit_init(io, inode, wbc, bh);
ret = io_submit_init_bio(io, bh);
if (ret)
return ret;
}
io_end = io->io_end;
if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
(io_end->pages[io_end->num_io_pages-1] != io_page))
goto submit_and_retry;
if (buffer_uninit(bh))
ext4_set_io_unwritten_flag(inode, io_end);
io->io_end->size += bh->b_size;
io->io_next_block++;
ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
if ((io_end->num_io_pages == 0) ||
(io_end->pages[io_end->num_io_pages-1] != io_page)) {
io_end->pages[io_end->num_io_pages++] = io_page;
atomic_inc(&io_page->p_count);
}
io_end = io->io_end;
if (test_clear_buffer_uninit(bh))
ext4_set_io_unwritten_flag(inode, io_end);
io_end->size += bh->b_size;
io->io_next_block++;
return 0;
}
......@@ -392,33 +400,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
unsigned block_start, block_end, blocksize;
struct ext4_io_page *io_page;
unsigned block_start, blocksize;
struct buffer_head *bh, *head;
int ret = 0;
int nr_submitted = 0;
blocksize = 1 << inode->i_blkbits;
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
if (!io_page) {
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return -ENOMEM;
}
io_page->p_page = page;
atomic_set(&io_page->p_count, 1);
get_page(page);
set_page_writeback(page);
ClearPageError(page);
for (bh = head = page_buffers(page), block_start = 0;
bh != head || !block_start;
block_start = block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
/*
* In the first loop we prepare and mark buffers to submit. We have to
* mark all buffers in the page before submitting so that
* end_page_writeback() cannot be called from ext4_bio_end_io() when IO
* on the first buffer finishes and we are still working on submitting
* the second buffer.
*/
bh = head = page_buffers(page);
do {
block_start = bh_offset(bh);
if (block_start >= len) {
/*
* Comments copied from block_write_full_page_endio:
......@@ -431,7 +435,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
* mapped, and writes to that region are not written
* out to the file."
*/
zero_user_segment(page, block_start, block_end);
zero_user_segment(page, block_start,
block_start + blocksize);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
continue;
......@@ -445,7 +450,19 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
ext4_io_submit(io);
continue;
}
ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
if (buffer_new(bh)) {
clear_buffer_new(bh);
unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
}
set_buffer_async_write(bh);
} while ((bh = bh->b_this_page) != head);
/* Now submit buffers to write */
bh = head = page_buffers(page);
do {
if (!buffer_async_write(bh))
continue;
ret = io_submit_add_bh(io, inode, bh);
if (ret) {
/*
* We only get here on ENOMEM. Not much else
......@@ -455,17 +472,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
redirty_page_for_writepage(wbc, page);
break;
}
nr_submitted++;
clear_buffer_dirty(bh);
} while ((bh = bh->b_this_page) != head);
/* Error stopped previous loop? Clean up buffers... */
if (ret) {
do {
clear_buffer_async_write(bh);
bh = bh->b_this_page;
} while (bh != head);
}
unlock_page(page);
/*
* If the page was truncated before we could do the writeback,
* or we had a memory allocation error while trying to write
* the first buffer head, we won't have submitted any pages for
* I/O. In that case we need to make sure we've cleared the
* PageWriteback bit from the page to prevent the system from
* wedging later on.
*/
put_io_page(io_page);
/* Nothing submitted - we have to end page writeback */
if (!nr_submitted)
end_page_writeback(page);
return ret;
}
......@@ -272,7 +272,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,
if (start_blk >= last_blk)
goto next_group;
group_data[bb_index].block_bitmap = start_blk++;
ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
group_data[group].free_blocks_count--;
if (flexbg_size > 1)
......@@ -284,7 +284,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,
if (start_blk >= last_blk)
goto next_group;
group_data[ib_index].inode_bitmap = start_blk++;
ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
group_data[group].free_blocks_count--;
if (flexbg_size > 1)
......@@ -296,7 +296,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,
if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
goto next_group;
group_data[it_index].inode_table = start_blk;
ext4_get_group_no_and_offset(sb, start_blk, &group, NULL);
group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
group_data[group].free_blocks_count -=
EXT4_SB(sb)->s_itb_per_group;
......@@ -392,7 +392,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
ext4_group_t group;
int err;
ext4_get_group_no_and_offset(sb, block, &group, NULL);
group = ext4_get_group_number(sb, block);
start = ext4_group_first_block_no(sb, group);
group -= flex_gd->groups[0].group;
......@@ -1341,6 +1341,8 @@ static void ext4_update_super(struct super_block *sb,
/* Update the global fs size fields */
sbi->s_groups_count += flex_gd->count;
sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
/* Update the reserved block counts only once the new group is
* active. */
......@@ -1879,7 +1881,11 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
/* Nothing need to do */
return 0;
ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
n_group = ext4_get_group_number(sb, n_blocks_count - 1);
if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
ext4_warning(sb, "resize would cause inodes_count overflow");
return -EINVAL;
}
ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
n_desc_blocks = num_desc_blocks(sb, n_group + 1);
......
......@@ -81,6 +81,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly);
static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
static struct file_system_type ext2_fs_type = {
......@@ -353,10 +354,13 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
int error = is_journal_aborted(journal);
struct ext4_journal_cb_entry *jce, *tmp;
struct ext4_journal_cb_entry *jce;
BUG_ON(txn->t_state == T_FINISHED);
spin_lock(&sbi->s_md_lock);
list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
while (!list_empty(&txn->t_private_list)) {
jce = list_entry(txn->t_private_list.next,
struct ext4_journal_cb_entry, jce_list);
list_del_init(&jce->jce_list);
spin_unlock(&sbi->s_md_lock);
jce->jce_func(sb, jce, error);
......@@ -1948,16 +1952,16 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
if ((sbi->s_es->s_feature_ro_compat &
cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
/* Use new metadata_csum algorithm */
__u16 old_csum;
__le16 save_csum;
__u32 csum32;
old_csum = gdp->bg_checksum;
save_csum = gdp->bg_checksum;
gdp->bg_checksum = 0;
csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
sizeof(le_group));
csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
sbi->s_desc_size);
gdp->bg_checksum = old_csum;
gdp->bg_checksum = save_csum;
crc = csum32 & 0xFFFF;
goto out;
......@@ -2379,17 +2383,15 @@ struct ext4_attr {
int offset;
};
static int parse_strtoul(const char *buf,
unsigned long max, unsigned long *value)
static int parse_strtoull(const char *buf,
unsigned long long max, unsigned long long *value)
{
char *endp;
*value = simple_strtoul(skip_spaces(buf), &endp, 0);
endp = skip_spaces(endp);
if (*endp || *value > max)
return -EINVAL;
int ret;
return 0;
ret = kstrtoull(skip_spaces(buf), 0, value);
if (!ret && *value > max)
ret = -EINVAL;
return ret;
}
static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
......@@ -2431,11 +2433,13 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
const char *buf, size_t count)
{
unsigned long t;
int ret;
if (parse_strtoul(buf, 0x40000000, &t))
return -EINVAL;
ret = kstrtoul(skip_spaces(buf), 0, &t);
if (ret)
return ret;
if (t && !is_power_of_2(t))
if (t && (!is_power_of_2(t) || t > 0x40000000))
return -EINVAL;
sbi->s_inode_readahead_blks = t;
......@@ -2456,13 +2460,36 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
{
unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
unsigned long t;
int ret;
if (parse_strtoul(buf, 0xffffffff, &t))
return -EINVAL;
ret = kstrtoul(skip_spaces(buf), 0, &t);
if (ret)
return ret;
*ui = t;
return count;
}
static ssize_t reserved_clusters_show(struct ext4_attr *a,
struct ext4_sb_info *sbi, char *buf)
{
return snprintf(buf, PAGE_SIZE, "%llu\n",
(unsigned long long) atomic64_read(&sbi->s_resv_clusters));
}
static ssize_t reserved_clusters_store(struct ext4_attr *a,
struct ext4_sb_info *sbi,
const char *buf, size_t count)
{
unsigned long long val;
int ret;
if (parse_strtoull(buf, -1ULL, &val))
return -EINVAL;
ret = ext4_reserve_clusters(sbi, val);
return ret ? ret : count;
}
static ssize_t trigger_test_error(struct ext4_attr *a,
struct ext4_sb_info *sbi,
const char *buf, size_t count)
......@@ -2500,6 +2527,7 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
EXT4_RO_ATTR(delayed_allocation_blocks);
EXT4_RO_ATTR(session_write_kbytes);
EXT4_RO_ATTR(lifetime_write_kbytes);
EXT4_RW_ATTR(reserved_clusters);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
inode_readahead_blks_store, s_inode_readahead_blks);
EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
......@@ -2517,6 +2545,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks),
ATTR_LIST(session_write_kbytes),
ATTR_LIST(lifetime_write_kbytes),
ATTR_LIST(reserved_clusters),
ATTR_LIST(inode_readahead_blks),
ATTR_LIST(inode_goal),
ATTR_LIST(mb_stats),
......@@ -3192,6 +3221,40 @@ int ext4_calculate_overhead(struct super_block *sb)
return 0;
}
static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
{
ext4_fsblk_t resv_clusters;
/*
* By default we reserve 2% or 4096 clusters, whichever is smaller.
* This should cover the situations where we can not afford to run
* out of space like for example punch hole, or converting
* uninitialized extents in delalloc path. In most cases such
* allocation would require 1, or 2 blocks, higher numbers are
* very rare.
*/
resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
do_div(resv_clusters, 50);
resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
return resv_clusters;
}
static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
{
ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
sbi->s_cluster_bits;
if (count >= clusters)
return -EINVAL;
atomic64_set(&sbi->s_resv_clusters, count);
return 0;
}
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
{
char *orig_data = kstrdup(data, GFP_KERNEL);
......@@ -3526,6 +3589,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
/* Do we have standard group size of blocksize * 8 blocks ? */
if (sbi->s_blocks_per_group == blocksize << 3)
set_opt2(sb, STD_GROUP_SIZE);
for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
......@@ -3698,6 +3765,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_err_report.function = print_daily_error_info;
sbi->s_err_report.data = (unsigned long) sb;
/* Register extent status tree shrinker */
ext4_es_register_shrinker(sb);
err = percpu_counter_init(&sbi->s_freeclusters_counter,
ext4_count_free_clusters(sb));
if (!err) {
......@@ -3723,9 +3793,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_max_writeback_mb_bump = 128;
sbi->s_extent_max_zeroout_kb = 32;
/* Register extent status tree shrinker */
ext4_es_register_shrinker(sb);
/*
* set up enough so that it can read an inode
*/
......@@ -3911,6 +3978,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"available");
}
err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
if (err) {
ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
"reserved pool", ext4_calculate_resv_clusters(sbi));
goto failed_mount4a;
}
err = ext4_setup_system_zone(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize system "
......@@ -4010,6 +4084,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_journal = NULL;
}
failed_mount3:
ext4_es_unregister_shrinker(sb);
del_timer(&sbi->s_err_report);
if (sbi->s_flex_groups)
ext4_kvfree(sbi->s_flex_groups);
......@@ -4177,7 +4252,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
goto out_bdev;
}
journal->j_private = sb;
ll_rw_block(READ, 1, &journal->j_sb_buffer);
ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
wait_on_buffer(journal->j_sb_buffer);
if (!buffer_uptodate(journal->j_sb_buffer)) {
ext4_msg(sb, KERN_ERR, "I/O error on journal device");
......@@ -4742,9 +4817,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
struct super_block *sb = dentry->d_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
ext4_fsblk_t overhead = 0;
ext4_fsblk_t overhead = 0, resv_blocks;
u64 fsid;
s64 bfree;
resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
if (!test_opt(sb, MINIX_DF))
overhead = sbi->s_overhead;
......@@ -4756,8 +4832,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
/* prevent underflow in case that few free space is available */
buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
if (buf->f_bfree < ext4_r_blocks_count(es))
buf->f_bavail = buf->f_bfree -
(ext4_r_blocks_count(es) + resv_blocks);
if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
buf->f_bavail = 0;
buf->f_files = le32_to_cpu(es->s_inodes_count);
buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
......@@ -4945,6 +5022,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
return PTR_ERR(qf_inode);
}
/* Don't account quota for quota files to avoid recursion */
qf_inode->i_flags |= S_NOQUOTA;
err = dquot_enable(qf_inode, type, format_id, flags);
iput(qf_inode);
......
......@@ -122,17 +122,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
struct ext4_xattr_header *hdr)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum, old;
__u32 csum;
__le32 save_csum;
__le64 dsk_block_nr = cpu_to_le64(block_nr);
old = hdr->h_checksum;
save_csum = hdr->h_checksum;
hdr->h_checksum = 0;
block_nr = cpu_to_le64(block_nr);
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr,
sizeof(block_nr));
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
sizeof(dsk_block_nr));
csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
EXT4_BLOCK_SIZE(inode->i_sb));
hdr->h_checksum = old;
hdr->h_checksum = save_csum;
return cpu_to_le32(csum);
}
......
......@@ -22,6 +22,7 @@
#define EXT4_XATTR_INDEX_LUSTRE 5
#define EXT4_XATTR_INDEX_SECURITY 6
#define EXT4_XATTR_INDEX_SYSTEM 7
#define EXT4_XATTR_INDEX_RICHACL 8
struct ext4_xattr_header {
__le32 h_magic; /* magic number for identification */
......
......@@ -382,7 +382,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
int space_left = 0;
int first_tag = 0;
int tag_flag;
int i, to_free = 0;
int i;
int tag_bytes = journal_tag_bytes(journal);
struct buffer_head *cbh = NULL; /* For transactional checksums */
__u32 crc32_sum = ~0;
......@@ -1134,7 +1134,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
spin_unlock(&journal->j_history_lock);
commit_transaction->t_state = T_FINISHED;
commit_transaction->t_state = T_COMMIT_CALLBACK;
J_ASSERT(commit_transaction == journal->j_committing_transaction);
journal->j_commit_sequence = commit_transaction->t_tid;
journal->j_committing_transaction = NULL;
......@@ -1149,38 +1149,44 @@ void jbd2_journal_commit_transaction(journal_t *journal)
journal->j_average_commit_time*3) / 4;
else
journal->j_average_commit_time = commit_time;
write_unlock(&journal->j_state_lock);
if (commit_transaction->t_checkpoint_list == NULL &&
commit_transaction->t_checkpoint_io_list == NULL) {
__jbd2_journal_drop_transaction(journal, commit_transaction);
to_free = 1;
if (journal->j_checkpoint_transactions == NULL) {
journal->j_checkpoint_transactions = commit_transaction;
commit_transaction->t_cpnext = commit_transaction;
commit_transaction->t_cpprev = commit_transaction;
} else {
if (journal->j_checkpoint_transactions == NULL) {
journal->j_checkpoint_transactions = commit_transaction;
commit_transaction->t_cpnext = commit_transaction;
commit_transaction->t_cpprev = commit_transaction;
} else {
commit_transaction->t_cpnext =
journal->j_checkpoint_transactions;
commit_transaction->t_cpprev =
commit_transaction->t_cpnext->t_cpprev;
commit_transaction->t_cpnext->t_cpprev =
commit_transaction;
commit_transaction->t_cpprev->t_cpnext =
commit_transaction->t_cpnext =
journal->j_checkpoint_transactions;
commit_transaction->t_cpprev =
commit_transaction->t_cpnext->t_cpprev;
commit_transaction->t_cpnext->t_cpprev =
commit_transaction;
commit_transaction->t_cpprev->t_cpnext =
commit_transaction;
}
}
spin_unlock(&journal->j_list_lock);
/* Drop all spin_locks because commit_callback may be block.
* __journal_remove_checkpoint() can not destroy transaction
* under us because it is not marked as T_FINISHED yet */
if (journal->j_commit_callback)
journal->j_commit_callback(journal, commit_transaction);
trace_jbd2_end_commit(journal, commit_transaction);
jbd_debug(1, "JBD2: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
if (to_free)
jbd2_journal_free_transaction(commit_transaction);
write_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
commit_transaction->t_state = T_FINISHED;
/* Recheck checkpoint lists after j_list_lock was dropped */
if (commit_transaction->t_checkpoint_list == NULL &&
commit_transaction->t_checkpoint_io_list == NULL) {
__jbd2_journal_drop_transaction(journal, commit_transaction);
jbd2_journal_free_transaction(commit_transaction);
}
spin_unlock(&journal->j_list_lock);
write_unlock(&journal->j_state_lock);
wake_up(&journal->j_wait_done_commit);
}
......@@ -707,6 +707,37 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
return err;
}
/*
* When this function returns the transaction corresponding to tid
* will be completed. If the transaction has currently running, start
* committing that transaction before waiting for it to complete. If
* the transaction id is stale, it is by definition already completed,
* so just return SUCCESS.
*/
int jbd2_complete_transaction(journal_t *journal, tid_t tid)
{
int need_to_wait = 1;
read_lock(&journal->j_state_lock);
if (journal->j_running_transaction &&
journal->j_running_transaction->t_tid == tid) {
if (journal->j_commit_request != tid) {
/* transaction not yet started, so request it */
read_unlock(&journal->j_state_lock);
jbd2_log_start_commit(journal, tid);
goto wait_commit;
}
} else if (!(journal->j_committing_transaction &&
journal->j_committing_transaction->t_tid == tid))
need_to_wait = 0;
read_unlock(&journal->j_state_lock);
if (!need_to_wait)
return 0;
wait_commit:
return jbd2_log_wait_commit(journal, tid);
}
EXPORT_SYMBOL(jbd2_complete_transaction);
/*
* Log buffer allocation routines:
*/
......
......@@ -332,7 +332,6 @@ static handle_t *new_handle(int nblocks)
handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
if (!handle)
return NULL;
memset(handle, 0, sizeof(*handle));
handle->h_buffer_credits = nblocks;
handle->h_ref = 1;
......@@ -640,6 +639,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
int error;
char *frozen_buffer = NULL;
int need_copy = 0;
unsigned long start_lock, time_lock;
if (is_handle_aborted(handle))
return -EROFS;
......@@ -655,9 +655,16 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
/* @@@ Need to check for errors here at some point. */
start_lock = jiffies;
lock_buffer(bh);
jbd_lock_bh_state(bh);
/* If it takes too long to lock the buffer, trace it */
time_lock = jbd2_time_diff(start_lock, jiffies);
if (time_lock > HZ/10)
trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
jiffies_to_msecs(time_lock));
/* We now hold the buffer lock so it is safe to query the buffer
* state. Is the buffer dirty?
*
......
......@@ -34,6 +34,8 @@ enum bh_state_bits {
BH_Write_EIO, /* I/O error on write */
BH_Unwritten, /* Buffer is allocated on disk but not written */
BH_Quiet, /* Buffer Error Prinks to be quiet */
BH_Meta, /* Buffer contains metadata */
BH_Prio, /* Buffer should be submitted with REQ_PRIO */
BH_PrivateStart,/* not a state bit, but the first bit available
* for private allocation by other entities
......@@ -124,6 +126,8 @@ BUFFER_FNS(Delay, delay)
BUFFER_FNS(Boundary, boundary)
BUFFER_FNS(Write_EIO, write_io_error)
BUFFER_FNS(Unwritten, unwritten)
BUFFER_FNS(Meta, meta)
BUFFER_FNS(Prio, prio)
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
......
......@@ -480,6 +480,7 @@ struct transaction_s
T_COMMIT,
T_COMMIT_DFLUSH,
T_COMMIT_JFLUSH,
T_COMMIT_CALLBACK,
T_FINISHED
} t_state;
......@@ -1144,7 +1145,7 @@ extern struct kmem_cache *jbd2_handle_cache;
static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags)
{
return kmem_cache_alloc(jbd2_handle_cache, gfp_flags);
return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags);
}
static inline void jbd2_free_handle(handle_t *handle)
......@@ -1200,6 +1201,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
int jbd2_journal_force_commit_nested(journal_t *journal);
int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
int jbd2_complete_transaction(journal_t *journal, tid_t tid);
int jbd2_log_do_checkpoint(journal_t *journal);
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册