提交 9d0be502 编写于 作者: T Theodore Ts'o

ext4: Calculate metadata requirements more accurately

In the past, ext4_calc_metadata_amount(), and its sub-functions
ext4_ext_calc_metadata_amount() and ext4_indirect_calc_metadata_amount()
badly over-estimated the number of metadata blocks that might be
required for delayed allocation blocks.  This didn't matter as much
when functions which managed the reserved metadata blocks were more
aggressive about dropping reserved metadata blocks as delayed
allocation blocks were written, but unfortunately they were too
aggressive.  This was fixed in commit 0637c6f4, but as a result the
over-estimation by ext4_calc_metadata_amount() would lead to reserving
2-3 times the number of pending delayed allocation blocks as
potentially required metadata blocks.  So if there are 1 megabytes of
blocks which have been not yet been allocation, up to 3 megabytes of
space would get reserved out of the user's quota and from the file
system free space pool until all of the inode's data blocks have been
allocated.

This commit addresses this problem by much more accurately estimating
the number of metadata blocks that will be required.  It will still
somewhat over-estimate the number of blocks needed, since it must make
a worst case estimate not knowing which physical blocks will be
needed, but it is much more accurate than before.
Signed-off-by: N"Theodore Ts'o" <tytso@mit.edu>
上级 ee5f4d9c
...@@ -699,6 +699,8 @@ struct ext4_inode_info { ...@@ -699,6 +699,8 @@ struct ext4_inode_info {
unsigned int i_reserved_meta_blocks; unsigned int i_reserved_meta_blocks;
unsigned int i_allocated_meta_blocks; unsigned int i_allocated_meta_blocks;
unsigned short i_delalloc_reserved_flag; unsigned short i_delalloc_reserved_flag;
sector_t i_da_metadata_calc_last_lblock;
int i_da_metadata_calc_len;
/* on-disk additional length */ /* on-disk additional length */
__u16 i_extra_isize; __u16 i_extra_isize;
......
...@@ -225,7 +225,8 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) ...@@ -225,7 +225,8 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
} }
extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); extern int ext4_ext_calc_metadata_amount(struct inode *inode,
sector_t lblocks);
extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
......
...@@ -296,29 +296,44 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) ...@@ -296,29 +296,44 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
* to allocate @blocks * to allocate @blocks
* Worse case is one block per extent * Worse case is one block per extent
*/ */
int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
{ {
int lcap, icap, rcap, leafs, idxs, num; struct ext4_inode_info *ei = EXT4_I(inode);
int newextents = blocks; int idxs, num = 0;
rcap = ext4_ext_space_root_idx(inode, 0);
lcap = ext4_ext_space_block(inode, 0);
icap = ext4_ext_space_block_idx(inode, 0);
/* number of new leaf blocks needed */ idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
num = leafs = (newextents + lcap - 1) / lcap; / sizeof(struct ext4_extent_idx));
/* /*
* Worse case, we need separate index block(s) * If the new delayed allocation block is contiguous with the
* to link all new leaf blocks * previous da block, it can share index blocks with the
* previous block, so we only need to allocate a new index
* block every idxs leaf blocks. At ldxs**2 blocks, we need
* an additional index block, and at ldxs**3 blocks, yet
* another index blocks.
*/ */
idxs = (leafs + icap - 1) / icap; if (ei->i_da_metadata_calc_len &&
do { ei->i_da_metadata_calc_last_lblock+1 == lblock) {
num += idxs; if ((ei->i_da_metadata_calc_len % idxs) == 0)
idxs = (idxs + icap - 1) / icap; num++;
} while (idxs > rcap); if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
num++;
if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
num++;
ei->i_da_metadata_calc_len = 0;
} else
ei->i_da_metadata_calc_len++;
ei->i_da_metadata_calc_last_lblock++;
return num;
}
return num; /*
* In the worst case we need a new set of index blocks at
* every level of the inode's extent tree.
*/
ei->i_da_metadata_calc_len = 1;
ei->i_da_metadata_calc_last_lblock = lblock;
return ext_depth(inode) + 1;
} }
static int static int
......
...@@ -1009,38 +1009,44 @@ qsize_t *ext4_get_reserved_space(struct inode *inode) ...@@ -1009,38 +1009,44 @@ qsize_t *ext4_get_reserved_space(struct inode *inode)
return &EXT4_I(inode)->i_reserved_quota; return &EXT4_I(inode)->i_reserved_quota;
} }
#endif #endif
/* /*
* Calculate the number of metadata blocks need to reserve * Calculate the number of metadata blocks need to reserve
* to allocate @blocks for non extent file based file * to allocate a new block at @lblocks for non extent file based file
*/ */
static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) static int ext4_indirect_calc_metadata_amount(struct inode *inode,
sector_t lblock)
{ {
int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode);
int ind_blks, dind_blks, tind_blks; int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1;
int blk_bits;
/* number of new indirect blocks needed */
ind_blks = (blocks + icap - 1) / icap;
dind_blks = (ind_blks + icap - 1) / icap; if (lblock < EXT4_NDIR_BLOCKS)
return 0;
tind_blks = 1; lblock -= EXT4_NDIR_BLOCKS;
return ind_blks + dind_blks + tind_blks; if (ei->i_da_metadata_calc_len &&
(lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
ei->i_da_metadata_calc_len++;
return 0;
}
ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
ei->i_da_metadata_calc_len = 1;
blk_bits = roundup_pow_of_two(lblock + 1);
return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
} }
/* /*
* Calculate the number of metadata blocks need to reserve * Calculate the number of metadata blocks need to reserve
* to allocate given number of blocks * to allocate a block located at @lblock
*/ */
static int ext4_calc_metadata_amount(struct inode *inode, int blocks) static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
{ {
if (!blocks)
return 0;
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
return ext4_ext_calc_metadata_amount(inode, blocks); return ext4_ext_calc_metadata_amount(inode, lblock);
return ext4_indirect_calc_metadata_amount(inode, blocks); return ext4_indirect_calc_metadata_amount(inode, lblock);
} }
/* /*
...@@ -1078,6 +1084,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) ...@@ -1078,6 +1084,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
*/ */
mdb_free = ei->i_reserved_meta_blocks; mdb_free = ei->i_reserved_meta_blocks;
ei->i_reserved_meta_blocks = 0; ei->i_reserved_meta_blocks = 0;
ei->i_da_metadata_calc_len = 0;
percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
} }
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
...@@ -1802,12 +1809,15 @@ static int ext4_journalled_write_end(struct file *file, ...@@ -1802,12 +1809,15 @@ static int ext4_journalled_write_end(struct file *file,
return ret ? ret : copied; return ret ? ret : copied;
} }
static int ext4_da_reserve_space(struct inode *inode, int nrblocks) /*
* Reserve a single block located at lblock
*/
static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
{ {
int retries = 0; int retries = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_inode_info *ei = EXT4_I(inode);
unsigned long md_needed, md_reserved, total = 0; unsigned long md_needed, md_reserved;
/* /*
* recalculate the amount of metadata blocks to reserve * recalculate the amount of metadata blocks to reserve
...@@ -1817,8 +1827,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) ...@@ -1817,8 +1827,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
repeat: repeat:
spin_lock(&ei->i_block_reservation_lock); spin_lock(&ei->i_block_reservation_lock);
md_reserved = ei->i_reserved_meta_blocks; md_reserved = ei->i_reserved_meta_blocks;
md_needed = ext4_calc_metadata_amount(inode, nrblocks); md_needed = ext4_calc_metadata_amount(inode, lblock);
total = md_needed + nrblocks;
spin_unlock(&ei->i_block_reservation_lock); spin_unlock(&ei->i_block_reservation_lock);
/* /*
...@@ -1826,7 +1835,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) ...@@ -1826,7 +1835,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
* later. Real quota accounting is done at pages writeout * later. Real quota accounting is done at pages writeout
* time. * time.
*/ */
if (vfs_dq_reserve_block(inode, total)) { if (vfs_dq_reserve_block(inode, md_needed + 1)) {
/* /*
* We tend to badly over-estimate the amount of * We tend to badly over-estimate the amount of
* metadata blocks which are needed, so if we have * metadata blocks which are needed, so if we have
...@@ -1838,8 +1847,8 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) ...@@ -1838,8 +1847,8 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
return -EDQUOT; return -EDQUOT;
} }
if (ext4_claim_free_blocks(sbi, total)) { if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
vfs_dq_release_reservation_block(inode, total); vfs_dq_release_reservation_block(inode, md_needed + 1);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) { if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
retry: retry:
if (md_reserved) if (md_reserved)
...@@ -1850,7 +1859,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) ...@@ -1850,7 +1859,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
return -ENOSPC; return -ENOSPC;
} }
spin_lock(&ei->i_block_reservation_lock); spin_lock(&ei->i_block_reservation_lock);
ei->i_reserved_data_blocks += nrblocks; ei->i_reserved_data_blocks++;
ei->i_reserved_meta_blocks += md_needed; ei->i_reserved_meta_blocks += md_needed;
spin_unlock(&ei->i_block_reservation_lock); spin_unlock(&ei->i_block_reservation_lock);
...@@ -1891,6 +1900,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free) ...@@ -1891,6 +1900,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
*/ */
to_free += ei->i_reserved_meta_blocks; to_free += ei->i_reserved_meta_blocks;
ei->i_reserved_meta_blocks = 0; ei->i_reserved_meta_blocks = 0;
ei->i_da_metadata_calc_len = 0;
} }
/* update fs dirty blocks counter */ /* update fs dirty blocks counter */
...@@ -2504,7 +2514,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, ...@@ -2504,7 +2514,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
* XXX: __block_prepare_write() unmaps passed block, * XXX: __block_prepare_write() unmaps passed block,
* is it OK? * is it OK?
*/ */
ret = ext4_da_reserve_space(inode, 1); ret = ext4_da_reserve_space(inode, iblock);
if (ret) if (ret)
/* not enough space to reserve */ /* not enough space to reserve */
return ret; return ret;
......
...@@ -702,6 +702,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ...@@ -702,6 +702,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_reserved_data_blocks = 0; ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0; ei->i_reserved_meta_blocks = 0;
ei->i_allocated_meta_blocks = 0; ei->i_allocated_meta_blocks = 0;
ei->i_da_metadata_calc_len = 0;
ei->i_delalloc_reserved_flag = 0; ei->i_delalloc_reserved_flag = 0;
spin_lock_init(&(ei->i_block_reservation_lock)); spin_lock_init(&(ei->i_block_reservation_lock));
#ifdef CONFIG_QUOTA #ifdef CONFIG_QUOTA
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册