提交 c9de560d 编写于 作者: A Alex Tomas 提交者: Theodore Ts'o

ext4: Add multi block allocator for ext4

Signed-off-by: NAlex Tomas <alex@clusterfs.com>
Signed-off-by: NAndreas Dilger <adilger@clusterfs.com>
Signed-off-by: NAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: NEric Sandeen <sandeen@redhat.com>
Signed-off-by: N"Theodore Ts'o" <tytso@mit.edu>
上级 1988b51e
......@@ -86,9 +86,11 @@ Alex is working on a new set of patches right now.
When mounting an ext4 filesystem, the following option are accepted:
(*) == default
extents ext4 will use extents to address file data. The
extents (*) ext4 will use extents to address file data. The
file system will no longer be mountable by ext3.
noextents ext4 will not use extents for newly created files
journal_checksum Enable checksumming of the journal transactions.
This will allow the recovery code in e2fsck and the
kernel to detect corruption in the kernel. It is a
......@@ -206,6 +208,12 @@ nobh (a) cache disk block mapping information
"nobh" option tries to avoid associating buffer
heads (supported only for "writeback" mode).
mballoc (*) Use the multiple block allocator for block allocation
nomballoc disabled multiple block allocator for block allocation.
stripe=n Number of filesystem blocks that mballoc will try
to use for allocation size and alignment. For RAID5/6
systems this should be the number of data
disks * RAID chunk size in file system blocks.
Data Mode
---------
......
......@@ -857,6 +857,45 @@ CPUs.
The "procs_blocked" line gives the number of processes currently blocked,
waiting for I/O to complete.
1.9 Ext4 file system parameters
------------------------------
Ext4 file system have one directory per partition under /proc/fs/ext4/
# ls /proc/fs/ext4/hdc/
group_prealloc max_to_scan mb_groups mb_history min_to_scan order2_req
stats stream_req
mb_groups:
This file gives the details of mutiblock allocator buddy cache of free blocks
mb_history:
Multiblock allocation history.
stats:
This file indicate whether the multiblock allocator should start collecting
statistics. The statistics are shown during unmount
group_prealloc:
The multiblock allocator normalize the block allocation request to
group_prealloc filesystem blocks if we don't have strip value set.
The stripe value can be specified at mount time or during mke2fs.
max_to_scan:
How long multiblock allocator can look for a best extent (in found extents)
min_to_scan:
How long multiblock allocator must look for a best extent
order2_req:
Multiblock allocator use 2^N search using buddies only for requests greater
than or equal to order2_req. The request size is specfied in file system
blocks. A value of 2 indicate only if the requests are greater than or equal
to 4 blocks.
stream_req:
Files smaller than stream_req are served by the stream allocator, whose
purpose is to pack requests as close each to other as possible to
produce smooth I/O traffic. Avalue of 16 indicate that file smaller than 16
filesystem block size will use group based preallocation.
------------------------------------------------------------------------------
Summary
......
......@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
ext4_jbd2.o migrate.o
ext4_jbd2.o migrate.o mballoc.o
ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
......
......@@ -577,6 +577,8 @@ void ext4_discard_reservation(struct inode *inode)
struct ext4_reserve_window_node *rsv;
spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
ext4_mb_discard_inode_preallocations(inode);
if (!block_i)
return;
......@@ -785,19 +787,29 @@ error_return:
* @inode: inode
* @block: start physical block to free
* @count: number of blocks to count
* @metadata: Are these metadata blocks
*/
void ext4_free_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count)
ext4_fsblk_t block, unsigned long count,
int metadata)
{
struct super_block * sb;
unsigned long dquot_freed_blocks;
/* this isn't the right place to decide whether block is metadata
* inode.c/extents.c knows better, but for safety ... */
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
ext4_should_journal_data(inode))
metadata = 1;
sb = inode->i_sb;
if (!sb) {
printk ("ext4_free_blocks: nonexistent device");
return;
}
ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
ext4_free_blocks_sb(handle, sb, block, count,
&dquot_freed_blocks);
else
ext4_mb_free_blocks(handle, inode, block, count,
metadata, &dquot_freed_blocks);
if (dquot_freed_blocks)
DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
return;
......@@ -1576,7 +1588,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
}
/**
* ext4_new_blocks() -- core block(s) allocation function
* ext4_new_blocks_old() -- core block(s) allocation function
* @handle: handle to this transaction
* @inode: file inode
* @goal: given target block(filesystem wide)
......@@ -1589,7 +1601,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
* any specific goal block.
*
*/
ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
struct buffer_head *bitmap_bh = NULL;
......@@ -1849,13 +1861,46 @@ out:
}
ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, int *errp)
ext4_fsblk_t goal, int *errp)
{
struct ext4_allocation_request ar;
ext4_fsblk_t ret;
if (!test_opt(inode->i_sb, MBALLOC)) {
unsigned long count = 1;
ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
return ret;
}
memset(&ar, 0, sizeof(ar));
ar.inode = inode;
ar.goal = goal;
ar.len = 1;
ret = ext4_mb_new_blocks(handle, &ar, errp);
return ret;
}
ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
unsigned long count = 1;
struct ext4_allocation_request ar;
ext4_fsblk_t ret;
return ext4_new_blocks(handle, inode, goal, &count, errp);
if (!test_opt(inode->i_sb, MBALLOC)) {
ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
return ret;
}
memset(&ar, 0, sizeof(ar));
ar.inode = inode;
ar.goal = goal;
ar.len = *count;
ret = ext4_mb_new_blocks(handle, &ar, errp);
*count = ar.len;
return ret;
}
/**
* ext4_count_free_blocks() -- count filesystem free blocks
* @sb: superblock
......
......@@ -853,7 +853,7 @@ cleanup:
for (i = 0; i < depth; i++) {
if (!ablocks[i])
continue;
ext4_free_blocks(handle, inode, ablocks[i], 1);
ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
}
}
kfree(ablocks);
......@@ -1698,7 +1698,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
ext_debug("index is empty, remove it, free block %llu\n", leaf);
bh = sb_find_get_block(inode->i_sb, leaf);
ext4_forget(handle, 1, inode, bh, leaf);
ext4_free_blocks(handle, inode, leaf, 1);
ext4_free_blocks(handle, inode, leaf, 1, 1);
return err;
}
......@@ -1759,8 +1759,10 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
{
struct buffer_head *bh;
unsigned short ee_len = ext4_ext_get_actual_len(ex);
int i;
int i, metadata = 0;
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
metadata = 1;
#ifdef EXTENTS_STATS
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
......@@ -1789,7 +1791,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
bh = sb_find_get_block(inode->i_sb, start + i);
ext4_forget(handle, 0, inode, bh, start + i);
}
ext4_free_blocks(handle, inode, start, num);
ext4_free_blocks(handle, inode, start, num, metadata);
} else if (from == le32_to_cpu(ex->ee_block)
&& to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
......@@ -2287,6 +2289,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, newblock;
int err = 0, depth, ret;
unsigned long allocated = 0;
struct ext4_allocation_request ar;
__clear_bit(BH_New, &bh_result->b_state);
ext_debug("blocks %u/%lu requested for inode %u\n",
......@@ -2397,8 +2400,15 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
ext4_init_block_alloc_info(inode);
/* allocate new block */
goal = ext4_ext_find_goal(inode, path, iblock);
/* find neighbour allocated blocks */
ar.lleft = iblock;
err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
if (err)
goto out2;
ar.lright = iblock;
err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
if (err)
goto out2;
/*
* See if request is beyond maximum number of blocks we can have in
......@@ -2421,7 +2431,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
allocated = le16_to_cpu(newex.ee_len);
else
allocated = max_blocks;
newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err);
/* allocate new block */
ar.inode = inode;
ar.goal = ext4_ext_find_goal(inode, path, iblock);
ar.logical = iblock;
ar.len = allocated;
if (S_ISREG(inode->i_mode))
ar.flags = EXT4_MB_HINT_DATA;
else
/* disable in-core preallocation for non-regular files */
ar.flags = 0;
newblock = ext4_mb_new_blocks(handle, &ar, &err);
if (!newblock)
goto out2;
ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
......@@ -2429,14 +2450,17 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* try to insert new extent into found leaf and return */
ext4_ext_store_pblock(&newex, newblock);
newex.ee_len = cpu_to_le16(allocated);
newex.ee_len = cpu_to_le16(ar.len);
if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */
ext4_ext_mark_uninitialized(&newex);
err = ext4_ext_insert_extent(handle, inode, path, &newex);
if (err) {
/* free data blocks we just allocated */
/* not a good idea to call discard here directly,
* but otherwise we'd need to call it every free() */
ext4_mb_discard_inode_preallocations(inode);
ext4_free_blocks(handle, inode, ext_pblock(&newex),
le16_to_cpu(newex.ee_len));
le16_to_cpu(newex.ee_len), 0);
goto out2;
}
......@@ -2445,6 +2469,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
allocated = le16_to_cpu(newex.ee_len);
outnew:
__set_bit(BH_New, &bh_result->b_state);
......@@ -2496,6 +2521,8 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
down_write(&EXT4_I(inode)->i_data_sem);
ext4_ext_invalidate_cache(inode);
ext4_mb_discard_inode_preallocations(inode);
/*
* TODO: optimization is possible here.
* Probably we need not scan at all,
......
......@@ -551,7 +551,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
return ret;
failed_out:
for (i = 0; i <index; i++)
ext4_free_blocks(handle, inode, new_blocks[i], 1);
ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
return ret;
}
......@@ -650,9 +650,9 @@ failed:
ext4_journal_forget(handle, branch[i].bh);
}
for (i = 0; i <indirect_blks; i++)
ext4_free_blocks(handle, inode, new_blocks[i], 1);
ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
ext4_free_blocks(handle, inode, new_blocks[i], num);
ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
return err;
}
......@@ -749,9 +749,10 @@ err_out:
for (i = 1; i <= num; i++) {
BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
ext4_journal_forget(handle, where[i].bh);
ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
ext4_free_blocks(handle, inode,
le32_to_cpu(where[i-1].key), 1, 0);
}
ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
return err;
}
......@@ -2052,7 +2053,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
}
}
ext4_free_blocks(handle, inode, block_to_free, count);
ext4_free_blocks(handle, inode, block_to_free, count, 0);
}
/**
......@@ -2225,7 +2226,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
ext4_journal_test_restart(handle, inode);
}
ext4_free_blocks(handle, inode, nr, 1);
ext4_free_blocks(handle, inode, nr, 1, 1);
if (parent_bh) {
/*
......
此差异已折叠。
......@@ -236,10 +236,10 @@ static int free_dind_blocks(handle_t *handle,
for (i = 0; i < max_entries; i++) {
if (tmp_idata[i])
ext4_free_blocks(handle, inode,
le32_to_cpu(tmp_idata[i]), 1);
le32_to_cpu(tmp_idata[i]), 1, 1);
}
put_bh(bh);
ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1);
ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
return 0;
}
......@@ -267,7 +267,7 @@ static int free_tind_blocks(handle_t *handle,
}
}
put_bh(bh);
ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1);
ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
return 0;
}
......@@ -278,7 +278,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode)
if (ei->i_data[EXT4_IND_BLOCK])
ext4_free_blocks(handle, inode,
le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1);
le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1, 1);
if (ei->i_data[EXT4_DIND_BLOCK]) {
retval = free_dind_blocks(handle, inode,
......@@ -365,7 +365,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
}
}
put_bh(bh);
ext4_free_blocks(handle, inode, block, 1);
ext4_free_blocks(handle, inode, block, 1, 1);
return retval;
}
......
此差异已折叠。
......@@ -480,7 +480,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
ea_bdebug(bh, "refcount now=0; freeing");
if (ce)
mb_cache_entry_free(ce);
ext4_free_blocks(handle, inode, bh->b_blocknr, 1);
ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
get_bh(bh);
ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
} else {
......@@ -821,7 +821,7 @@ inserted:
new_bh = sb_getblk(sb, block);
if (!new_bh) {
getblk_failed:
ext4_free_blocks(handle, inode, block, 1);
ext4_free_blocks(handle, inode, block, 1, 1);
error = -EIO;
goto cleanup;
}
......
此差异已折叠。
......@@ -158,6 +158,10 @@ struct ext4_inode_info {
* struct timespec i_{a,c,m}time in the generic inode.
*/
struct timespec i_crtime;
/* mballoc */
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
};
#endif /* _LINUX_EXT4_FS_I */
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册