提交 3a0782d0 编写于 作者: M Mark Fasheh

ocfs2: teach extend/truncate about sparse files

For ocfs2_truncate_file(), we eliminate the "simple" truncate case which no
longer exists since i_size is not tied to i_clusters. In
ocfs2_extend_file(), we skip the allocation / page zeroing code for file
systems which understand sparse files.

The core truncate code is changed to do a bottom up tree traversal. This
gets abstracted out into it's own function. To make things more readable,
most of the special case handling for in-inode extents from
ocfs2_do_truncate() is also removed.

Though write support for sparse files comes in a later patch, we at least
update ocfs2_prepare_inode_for_write() to skip allocation for sparse files.
Signed-off-by: NMark Fasheh <mark.fasheh@oracle.com>
上级 363041a5
......@@ -2921,12 +2921,13 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
* block will be so we can update his h_next_leaf_blk field, as well
* as the dinodes i_last_eb_blk */
static int ocfs2_find_new_last_ext_blk(struct inode *inode,
u32 new_i_clusters,
unsigned int clusters_to_del,
struct ocfs2_path *path,
struct buffer_head **new_last_eb)
{
int ret = 0;
int next_free, ret = 0;
u32 cpos;
struct ocfs2_extent_rec *rec;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
struct buffer_head *bh = NULL;
......@@ -2939,20 +2940,48 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
/* trunc to zero special case - this makes tree_depth = 0
* regardless of what it is. */
if (!new_i_clusters)
if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
goto out;
el = path_leaf_el(path);
BUG_ON(!el->l_next_free_rec);
/* Make sure that this guy will actually be empty after we
* clear away the data. */
/*
* Make sure that this extent list will actually be empty
* after we clear away the data. We can shortcut out if
* there's more than one non-empty extent in the
* list. Otherwise, a check of the remaining extent is
* necessary.
*/
next_free = le16_to_cpu(el->l_next_free_rec);
rec = NULL;
if (ocfs2_is_empty_extent(&el->l_recs[0])) {
if (le16_to_cpu(el->l_next_free_rec) > 1 &&
le32_to_cpu(el->l_recs[1].e_cpos) < new_i_clusters)
if (next_free > 2)
goto out;
} else if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
/* We may have a valid extent in index 1, check it. */
if (next_free == 2)
rec = &el->l_recs[1];
/*
* Fall through - no more nonempty extents, so we want
* to delete this leaf.
*/
} else {
if (next_free > 1)
goto out;
rec = &el->l_recs[0];
}
if (rec) {
/*
* Check it we'll only be trimming off the end of this
* cluster.
*/
if (le16_to_cpu(rec->e_clusters) > clusters_to_del)
goto out;
}
ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
if (ret) {
......@@ -2984,6 +3013,223 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
return ret;
}
/*
* Trim some clusters off the rightmost edge of a tree. Only called
* during truncate.
*
* The caller needs to:
* - start journaling of each path component.
* - compute and fully set up any new last ext block
*/
static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
handle_t *handle, struct ocfs2_truncate_context *tc,
u32 clusters_to_del, u64 *delete_start)
{
int ret, i, index = path->p_tree_depth;
u32 new_edge = 0;
u64 deleted_eb = 0;
struct buffer_head *bh;
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec;
*delete_start = 0;
while (index >= 0) {
bh = path->p_node[index].bh;
el = path->p_node[index].el;
mlog(0, "traveling tree (index = %d, block = %llu)\n",
index, (unsigned long long)bh->b_blocknr);
BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
if (index !=
(path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
ocfs2_error(inode->i_sb,
"Inode %lu has invalid ext. block %llu",
inode->i_ino,
(unsigned long long)bh->b_blocknr);
ret = -EROFS;
goto out;
}
find_tail_record:
i = le16_to_cpu(el->l_next_free_rec) - 1;
rec = &el->l_recs[i];
mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
"next = %u\n", i, le32_to_cpu(rec->e_cpos),
le32_to_cpu(rec->e_clusters),
(unsigned long long)le64_to_cpu(rec->e_blkno),
le16_to_cpu(el->l_next_free_rec));
BUG_ON(le32_to_cpu(rec->e_clusters) < clusters_to_del);
if (le16_to_cpu(el->l_tree_depth) == 0) {
/*
* If the leaf block contains a single empty
* extent and no records, we can just remove
* the block.
*/
if (i == 0 && ocfs2_is_empty_extent(rec)) {
memset(rec, 0,
sizeof(struct ocfs2_extent_rec));
el->l_next_free_rec = cpu_to_le16(0);
goto delete;
}
/*
* Remove any empty extents by shifting things
* left. That should make life much easier on
* the code below. This condition is rare
* enough that we shouldn't see a performance
* hit.
*/
if (ocfs2_is_empty_extent(&el->l_recs[0])) {
le16_add_cpu(&el->l_next_free_rec, -1);
for(i = 0;
i < le16_to_cpu(el->l_next_free_rec); i++)
el->l_recs[i] = el->l_recs[i + 1];
memset(&el->l_recs[i], 0,
sizeof(struct ocfs2_extent_rec));
/*
* We've modified our extent list. The
* simplest way to handle this change
* is to being the search from the
* start again.
*/
goto find_tail_record;
}
le32_add_cpu(&rec->e_clusters, -clusters_to_del);
/*
* We'll use "new_edge" on our way back up the
* tree to know what our rightmost cpos is.
*/
new_edge = le32_to_cpu(rec->e_clusters);
new_edge += le32_to_cpu(rec->e_cpos);
/*
* The caller will use this to delete data blocks.
*/
*delete_start = le64_to_cpu(rec->e_blkno)
+ ocfs2_clusters_to_blocks(inode->i_sb,
le32_to_cpu(rec->e_clusters));
/*
* If it's now empty, remove this record.
*/
if (le32_to_cpu(rec->e_clusters) == 0) {
memset(rec, 0,
sizeof(struct ocfs2_extent_rec));
le16_add_cpu(&el->l_next_free_rec, -1);
}
} else {
if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
memset(rec, 0,
sizeof(struct ocfs2_extent_rec));
le16_add_cpu(&el->l_next_free_rec, -1);
goto delete;
}
/* Can this actually happen? */
if (le16_to_cpu(el->l_next_free_rec) == 0)
goto delete;
/*
* We never actually deleted any clusters
* because our leaf was empty. There's no
* reason to adjust the rightmost edge then.
*/
if (new_edge == 0)
goto delete;
rec->e_clusters = cpu_to_le32(new_edge);
le32_add_cpu(&rec->e_clusters,
-le32_to_cpu(rec->e_cpos));
/*
* A deleted child record should have been
* caught above.
*/
BUG_ON(le32_to_cpu(rec->e_clusters) == 0);
}
delete:
ret = ocfs2_journal_dirty(handle, bh);
if (ret) {
mlog_errno(ret);
goto out;
}
mlog(0, "extent list container %llu, after: record %d: "
"(%u, %u, %llu), next = %u.\n",
(unsigned long long)bh->b_blocknr, i,
le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
(unsigned long long)le64_to_cpu(rec->e_blkno),
le16_to_cpu(el->l_next_free_rec));
/*
* We must be careful to only attempt delete of an
* extent block (and not the root inode block).
*/
if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
struct ocfs2_extent_block *eb =
(struct ocfs2_extent_block *)bh->b_data;
/*
* Save this for use when processing the
* parent block.
*/
deleted_eb = le64_to_cpu(eb->h_blkno);
mlog(0, "deleting this extent block.\n");
ocfs2_remove_from_cache(inode, bh);
BUG_ON(le32_to_cpu(el->l_recs[0].e_clusters));
BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
/*
* This code only understands how to
* lock the suballocator in slot 0,
* which is fine because allocation is
* only ever done out of that
* suballocator too. A future version
* might change that however, so avoid
* a free if we don't know how to
* handle it. This way an fs incompat
* bit will not be necessary.
*/
ret = ocfs2_free_extent_block(handle,
tc->tc_ext_alloc_inode,
tc->tc_ext_alloc_bh,
eb);
/* An error here is not fatal. */
if (ret < 0)
mlog_errno(ret);
}
} else {
deleted_eb = 0;
}
index--;
}
ret = 0;
out:
return ret;
}
static int ocfs2_do_truncate(struct ocfs2_super *osb,
unsigned int clusters_to_del,
struct inode *inode,
......@@ -2992,20 +3238,16 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
struct ocfs2_truncate_context *tc,
struct ocfs2_path *path)
{
int status, i, index;
int status;
struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_block *last_eb = NULL;
struct ocfs2_extent_list *el;
struct buffer_head *eb_bh = NULL;
struct buffer_head *last_eb_bh = NULL;
u64 delete_blk = 0;
fe = (struct ocfs2_dinode *) fe_bh->b_data;
status = ocfs2_find_new_last_ext_blk(inode,
le32_to_cpu(fe->i_clusters) -
clusters_to_del,
status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
path, &last_eb_bh);
if (status < 0) {
mlog_errno(status);
......@@ -3016,15 +3258,11 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
* Each component will be touched, so we might as well journal
* here to avoid having to handle errors later.
*/
for (i = 0; i < path_num_items(path); i++) {
status = ocfs2_journal_access(handle, inode,
path->p_node[i].bh,
OCFS2_JOURNAL_ACCESS_WRITE);
status = ocfs2_journal_access_path(inode, handle, path);
if (status < 0) {
mlog_errno(status);
goto bail;
}
}
if (last_eb_bh) {
status = ocfs2_journal_access(handle, inode, last_eb_bh,
......@@ -3047,6 +3285,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
ocfs2_error(inode->i_sb,
"Inode %lu has an empty extent record, depth %u\n",
inode->i_ino, le16_to_cpu(el->l_tree_depth));
status = -EROFS;
goto bail;
}
......@@ -3056,38 +3295,11 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
spin_unlock(&OCFS2_I(inode)->ip_lock);
le32_add_cpu(&fe->i_clusters, -clusters_to_del);
i = le16_to_cpu(el->l_next_free_rec) - 1;
BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
/* tree depth zero, we can just delete the clusters, otherwise
* we need to record the offset of the next level extent block
* as we may overwrite it. */
if (!el->l_tree_depth) {
delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
+ ocfs2_clusters_to_blocks(osb->sb,
le32_to_cpu(el->l_recs[i].e_clusters));
if (!el->l_recs[i].e_clusters) {
/* if we deleted the whole extent record, then clear
* out the other fields and update the extent
* list.
*/
el->l_recs[i].e_cpos = 0;
el->l_recs[i].e_blkno = 0;
BUG_ON(!el->l_next_free_rec);
le16_add_cpu(&el->l_next_free_rec, -1);
/*
* The leftmost record might be an empty extent -
* delete it here too.
*/
if (i == 1 && ocfs2_is_empty_extent(&el->l_recs[0])) {
el->l_recs[0].e_cpos = 0;
el->l_recs[0].e_blkno = 0;
el->l_next_free_rec = 0;
}
}
status = ocfs2_trim_tree(inode, path, handle, tc,
clusters_to_del, &delete_blk);
if (status) {
mlog_errno(status);
goto bail;
}
if (le32_to_cpu(fe->i_clusters) == 0) {
......@@ -3115,126 +3327,14 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
}
}
index = 1;
/* if our tree depth > 0, update all the tree blocks below us. */
while (index <= path->p_tree_depth) {
eb_bh = path->p_node[index].bh;
eb = (struct ocfs2_extent_block *)eb_bh->b_data;
el = path->p_node[index].el;
mlog(0, "traveling tree (index = %d, extent block: %llu)\n",
index, (unsigned long long)eb_bh->b_blocknr);
BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
if (index !=
(path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
ocfs2_error(inode->i_sb,
"Inode %lu has invalid ext. block %llu\n",
inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
goto bail;
}
i = le16_to_cpu(el->l_next_free_rec) - 1;
mlog(0, "extent block %llu, before: record %d: "
"(%u, %u, %llu), next = %u\n",
(unsigned long long)le64_to_cpu(eb->h_blkno), i,
le32_to_cpu(el->l_recs[i].e_cpos),
le32_to_cpu(el->l_recs[i].e_clusters),
(unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
le16_to_cpu(el->l_next_free_rec));
BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
/* bottom-most block requires us to delete data.*/
if (!el->l_tree_depth)
delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
+ ocfs2_clusters_to_blocks(osb->sb,
le32_to_cpu(el->l_recs[i].e_clusters));
if (!el->l_recs[i].e_clusters) {
el->l_recs[i].e_cpos = 0;
el->l_recs[i].e_blkno = 0;
BUG_ON(!el->l_next_free_rec);
le16_add_cpu(&el->l_next_free_rec, -1);
}
if (i == 1 && ocfs2_is_empty_extent(&el->l_recs[0])) {
el->l_recs[0].e_cpos = 0;
el->l_recs[0].e_blkno = 0;
el->l_next_free_rec = 0;
}
mlog(0, "extent block %llu, after: record %d: "
"(%u, %u, %llu), next = %u\n",
(unsigned long long)le64_to_cpu(eb->h_blkno), i,
le32_to_cpu(el->l_recs[i].e_cpos),
le32_to_cpu(el->l_recs[i].e_clusters),
(unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
le16_to_cpu(el->l_next_free_rec));
status = ocfs2_journal_dirty(handle, eb_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
if (!el->l_next_free_rec) {
mlog(0, "deleting this extent block.\n");
ocfs2_remove_from_cache(inode, eb_bh);
BUG_ON(el->l_recs[0].e_clusters);
BUG_ON(el->l_recs[0].e_cpos);
BUG_ON(el->l_recs[0].e_blkno);
/*
* We need to remove this extent block from
* the list above it.
*
* Since we've passed it already in this loop,
* no need to worry about journaling.
*/
el = path->p_node[index - 1].el;
i = le16_to_cpu(el->l_next_free_rec) - 1;
BUG_ON(i < 0);
el->l_recs[i].e_cpos = 0;
el->l_recs[i].e_clusters = 0;
el->l_recs[i].e_blkno = 0;
le16_add_cpu(&el->l_next_free_rec, -1);
if (eb->h_suballoc_slot == 0) {
/*
* This code only understands how to
* lock the suballocator in slot 0,
* which is fine because allocation is
* only ever done out of that
* suballocator too. A future version
* might change that however, so avoid
* a free if we don't know how to
* handle it. This way an fs incompat
* bit will not be necessary.
*/
status = ocfs2_free_extent_block(handle,
tc->tc_ext_alloc_inode,
tc->tc_ext_alloc_bh,
eb);
if (status < 0) {
mlog_errno(status);
goto bail;
}
}
}
index++;
}
BUG_ON(!delete_blk);
if (delete_blk) {
status = ocfs2_truncate_log_append(osb, handle, delete_blk,
clusters_to_del);
if (status < 0) {
mlog_errno(status);
goto bail;
}
}
status = 0;
bail:
......@@ -3274,6 +3374,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
goto bail;
}
start:
/*
* Check that we still have allocation to delete.
*/
if (OCFS2_I(inode)->ip_clusters == 0) {
status = 0;
goto bail;
}
/*
* Truncate always works against the rightmost tree branch.
*/
......@@ -3298,6 +3406,15 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
* - no record needs to be removed (truncate has completed)
*/
el = path_leaf_el(path);
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ocfs2_error(inode->i_sb,
"Inode %llu has empty extent block at %llu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)path_leaf_bh(path)->b_blocknr);
status = -EROFS;
goto bail;
}
i = le16_to_cpu(el->l_next_free_rec) - 1;
range = le32_to_cpu(el->l_recs[i].e_cpos) +
le32_to_cpu(el->l_recs[i].e_clusters);
......@@ -3359,10 +3476,11 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
ocfs2_reinit_path(path, 1);
/*
* Only loop if we still have allocation.
* The check above will catch the case where we've truncated
* away all allocation.
*/
if (OCFS2_I(inode)->ip_clusters)
goto start;
bail:
up_write(&OCFS2_I(inode)->ip_alloc_sem);
......@@ -3414,22 +3532,6 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
"%llu\n", fe->i_clusters, new_i_clusters,
(unsigned long long)fe->i_size);
if (!ocfs2_sparse_alloc(osb) &&
le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
"%u and size %llu whereas struct inode has "
"cluster count %u and size %llu which caused an "
"invalid truncate to %u clusters.",
(unsigned long long)le64_to_cpu(fe->i_blkno),
le32_to_cpu(fe->i_clusters),
(unsigned long long)le64_to_cpu(fe->i_size),
OCFS2_I(inode)->ip_clusters, i_size_read(inode),
new_i_clusters);
mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
status = -EIO;
goto bail;
}
*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
if (!(*tc)) {
status = -ENOMEM;
......
......@@ -344,18 +344,6 @@ static int ocfs2_truncate_file(struct inode *inode,
}
ocfs2_data_unlock(inode, 1);
if (le32_to_cpu(fe->i_clusters) ==
ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
fe->i_clusters);
/* No allocation change is required, so lets fast path
* this truncate. */
status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
if (status < 0)
mlog_errno(status);
goto bail;
}
/* alright, we're going to need to do a full blown alloc size
* change. Orphan the inode so that recovery can complete the
* truncate if necessary. This does the task of marking
......@@ -785,7 +773,7 @@ static int ocfs2_extend_file(struct inode *inode,
size_t tail_to_skip)
{
int ret = 0;
u32 clusters_to_add;
u32 clusters_to_add = 0;
BUG_ON(!tail_to_skip && !di_bh);
......@@ -797,6 +785,11 @@ static int ocfs2_extend_file(struct inode *inode,
goto out;
BUG_ON(new_i_size < i_size_read(inode));
if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
BUG_ON(tail_to_skip != 0);
goto out_update_size;
}
clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
OCFS2_I(inode)->ip_clusters;
......@@ -832,6 +825,7 @@ static int ocfs2_extend_file(struct inode *inode,
goto out_unlock;
}
out_update_size:
if (!tail_to_skip) {
/* We're being called from ocfs2_setattr() which wants
* us to update i_size */
......@@ -841,6 +835,7 @@ static int ocfs2_extend_file(struct inode *inode,
}
out_unlock:
if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
ocfs2_data_unlock(inode, 1);
out:
......@@ -1097,6 +1092,14 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
} else {
saved_pos = *ppos;
}
/*
* The rest of this loop is concerned with legacy file
* systems which don't support sparse files.
*/
if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
break;
newsize = count + saved_pos;
mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
......
......@@ -487,7 +487,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
struct buffer_head *fe_bh)
{
int status = 0;
handle_t *handle = NULL;
struct ocfs2_truncate_context *tc = NULL;
struct ocfs2_dinode *fe;
......@@ -495,41 +494,20 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
fe = (struct ocfs2_dinode *) fe_bh->b_data;
/* zero allocation, zero truncate :) */
if (!fe->i_clusters)
goto bail;
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
mlog_errno(status);
goto bail;
}
status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
if (status < 0) {
mlog_errno(status);
goto bail;
}
ocfs2_commit_trans(osb, handle);
handle = NULL;
if (fe->i_clusters) {
status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
if (status < 0) {
mlog_errno(status);
goto bail;
goto out;
}
status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
if (status < 0) {
mlog_errno(status);
goto bail;
goto out;
}
bail:
if (handle)
ocfs2_commit_trans(osb, handle);
}
out:
mlog_exit(status);
return status;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册