From 363041a5f74b953ab6b705ac9c88e5eda218a24b Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 17 Jan 2007 12:31:35 -0800 Subject: [PATCH] ocfs2: temporarily remove extent map caching The code in extent_map.c is not prepared to deal with a subtree being rotated between lookups. This can happen when filling holes in sparse files. Instead of a lengthy patch to update the code (which would likely lose the benefit of caching subtree roots), we remove most of the algorithms and implement a simple path based lookup. A less ambitious extent caching scheme will be added in a later patch. Signed-off-by: Mark Fasheh --- fs/ocfs2/alloc.c | 5 +- fs/ocfs2/alloc.h | 3 + fs/ocfs2/aops.c | 8 +- fs/ocfs2/dir.c | 2 +- fs/ocfs2/dlmglue.c | 4 - fs/ocfs2/extent_map.c | 1024 ++++------------------------------------- fs/ocfs2/extent_map.h | 19 +- fs/ocfs2/inode.c | 6 +- fs/ocfs2/inode.h | 1 - fs/ocfs2/journal.c | 3 +- fs/ocfs2/namei.c | 3 +- fs/ocfs2/ocfs2.h | 5 - fs/ocfs2/slot_map.c | 2 +- fs/ocfs2/super.c | 7 - 14 files changed, 96 insertions(+), 996 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a96696867576..85a05f120249 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1146,9 +1146,8 @@ static void find_leaf_ins(void *data, struct buffer_head *bh) * * This function doesn't handle non btree extent lists. */ -static int ocfs2_find_leaf(struct inode *inode, - struct ocfs2_extent_list *root_el, u32 cpos, - struct buffer_head **leaf_bh) +int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, + u32 cpos, struct buffer_head **leaf_bh) { int ret; struct buffer_head *bh = NULL; diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index b0880fdb3108..bff2a162b030 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -80,4 +80,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, struct buffer_head *fe_bh, struct ocfs2_truncate_context *tc); +int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, + u32 cpos, struct buffer_head **leaf_bh); + #endif /* OCFS2_ALLOC_H */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 875c11443817..f3b0cc5cba1a 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -158,8 +158,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, if (err) goto bail; - err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, - NULL); + err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL); if (err) { mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " "%llu, NULL)\n", err, inode, (unsigned long long)iblock, @@ -499,8 +498,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) down_read(&OCFS2_I(inode)->ip_alloc_sem); } - err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, - NULL); + err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL); if (!INODE_JOURNAL(inode)) { up_read(&OCFS2_I(inode)->ip_alloc_sem); @@ -574,7 +572,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, /* This figures out the size of the next contiguous block, and * our logical offset */ - ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, + ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &contig_blocks); if (ret) { mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 5d211c53a8d8..c91490670ffa 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -379,7 +379,7 @@ int ocfs2_do_extend_dir(struct super_block *sb, status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >> (sb->s_blocksize_bits - 9)), - 1, &p_blkno, NULL); + &p_blkno, NULL); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index ca4f0e0e7587..8de6678a340a 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1614,10 +1614,6 @@ static int ocfs2_meta_lock_update(struct inode *inode, * for the inode metadata. */ ocfs2_metadata_cache_purge(inode); - /* will do nothing for inode types that don't use the extent - * map (bitmap files, etc) */ - ocfs2_extent_map_trunc(inode, 0); - if (ocfs2_meta_lvb_is_trustable(inode, lockres)) { mlog(0, "Trusting LVB on inode %llu\n", (unsigned long long)oi->ip_blkno); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 80ac69f11d9f..3b4322fd369a 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -3,8 +3,7 @@ * * extent_map.c * - * In-memory extent map for OCFS2. Man, this code was prettier in - * the library. + * Block/Cluster mapping functions * * Copyright (C) 2004 Oracle. All rights reserved. * @@ -26,1016 +25,155 @@ #include #include #include -#include -#include #define MLOG_MASK_PREFIX ML_EXTENT_MAP #include #include "ocfs2.h" +#include "alloc.h" #include "extent_map.h" #include "inode.h" #include "super.h" #include "buffer_head_io.h" - -/* - * SUCK SUCK SUCK - * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h - */ - -struct ocfs2_extent_map_entry { - struct rb_node e_node; - int e_tree_depth; - struct ocfs2_extent_rec e_rec; -}; - -struct ocfs2_em_insert_context { - int need_left; - int need_right; - struct ocfs2_extent_map_entry *new_ent; - struct ocfs2_extent_map_entry *old_ent; - struct ocfs2_extent_map_entry *left_ent; - struct ocfs2_extent_map_entry *right_ent; -}; - -static struct kmem_cache *ocfs2_em_ent_cachep = NULL; - - -static struct ocfs2_extent_map_entry * -ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, - u32 cpos, u32 clusters, - struct rb_node ***ret_p, - struct rb_node **ret_parent); -static int ocfs2_extent_map_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth); -static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, - struct ocfs2_extent_map_entry *ent); -static int ocfs2_extent_map_find_leaf(struct inode *inode, - u32 cpos, u32 clusters, - struct ocfs2_extent_list *el); -static int ocfs2_extent_map_lookup_read(struct inode *inode, - u32 cpos, u32 clusters, - struct ocfs2_extent_map_entry **ret_ent); -static int ocfs2_extent_map_try_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth, - struct ocfs2_em_insert_context *ctxt); - -/* returns 1 only if the rec contains all the given clusters -- that is that - * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos + - * clusters) is >= the argument's endpoint */ -static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec, - u32 cpos, u32 clusters) -{ - if (le32_to_cpu(rec->e_cpos) > cpos) - return 0; - if (cpos + clusters > le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)) - return 0; - return 1; -} - - /* - * Find an entry in the tree that intersects the region passed in. - * Note that this will find straddled intervals, it is up to the - * callers to enforce any boundary conditions. - * - * Callers must hold ip_lock. This lookup is not guaranteed to return - * a tree_depth 0 match, and as such can race inserts if the lock - * were not held. + * Return the index of the extent record which contains cluster #v_cluster. + * -1 is returned if it was not found. * - * The rb_node garbage lets insertion share the search. Trivial - * callers pass NULL. + * Should work fine on interior and exterior nodes. */ -static struct ocfs2_extent_map_entry * -ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, - u32 cpos, u32 clusters, - struct rb_node ***ret_p, - struct rb_node **ret_parent) +static int ocfs2_search_extent_list(struct ocfs2_extent_list *el, + u32 v_cluster) { - struct rb_node **p = &em->em_extents.rb_node; - struct rb_node *parent = NULL; - struct ocfs2_extent_map_entry *ent = NULL; - - while (*p) - { - parent = *p; - ent = rb_entry(parent, struct ocfs2_extent_map_entry, - e_node); - if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) { - p = &(*p)->rb_left; - ent = NULL; - } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) + - le32_to_cpu(ent->e_rec.e_clusters))) { - p = &(*p)->rb_right; - ent = NULL; - } else - break; - } - - if (ret_p != NULL) - *ret_p = p; - if (ret_parent != NULL) - *ret_parent = parent; - return ent; -} - -/* - * Find the leaf containing the interval we want. While we're on our - * way down the tree, fill in every record we see at any depth, because - * we might want it later. - * - * Note that this code is run without ip_lock. That's because it - * sleeps while reading. If someone is also filling the extent list at - * the same time we are, we might have to restart. - */ -static int ocfs2_extent_map_find_leaf(struct inode *inode, - u32 cpos, u32 clusters, - struct ocfs2_extent_list *el) -{ - int i, ret; - struct buffer_head *eb_bh = NULL; - u64 blkno; - u32 rec_end; - struct ocfs2_extent_block *eb; + int ret = -1; + int i; struct ocfs2_extent_rec *rec; + u32 rec_end, rec_start; - /* - * The bh data containing the el cannot change here, because - * we hold alloc_sem. So we can do this without other - * locks. - */ - while (el->l_tree_depth) - { - blkno = 0; - for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { - rec = &el->l_recs[i]; - rec_end = (le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)); - - ret = -EBADR; - if (rec_end > OCFS2_I(inode)->ip_clusters) { - mlog_errno(ret); - ocfs2_error(inode->i_sb, - "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", - i, - (unsigned long long)le64_to_cpu(rec->e_blkno), - (unsigned long long)OCFS2_I(inode)->ip_blkno, - OCFS2_I(inode)->ip_clusters); - goto out_free; - } - - if (rec_end <= cpos) { - ret = ocfs2_extent_map_insert(inode, rec, - le16_to_cpu(el->l_tree_depth)); - if (ret && (ret != -EEXIST)) { - mlog_errno(ret); - goto out_free; - } - continue; - } - if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) { - ret = ocfs2_extent_map_insert(inode, rec, - le16_to_cpu(el->l_tree_depth)); - if (ret && (ret != -EEXIST)) { - mlog_errno(ret); - goto out_free; - } - continue; - } - - /* - * We've found a record that matches our - * interval. We don't insert it because we're - * about to traverse it. - */ - - /* Check to see if we're stradling */ - ret = -ESRCH; - if (!ocfs2_extent_rec_contains_clusters(rec, - cpos, - clusters)) { - mlog_errno(ret); - goto out_free; - } - - /* - * If we've already found a record, the el has - * two records covering the same interval. - * EEEK! - */ - ret = -EBADR; - if (blkno) { - mlog_errno(ret); - ocfs2_error(inode->i_sb, - "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n", - cpos, clusters, - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)blkno, i, - (unsigned long long)le64_to_cpu(rec->e_blkno)); - goto out_free; - } - - blkno = le64_to_cpu(rec->e_blkno); - } - - /* - * We don't support holes, and we're still up - * in the branches, so we'd better have found someone - */ - ret = -EBADR; - if (!blkno) { - ocfs2_error(inode->i_sb, - "No record found for (cpos = %u, clusters = %u) on inode %llu\n", - cpos, clusters, - (unsigned long long)OCFS2_I(inode)->ip_blkno); - mlog_errno(ret); - goto out_free; - } - - if (eb_bh) { - brelse(eb_bh); - eb_bh = NULL; - } - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - blkno, &eb_bh, OCFS2_BH_CACHED, - inode); - if (ret) { - mlog_errno(ret); - goto out_free; - } - eb = (struct ocfs2_extent_block *)eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - ret = -EIO; - goto out_free; - } - el = &eb->h_list; - } - - BUG_ON(el->l_tree_depth); - - for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { rec = &el->l_recs[i]; - if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > - OCFS2_I(inode)->ip_clusters) { - ret = -EBADR; - mlog_errno(ret); - ocfs2_error(inode->i_sb, - "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", - i, - (unsigned long long)le64_to_cpu(rec->e_blkno), - (unsigned long long)OCFS2_I(inode)->ip_blkno, - OCFS2_I(inode)->ip_clusters); - return ret; - } + rec_start = le32_to_cpu(rec->e_cpos); + rec_end = rec_start + le32_to_cpu(rec->e_clusters); - ret = ocfs2_extent_map_insert(inode, rec, - le16_to_cpu(el->l_tree_depth)); - if (ret && (ret != -EEXIST)) { - mlog_errno(ret); - goto out_free; + if (v_cluster >= rec_start && v_cluster < rec_end) { + ret = i; + break; } } - ret = 0; - -out_free: - if (eb_bh) - brelse(eb_bh); - return ret; } -/* - * This lookup actually will read from disk. It has one invariant: - * It will never re-traverse blocks. This means that all inserts should - * be new regions or more granular regions (both allowed by insert). - */ -static int ocfs2_extent_map_lookup_read(struct inode *inode, - u32 cpos, - u32 clusters, - struct ocfs2_extent_map_entry **ret_ent) +static int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters) { - int ret; - u64 blkno; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; - struct buffer_head *bh = NULL; - struct ocfs2_extent_block *eb; + int ret, i; + struct buffer_head *di_bh = NULL; + struct buffer_head *eb_bh = NULL; struct ocfs2_dinode *di; + struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + u32 coff; - spin_lock(&OCFS2_I(inode)->ip_lock); - ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); - if (ent) { - if (!ent->e_tree_depth) { - spin_unlock(&OCFS2_I(inode)->ip_lock); - *ret_ent = ent; - return 0; - } - blkno = le64_to_cpu(ent->e_rec.e_blkno); - spin_unlock(&OCFS2_I(inode)->ip_lock); - - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh, - OCFS2_BH_CACHED, inode); - if (ret) { - mlog_errno(ret); - if (bh) - brelse(bh); - return ret; - } - eb = (struct ocfs2_extent_block *)bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - brelse(bh); - return -EIO; - } - el = &eb->h_list; - } else { - spin_unlock(&OCFS2_I(inode)->ip_lock); - - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, &bh, - OCFS2_BH_CACHED, inode); - if (ret) { - mlog_errno(ret); - if (bh) - brelse(bh); - return ret; - } - di = (struct ocfs2_dinode *)bh->b_data; - if (!OCFS2_IS_VALID_DINODE(di)) { - brelse(bh); - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di); - return -EIO; - } - el = &di->id2.i_list; - } - - ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el); - brelse(bh); + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, + &di_bh, OCFS2_BH_CACHED, inode); if (ret) { mlog_errno(ret); - return ret; + goto out; } - ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); - if (!ent) { - ret = -ESRCH; - mlog_errno(ret); - return ret; - } - - /* FIXME: Make sure this isn't a corruption */ - BUG_ON(ent->e_tree_depth); + di = (struct ocfs2_dinode *) di_bh->b_data; + el = &di->id2.i_list; - *ret_ent = ent; - - return 0; -} - -/* - * Callers must hold ip_lock. This can insert pieces of the tree, - * thus racing lookup if the lock weren't held. - */ -static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, - struct ocfs2_extent_map_entry *ent) -{ - struct rb_node **p, *parent; - struct ocfs2_extent_map_entry *old_ent; - - old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), - le32_to_cpu(ent->e_rec.e_clusters), - &p, &parent); - if (old_ent) - return -EEXIST; - - rb_link_node(&ent->e_node, parent, p); - rb_insert_color(&ent->e_node, &em->em_extents); - - return 0; -} - - -/* - * Simple rule: on any return code other than -EAGAIN, anything left - * in the insert_context will be freed. - * - * Simple rule #2: A return code of -EEXIST from this function or - * its calls to ocfs2_extent_map_insert_entry() signifies that another - * thread beat us to the insert. It is not an actual error, but it - * tells the caller we have no more work to do. - */ -static int ocfs2_extent_map_try_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth, - struct ocfs2_em_insert_context *ctxt) -{ - int ret; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *old_ent; - - ctxt->need_left = 0; - ctxt->need_right = 0; - ctxt->old_ent = NULL; - - spin_lock(&OCFS2_I(inode)->ip_lock); - ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); - if (!ret) { - ctxt->new_ent = NULL; - goto out_unlock; - } - - /* Since insert_entry failed, the map MUST have old_ent */ - old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), - le32_to_cpu(rec->e_clusters), - NULL, NULL); - - BUG_ON(!old_ent); + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } - if (old_ent->e_tree_depth < tree_depth) { - /* Another thread beat us to the lower tree_depth */ - ret = -EEXIST; - goto out_unlock; + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; } - if (old_ent->e_tree_depth == tree_depth) { + i = ocfs2_search_extent_list(el, v_cluster); + if (i == -1) { /* - * Another thread beat us to this tree_depth. - * Let's make sure we agree with that thread (the - * extent_rec should be identical). + * A hole was found. Return some canned values that + * callers can key on. */ - if (!memcmp(rec, &old_ent->e_rec, - sizeof(struct ocfs2_extent_rec))) - ret = 0; - else - /* FIXME: Should this be ESRCH/EBADR??? */ - ret = -EEXIST; + *p_cluster = 0; + if (num_clusters) + *num_clusters = 1; + } else { + rec = &el->l_recs[i]; - goto out_unlock; - } + BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); - /* - * We do it in this order specifically so that no actual tree - * changes occur until we have all the pieces we need. We - * don't want malloc failures to leave an inconsistent tree. - * Whenever we drop the lock, another process could be - * inserting. Also note that, if another process just beat us - * to an insert, we might not need the same pieces we needed - * the first go round. In the end, the pieces we need will - * be used, and the pieces we don't will be freed. - */ - ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > - le32_to_cpu(old_ent->e_rec.e_cpos)); - ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + - le32_to_cpu(old_ent->e_rec.e_clusters)) > - (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); - ret = -EAGAIN; - if (ctxt->need_left) { - if (!ctxt->left_ent) - goto out_unlock; - *(ctxt->left_ent) = *old_ent; - ctxt->left_ent->e_rec.e_clusters = - cpu_to_le32(le32_to_cpu(rec->e_cpos) - - le32_to_cpu(ctxt->left_ent->e_rec.e_cpos)); - } - if (ctxt->need_right) { - if (!ctxt->right_ent) - goto out_unlock; - *(ctxt->right_ent) = *old_ent; - ctxt->right_ent->e_rec.e_cpos = - cpu_to_le32(le32_to_cpu(rec->e_cpos) + + if (!rec->e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0)", inode->i_ino, + le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters)); - ctxt->right_ent->e_rec.e_clusters = - cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) + - le32_to_cpu(old_ent->e_rec.e_clusters)) - - le32_to_cpu(ctxt->right_ent->e_rec.e_cpos)); - } - - rb_erase(&old_ent->e_node, &em->em_extents); - /* Now that he's erased, set him up for deletion */ - ctxt->old_ent = old_ent; - - if (ctxt->need_left) { - ret = ocfs2_extent_map_insert_entry(em, - ctxt->left_ent); - if (ret) - goto out_unlock; - ctxt->left_ent = NULL; - } - - if (ctxt->need_right) { - ret = ocfs2_extent_map_insert_entry(em, - ctxt->right_ent); - if (ret) - goto out_unlock; - ctxt->right_ent = NULL; - } - - ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); - - if (!ret) - ctxt->new_ent = NULL; - -out_unlock: - spin_unlock(&OCFS2_I(inode)->ip_lock); - - return ret; -} - - -static int ocfs2_extent_map_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth) -{ - int ret; - struct ocfs2_em_insert_context ctxt = {0, }; - - if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > - OCFS2_I(inode)->ip_map.em_clusters) { - ret = -EBADR; - mlog_errno(ret); - return ret; - } - - /* Zero e_clusters means a truncated tail record. It better be EOF */ - if (!rec->e_clusters) { - if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != - OCFS2_I(inode)->ip_map.em_clusters) { - ret = -EBADR; - mlog_errno(ret); - ocfs2_error(inode->i_sb, - "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n", - (unsigned long long)le64_to_cpu(rec->e_blkno), - (unsigned long long)OCFS2_I(inode)->ip_blkno); - return ret; - } - - /* Ignore the truncated tail */ - return 0; - } - - ret = -ENOMEM; - ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_NOFS); - if (!ctxt.new_ent) { - mlog_errno(ret); - return ret; - } - - ctxt.new_ent->e_rec = *rec; - ctxt.new_ent->e_tree_depth = tree_depth; - - do { - ret = -ENOMEM; - if (ctxt.need_left && !ctxt.left_ent) { - ctxt.left_ent = - kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_NOFS); - if (!ctxt.left_ent) - break; - } - if (ctxt.need_right && !ctxt.right_ent) { - ctxt.right_ent = - kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_NOFS); - if (!ctxt.right_ent) - break; + ret = -EROFS; + goto out; } - ret = ocfs2_extent_map_try_insert(inode, rec, - tree_depth, &ctxt); - } while (ret == -EAGAIN); + coff = v_cluster - le32_to_cpu(rec->e_cpos); - if ((ret < 0) && (ret != -EEXIST)) - mlog_errno(ret); - - if (ctxt.left_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent); - if (ctxt.right_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent); - if (ctxt.old_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent); - if (ctxt.new_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent); - - return ret; -} + *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(rec->e_blkno)); + *p_cluster = *p_cluster + coff; -/* - * Append this record to the tail of the extent map. It must be - * tree_depth 0. The record might be an extension of an existing - * record, and as such that needs to be handled. eg: - * - * Existing record in the extent map: - * - * cpos = 10, len = 10 - * |---------| - * - * New Record: - * - * cpos = 10, len = 20 - * |------------------| - * - * The passed record is the new on-disk record. The new_clusters value - * is how many clusters were added to the file. If the append is a - * contiguous append, the new_clusters has been added to - * rec->e_clusters. If the append is an entirely new extent, then - * rec->e_clusters is == new_clusters. - */ -int ocfs2_extent_map_append(struct inode *inode, - struct ocfs2_extent_rec *rec, - u32 new_clusters) -{ - int ret; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; - struct ocfs2_extent_rec *old; - - BUG_ON(!new_clusters); - BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters); - - if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { - /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters - */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters; + if (num_clusters) + *num_clusters = le32_to_cpu(rec->e_clusters) - coff; } - mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)) != - (em->em_clusters + new_clusters), - "Inode %llu:\n" - "rec->e_cpos = %u + rec->e_clusters = %u = %u\n" - "em->em_clusters = %u + new_clusters = %u = %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), - le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters), - em->em_clusters, new_clusters, - em->em_clusters + new_clusters); - - em->em_clusters += new_clusters; - - ret = -ENOENT; - if (le32_to_cpu(rec->e_clusters) > new_clusters) { - /* This is a contiguous append */ - ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1, - NULL, NULL); - if (ent) { - old = &ent->e_rec; - BUG_ON((le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)) != - (le32_to_cpu(old->e_cpos) + - le32_to_cpu(old->e_clusters) + - new_clusters)); - if (ent->e_tree_depth == 0) { - BUG_ON(le32_to_cpu(old->e_cpos) != - le32_to_cpu(rec->e_cpos)); - BUG_ON(le64_to_cpu(old->e_blkno) != - le64_to_cpu(rec->e_blkno)); - ret = 0; - } - /* - * Let non-leafs fall through as -ENOENT to - * force insertion of the new leaf. - */ - le32_add_cpu(&old->e_clusters, new_clusters); - } - } - - if (ret == -ENOENT) - ret = ocfs2_extent_map_insert(inode, rec, 0); - if (ret < 0) - mlog_errno(ret); +out: + brelse(di_bh); + brelse(eb_bh); return ret; } -#if 0 -/* Code here is included but defined out as it completes the extent - * map api and may be used in the future. */ - /* - * Look up the record containing this cluster offset. This record is - * part of the extent map. Do not free it. Any changes you make to - * it will reflect in the extent map. So, if your last extent - * is (cpos = 10, clusters = 10) and you truncate the file by 5 - * clusters, you can do: - * - * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); - * rec->e_clusters -= 5; - * - * The lookup does not read from disk. If the map isn't filled in for - * an entry, you won't find it. - * - * Also note that the returned record is valid until alloc_sem is - * dropped. After that, truncate and extend can happen. Caveat Emptor. + * This expects alloc_sem to be held. The allocation cannot change at + * all while the map is in the process of being updated. */ -int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, - struct ocfs2_extent_rec **rec, - int *tree_depth) -{ - int ret = -ENOENT; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; - - *rec = NULL; - - if (cpos >= OCFS2_I(inode)->ip_clusters) - return -EINVAL; - - if (cpos >= em->em_clusters) { - /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters - */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters ; - } - - ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1, - NULL, NULL); - - if (ent) { - *rec = &ent->e_rec; - if (tree_depth) - *tree_depth = ent->e_tree_depth; - ret = 0; - } - - return ret; -} - -int ocfs2_extent_map_get_clusters(struct inode *inode, - u32 v_cpos, int count, - u32 *p_cpos, int *ret_count) -{ - int ret; - u32 coff, ccount; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent = NULL; - - *p_cpos = ccount = 0; - - if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) - return -EINVAL; - - if ((v_cpos + count) > em->em_clusters) { - /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters - */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters; - } - - - ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); - if (ret) - return ret; - - if (ent) { - /* We should never find ourselves straddling an interval */ - if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec, - v_cpos, - count)) - return -ESRCH; - - coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); - *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, - le64_to_cpu(ent->e_rec.e_blkno)) + - coff; - - if (ret_count) - *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff; - - return 0; - } - - - return -ENOENT; -} - -#endif /* 0 */ - -int ocfs2_extent_map_get_blocks(struct inode *inode, - u64 v_blkno, int count, - u64 *p_blkno, int *ret_count) +int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, + int *ret_count) { int ret; - u64 boff; - u32 cpos, clusters; int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); - struct ocfs2_extent_map_entry *ent = NULL; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_rec *rec; - - *p_blkno = 0; + u32 cpos, num_clusters, p_cluster; + u64 boff = 0; cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); - clusters = ocfs2_blocks_to_clusters(inode->i_sb, - (u64)count + bpc - 1); - if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) { - ret = -EINVAL; - mlog_errno(ret); - return ret; - } - if ((cpos + clusters) > em->em_clusters) { - /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters - */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters; - } - - ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters); if (ret) { mlog_errno(ret); - return ret; + goto out; } - if (ent) - { - rec = &ent->e_rec; - - /* We should never find ourselves straddling an interval */ - if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { - ret = -ESRCH; - mlog_errno(ret); - return ret; - } - - boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - - le32_to_cpu(rec->e_cpos)); + /* + * p_cluster == 0 indicates a hole. + */ + if (p_cluster) { + boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); boff += (v_blkno & (u64)(bpc - 1)); - *p_blkno = le64_to_cpu(rec->e_blkno) + boff; - - if (ret_count) { - *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, - le32_to_cpu(rec->e_clusters)) - boff; - } - - return 0; } - return -ENOENT; -} - -int ocfs2_extent_map_init(struct inode *inode) -{ - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + *p_blkno = boff; - em->em_extents = RB_ROOT; - em->em_clusters = 0; - - return 0; -} - -/* Needs the lock */ -static void __ocfs2_extent_map_drop(struct inode *inode, - u32 new_clusters, - struct rb_node **free_head, - struct ocfs2_extent_map_entry **tail_ent) -{ - struct rb_node *node, *next; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; - - *free_head = NULL; - - ent = NULL; - node = rb_last(&em->em_extents); - while (node) - { - next = rb_prev(node); - - ent = rb_entry(node, struct ocfs2_extent_map_entry, - e_node); - if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters) - break; - - rb_erase(&ent->e_node, &em->em_extents); - - node->rb_right = *free_head; - *free_head = node; - - ent = NULL; - node = next; - } - - /* Do we have an entry straddling new_clusters? */ - if (tail_ent) { - if (ent && - ((le32_to_cpu(ent->e_rec.e_cpos) + - le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters)) - *tail_ent = ent; - else - *tail_ent = NULL; + if (ret_count) { + *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); + *ret_count -= v_blkno & (u64)(bpc - 1); } -} - -static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head) -{ - struct rb_node *node; - struct ocfs2_extent_map_entry *ent; - - while (free_head) { - node = free_head; - free_head = node->rb_right; - ent = rb_entry(node, struct ocfs2_extent_map_entry, - e_node); - kmem_cache_free(ocfs2_em_ent_cachep, ent); - } -} - -/* - * Remove all entries past new_clusters, inclusive of an entry that - * contains new_clusters. This is effectively a cache forget. - * - * If you want to also clip the last extent by some number of clusters, - * you need to call ocfs2_extent_map_trunc(). - * This code does not check or modify ip_clusters. - */ -int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) -{ - struct rb_node *free_head = NULL; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; - - spin_lock(&OCFS2_I(inode)->ip_lock); - - __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); - - if (ent) { - rb_erase(&ent->e_node, &em->em_extents); - ent->e_node.rb_right = free_head; - free_head = &ent->e_node; - } - - spin_unlock(&OCFS2_I(inode)->ip_lock); - - if (free_head) - __ocfs2_extent_map_drop_cleanup(free_head); - - return 0; -} - -/* - * Remove all entries past new_clusters and also clip any extent - * straddling new_clusters, if there is one. This does not check - * or modify ip_clusters - */ -int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters) -{ - struct rb_node *free_head = NULL; - struct ocfs2_extent_map_entry *ent = NULL; - - spin_lock(&OCFS2_I(inode)->ip_lock); - - __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); - - if (ent) - ent->e_rec.e_clusters = cpu_to_le32(new_clusters - - le32_to_cpu(ent->e_rec.e_cpos)); - - OCFS2_I(inode)->ip_map.em_clusters = new_clusters; - - spin_unlock(&OCFS2_I(inode)->ip_lock); - - if (free_head) - __ocfs2_extent_map_drop_cleanup(free_head); - - return 0; -} - -int __init init_ocfs2_extent_maps(void) -{ - ocfs2_em_ent_cachep = - kmem_cache_create("ocfs2_em_ent", - sizeof(struct ocfs2_extent_map_entry), - 0, SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!ocfs2_em_ent_cachep) - return -ENOMEM; - - return 0; -} - -void exit_ocfs2_extent_maps(void) -{ - kmem_cache_destroy(ocfs2_em_ent_cachep); +out: + return ret; } diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index fa3745efa886..036e23251448 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -25,22 +25,7 @@ #ifndef _EXTENT_MAP_H #define _EXTENT_MAP_H -int init_ocfs2_extent_maps(void); -void exit_ocfs2_extent_maps(void); - -/* - * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem - * to be held. The allocation cannot change at all while the map is - * in the process of being updated. - */ -int ocfs2_extent_map_init(struct inode *inode); -int ocfs2_extent_map_append(struct inode *inode, - struct ocfs2_extent_rec *rec, - u32 new_clusters); -int ocfs2_extent_map_get_blocks(struct inode *inode, - u64 v_blkno, int count, - u64 *p_blkno, int *ret_count); -int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters); -int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters); +int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, + int *ret_count); #endif /* _EXTENT_MAP_H */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 08d57a3d4e83..5ff8549eb1a3 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1003,9 +1003,6 @@ void ocfs2_clear_inode(struct inode *inode) "Clear inode of %llu, inode has io markers\n", (unsigned long long)oi->ip_blkno); - ocfs2_extent_map_drop(inode, 0); - ocfs2_extent_map_init(inode); - status = ocfs2_drop_inode_locks(inode); if (status < 0) mlog_errno(status); @@ -1102,8 +1099,7 @@ struct buffer_head *ocfs2_bread(struct inode *inode, return NULL; } - tmperr = ocfs2_extent_map_get_blocks(inode, block, 1, - &p_blkno, NULL); + tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL); if (tmperr < 0) { mlog_errno(tmperr); goto fail; diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 042ae20a7132..a9ced009cb9c 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -43,7 +43,6 @@ struct ocfs2_inode_info spinlock_t ip_lock; u32 ip_open_count; u32 ip_clusters; - struct ocfs2_extent_map ip_map; struct list_head ip_io_markers; struct mutex ip_io_mutex; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 12445a31f733..2e2e04fe9738 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -670,8 +670,7 @@ static int ocfs2_force_read_journal(struct inode *inode) (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) { status = ocfs2_extent_map_get_blocks(inode, v_blkno, - 1, &p_blkno, - &p_blocks); + &p_blkno, &p_blocks); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index d65fef4a8bd8..5755e0748256 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1511,8 +1511,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, goto bail; } - status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno, - &p_blocks); + status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index fe7e1ecafca5..faeb53f2eecf 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -46,11 +46,6 @@ #include "endian.h" #include "ocfs2_lockid.h" -struct ocfs2_extent_map { - u32 em_clusters; - struct rb_root em_extents; -}; - /* Most user visible OCFS2 inodes will have very few pieces of * metadata, but larger files (including bitmaps, etc) must be taken * into account when designing an access scheme. We allow a small diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 2d3ac32cb74e..f4416e7330e1 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) goto bail; } - status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL); + status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 16564ea6c141..6ab52351943a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -806,9 +806,6 @@ static int __init ocfs2_init(void) ocfs2_print_version(); - if (init_ocfs2_extent_maps()) - return -ENOMEM; - status = init_ocfs2_uptodate_cache(); if (status < 0) { mlog_errno(status); @@ -837,7 +834,6 @@ static int __init ocfs2_init(void) if (status < 0) { ocfs2_free_mem_caches(); exit_ocfs2_uptodate_cache(); - exit_ocfs2_extent_maps(); } mlog_exit(status); @@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void) unregister_filesystem(&ocfs2_fs_type); - exit_ocfs2_extent_maps(); - exit_ocfs2_uptodate_cache(); mlog_exit_void(); @@ -948,7 +942,6 @@ static void ocfs2_inode_init_once(void *data, oi->ip_flags = 0; oi->ip_open_count = 0; spin_lock_init(&oi->ip_lock); - ocfs2_extent_map_init(&oi->vfs_inode); INIT_LIST_HEAD(&oi->ip_io_markers); oi->ip_created_trans = 0; oi->ip_last_trans = 0; -- GitLab