提交 8f616cd5 编写于 作者: L Linus Torvalds

Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: remove write-only variables from ext4_ordered_write_end
  ext4: unexport jbd2_journal_update_superblock
  ext4: Cleanup whitespace and other miscellaneous style issues
  ext4: improve ext4_fill_flex_info() a bit
  ext4: Cleanup the block reservation code path
  ext4: don't assume extents can't cross block groups when truncating
  ext4: Fix lack of credits BUG() when deleting a badly fragmented inode
  ext4: Fix ext4_ext_journal_restart()
  ext4: fix ext4_da_write_begin error path
  jbd2: don't abort if flushing file data failed
  ext4: don't read inode block if the buffer has a write error
  ext4: Don't allow lg prealloc list to be grow large.
  ext4: Convert the usage of NR_CPUS to nr_cpu_ids.
  ext4: Improve error handling in mballoc
  ext4: lock block groups when initializing
  ext4: sync up block and inode bitmap reading functions
  ext4: Allow read/only mounts with corrupted block group checksums
  ext4: Fix data corruption when writing to prealloc area
...@@ -40,34 +40,35 @@ ext4_acl_from_disk(const void *value, size_t size) ...@@ -40,34 +40,35 @@ ext4_acl_from_disk(const void *value, size_t size)
acl = posix_acl_alloc(count, GFP_NOFS); acl = posix_acl_alloc(count, GFP_NOFS);
if (!acl) if (!acl)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
for (n=0; n < count; n++) { for (n = 0; n < count; n++) {
ext4_acl_entry *entry = ext4_acl_entry *entry =
(ext4_acl_entry *)value; (ext4_acl_entry *)value;
if ((char *)value + sizeof(ext4_acl_entry_short) > end) if ((char *)value + sizeof(ext4_acl_entry_short) > end)
goto fail; goto fail;
acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
switch(acl->a_entries[n].e_tag) {
case ACL_USER_OBJ: switch (acl->a_entries[n].e_tag) {
case ACL_GROUP_OBJ: case ACL_USER_OBJ:
case ACL_MASK: case ACL_GROUP_OBJ:
case ACL_OTHER: case ACL_MASK:
value = (char *)value + case ACL_OTHER:
sizeof(ext4_acl_entry_short); value = (char *)value +
acl->a_entries[n].e_id = ACL_UNDEFINED_ID; sizeof(ext4_acl_entry_short);
break; acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
break;
case ACL_USER:
case ACL_GROUP: case ACL_USER:
value = (char *)value + sizeof(ext4_acl_entry); case ACL_GROUP:
if ((char *)value > end) value = (char *)value + sizeof(ext4_acl_entry);
goto fail; if ((char *)value > end)
acl->a_entries[n].e_id =
le32_to_cpu(entry->e_id);
break;
default:
goto fail; goto fail;
acl->a_entries[n].e_id =
le32_to_cpu(entry->e_id);
break;
default:
goto fail;
} }
} }
if (value != end) if (value != end)
...@@ -96,27 +97,26 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) ...@@ -96,27 +97,26 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
e = (char *)ext_acl + sizeof(ext4_acl_header); e = (char *)ext_acl + sizeof(ext4_acl_header);
for (n=0; n < acl->a_count; n++) { for (n = 0; n < acl->a_count; n++) {
ext4_acl_entry *entry = (ext4_acl_entry *)e; ext4_acl_entry *entry = (ext4_acl_entry *)e;
entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
switch(acl->a_entries[n].e_tag) { switch (acl->a_entries[n].e_tag) {
case ACL_USER: case ACL_USER:
case ACL_GROUP: case ACL_GROUP:
entry->e_id = entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
cpu_to_le32(acl->a_entries[n].e_id); e += sizeof(ext4_acl_entry);
e += sizeof(ext4_acl_entry); break;
break;
case ACL_USER_OBJ:
case ACL_USER_OBJ: case ACL_GROUP_OBJ:
case ACL_GROUP_OBJ: case ACL_MASK:
case ACL_MASK: case ACL_OTHER:
case ACL_OTHER: e += sizeof(ext4_acl_entry_short);
e += sizeof(ext4_acl_entry_short); break;
break;
default:
default: goto fail;
goto fail;
} }
} }
return (char *)ext_acl; return (char *)ext_acl;
...@@ -167,23 +167,23 @@ ext4_get_acl(struct inode *inode, int type) ...@@ -167,23 +167,23 @@ ext4_get_acl(struct inode *inode, int type)
if (!test_opt(inode->i_sb, POSIX_ACL)) if (!test_opt(inode->i_sb, POSIX_ACL))
return NULL; return NULL;
switch(type) { switch (type) {
case ACL_TYPE_ACCESS: case ACL_TYPE_ACCESS:
acl = ext4_iget_acl(inode, &ei->i_acl); acl = ext4_iget_acl(inode, &ei->i_acl);
if (acl != EXT4_ACL_NOT_CACHED) if (acl != EXT4_ACL_NOT_CACHED)
return acl; return acl;
name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
break; break;
case ACL_TYPE_DEFAULT: case ACL_TYPE_DEFAULT:
acl = ext4_iget_acl(inode, &ei->i_default_acl); acl = ext4_iget_acl(inode, &ei->i_default_acl);
if (acl != EXT4_ACL_NOT_CACHED) if (acl != EXT4_ACL_NOT_CACHED)
return acl; return acl;
name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
break; break;
default: default:
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
retval = ext4_xattr_get(inode, name_index, "", NULL, 0); retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
if (retval > 0) { if (retval > 0) {
...@@ -201,14 +201,14 @@ ext4_get_acl(struct inode *inode, int type) ...@@ -201,14 +201,14 @@ ext4_get_acl(struct inode *inode, int type)
kfree(value); kfree(value);
if (!IS_ERR(acl)) { if (!IS_ERR(acl)) {
switch(type) { switch (type) {
case ACL_TYPE_ACCESS: case ACL_TYPE_ACCESS:
ext4_iset_acl(inode, &ei->i_acl, acl); ext4_iset_acl(inode, &ei->i_acl, acl);
break; break;
case ACL_TYPE_DEFAULT: case ACL_TYPE_DEFAULT:
ext4_iset_acl(inode, &ei->i_default_acl, acl); ext4_iset_acl(inode, &ei->i_default_acl, acl);
break; break;
} }
} }
return acl; return acl;
...@@ -232,31 +232,31 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, ...@@ -232,31 +232,31 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
if (S_ISLNK(inode->i_mode)) if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP; return -EOPNOTSUPP;
switch(type) { switch (type) {
case ACL_TYPE_ACCESS: case ACL_TYPE_ACCESS:
name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl) { if (acl) {
mode_t mode = inode->i_mode; mode_t mode = inode->i_mode;
error = posix_acl_equiv_mode(acl, &mode); error = posix_acl_equiv_mode(acl, &mode);
if (error < 0) if (error < 0)
return error; return error;
else { else {
inode->i_mode = mode; inode->i_mode = mode;
ext4_mark_inode_dirty(handle, inode); ext4_mark_inode_dirty(handle, inode);
if (error == 0) if (error == 0)
acl = NULL; acl = NULL;
}
} }
break; }
break;
case ACL_TYPE_DEFAULT: case ACL_TYPE_DEFAULT:
name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
if (!S_ISDIR(inode->i_mode)) if (!S_ISDIR(inode->i_mode))
return acl ? -EACCES : 0; return acl ? -EACCES : 0;
break; break;
default: default:
return -EINVAL; return -EINVAL;
} }
if (acl) { if (acl) {
value = ext4_acl_to_disk(acl, &size); value = ext4_acl_to_disk(acl, &size);
...@@ -269,14 +269,14 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, ...@@ -269,14 +269,14 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
kfree(value); kfree(value);
if (!error) { if (!error) {
switch(type) { switch (type) {
case ACL_TYPE_ACCESS: case ACL_TYPE_ACCESS:
ext4_iset_acl(inode, &ei->i_acl, acl); ext4_iset_acl(inode, &ei->i_acl, acl);
break; break;
case ACL_TYPE_DEFAULT: case ACL_TYPE_DEFAULT:
ext4_iset_acl(inode, &ei->i_default_acl, acl); ext4_iset_acl(inode, &ei->i_default_acl, acl);
break; break;
} }
} }
return error; return error;
......
...@@ -314,25 +314,28 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) ...@@ -314,25 +314,28 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
if (unlikely(!bh)) { if (unlikely(!bh)) {
ext4_error(sb, __func__, ext4_error(sb, __func__,
"Cannot read block bitmap - " "Cannot read block bitmap - "
"block_group = %d, block_bitmap = %llu", "block_group = %lu, block_bitmap = %llu",
(int)block_group, (unsigned long long)bitmap_blk); block_group, bitmap_blk);
return NULL; return NULL;
} }
if (bh_uptodate_or_lock(bh)) if (bh_uptodate_or_lock(bh))
return bh; return bh;
spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh, block_group, desc); ext4_init_block_bitmap(sb, bh, block_group, desc);
set_buffer_uptodate(bh); set_buffer_uptodate(bh);
unlock_buffer(bh); unlock_buffer(bh);
spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
return bh; return bh;
} }
spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
if (bh_submit_read(bh) < 0) { if (bh_submit_read(bh) < 0) {
put_bh(bh); put_bh(bh);
ext4_error(sb, __func__, ext4_error(sb, __func__,
"Cannot read block bitmap - " "Cannot read block bitmap - "
"block_group = %d, block_bitmap = %llu", "block_group = %lu, block_bitmap = %llu",
(int)block_group, (unsigned long long)bitmap_blk); block_group, bitmap_blk);
return NULL; return NULL;
} }
ext4_valid_block_bitmap(sb, desc, block_group, bh); ext4_valid_block_bitmap(sb, desc, block_group, bh);
......
...@@ -1044,7 +1044,6 @@ extern void ext4_mb_update_group_info(struct ext4_group_info *grp, ...@@ -1044,7 +1044,6 @@ extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
/* inode.c */ /* inode.c */
void ext4_da_release_space(struct inode *inode, int used, int to_free);
int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t blocknr); struct buffer_head *bh, ext4_fsblk_t blocknr);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, struct buffer_head *ext4_getblk(handle_t *, struct inode *,
......
...@@ -99,7 +99,7 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed) ...@@ -99,7 +99,7 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
if (handle->h_buffer_credits > needed) if (handle->h_buffer_credits > needed)
return 0; return 0;
err = ext4_journal_extend(handle, needed); err = ext4_journal_extend(handle, needed);
if (err) if (err <= 0)
return err; return err;
return ext4_journal_restart(handle, needed); return ext4_journal_restart(handle, needed);
} }
...@@ -1441,7 +1441,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode, ...@@ -1441,7 +1441,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
/* /*
* get the next allocated block if the extent in the path * get the next allocated block if the extent in the path
* is before the requested block(s) * is before the requested block(s)
*/ */
if (b2 < b1) { if (b2 < b1) {
b2 = ext4_ext_next_allocated_block(path); b2 = ext4_ext_next_allocated_block(path);
...@@ -1910,9 +1910,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ...@@ -1910,9 +1910,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
BUG_ON(b != ex_ee_block + ex_ee_len - 1); BUG_ON(b != ex_ee_block + ex_ee_len - 1);
} }
/* at present, extent can't cross block group: */ /*
/* leaf + bitmap + group desc + sb + inode */ * 3 for leaf, sb, and inode plus 2 (bmap and group
credits = 5; * descriptor) for each block group; assume two block
* groups plus ex_ee_len/blocks_per_block_group for
* the worst case
*/
credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
if (ex == EXT_FIRST_EXTENT(eh)) { if (ex == EXT_FIRST_EXTENT(eh)) {
correct_index = 1; correct_index = 1;
credits += (ext_depth(inode)) + 1; credits += (ext_depth(inode)) + 1;
...@@ -2323,7 +2327,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ...@@ -2323,7 +2327,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
unsigned int newdepth; unsigned int newdepth;
/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
if (allocated <= EXT4_EXT_ZERO_LEN) { if (allocated <= EXT4_EXT_ZERO_LEN) {
/* Mark first half uninitialized. /*
* iblock == ee_block is handled by the zerouout
* at the beginning.
* Mark first half uninitialized.
* Mark second half initialized and zero out the * Mark second half initialized and zero out the
* initialized extent * initialized extent
*/ */
...@@ -2346,7 +2353,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ...@@ -2346,7 +2353,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ex->ee_len = orig_ex.ee_len; ex->ee_len = orig_ex.ee_len;
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth); ext4_ext_dirty(handle, inode, path + depth);
/* zeroed the full extent */ /* blocks available from iblock */
return allocated; return allocated;
} else if (err) } else if (err)
...@@ -2374,6 +2381,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ...@@ -2374,6 +2381,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
err = PTR_ERR(path); err = PTR_ERR(path);
return err; return err;
} }
/* get the second half extent details */
ex = path[depth].p_ext; ex = path[depth].p_ext;
err = ext4_ext_get_access(handle, inode, err = ext4_ext_get_access(handle, inode,
path + depth); path + depth);
...@@ -2403,6 +2411,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ...@@ -2403,6 +2411,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth); ext4_ext_dirty(handle, inode, path + depth);
/* zeroed the full extent */ /* zeroed the full extent */
/* blocks available from iblock */
return allocated; return allocated;
} else if (err) } else if (err)
...@@ -2418,23 +2427,22 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ...@@ -2418,23 +2427,22 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
*/ */
orig_ex.ee_len = cpu_to_le16(ee_len - orig_ex.ee_len = cpu_to_le16(ee_len -
ext4_ext_get_actual_len(ex3)); ext4_ext_get_actual_len(ex3));
if (newdepth != depth) { depth = newdepth;
depth = newdepth; ext4_ext_drop_refs(path);
ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, iblock, path);
path = ext4_ext_find_extent(inode, iblock, path); if (IS_ERR(path)) {
if (IS_ERR(path)) { err = PTR_ERR(path);
err = PTR_ERR(path); goto out;
goto out;
}
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
if (ex2 != &newex)
ex2 = ex;
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
} }
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
if (ex2 != &newex)
ex2 = ex;
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
allocated = max_blocks; allocated = max_blocks;
/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
...@@ -2452,6 +2460,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ...@@ -2452,6 +2460,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth); ext4_ext_dirty(handle, inode, path + depth);
/* zero out the first half */ /* zero out the first half */
/* blocks available from iblock */
return allocated; return allocated;
} }
} }
......
...@@ -97,34 +97,44 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, ...@@ -97,34 +97,44 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
* Return buffer_head of bitmap on success or NULL. * Return buffer_head of bitmap on success or NULL.
*/ */
static struct buffer_head * static struct buffer_head *
read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
{ {
struct ext4_group_desc *desc; struct ext4_group_desc *desc;
struct buffer_head *bh = NULL; struct buffer_head *bh = NULL;
ext4_fsblk_t bitmap_blk;
desc = ext4_get_group_desc(sb, block_group, NULL); desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc) if (!desc)
goto error_out; return NULL;
if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { bitmap_blk = ext4_inode_bitmap(sb, desc);
bh = sb_getblk(sb, ext4_inode_bitmap(sb, desc)); bh = sb_getblk(sb, bitmap_blk);
if (!buffer_uptodate(bh)) { if (unlikely(!bh)) {
lock_buffer(bh); ext4_error(sb, __func__,
if (!buffer_uptodate(bh)) { "Cannot read inode bitmap - "
ext4_init_inode_bitmap(sb, bh, block_group, "block_group = %lu, inode_bitmap = %llu",
desc); block_group, bitmap_blk);
set_buffer_uptodate(bh); return NULL;
}
unlock_buffer(bh);
}
} else {
bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
} }
if (!bh) if (bh_uptodate_or_lock(bh))
ext4_error(sb, "read_inode_bitmap", return bh;
spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
ext4_init_inode_bitmap(sb, bh, block_group, desc);
set_buffer_uptodate(bh);
unlock_buffer(bh);
spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
return bh;
}
spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
if (bh_submit_read(bh) < 0) {
put_bh(bh);
ext4_error(sb, __func__,
"Cannot read inode bitmap - " "Cannot read inode bitmap - "
"block_group = %lu, inode_bitmap = %llu", "block_group = %lu, inode_bitmap = %llu",
block_group, ext4_inode_bitmap(sb, desc)); block_group, bitmap_blk);
error_out: return NULL;
}
return bh; return bh;
} }
...@@ -200,7 +210,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) ...@@ -200,7 +210,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
} }
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = read_inode_bitmap(sb, block_group); bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
if (!bitmap_bh) if (!bitmap_bh)
goto error_return; goto error_return;
...@@ -623,7 +633,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) ...@@ -623,7 +633,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
goto fail; goto fail;
brelse(bitmap_bh); brelse(bitmap_bh);
bitmap_bh = read_inode_bitmap(sb, group); bitmap_bh = ext4_read_inode_bitmap(sb, group);
if (!bitmap_bh) if (!bitmap_bh)
goto fail; goto fail;
...@@ -728,7 +738,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) ...@@ -728,7 +738,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
/* When marking the block group with /* When marking the block group with
* ~EXT4_BG_INODE_UNINIT we don't want to depend * ~EXT4_BG_INODE_UNINIT we don't want to depend
* on the value of bg_itable_unsed even though * on the value of bg_itable_unused even though
* mke2fs could have initialized the same for us. * mke2fs could have initialized the same for us.
* Instead we calculated the value below * Instead we calculated the value below
*/ */
...@@ -891,7 +901,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) ...@@ -891,7 +901,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = read_inode_bitmap(sb, block_group); bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
if (!bitmap_bh) { if (!bitmap_bh) {
ext4_warning(sb, __func__, ext4_warning(sb, __func__,
"inode bitmap error for orphan %lu", ino); "inode bitmap error for orphan %lu", ino);
...@@ -969,7 +979,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb) ...@@ -969,7 +979,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
continue; continue;
desc_count += le16_to_cpu(gdp->bg_free_inodes_count); desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
brelse(bitmap_bh); brelse(bitmap_bh);
bitmap_bh = read_inode_bitmap(sb, i); bitmap_bh = ext4_read_inode_bitmap(sb, i);
if (!bitmap_bh) if (!bitmap_bh)
continue; continue;
......
...@@ -191,6 +191,7 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) ...@@ -191,6 +191,7 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
void ext4_delete_inode (struct inode * inode) void ext4_delete_inode (struct inode * inode)
{ {
handle_t *handle; handle_t *handle;
int err;
if (ext4_should_order_data(inode)) if (ext4_should_order_data(inode))
ext4_begin_ordered_truncate(inode, 0); ext4_begin_ordered_truncate(inode, 0);
...@@ -199,8 +200,9 @@ void ext4_delete_inode (struct inode * inode) ...@@ -199,8 +200,9 @@ void ext4_delete_inode (struct inode * inode)
if (is_bad_inode(inode)) if (is_bad_inode(inode))
goto no_delete; goto no_delete;
handle = start_transaction(inode); handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
if (IS_ERR(handle)) { if (IS_ERR(handle)) {
ext4_std_error(inode->i_sb, PTR_ERR(handle));
/* /*
* If we're going to skip the normal cleanup, we still need to * If we're going to skip the normal cleanup, we still need to
* make sure that the in-core orphan linked list is properly * make sure that the in-core orphan linked list is properly
...@@ -213,8 +215,34 @@ void ext4_delete_inode (struct inode * inode) ...@@ -213,8 +215,34 @@ void ext4_delete_inode (struct inode * inode)
if (IS_SYNC(inode)) if (IS_SYNC(inode))
handle->h_sync = 1; handle->h_sync = 1;
inode->i_size = 0; inode->i_size = 0;
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
ext4_warning(inode->i_sb, __func__,
"couldn't mark inode dirty (err %d)", err);
goto stop_handle;
}
if (inode->i_blocks) if (inode->i_blocks)
ext4_truncate(inode); ext4_truncate(inode);
/*
* ext4_ext_truncate() doesn't reserve any slop when it
* restarts journal transactions; therefore there may not be
* enough credits left in the handle to remove the inode from
* the orphan list and set the dtime field.
*/
if (handle->h_buffer_credits < 3) {
err = ext4_journal_extend(handle, 3);
if (err > 0)
err = ext4_journal_restart(handle, 3);
if (err != 0) {
ext4_warning(inode->i_sb, __func__,
"couldn't extend journal (err %d)", err);
stop_handle:
ext4_journal_stop(handle);
goto no_delete;
}
}
/* /*
* Kill off the orphan record which ext4_truncate created. * Kill off the orphan record which ext4_truncate created.
* AKPM: I think this can be inside the above `if'. * AKPM: I think this can be inside the above `if'.
...@@ -952,6 +980,67 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, ...@@ -952,6 +980,67 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
return err; return err;
} }
/*
* Calculate the number of metadata blocks need to reserve
* to allocate @blocks for non extent file based file
*/
static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
{
int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
int ind_blks, dind_blks, tind_blks;
/* number of new indirect blocks needed */
ind_blks = (blocks + icap - 1) / icap;
dind_blks = (ind_blks + icap - 1) / icap;
tind_blks = 1;
return ind_blks + dind_blks + tind_blks;
}
/*
* Calculate the number of metadata blocks need to reserve
* to allocate given number of blocks
*/
static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
{
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
return ext4_ext_calc_metadata_amount(inode, blocks);
return ext4_indirect_calc_metadata_amount(inode, blocks);
}
static void ext4_da_update_reserve_space(struct inode *inode, int used)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int total, mdb, mdb_free;
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
/* recalculate the number of metablocks still need to be reserved */
total = EXT4_I(inode)->i_reserved_data_blocks - used;
mdb = ext4_calc_metadata_amount(inode, total);
/* figure out how many metablocks to release */
BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
/* Account for allocated meta_blocks */
mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
/* update fs free blocks counter for truncate case */
percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free);
/* update per-inode reservations */
BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
EXT4_I(inode)->i_reserved_data_blocks -= used;
BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
EXT4_I(inode)->i_reserved_meta_blocks = mdb;
EXT4_I(inode)->i_allocated_meta_blocks = 0;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
}
/* Maximum number of blocks we map for direct IO at once. */ /* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096 #define DIO_MAX_BLOCKS 4096
/* /*
...@@ -965,10 +1054,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, ...@@ -965,10 +1054,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
/* /*
* The ext4_get_blocks_wrap() function try to look up the requested blocks,
* and returns if the blocks are already mapped.
* *
*
* ext4_ext4 get_block() wrapper function
* It will do a look up first, and returns if the blocks already mapped.
* Otherwise it takes the write lock of the i_data_sem and allocate blocks * Otherwise it takes the write lock of the i_data_sem and allocate blocks
* and store the allocated blocks in the result buffer head and mark it * and store the allocated blocks in the result buffer head and mark it
* mapped. * mapped.
...@@ -1069,7 +1157,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, ...@@ -1069,7 +1157,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
* which were deferred till now * which were deferred till now
*/ */
if ((retval > 0) && buffer_delay(bh)) if ((retval > 0) && buffer_delay(bh))
ext4_da_release_space(inode, retval, 0); ext4_da_update_reserve_space(inode, retval);
} }
up_write((&EXT4_I(inode)->i_data_sem)); up_write((&EXT4_I(inode)->i_data_sem));
...@@ -1336,12 +1424,8 @@ static int ext4_ordered_write_end(struct file *file, ...@@ -1336,12 +1424,8 @@ static int ext4_ordered_write_end(struct file *file,
{ {
handle_t *handle = ext4_journal_current_handle(); handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
unsigned from, to;
int ret = 0, ret2; int ret = 0, ret2;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
ret = ext4_jbd2_file_inode(handle, inode); ret = ext4_jbd2_file_inode(handle, inode);
if (ret == 0) { if (ret == 0) {
...@@ -1437,36 +1521,6 @@ static int ext4_journalled_write_end(struct file *file, ...@@ -1437,36 +1521,6 @@ static int ext4_journalled_write_end(struct file *file,
return ret ? ret : copied; return ret ? ret : copied;
} }
/*
* Calculate the number of metadata blocks need to reserve
* to allocate @blocks for non extent file based file
*/
static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
{
int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
int ind_blks, dind_blks, tind_blks;
/* number of new indirect blocks needed */
ind_blks = (blocks + icap - 1) / icap;
dind_blks = (ind_blks + icap - 1) / icap;
tind_blks = 1;
return ind_blks + dind_blks + tind_blks;
}
/*
* Calculate the number of metadata blocks need to reserve
* to allocate given number of blocks
*/
static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
{
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
return ext4_ext_calc_metadata_amount(inode, blocks);
return ext4_indirect_calc_metadata_amount(inode, blocks);
}
static int ext4_da_reserve_space(struct inode *inode, int nrblocks) static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
{ {
...@@ -1490,7 +1544,6 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) ...@@ -1490,7 +1544,6 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
return -ENOSPC; return -ENOSPC;
} }
/* reduce fs free blocks counter */ /* reduce fs free blocks counter */
percpu_counter_sub(&sbi->s_freeblocks_counter, total); percpu_counter_sub(&sbi->s_freeblocks_counter, total);
...@@ -1501,35 +1554,31 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) ...@@ -1501,35 +1554,31 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
return 0; /* success */ return 0; /* success */
} }
void ext4_da_release_space(struct inode *inode, int used, int to_free) static void ext4_da_release_space(struct inode *inode, int to_free)
{ {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int total, mdb, mdb_free, release; int total, mdb, mdb_free, release;
spin_lock(&EXT4_I(inode)->i_block_reservation_lock); spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
/* recalculate the number of metablocks still need to be reserved */ /* recalculate the number of metablocks still need to be reserved */
total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free; total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
mdb = ext4_calc_metadata_amount(inode, total); mdb = ext4_calc_metadata_amount(inode, total);
/* figure out how many metablocks to release */ /* figure out how many metablocks to release */
BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
/* Account for allocated meta_blocks */
mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
release = to_free + mdb_free; release = to_free + mdb_free;
/* update fs free blocks counter for truncate case */ /* update fs free blocks counter for truncate case */
percpu_counter_add(&sbi->s_freeblocks_counter, release); percpu_counter_add(&sbi->s_freeblocks_counter, release);
/* update per-inode reservations */ /* update per-inode reservations */
BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks); BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free); EXT4_I(inode)->i_reserved_data_blocks -= to_free;
BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
EXT4_I(inode)->i_reserved_meta_blocks = mdb; EXT4_I(inode)->i_reserved_meta_blocks = mdb;
EXT4_I(inode)->i_allocated_meta_blocks = 0;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
} }
...@@ -1551,7 +1600,7 @@ static void ext4_da_page_release_reservation(struct page *page, ...@@ -1551,7 +1600,7 @@ static void ext4_da_page_release_reservation(struct page *page,
} }
curr_off = next_off; curr_off = next_off;
} while ((bh = bh->b_this_page) != head); } while ((bh = bh->b_this_page) != head);
ext4_da_release_space(page->mapping->host, 0, to_release); ext4_da_release_space(page->mapping->host, to_release);
} }
/* /*
...@@ -2280,8 +2329,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, ...@@ -2280,8 +2329,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
} }
page = __grab_cache_page(mapping, index); page = __grab_cache_page(mapping, index);
if (!page) if (!page) {
return -ENOMEM; ext4_journal_stop(handle);
ret = -ENOMEM;
goto out;
}
*pagep = page; *pagep = page;
ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
...@@ -3590,6 +3642,16 @@ static int __ext4_get_inode_loc(struct inode *inode, ...@@ -3590,6 +3642,16 @@ static int __ext4_get_inode_loc(struct inode *inode,
} }
if (!buffer_uptodate(bh)) { if (!buffer_uptodate(bh)) {
lock_buffer(bh); lock_buffer(bh);
/*
* If the buffer has the write error flag, we have failed
* to write out another inode in the same block. In this
* case, we don't have to read the block because we may
* read the old inode data successfully.
*/
if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
set_buffer_uptodate(bh);
if (buffer_uptodate(bh)) { if (buffer_uptodate(bh)) {
/* someone brought it uptodate while we waited */ /* someone brought it uptodate while we waited */
unlock_buffer(bh); unlock_buffer(bh);
......
...@@ -787,13 +787,16 @@ static int ext4_mb_init_cache(struct page *page, char *incore) ...@@ -787,13 +787,16 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if (bh_uptodate_or_lock(bh[i])) if (bh_uptodate_or_lock(bh[i]))
continue; continue;
spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh[i], ext4_init_block_bitmap(sb, bh[i],
first_group + i, desc); first_group + i, desc);
set_buffer_uptodate(bh[i]); set_buffer_uptodate(bh[i]);
unlock_buffer(bh[i]); unlock_buffer(bh[i]);
spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
continue; continue;
} }
spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
get_bh(bh[i]); get_bh(bh[i]);
bh[i]->b_end_io = end_buffer_read_sync; bh[i]->b_end_io = end_buffer_read_sync;
submit_bh(READ, bh[i]); submit_bh(READ, bh[i]);
...@@ -2477,7 +2480,7 @@ static int ext4_mb_init_backend(struct super_block *sb) ...@@ -2477,7 +2480,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
int ext4_mb_init(struct super_block *sb, int needs_recovery) int ext4_mb_init(struct super_block *sb, int needs_recovery)
{ {
struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned i; unsigned i, j;
unsigned offset; unsigned offset;
unsigned max; unsigned max;
int ret; int ret;
...@@ -2537,7 +2540,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) ...@@ -2537,7 +2540,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
i = sizeof(struct ext4_locality_group) * NR_CPUS; i = sizeof(struct ext4_locality_group) * nr_cpu_ids;
sbi->s_locality_groups = kmalloc(i, GFP_KERNEL); sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
if (sbi->s_locality_groups == NULL) { if (sbi->s_locality_groups == NULL) {
clear_opt(sbi->s_mount_opt, MBALLOC); clear_opt(sbi->s_mount_opt, MBALLOC);
...@@ -2545,11 +2548,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) ...@@ -2545,11 +2548,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
kfree(sbi->s_mb_maxs); kfree(sbi->s_mb_maxs);
return -ENOMEM; return -ENOMEM;
} }
for (i = 0; i < NR_CPUS; i++) { for (i = 0; i < nr_cpu_ids; i++) {
struct ext4_locality_group *lg; struct ext4_locality_group *lg;
lg = &sbi->s_locality_groups[i]; lg = &sbi->s_locality_groups[i];
mutex_init(&lg->lg_mutex); mutex_init(&lg->lg_mutex);
INIT_LIST_HEAD(&lg->lg_prealloc_list); for (j = 0; j < PREALLOC_TB_SIZE; j++)
INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
spin_lock_init(&lg->lg_prealloc_lock); spin_lock_init(&lg->lg_prealloc_lock);
} }
...@@ -3260,6 +3264,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, ...@@ -3260,6 +3264,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
struct ext4_prealloc_space *pa) struct ext4_prealloc_space *pa)
{ {
unsigned int len = ac->ac_o_ex.fe_len; unsigned int len = ac->ac_o_ex.fe_len;
ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
&ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_group,
&ac->ac_b_ex.fe_start); &ac->ac_b_ex.fe_start);
...@@ -3282,6 +3287,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, ...@@ -3282,6 +3287,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
static noinline_for_stack int static noinline_for_stack int
ext4_mb_use_preallocated(struct ext4_allocation_context *ac) ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{ {
int order, i;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_locality_group *lg; struct ext4_locality_group *lg;
struct ext4_prealloc_space *pa; struct ext4_prealloc_space *pa;
...@@ -3322,22 +3328,29 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) ...@@ -3322,22 +3328,29 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
lg = ac->ac_lg; lg = ac->ac_lg;
if (lg == NULL) if (lg == NULL)
return 0; return 0;
order = fls(ac->ac_o_ex.fe_len) - 1;
rcu_read_lock(); if (order > PREALLOC_TB_SIZE - 1)
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) { /* The max size of hash table is PREALLOC_TB_SIZE */
spin_lock(&pa->pa_lock); order = PREALLOC_TB_SIZE - 1;
if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
atomic_inc(&pa->pa_count); for (i = order; i < PREALLOC_TB_SIZE; i++) {
ext4_mb_use_group_pa(ac, pa); rcu_read_lock();
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
pa_inode_list) {
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0 &&
pa->pa_free >= ac->ac_o_ex.fe_len) {
atomic_inc(&pa->pa_count);
ext4_mb_use_group_pa(ac, pa);
spin_unlock(&pa->pa_lock);
ac->ac_criteria = 20;
rcu_read_unlock();
return 1;
}
spin_unlock(&pa->pa_lock); spin_unlock(&pa->pa_lock);
ac->ac_criteria = 20;
rcu_read_unlock();
return 1;
} }
spin_unlock(&pa->pa_lock); rcu_read_unlock();
} }
rcu_read_unlock();
return 0; return 0;
} }
...@@ -3560,6 +3573,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) ...@@ -3560,6 +3573,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_free = pa->pa_len; pa->pa_free = pa->pa_len;
atomic_set(&pa->pa_count, 1); atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock); spin_lock_init(&pa->pa_lock);
INIT_LIST_HEAD(&pa->pa_inode_list);
pa->pa_deleted = 0; pa->pa_deleted = 0;
pa->pa_linear = 1; pa->pa_linear = 1;
...@@ -3580,10 +3594,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) ...@@ -3580,10 +3594,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
list_add(&pa->pa_group_list, &grp->bb_prealloc_list); list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group); ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
spin_lock(pa->pa_obj_lock); /*
list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list); * We will later add the new pa to the right bucket
spin_unlock(pa->pa_obj_lock); * after updating the pa_free in ext4_mb_release_context
*/
return 0; return 0;
} }
...@@ -3733,20 +3747,23 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, ...@@ -3733,20 +3747,23 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
bitmap_bh = ext4_read_block_bitmap(sb, group); bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) { if (bitmap_bh == NULL) {
/* error handling here */ ext4_error(sb, __func__, "Error in reading block "
ext4_mb_release_desc(&e4b); "bitmap for %lu\n", group);
BUG_ON(bitmap_bh == NULL); return 0;
} }
err = ext4_mb_load_buddy(sb, group, &e4b); err = ext4_mb_load_buddy(sb, group, &e4b);
BUG_ON(err != 0); /* error handling here */ if (err) {
ext4_error(sb, __func__, "Error in loading buddy "
"information for %lu\n", group);
put_bh(bitmap_bh);
return 0;
}
if (needed == 0) if (needed == 0)
needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
grp = ext4_get_group_info(sb, group);
INIT_LIST_HEAD(&list); INIT_LIST_HEAD(&list);
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
repeat: repeat:
ext4_lock_group(sb, group); ext4_lock_group(sb, group);
...@@ -3903,13 +3920,18 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode) ...@@ -3903,13 +3920,18 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode)
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
err = ext4_mb_load_buddy(sb, group, &e4b); err = ext4_mb_load_buddy(sb, group, &e4b);
BUG_ON(err != 0); /* error handling here */ if (err) {
ext4_error(sb, __func__, "Error in loading buddy "
"information for %lu\n", group);
continue;
}
bitmap_bh = ext4_read_block_bitmap(sb, group); bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) { if (bitmap_bh == NULL) {
/* error handling here */ ext4_error(sb, __func__, "Error in reading block "
"bitmap for %lu\n", group);
ext4_mb_release_desc(&e4b); ext4_mb_release_desc(&e4b);
BUG_ON(bitmap_bh == NULL); continue;
} }
ext4_lock_group(sb, group); ext4_lock_group(sb, group);
...@@ -4112,22 +4134,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ...@@ -4112,22 +4134,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
} }
static noinline_for_stack void
ext4_mb_discard_lg_preallocations(struct super_block *sb,
struct ext4_locality_group *lg,
int order, int total_entries)
{
ext4_group_t group = 0;
struct ext4_buddy e4b;
struct list_head discard_list;
struct ext4_prealloc_space *pa, *tmp;
struct ext4_allocation_context *ac;
mb_debug("discard locality group preallocation\n");
INIT_LIST_HEAD(&discard_list);
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
pa_inode_list) {
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
/*
* This is the pa that we just used
* for block allocation. So don't
* free that
*/
spin_unlock(&pa->pa_lock);
continue;
}
if (pa->pa_deleted) {
spin_unlock(&pa->pa_lock);
continue;
}
/* only lg prealloc space */
BUG_ON(!pa->pa_linear);
/* seems this one can be freed ... */
pa->pa_deleted = 1;
spin_unlock(&pa->pa_lock);
list_del_rcu(&pa->pa_inode_list);
list_add(&pa->u.pa_tmp_list, &discard_list);
total_entries--;
if (total_entries <= 5) {
/*
* we want to keep only 5 entries
* allowing it to grow to 8. This
* mak sure we don't call discard
* soon for this list.
*/
break;
}
}
spin_unlock(&lg->lg_prealloc_lock);
list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
if (ext4_mb_load_buddy(sb, group, &e4b)) {
ext4_error(sb, __func__, "Error in loading buddy "
"information for %lu\n", group);
continue;
}
ext4_lock_group(sb, group);
list_del(&pa->pa_group_list);
ext4_mb_release_group_pa(&e4b, pa, ac);
ext4_unlock_group(sb, group);
ext4_mb_release_desc(&e4b);
list_del(&pa->u.pa_tmp_list);
call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
if (ac)
kmem_cache_free(ext4_ac_cachep, ac);
}
/*
* We have incremented pa_count. So it cannot be freed at this
* point. Also we hold lg_mutex. So no parallel allocation is
* possible from this lg. That means pa_free cannot be updated.
*
* A parallel ext4_mb_discard_group_preallocations is possible.
* which can cause the lg_prealloc_list to be updated.
*/
static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
{
int order, added = 0, lg_prealloc_count = 1;
struct super_block *sb = ac->ac_sb;
struct ext4_locality_group *lg = ac->ac_lg;
struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
order = fls(pa->pa_free) - 1;
if (order > PREALLOC_TB_SIZE - 1)
/* The max size of hash table is PREALLOC_TB_SIZE */
order = PREALLOC_TB_SIZE - 1;
/* Add the prealloc space to lg */
rcu_read_lock();
list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
pa_inode_list) {
spin_lock(&tmp_pa->pa_lock);
if (tmp_pa->pa_deleted) {
spin_unlock(&pa->pa_lock);
continue;
}
if (!added && pa->pa_free < tmp_pa->pa_free) {
/* Add to the tail of the previous entry */
list_add_tail_rcu(&pa->pa_inode_list,
&tmp_pa->pa_inode_list);
added = 1;
/*
* we want to count the total
* number of entries in the list
*/
}
spin_unlock(&tmp_pa->pa_lock);
lg_prealloc_count++;
}
if (!added)
list_add_tail_rcu(&pa->pa_inode_list,
&lg->lg_prealloc_list[order]);
rcu_read_unlock();
/* Now trim the list to be not more than 8 elements */
if (lg_prealloc_count > 8) {
ext4_mb_discard_lg_preallocations(sb, lg,
order, lg_prealloc_count);
return;
}
return ;
}
/* /*
* release all resource we used in allocation * release all resource we used in allocation
*/ */
static int ext4_mb_release_context(struct ext4_allocation_context *ac) static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{ {
if (ac->ac_pa) { struct ext4_prealloc_space *pa = ac->ac_pa;
if (ac->ac_pa->pa_linear) { if (pa) {
if (pa->pa_linear) {
/* see comment in ext4_mb_use_group_pa() */ /* see comment in ext4_mb_use_group_pa() */
spin_lock(&ac->ac_pa->pa_lock); spin_lock(&pa->pa_lock);
ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len; pa->pa_pstart += ac->ac_b_ex.fe_len;
ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len; pa->pa_lstart += ac->ac_b_ex.fe_len;
ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_free -= ac->ac_b_ex.fe_len;
ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len;
spin_unlock(&ac->ac_pa->pa_lock); spin_unlock(&pa->pa_lock);
/*
* We want to add the pa to the right bucket.
* Remove it from the list and while adding
* make sure the list to which we are adding
* doesn't grow big.
*/
if (likely(pa->pa_free)) {
spin_lock(pa->pa_obj_lock);
list_del_rcu(&pa->pa_inode_list);
spin_unlock(pa->pa_obj_lock);
ext4_mb_add_n_trim(ac);
}
} }
ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa); ext4_mb_put_pa(ac, ac->ac_sb, pa);
} }
if (ac->ac_bitmap_page) if (ac->ac_bitmap_page)
page_cache_release(ac->ac_bitmap_page); page_cache_release(ac->ac_bitmap_page);
...@@ -4420,11 +4588,15 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, ...@@ -4420,11 +4588,15 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
count -= overflow; count -= overflow;
} }
bitmap_bh = ext4_read_block_bitmap(sb, block_group); bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh) if (!bitmap_bh) {
err = -EIO;
goto error_return; goto error_return;
}
gdp = ext4_get_group_desc(sb, block_group, &gd_bh); gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
if (!gdp) if (!gdp) {
err = -EIO;
goto error_return; goto error_return;
}
if (in_range(ext4_block_bitmap(sb, gdp), block, count) || if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
in_range(ext4_inode_bitmap(sb, gdp), block, count) || in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
......
...@@ -164,11 +164,17 @@ struct ext4_free_extent { ...@@ -164,11 +164,17 @@ struct ext4_free_extent {
* Locality group: * Locality group:
* we try to group all related changes together * we try to group all related changes together
* so that writeback can flush/allocate them together as well * so that writeback can flush/allocate them together as well
* Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
* (512). We store prealloc space into the hash based on the pa_free blocks
* order value.ie, fls(pa_free)-1;
*/ */
#define PREALLOC_TB_SIZE 10
struct ext4_locality_group { struct ext4_locality_group {
/* for allocator */ /* for allocator */
struct mutex lg_mutex; /* to serialize allocates */ /* to serialize allocates */
struct list_head lg_prealloc_list;/* list of preallocations */ struct mutex lg_mutex;
/* list of preallocations */
struct list_head lg_prealloc_list[PREALLOC_TB_SIZE];
spinlock_t lg_prealloc_lock; spinlock_t lg_prealloc_lock;
}; };
......
...@@ -73,7 +73,7 @@ static int verify_group_input(struct super_block *sb, ...@@ -73,7 +73,7 @@ static int verify_group_input(struct super_block *sb,
"Inode bitmap not in group (block %llu)", "Inode bitmap not in group (block %llu)",
(unsigned long long)input->inode_bitmap); (unsigned long long)input->inode_bitmap);
else if (outside(input->inode_table, start, end) || else if (outside(input->inode_table, start, end) ||
outside(itend - 1, start, end)) outside(itend - 1, start, end))
ext4_warning(sb, __func__, ext4_warning(sb, __func__,
"Inode table not in group (blocks %llu-%llu)", "Inode table not in group (blocks %llu-%llu)",
(unsigned long long)input->inode_table, itend - 1); (unsigned long long)input->inode_table, itend - 1);
...@@ -104,7 +104,7 @@ static int verify_group_input(struct super_block *sb, ...@@ -104,7 +104,7 @@ static int verify_group_input(struct super_block *sb,
(unsigned long long)input->inode_bitmap, (unsigned long long)input->inode_bitmap,
start, metaend - 1); start, metaend - 1);
else if (inside(input->inode_table, start, metaend) || else if (inside(input->inode_table, start, metaend) ||
inside(itend - 1, start, metaend)) inside(itend - 1, start, metaend))
ext4_warning(sb, __func__, ext4_warning(sb, __func__,
"Inode table (%llu-%llu) overlaps" "Inode table (%llu-%llu) overlaps"
"GDT table (%llu-%llu)", "GDT table (%llu-%llu)",
...@@ -158,9 +158,9 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh, ...@@ -158,9 +158,9 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
if (err) { if (err) {
if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
return err; return err;
if ((err = ext4_journal_get_write_access(handle, bh))) if ((err = ext4_journal_get_write_access(handle, bh)))
return err; return err;
} }
return 0; return 0;
} }
...@@ -416,11 +416,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, ...@@ -416,11 +416,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
"EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
gdb_num); gdb_num);
/* /*
* If we are not using the primary superblock/GDT copy don't resize, * If we are not using the primary superblock/GDT copy don't resize,
* because the user tools have no way of handling this. Probably a * because the user tools have no way of handling this. Probably a
* bad time to do it anyways. * bad time to do it anyways.
*/ */
if (EXT4_SB(sb)->s_sbh->b_blocknr != if (EXT4_SB(sb)->s_sbh->b_blocknr !=
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
ext4_warning(sb, __func__, ext4_warning(sb, __func__,
...@@ -507,14 +507,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, ...@@ -507,14 +507,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
return 0; return 0;
exit_inode: exit_inode:
//ext4_journal_release_buffer(handle, iloc.bh); /* ext4_journal_release_buffer(handle, iloc.bh); */
brelse(iloc.bh); brelse(iloc.bh);
exit_dindj: exit_dindj:
//ext4_journal_release_buffer(handle, dind); /* ext4_journal_release_buffer(handle, dind); */
exit_primary: exit_primary:
//ext4_journal_release_buffer(handle, *primary); /* ext4_journal_release_buffer(handle, *primary); */
exit_sbh: exit_sbh:
//ext4_journal_release_buffer(handle, *primary); /* ext4_journal_release_buffer(handle, *primary); */
exit_dind: exit_dind:
brelse(dind); brelse(dind);
exit_bh: exit_bh:
...@@ -818,12 +818,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) ...@@ -818,12 +818,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
goto exit_journal; goto exit_journal;
/* /*
* We will only either add reserved group blocks to a backup group * We will only either add reserved group blocks to a backup group
* or remove reserved blocks for the first group in a new group block. * or remove reserved blocks for the first group in a new group block.
* Doing both would be mean more complex code, and sane people don't * Doing both would be mean more complex code, and sane people don't
* use non-sparse filesystems anymore. This is already checked above. * use non-sparse filesystems anymore. This is already checked above.
*/ */
if (gdb_off) { if (gdb_off) {
primary = sbi->s_group_desc[gdb_num]; primary = sbi->s_group_desc[gdb_num];
if ((err = ext4_journal_get_write_access(handle, primary))) if ((err = ext4_journal_get_write_access(handle, primary)))
...@@ -835,24 +835,24 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) ...@@ -835,24 +835,24 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
} else if ((err = add_new_gdb(handle, inode, input, &primary))) } else if ((err = add_new_gdb(handle, inode, input, &primary)))
goto exit_journal; goto exit_journal;
/* /*
* OK, now we've set up the new group. Time to make it active. * OK, now we've set up the new group. Time to make it active.
* *
* Current kernels don't lock all allocations via lock_super(), * Current kernels don't lock all allocations via lock_super(),
* so we have to be safe wrt. concurrent accesses the group * so we have to be safe wrt. concurrent accesses the group
* data. So we need to be careful to set all of the relevant * data. So we need to be careful to set all of the relevant
* group descriptor data etc. *before* we enable the group. * group descriptor data etc. *before* we enable the group.
* *
* The key field here is sbi->s_groups_count: as long as * The key field here is sbi->s_groups_count: as long as
* that retains its old value, nobody is going to access the new * that retains its old value, nobody is going to access the new
* group. * group.
* *
* So first we update all the descriptor metadata for the new * So first we update all the descriptor metadata for the new
* group; then we update the total disk blocks count; then we * group; then we update the total disk blocks count; then we
* update the groups count to enable the group; then finally we * update the groups count to enable the group; then finally we
* update the free space counts so that the system can start * update the free space counts so that the system can start
* using the new disk blocks. * using the new disk blocks.
*/ */
/* Update group descriptor block for new group */ /* Update group descriptor block for new group */
gdp = (struct ext4_group_desc *)((char *)primary->b_data + gdp = (struct ext4_group_desc *)((char *)primary->b_data +
...@@ -946,7 +946,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) ...@@ -946,7 +946,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
return err; return err;
} /* ext4_group_add */ } /* ext4_group_add */
/* Extend the filesystem to the new number of blocks specified. This entry /*
* Extend the filesystem to the new number of blocks specified. This entry
* point is only used to extend the current filesystem to the end of the last * point is only used to extend the current filesystem to the end of the last
* existing group. It can be accessed via ioctl, or by "remount,resize=<size>" * existing group. It can be accessed via ioctl, or by "remount,resize=<size>"
* for emergencies (because it has no dependencies on reserved blocks). * for emergencies (because it has no dependencies on reserved blocks).
...@@ -1024,7 +1025,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ...@@ -1024,7 +1025,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
o_blocks_count + add, add); o_blocks_count + add, add);
/* See if the device is actually as big as what was requested */ /* See if the device is actually as big as what was requested */
bh = sb_bread(sb, o_blocks_count + add -1); bh = sb_bread(sb, o_blocks_count + add - 1);
if (!bh) { if (!bh) {
ext4_warning(sb, __func__, ext4_warning(sb, __func__,
"can't read last block, resize aborted"); "can't read last block, resize aborted");
......
此差异已折叠。
...@@ -1512,7 +1512,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, ...@@ -1512,7 +1512,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
char *name = entry->e_name; char *name = entry->e_name;
int n; int n;
for (n=0; n < entry->e_name_len; n++) { for (n = 0; n < entry->e_name_len; n++) {
hash = (hash << NAME_HASH_SHIFT) ^ hash = (hash << NAME_HASH_SHIFT) ^
(hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
*name++; *name++;
......
...@@ -262,8 +262,18 @@ static int journal_finish_inode_data_buffers(journal_t *journal, ...@@ -262,8 +262,18 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
jinode->i_flags |= JI_COMMIT_RUNNING; jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_list_lock);
err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
if (!ret) if (err) {
ret = err; /*
* Because AS_EIO is cleared by
* wait_on_page_writeback_range(), set it again so
* that user process can get -EIO from fsync().
*/
set_bit(AS_EIO,
&jinode->i_vfs_inode->i_mapping->flags);
if (!ret)
ret = err;
}
spin_lock(&journal->j_list_lock); spin_lock(&journal->j_list_lock);
jinode->i_flags &= ~JI_COMMIT_RUNNING; jinode->i_flags &= ~JI_COMMIT_RUNNING;
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
...@@ -670,8 +680,14 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -670,8 +680,14 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* commit block, which happens below in such setting. * commit block, which happens below in such setting.
*/ */
err = journal_finish_inode_data_buffers(journal, commit_transaction); err = journal_finish_inode_data_buffers(journal, commit_transaction);
if (err) if (err) {
jbd2_journal_abort(journal, err); char b[BDEVNAME_SIZE];
printk(KERN_WARNING
"JBD2: Detected IO errors while flushing file data "
"on %s\n", bdevname(journal->j_fs_dev, b));
err = 0;
}
/* Lo and behold: we have just managed to send a transaction to /* Lo and behold: we have just managed to send a transaction to
the log. Before we can commit it, wait for the IO so far to the log. Before we can commit it, wait for the IO so far to
......
...@@ -68,7 +68,6 @@ EXPORT_SYMBOL(jbd2_journal_set_features); ...@@ -68,7 +68,6 @@ EXPORT_SYMBOL(jbd2_journal_set_features);
EXPORT_SYMBOL(jbd2_journal_create); EXPORT_SYMBOL(jbd2_journal_create);
EXPORT_SYMBOL(jbd2_journal_load); EXPORT_SYMBOL(jbd2_journal_load);
EXPORT_SYMBOL(jbd2_journal_destroy); EXPORT_SYMBOL(jbd2_journal_destroy);
EXPORT_SYMBOL(jbd2_journal_update_superblock);
EXPORT_SYMBOL(jbd2_journal_abort); EXPORT_SYMBOL(jbd2_journal_abort);
EXPORT_SYMBOL(jbd2_journal_errno); EXPORT_SYMBOL(jbd2_journal_errno);
EXPORT_SYMBOL(jbd2_journal_ack_err); EXPORT_SYMBOL(jbd2_journal_ack_err);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册