balloc.c 26.5 KB
Newer Older
1
/*
2
 *  linux/fs/ext4/balloc.c
3 4 5 6 7 8 9 10 11 12 13 14 15 16
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */

#include <linux/time.h>
#include <linux/capability.h>
#include <linux/fs.h>
17
#include <linux/jbd2.h>
18 19
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
20 21
#include "ext4.h"
#include "ext4_jbd2.h"
A
Andreas Dilger 已提交
22
#include "group.h"
23

24 25 26 27
/*
 * balloc.c contains the blocks allocation and deallocation routines
 */

28 29 30 31
/*
 * Calculate the block group number and offset, given a block number
 */
void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
32
		ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
33
{
D
Dave Kleikamp 已提交
34
	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
35 36
	ext4_grpblk_t offset;

D
Dave Kleikamp 已提交
37
	blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
A
Andrew Morton 已提交
38
	offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
39 40 41
	if (offsetp)
		*offsetp = offset;
	if (blockgrpp)
D
Dave Kleikamp 已提交
42
		*blockgrpp = blocknr;
43 44 45

}

46 47 48 49
static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
			ext4_group_t block_group)
{
	ext4_group_t actual_group;
A
Aneesh Kumar K.V 已提交
50
	ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
	if (actual_group == block_group)
		return 1;
	return 0;
}

static int ext4_group_used_meta_blocks(struct super_block *sb,
				ext4_group_t block_group)
{
	ext4_fsblk_t tmp;
	struct ext4_sb_info *sbi = EXT4_SB(sb);
	/* block bitmap, inode bitmap, and inode table blocks */
	int used_blocks = sbi->s_itb_per_group + 2;

	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
		struct ext4_group_desc *gdp;
		struct buffer_head *bh;

		gdp = ext4_get_group_desc(sb, block_group, &bh);
		if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
					block_group))
			used_blocks--;

		if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp),
					block_group))
			used_blocks--;

		tmp = ext4_inode_table(sb, gdp);
		for (; tmp < ext4_inode_table(sb, gdp) +
				sbi->s_itb_per_group; tmp++) {
			if (!ext4_block_in_group(sb, tmp, block_group))
				used_blocks -= 1;
		}
	}
	return used_blocks;
}
86

A
Andreas Dilger 已提交
87 88 89
/* Initializes an uninitialized block bitmap if given, and returns the
 * number of blocks free in the group. */
unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
90
		 ext4_group_t block_group, struct ext4_group_desc *gdp)
A
Andreas Dilger 已提交
91 92 93 94 95 96 97 98 99 100 101
{
	int bit, bit_max;
	unsigned free_blocks, group_blocks;
	struct ext4_sb_info *sbi = EXT4_SB(sb);

	if (bh) {
		J_ASSERT_BH(bh, buffer_locked(bh));

		/* If checksum is bad mark all blocks used to prevent allocation
		 * essentially implementing a per-group read-only flag. */
		if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
102
			ext4_error(sb, __func__,
103
				  "Checksum bad for group %lu\n", block_group);
A
Andreas Dilger 已提交
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
			gdp->bg_free_blocks_count = 0;
			gdp->bg_free_inodes_count = 0;
			gdp->bg_itable_unused = 0;
			memset(bh->b_data, 0xff, sb->s_blocksize);
			return 0;
		}
		memset(bh->b_data, 0, sb->s_blocksize);
	}

	/* Check for superblock and gdt backups in this group */
	bit_max = ext4_bg_has_super(sb, block_group);

	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
	    block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
			  sbi->s_desc_per_block) {
		if (bit_max) {
			bit_max += ext4_bg_num_gdb(sb, block_group);
			bit_max +=
				le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
		}
	} else { /* For META_BG_BLOCK_GROUPS */
125
		bit_max += ext4_bg_num_gdb(sb, block_group);
A
Andreas Dilger 已提交
126 127 128 129 130 131 132 133 134 135
	}

	if (block_group == sbi->s_groups_count - 1) {
		/*
		 * Even though mke2fs always initialize first and last group
		 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
		 * to make sure we calculate the right free blocks
		 */
		group_blocks = ext4_blocks_count(sbi->s_es) -
			le32_to_cpu(sbi->s_es->s_first_data_block) -
136
			(EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
A
Andreas Dilger 已提交
137 138 139 140 141 142 143
	} else {
		group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
	}

	free_blocks = group_blocks - bit_max;

	if (bh) {
144 145
		ext4_fsblk_t start, tmp;
		int flex_bg = 0;
146

A
Andreas Dilger 已提交
147 148 149
		for (bit = 0; bit < bit_max; bit++)
			ext4_set_bit(bit, bh->b_data);

150
		start = ext4_group_first_block_no(sb, block_group);
A
Andreas Dilger 已提交
151

152 153 154
		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
					      EXT4_FEATURE_INCOMPAT_FLEX_BG))
			flex_bg = 1;
A
Andreas Dilger 已提交
155

156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
		/* Set bits for block and inode bitmaps, and inode table */
		tmp = ext4_block_bitmap(sb, gdp);
		if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
			ext4_set_bit(tmp - start, bh->b_data);

		tmp = ext4_inode_bitmap(sb, gdp);
		if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
			ext4_set_bit(tmp - start, bh->b_data);

		tmp = ext4_inode_table(sb, gdp);
		for (; tmp < ext4_inode_table(sb, gdp) +
				sbi->s_itb_per_group; tmp++) {
			if (!flex_bg ||
				ext4_block_in_group(sb, tmp, block_group))
				ext4_set_bit(tmp - start, bh->b_data);
		}
A
Andreas Dilger 已提交
172 173 174 175 176 177 178
		/*
		 * Also if the number of blocks within the group is
		 * less than the blocksize * 8 ( which is the size
		 * of bitmap ), set rest of the block bitmap to 1
		 */
		mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
	}
179
	return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
A
Andreas Dilger 已提交
180 181 182
}


183 184 185 186 187 188 189 190
/*
 * The free blocks are managed by bitmaps.  A file system contains several
 * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
 * block for inodes, N blocks for the inode table and data blocks.
 *
 * The file system contains group descriptors which are located after the
 * super block.  Each descriptor contains the number of the bitmap block and
 * the free blocks count in the block.  The descriptors are loaded in memory
191
 * when a file system is mounted (see ext4_fill_super).
192 193 194 195 196 197
 */


#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)

/**
198
 * ext4_get_group_desc() -- load group descriptor from disk
199 200 201 202 203
 * @sb:			super block
 * @block_group:	given block group
 * @bh:			pointer to the buffer head to store the block
 *			group descriptor
 */
204
struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
205
					     ext4_group_t block_group,
206
					     struct buffer_head **bh)
207 208 209
{
	unsigned long group_desc;
	unsigned long offset;
210
	struct ext4_group_desc *desc;
211
	struct ext4_sb_info *sbi = EXT4_SB(sb);
212 213

	if (block_group >= sbi->s_groups_count) {
214 215 216 217
		ext4_error(sb, "ext4_get_group_desc",
			   "block_group >= groups_count - "
			   "block_group = %lu, groups_count = %lu",
			   block_group, sbi->s_groups_count);
218 219 220 221 222

		return NULL;
	}
	smp_rmb();

223 224
	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
	offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
225
	if (!sbi->s_group_desc[group_desc]) {
226 227 228 229
		ext4_error(sb, "ext4_get_group_desc",
			   "Group descriptor not loaded - "
			   "block_group = %lu, group_desc = %lu, desc = %lu",
			   block_group, group_desc, offset);
230 231 232
		return NULL;
	}

233 234 235
	desc = (struct ext4_group_desc *)(
		(__u8 *)sbi->s_group_desc[group_desc]->b_data +
		offset * EXT4_DESC_SIZE(sb));
236 237
	if (bh)
		*bh = sbi->s_group_desc[group_desc];
238
	return desc;
239 240
}

241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
static int ext4_valid_block_bitmap(struct super_block *sb,
					struct ext4_group_desc *desc,
					unsigned int block_group,
					struct buffer_head *bh)
{
	ext4_grpblk_t offset;
	ext4_grpblk_t next_zero_bit;
	ext4_fsblk_t bitmap_blk;
	ext4_fsblk_t group_first_block;

	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
		/* with FLEX_BG, the inode/block bitmaps and itable
		 * blocks may not be in the group at all
		 * so the bitmap validation will be skipped for those groups
		 * or it has to also read the block group where the bitmaps
		 * are located to verify they are set.
		 */
		return 1;
	}
	group_first_block = ext4_group_first_block_no(sb, block_group);

	/* check whether block bitmap block number is set */
	bitmap_blk = ext4_block_bitmap(sb, desc);
	offset = bitmap_blk - group_first_block;
	if (!ext4_test_bit(offset, bh->b_data))
		/* bad block bitmap */
		goto err_out;

	/* check whether the inode bitmap block number is set */
	bitmap_blk = ext4_inode_bitmap(sb, desc);
	offset = bitmap_blk - group_first_block;
	if (!ext4_test_bit(offset, bh->b_data))
		/* bad block bitmap */
		goto err_out;

	/* check whether the inode table block number is set */
	bitmap_blk = ext4_inode_table(sb, desc);
	offset = bitmap_blk - group_first_block;
	next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
				offset + EXT4_SB(sb)->s_itb_per_group,
				offset);
	if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group)
		/* good bitmap for inode tables */
		return 1;

err_out:
287
	ext4_error(sb, __func__,
288 289 290 291 292
			"Invalid block bitmap - "
			"block_group = %d, block = %llu",
			block_group, bitmap_blk);
	return 0;
}
293
/**
294
 * ext4_read_block_bitmap()
295 296 297
 * @sb:			super block
 * @block_group:	given block group
 *
298 299
 * Read the bitmap for a given block_group,and validate the
 * bits for block/inode/inode tables are set in the bitmaps
300 301 302
 *
 * Return buffer_head on success or NULL in case of failure.
 */
A
Andreas Dilger 已提交
303
struct buffer_head *
304
ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
305
{
306 307
	struct ext4_group_desc *desc;
	struct buffer_head *bh = NULL;
308
	ext4_fsblk_t bitmap_blk;
309

A
Andreas Dilger 已提交
310
	desc = ext4_get_group_desc(sb, block_group, NULL);
311
	if (!desc)
312 313
		return NULL;
	bitmap_blk = ext4_block_bitmap(sb, desc);
314 315
	bh = sb_getblk(sb, bitmap_blk);
	if (unlikely(!bh)) {
316
		ext4_error(sb, __func__,
317
			    "Cannot read block bitmap - "
318 319
			    "block_group = %lu, block_bitmap = %llu",
			    block_group, bitmap_blk);
320 321
		return NULL;
	}
322 323
	if (buffer_uptodate(bh) &&
	    !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
324 325
		return bh;

326
	lock_buffer(bh);
327
	spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
A
Andreas Dilger 已提交
328
	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
329 330 331
		ext4_init_block_bitmap(sb, bh, block_group, desc);
		set_buffer_uptodate(bh);
		unlock_buffer(bh);
332
		spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
333
		return bh;
A
Andreas Dilger 已提交
334
	}
335
	spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
336 337
	if (bh_submit_read(bh) < 0) {
		put_bh(bh);
338
		ext4_error(sb, __func__,
339
			    "Cannot read block bitmap - "
340 341
			    "block_group = %lu, block_bitmap = %llu",
			    block_group, bitmap_blk);
342 343
		return NULL;
	}
344 345 346 347 348
	ext4_valid_block_bitmap(sb, desc, block_group, bh);
	/*
	 * file system mounted not to panic on error,
	 * continue with corrupt bitmap
	 */
349 350 351 352
	return bh;
}

/**
353
 * ext4_free_blocks_sb() -- Free given blocks and update quota
354 355 356 357 358
 * @handle:			handle to this transaction
 * @sb:				super block
 * @block:			start physcial block to free
 * @count:			number of blocks to free
 * @pdquot_freed_blocks:	pointer to quota
359 360 361 362 363 364 365
 *
 * XXX This function is only used by the on-line resizing code, which
 * should probably be fixed up to call the mballoc variant.  There
 * this needs to be cleaned up later; in fact, I'm not convinced this
 * is 100% correct in the face of the mballoc code.  The online resizing
 * code needs to be fixed up to more tightly (and correctly) interlock
 * with the mballoc code.
366
 */
367 368
void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
			 ext4_fsblk_t block, unsigned long count,
369 370 371 372
			 unsigned long *pdquot_freed_blocks)
{
	struct buffer_head *bitmap_bh = NULL;
	struct buffer_head *gd_bh;
373
	ext4_group_t block_group;
374
	ext4_grpblk_t bit;
375 376
	unsigned long i;
	unsigned long overflow;
377 378
	struct ext4_group_desc *desc;
	struct ext4_super_block *es;
379
	struct ext4_sb_info *sbi;
380
	int err = 0, ret;
381
	ext4_grpblk_t group_freed;
382 383

	*pdquot_freed_blocks = 0;
384
	sbi = EXT4_SB(sb);
385 386 387
	es = sbi->s_es;
	if (block < le32_to_cpu(es->s_first_data_block) ||
	    block + count < block ||
L
Laurent Vivier 已提交
388
	    block + count > ext4_blocks_count(es)) {
389 390 391
		ext4_error(sb, "ext4_free_blocks",
			   "Freeing blocks not in datazone - "
			   "block = %llu, count = %lu", block, count);
392 393 394
		goto error_return;
	}

395
	ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
396 397 398

do_more:
	overflow = 0;
399
	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
400 401 402 403
	/*
	 * Check to see if we are freeing blocks across a group
	 * boundary.
	 */
404 405
	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
		overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
406 407 408
		count -= overflow;
	}
	brelse(bitmap_bh);
409
	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
410 411
	if (!bitmap_bh)
		goto error_return;
412
	desc = ext4_get_group_desc(sb, block_group, &gd_bh);
413 414 415
	if (!desc)
		goto error_return;

416 417 418 419
	if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
	    in_range(ext4_inode_bitmap(sb, desc), block, count) ||
	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
	    in_range(block + count - 1, ext4_inode_table(sb, desc),
420
		     sbi->s_itb_per_group)) {
421 422 423 424
		ext4_error(sb, "ext4_free_blocks",
			   "Freeing blocks in system zones - "
			   "Block = %llu, count = %lu",
			   block, count);
425 426
		goto error_return;
	}
427 428 429 430 431 432 433

	/*
	 * We are about to start releasing blocks in the bitmap,
	 * so we need undo access.
	 */
	/* @@@ check errors */
	BUFFER_TRACE(bitmap_bh, "getting undo access");
434
	err = ext4_journal_get_undo_access(handle, bitmap_bh);
435 436 437 438 439 440 441 442 443
	if (err)
		goto error_return;

	/*
	 * We are about to modify some metadata.  Call the journal APIs
	 * to unshare ->b_data if a currently-committing transaction is
	 * using it
	 */
	BUFFER_TRACE(gd_bh, "get_write_access");
444
	err = ext4_journal_get_write_access(handle, gd_bh);
445 446 447 448 449 450 451 452 453
	if (err)
		goto error_return;

	jbd_lock_bh_state(bitmap_bh);

	for (i = 0, group_freed = 0; i < count; i++) {
		/*
		 * An HJ special.  This is expensive...
		 */
454
#ifdef CONFIG_JBD2_DEBUG
455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
		jbd_unlock_bh_state(bitmap_bh);
		{
			struct buffer_head *debug_bh;
			debug_bh = sb_find_get_block(sb, block + i);
			if (debug_bh) {
				BUFFER_TRACE(debug_bh, "Deleted!");
				if (!bh2jh(bitmap_bh)->b_committed_data)
					BUFFER_TRACE(debug_bh,
						"No commited data in bitmap");
				BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
				__brelse(debug_bh);
			}
		}
		jbd_lock_bh_state(bitmap_bh);
#endif
		if (need_resched()) {
			jbd_unlock_bh_state(bitmap_bh);
			cond_resched();
			jbd_lock_bh_state(bitmap_bh);
		}
		/* @@@ This prevents newly-allocated data from being
		 * freed and then reallocated within the same
		 * transaction.
		 *
		 * Ideally we would want to allow that to happen, but to
480
		 * do so requires making jbd2_journal_forget() capable of
481 482 483 484
		 * revoking the queued write of a data block, which
		 * implies blocking on the journal lock.  *forget()
		 * cannot block due to truncate races.
		 *
485
		 * Eventually we can fix this by making jbd2_journal_forget()
486 487 488 489 490 491 492 493 494 495
		 * return a status indicating whether or not it was able
		 * to revoke the buffer.  On successful revoke, it is
		 * safe not to set the allocation bit in the committed
		 * bitmap, because we know that there is no outstanding
		 * activity on the buffer any more and so it is safe to
		 * reallocate it.
		 */
		BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
		J_ASSERT_BH(bitmap_bh,
				bh2jh(bitmap_bh)->b_committed_data != NULL);
496
		ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
497 498 499 500 501 502 503 504
				bh2jh(bitmap_bh)->b_committed_data);

		/*
		 * We clear the bit in the bitmap after setting the committed
		 * data bit, because this is the reverse order to that which
		 * the allocator uses.
		 */
		BUFFER_TRACE(bitmap_bh, "clear bit");
505
		if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
506 507
						bit + i, bitmap_bh->b_data)) {
			jbd_unlock_bh_state(bitmap_bh);
508
			ext4_error(sb, __func__,
509
				   "bit already cleared for block %llu",
L
Laurent Vivier 已提交
510
				   (ext4_fsblk_t)(block + i));
511 512 513 514 515 516 517 518 519
			jbd_lock_bh_state(bitmap_bh);
			BUFFER_TRACE(bitmap_bh, "bit already cleared");
		} else {
			group_freed++;
		}
	}
	jbd_unlock_bh_state(bitmap_bh);

	spin_lock(sb_bgl_lock(sbi, block_group));
M
Marcin Slusarz 已提交
520
	le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
A
Andreas Dilger 已提交
521
	desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
522
	spin_unlock(sb_bgl_lock(sbi, block_group));
P
Peter Zijlstra 已提交
523
	percpu_counter_add(&sbi->s_freeblocks_counter, count);
524

525 526 527 528 529 530 531
	if (sbi->s_log_groups_per_flex) {
		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
		spin_lock(sb_bgl_lock(sbi, flex_group));
		sbi->s_flex_groups[flex_group].free_blocks += count;
		spin_unlock(sb_bgl_lock(sbi, flex_group));
	}

532 533
	/* We dirtied the bitmap block */
	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
534
	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
535 536 537

	/* And the group descriptor block */
	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
538
	ret = ext4_journal_dirty_metadata(handle, gd_bh);
539 540 541 542 543 544 545 546 547 548 549
	if (!err) err = ret;
	*pdquot_freed_blocks += group_freed;

	if (overflow && !err) {
		block += count;
		count = overflow;
		goto do_more;
	}
	sb->s_dirt = 1;
error_return:
	brelse(bitmap_bh);
550
	ext4_std_error(sb, err);
551 552 553 554
	return;
}

/**
555
 * ext4_free_blocks() -- Free given blocks and update quota
556 557 558 559
 * @handle:		handle for this transaction
 * @inode:		inode
 * @block:		start physical block to free
 * @count:		number of blocks to count
560
 * @metadata: 		Are these metadata blocks
561
 */
562
void ext4_free_blocks(handle_t *handle, struct inode *inode,
563 564
			ext4_fsblk_t block, unsigned long count,
			int metadata)
565
{
566
	struct super_block *sb;
567 568
	unsigned long dquot_freed_blocks;

569 570
	/* this isn't the right place to decide whether block is metadata
	 * inode.c/extents.c knows better, but for safety ... */
571 572 573 574 575 576 577 578 579 580
	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
		metadata = 1;

	/* We need to make sure we don't reuse
	 * block released untill the transaction commit.
	 * writeback mode have weak data consistency so
	 * don't force data as metadata when freeing block
	 * for writeback mode.
	 */
	if (metadata == 0 && !ext4_should_writeback_data(inode))
581 582
		metadata = 1;

583
	sb = inode->i_sb;
584

585 586
	ext4_mb_free_blocks(handle, inode, block, count,
			    metadata, &dquot_freed_blocks);
587 588 589 590 591
	if (dquot_freed_blocks)
		DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
	return;
}

592
int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
A
Aneesh Kumar K.V 已提交
593
						s64 nblocks)
594
{
595
	s64 free_blocks, dirty_blocks;
A
Aneesh Kumar K.V 已提交
596
	s64 root_blocks = 0;
597
	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
598
	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
599

600 601
	free_blocks  = percpu_counter_read_positive(fbc);
	dirty_blocks = percpu_counter_read_positive(dbc);
602 603 604 605 606 607

	if (!capable(CAP_SYS_RESOURCE) &&
		sbi->s_resuid != current->fsuid &&
		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
		root_blocks = ext4_r_blocks_count(sbi->s_es);

608 609 610 611 612 613 614 615 616 617 618 619 620
	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
						EXT4_FREEBLOCKS_WATERMARK) {
		free_blocks  = percpu_counter_sum(fbc);
		dirty_blocks = percpu_counter_sum(dbc);
		if (dirty_blocks < 0) {
			printk(KERN_CRIT "Dirty block accounting "
					"went wrong %lld\n",
					dirty_blocks);
		}
	}
	/* Check whether we have space after
	 * accounting for current dirty blocks
	 */
A
Aneesh Kumar K.V 已提交
621
	if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
622 623 624
		/* we don't have free space */
		return -ENOSPC;

625 626
	/* Add the blocks to nblocks */
	percpu_counter_add(dbc, nblocks);
627 628 629
	return 0;
}

630
/**
631
 * ext4_has_free_blocks()
632 633
 * @sbi:	in-core super block structure.
 * @nblocks:	number of neeed blocks
634
 *
635 636 637
 * Check if filesystem has free blocks available for allocation.
 * Return the number of blocks avaible for allocation for this request
 * On success, return nblocks
638
 */
639
ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
A
Aneesh Kumar K.V 已提交
640
						s64 nblocks)
641
{
A
Aneesh Kumar K.V 已提交
642 643
	s64 free_blocks, dirty_blocks;
	s64 root_blocks = 0;
644 645
	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
646

647 648
	free_blocks  = percpu_counter_read_positive(fbc);
	dirty_blocks = percpu_counter_read_positive(dbc);
649 650

	if (!capable(CAP_SYS_RESOURCE) &&
651
		sbi->s_resuid != current->fsuid &&
652 653
		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
		root_blocks = ext4_r_blocks_count(sbi->s_es);
654

655 656
	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
						EXT4_FREEBLOCKS_WATERMARK) {
A
Aneesh Kumar K.V 已提交
657 658
		free_blocks  = percpu_counter_sum(fbc);
		dirty_blocks = percpu_counter_sum(dbc);
659 660
	}
	if (free_blocks <= (root_blocks + dirty_blocks))
661 662
		/* we don't have free space */
		return 0;
A
Aneesh Kumar K.V 已提交
663

664
	if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
A
Aneesh Kumar K.V 已提交
665
		return free_blocks - (root_blocks + dirty_blocks);
666
	return nblocks;
667
}
668

669 670

/**
671
 * ext4_should_retry_alloc()
672 673 674
 * @sb:			super block
 * @retries		number of attemps has been made
 *
675
 * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
676 677 678 679 680 681
 * it is profitable to retry the operation, this function will wait
 * for the current or commiting transaction to complete, and then
 * return TRUE.
 *
 * if the total number of retries exceed three times, return FALSE.
 */
682
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
683
{
684
	if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
685 686 687 688
		return 0;

	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);

689
	return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
690 691
}

A
Aneesh Kumar K.V 已提交
692
#define EXT4_META_BLOCK 0x1
693

A
Aneesh Kumar K.V 已提交
694
static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
695
				ext4_lblk_t iblock, ext4_fsblk_t goal,
A
Aneesh Kumar K.V 已提交
696
				unsigned long *count, int *errp, int flags)
697
{
698 699
	struct ext4_allocation_request ar;
	ext4_fsblk_t ret;
700

701
	memset(&ar, 0, sizeof(ar));
702 703
	/* Fill with neighbour allocated blocks */

704 705 706
	ar.inode = inode;
	ar.goal = goal;
	ar.len = *count;
707
	ar.logical = iblock;
A
Aneesh Kumar K.V 已提交
708 709 710

	if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
		/* enable in-core preallocation for data block allocation */
711 712 713 714
		ar.flags = EXT4_MB_HINT_DATA;
	else
		/* disable in-core preallocation for non-regular files */
		ar.flags = 0;
A
Aneesh Kumar K.V 已提交
715

716 717 718
	ret = ext4_mb_new_blocks(handle, &ar, errp);
	*count = ar.len;
	return ret;
719 720
}

A
Aneesh Kumar K.V 已提交
721
/*
722
 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
A
Aneesh Kumar K.V 已提交
723 724 725 726
 *
 * @handle:             handle to this transaction
 * @inode:              file inode
 * @goal:               given target block(filesystem wide)
727
 * @count:		total number of blocks need
A
Aneesh Kumar K.V 已提交
728 729
 * @errp:               error code
 *
730 731
 * Return 1st allocated block numberon success, *count stores total account
 * error stores in errp pointer
A
Aneesh Kumar K.V 已提交
732
 */
733 734
ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
		ext4_fsblk_t goal, unsigned long *count, int *errp)
A
Aneesh Kumar K.V 已提交
735
{
736 737 738 739 740 741
	ext4_fsblk_t ret;
	ret = do_blk_alloc(handle, inode, 0, goal,
				count, errp, EXT4_META_BLOCK);
	/*
	 * Account for the allocated meta blocks
	 */
742
	if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
743 744 745 746 747
		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
		EXT4_I(inode)->i_allocated_meta_blocks += *count;
		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
	}
	return ret;
A
Aneesh Kumar K.V 已提交
748 749 750
}

/*
751
 * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
A
Aneesh Kumar K.V 已提交
752 753 754 755 756 757
 *
 * @handle:             handle to this transaction
 * @inode:              file inode
 * @goal:               given target block(filesystem wide)
 * @errp:               error code
 *
758
 * Return allocated block number on success
A
Aneesh Kumar K.V 已提交
759
 */
760 761
ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
		ext4_fsblk_t goal, int *errp)
A
Aneesh Kumar K.V 已提交
762
{
763 764
	unsigned long count = 1;
	return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
A
Aneesh Kumar K.V 已提交
765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785
}

/*
 * ext4_new_blocks() -- allocate data blocks
 *
 * @handle:             handle to this transaction
 * @inode:              file inode
 * @goal:               given target block(filesystem wide)
 * @count:		total number of blocks need
 * @errp:               error code
 *
 * Return 1st allocated block numberon success, *count stores total account
 * error stores in errp pointer
 */

ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
				ext4_lblk_t iblock, ext4_fsblk_t goal,
				unsigned long *count, int *errp)
{
	return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
}
786

787
/**
788
 * ext4_count_free_blocks() -- count filesystem free blocks
789 790 791 792
 * @sb:		superblock
 *
 * Adds up the number of free blocks from each block group.
 */
793
ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
794
{
795 796
	ext4_fsblk_t desc_count;
	struct ext4_group_desc *gdp;
797 798
	ext4_group_t i;
	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
799 800 801
#ifdef EXT4FS_DEBUG
	struct ext4_super_block *es;
	ext4_fsblk_t bitmap_count;
802 803 804
	unsigned long x;
	struct buffer_head *bitmap_bh = NULL;

805
	es = EXT4_SB(sb)->s_es;
806 807 808 809 810 811
	desc_count = 0;
	bitmap_count = 0;
	gdp = NULL;

	smp_rmb();
	for (i = 0; i < ngroups; i++) {
812
		gdp = ext4_get_group_desc(sb, i, NULL);
813 814 815 816
		if (!gdp)
			continue;
		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
		brelse(bitmap_bh);
817
		bitmap_bh = ext4_read_block_bitmap(sb, i);
818 819 820
		if (bitmap_bh == NULL)
			continue;

821
		x = ext4_count_free(bitmap_bh, sb->s_blocksize);
822
		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
823 824 825 826
			i, le16_to_cpu(gdp->bg_free_blocks_count), x);
		bitmap_count += x;
	}
	brelse(bitmap_bh);
827 828 829
	printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu"
		", computed = %llu, %llu\n", ext4_free_blocks_count(es),
	       desc_count, bitmap_count);
830 831 832 833 834
	return bitmap_count;
#else
	desc_count = 0;
	smp_rmb();
	for (i = 0; i < ngroups; i++) {
835
		gdp = ext4_get_group_desc(sb, i, NULL);
836 837 838 839 840 841 842 843 844
		if (!gdp)
			continue;
		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
	}

	return desc_count;
#endif
}

845
static inline int test_root(ext4_group_t a, int b)
846 847 848 849 850 851 852 853
{
	int num = b;

	while (a > num)
		num *= b;
	return num == a;
}

854
static int ext4_group_sparse(ext4_group_t group)
855 856 857 858 859 860 861 862 863 864
{
	if (group <= 1)
		return 1;
	if (!(group & 1))
		return 0;
	return (test_root(group, 7) || test_root(group, 5) ||
		test_root(group, 3));
}

/**
865
 *	ext4_bg_has_super - number of blocks used by the superblock in group
866 867 868 869 870 871
 *	@sb: superblock for filesystem
 *	@group: group number to check
 *
 *	Return the number of blocks used by the superblock (primary or backup)
 *	in this group.  Currently this will be only 0 or 1.
 */
872
int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
873
{
874 875 876
	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
				EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
			!ext4_group_sparse(group))
877 878 879 880
		return 0;
	return 1;
}

881 882
static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
					ext4_group_t group)
883
{
884
	unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
885 886
	ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb);
	ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
887 888 889 890 891 892

	if (group == first || group == first + 1 || group == last)
		return 1;
	return 0;
}

893 894
static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
					ext4_group_t group)
895
{
896
	return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0;
897 898 899
}

/**
900
 *	ext4_bg_num_gdb - number of blocks used by the group table in group
901 902 903 904 905 906 907
 *	@sb: superblock for filesystem
 *	@group: group number to check
 *
 *	Return the number of blocks used by the group descriptor table
 *	(primary or backup) in this group.  In the future there may be a
 *	different number of descriptor blocks in each group.
 */
908
unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
909 910
{
	unsigned long first_meta_bg =
911 912
			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
	unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
913

914
	if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
915
			metagroup < first_meta_bg)
916
		return ext4_bg_num_gdb_nometa(sb, group);
917

918
	return ext4_bg_num_gdb_meta(sb,group);
919 920

}
921