ialloc.c 31.8 KB
Newer Older
1
/*
2
 *  linux/fs/ext4/ialloc.c
3 4 5 6 7 8 9 10 11 12 13 14 15 16
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  BSD ufs-inspired inode and directory allocation by
 *  Stephen Tweedie (sct@redhat.com), 1993
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */

#include <linux/time.h>
#include <linux/fs.h>
17
#include <linux/jbd2.h>
18 19 20 21 22 23
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/random.h>
#include <linux/bitops.h>
24
#include <linux/blkdev.h>
25
#include <asm/byteorder.h>
26

27 28
#include "ext4.h"
#include "ext4_jbd2.h"
29 30 31
#include "xattr.h"
#include "acl.h"

32 33
#include <trace/events/ext4.h>

34 35 36 37 38 39 40 41 42 43 44 45 46 47
/*
 * ialloc.c contains the inodes allocation and deallocation routines
 */

/*
 * The free inodes are managed by bitmaps.  A file system contains several
 * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
 * block for inodes, N blocks for the inode table and data blocks.
 *
 * The file system contains group descriptors which are located after the
 * super block.  Each descriptor contains the number of the bitmap block and
 * the free blocks count in the block.
 */

A
Andreas Dilger 已提交
48 49 50 51 52
/*
 * To avoid calling the atomic setbit hundreds or thousands of times, we only
 * need to use it within a single byte (to ensure we get endianness right).
 * We can use memset for the rest of the bitmap as there are no other users.
 */
53
void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
A
Andreas Dilger 已提交
54 55 56 57 58 59 60 61 62 63 64 65 66 67
{
	int i;

	if (start_bit >= end_bit)
		return;

	ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
	for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
		ext4_set_bit(i, bitmap);
	if (i < end_bit)
		memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
}

/* Initializes an uninitialized inode bitmap */
68 69 70 71
static unsigned ext4_init_inode_bitmap(struct super_block *sb,
				       struct buffer_head *bh,
				       ext4_group_t block_group,
				       struct ext4_group_desc *gdp)
A
Andreas Dilger 已提交
72 73 74 75 76 77 78 79
{
	struct ext4_sb_info *sbi = EXT4_SB(sb);

	J_ASSERT_BH(bh, buffer_locked(bh));

	/* If checksum is bad mark all blocks and inodes use to prevent
	 * allocation, essentially implementing a per-group read-only flag. */
	if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
80
		ext4_error(sb, "Checksum bad for group %u", block_group);
81
		ext4_free_group_clusters_set(sb, gdp, 0);
82 83
		ext4_free_inodes_set(sb, gdp, 0);
		ext4_itable_unused_set(sb, gdp, 0);
A
Andreas Dilger 已提交
84 85 86 87 88
		memset(bh->b_data, 0xff, sb->s_blocksize);
		return 0;
	}

	memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
89
	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
A
Andreas Dilger 已提交
90 91 92 93
			bh->b_data);

	return EXT4_INODES_PER_GROUP(sb);
}
94 95 96 97 98 99 100 101

/*
 * Read the inode allocation bitmap for a given block_group, reading
 * into the specified slot in the superblock's bitmap cache.
 *
 * Return buffer_head of bitmap on success or NULL.
 */
static struct buffer_head *
102
ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
103
{
104
	struct ext4_group_desc *desc;
105
	struct buffer_head *bh = NULL;
106
	ext4_fsblk_t bitmap_blk;
107

108
	desc = ext4_get_group_desc(sb, block_group, NULL);
109
	if (!desc)
110
		return NULL;
111

112 113 114
	bitmap_blk = ext4_inode_bitmap(sb, desc);
	bh = sb_getblk(sb, bitmap_blk);
	if (unlikely(!bh)) {
115
		ext4_error(sb, "Cannot read inode bitmap - "
116
			    "block_group = %u, inode_bitmap = %llu",
117 118 119
			    block_group, bitmap_blk);
		return NULL;
	}
120
	if (bitmap_uptodate(bh))
121 122
		return bh;

123
	lock_buffer(bh);
124 125 126 127
	if (bitmap_uptodate(bh)) {
		unlock_buffer(bh);
		return bh;
	}
128

129
	ext4_lock_group(sb, block_group);
A
Andreas Dilger 已提交
130
	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
131
		ext4_init_inode_bitmap(sb, bh, block_group, desc);
132
		set_bitmap_uptodate(bh);
133
		set_buffer_uptodate(bh);
134
		ext4_unlock_group(sb, block_group);
A
Aneesh Kumar K.V 已提交
135
		unlock_buffer(bh);
136
		return bh;
A
Andreas Dilger 已提交
137
	}
138
	ext4_unlock_group(sb, block_group);
139

140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
	if (buffer_uptodate(bh)) {
		/*
		 * if not uninit if bh is uptodate,
		 * bitmap is also uptodate
		 */
		set_bitmap_uptodate(bh);
		unlock_buffer(bh);
		return bh;
	}
	/*
	 * submit the buffer_head for read. We can
	 * safely mark the bitmap as uptodate now.
	 * We do it here so the bitmap uptodate bit
	 * get set with buffer lock held.
	 */
155
	trace_ext4_load_inode_bitmap(sb, block_group);
156
	set_bitmap_uptodate(bh);
157 158
	if (bh_submit_read(bh) < 0) {
		put_bh(bh);
159
		ext4_error(sb, "Cannot read inode bitmap - "
160
			    "block_group = %u, inode_bitmap = %llu",
161 162 163
			    block_group, bitmap_blk);
		return NULL;
	}
164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
	return bh;
}

/*
 * NOTE! When we get the inode, we're the only people
 * that have access to it, and as such there are no
 * race conditions we have to worry about. The inode
 * is not on the hash-lists, and it cannot be reached
 * through the filesystem because the directory entry
 * has been deleted earlier.
 *
 * HOWEVER: we must make sure that we get no aliases,
 * which means that we have to call "clear_inode()"
 * _before_ we mark the inode not in use in the inode
 * bitmaps. Otherwise a newly created file might use
 * the same inode number (not actually the same pointer
 * though), and then we'd have two inodes sharing the
 * same inode number and space on the harddisk.
 */
183
void ext4_free_inode(handle_t *handle, struct inode *inode)
184
{
185
	struct super_block *sb = inode->i_sb;
186 187 188 189
	int is_directory;
	unsigned long ino;
	struct buffer_head *bitmap_bh = NULL;
	struct buffer_head *bh2;
190
	ext4_group_t block_group;
191
	unsigned long bit;
192 193
	struct ext4_group_desc *gdp;
	struct ext4_super_block *es;
194
	struct ext4_sb_info *sbi;
195
	int fatal = 0, err, count, cleared;
196 197

	if (atomic_read(&inode->i_count) > 1) {
198 199
		printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
		       atomic_read(&inode->i_count));
200 201 202
		return;
	}
	if (inode->i_nlink) {
203 204
		printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n",
		       inode->i_nlink);
205 206 207
		return;
	}
	if (!sb) {
208 209
		printk(KERN_ERR "ext4_free_inode: inode on "
		       "nonexistent device\n");
210 211
		return;
	}
212
	sbi = EXT4_SB(sb);
213 214

	ino = inode->i_ino;
215
	ext4_debug("freeing inode %lu\n", ino);
216
	trace_ext4_free_inode(inode);
217 218 219 220 221

	/*
	 * Note: we must free any quota before locking the superblock,
	 * as writing the quota to disk may need the lock as well.
	 */
222
	dquot_initialize(inode);
223
	ext4_xattr_delete_inode(handle, inode);
224
	dquot_free_inode(inode);
225
	dquot_drop(inode);
226 227 228 229

	is_directory = S_ISDIR(inode->i_mode);

	/* Do this BEFORE marking the inode not in use or returning an error */
A
Al Viro 已提交
230
	ext4_clear_inode(inode);
231

232 233
	es = EXT4_SB(sb)->s_es;
	if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
234
		ext4_error(sb, "reserved or nonexistent inode %lu", ino);
235 236
		goto error_return;
	}
237 238
	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
239
	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
240 241 242 243
	if (!bitmap_bh)
		goto error_return;

	BUFFER_TRACE(bitmap_bh, "get_write_access");
244
	fatal = ext4_journal_get_write_access(handle, bitmap_bh);
245 246 247
	if (fatal)
		goto error_return;

248 249 250
	fatal = -ESRCH;
	gdp = ext4_get_group_desc(sb, block_group, &bh2);
	if (gdp) {
251
		BUFFER_TRACE(bh2, "get_write_access");
252
		fatal = ext4_journal_get_write_access(handle, bh2);
253 254
	}
	ext4_lock_group(sb, block_group);
255
	cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
256 257 258 259
	if (fatal || !cleared) {
		ext4_unlock_group(sb, block_group);
		goto out;
	}
260

261 262 263 264 265 266
	count = ext4_free_inodes_count(sb, gdp) + 1;
	ext4_free_inodes_set(sb, gdp, count);
	if (is_directory) {
		count = ext4_used_dirs_count(sb, gdp) - 1;
		ext4_used_dirs_set(sb, gdp, count);
		percpu_counter_dec(&sbi->s_dirs_counter);
267
	}
268 269
	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
	ext4_unlock_group(sb, block_group);
270

271 272 273
	percpu_counter_inc(&sbi->s_freeinodes_counter);
	if (sbi->s_log_groups_per_flex) {
		ext4_group_t f = ext4_flex_group(sbi, block_group);
274

275 276 277
		atomic_inc(&sbi->s_flex_groups[f].free_inodes);
		if (is_directory)
			atomic_dec(&sbi->s_flex_groups[f].used_dirs);
278
	}
279 280 281 282 283 284 285 286
	BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
	fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
out:
	if (cleared) {
		BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
		if (!fatal)
			fatal = err;
T
Theodore Ts'o 已提交
287
		ext4_mark_super_dirty(sb);
288 289 290
	} else
		ext4_error(sb, "bit already cleared for inode %lu", ino);

291 292
error_return:
	brelse(bitmap_bh);
293
	ext4_std_error(sb, fatal);
294 295
}

296 297
struct orlov_stats {
	__u32 free_inodes;
298
	__u32 free_clusters;
299 300 301 302 303 304 305 306
	__u32 used_dirs;
};

/*
 * Helper function for Orlov's allocator; returns critical information
 * for a particular block group or flex_bg.  If flex_size is 1, then g
 * is a block group number; otherwise it is flex_bg number.
 */
307 308
static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
			    int flex_size, struct orlov_stats *stats)
309 310
{
	struct ext4_group_desc *desc;
311
	struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
312

313 314
	if (flex_size > 1) {
		stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
315
		stats->free_clusters = atomic_read(&flex_group[g].free_clusters);
316 317 318
		stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
		return;
	}
319

320 321 322
	desc = ext4_get_group_desc(sb, g, NULL);
	if (desc) {
		stats->free_inodes = ext4_free_inodes_count(sb, desc);
323
		stats->free_clusters = ext4_free_group_clusters(sb, desc);
324 325 326
		stats->used_dirs = ext4_used_dirs_count(sb, desc);
	} else {
		stats->free_inodes = 0;
327
		stats->free_clusters = 0;
328
		stats->used_dirs = 0;
329 330 331
	}
}

332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
/*
 * Orlov's allocator for directories.
 *
 * We always try to spread first-level directories.
 *
 * If there are blockgroups with both free inodes and free blocks counts
 * not worse than average we return one with smallest directory count.
 * Otherwise we simply return a random group.
 *
 * For the rest rules look so:
 *
 * It's OK to put directory into a group unless
 * it has too many directories already (max_dirs) or
 * it has too few free inodes left (min_inodes) or
 * it has too few free blocks left (min_blocks) or
347
 * Parent's group is preferred, if it doesn't satisfy these
348 349 350 351 352
 * conditions we search cyclically through the rest. If none
 * of the groups look good we just look for a group with more
 * free inodes than average (starting at parent's group).
 */

353
static int find_group_orlov(struct super_block *sb, struct inode *parent,
A
Al Viro 已提交
354
			    ext4_group_t *group, umode_t mode,
355
			    const struct qstr *qstr)
356
{
357
	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
358
	struct ext4_sb_info *sbi = EXT4_SB(sb);
359
	ext4_group_t real_ngroups = ext4_get_groups_count(sb);
360
	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
361
	unsigned int freei, avefreei, grp_free;
362
	ext4_fsblk_t freeb, avefreec;
363
	unsigned int ndirs;
364
	int max_dirs, min_inodes;
365
	ext4_grpblk_t min_clusters;
366
	ext4_group_t i, grp, g, ngroups;
367
	struct ext4_group_desc *desc;
368 369
	struct orlov_stats stats;
	int flex_size = ext4_flex_bg_size(sbi);
370
	struct dx_hash_info hinfo;
371

372
	ngroups = real_ngroups;
373
	if (flex_size > 1) {
374
		ngroups = (real_ngroups + flex_size - 1) >>
375 376 377
			sbi->s_log_groups_per_flex;
		parent_group >>= sbi->s_log_groups_per_flex;
	}
378 379 380

	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
	avefreei = freei / ngroups;
381 382
	freeb = EXT4_C2B(sbi,
		percpu_counter_read_positive(&sbi->s_freeclusters_counter));
383 384
	avefreec = freeb;
	do_div(avefreec, ngroups);
385 386
	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);

387 388
	if (S_ISDIR(mode) &&
	    ((parent == sb->s_root->d_inode) ||
389
	     (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
390
		int best_ndir = inodes_per_group;
391
		int ret = -1;
392

393 394 395 396 397 398 399
		if (qstr) {
			hinfo.hash_version = DX_HASH_HALF_MD4;
			hinfo.seed = sbi->s_hash_seed;
			ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
			grp = hinfo.hash;
		} else
			get_random_bytes(&grp, sizeof(grp));
400
		parent_group = (unsigned)grp % ngroups;
401
		for (i = 0; i < ngroups; i++) {
402 403 404
			g = (parent_group + i) % ngroups;
			get_orlov_stats(sb, g, flex_size, &stats);
			if (!stats.free_inodes)
405
				continue;
406
			if (stats.used_dirs >= best_ndir)
407
				continue;
408
			if (stats.free_inodes < avefreei)
409
				continue;
410
			if (stats.free_clusters < avefreec)
411
				continue;
412
			grp = g;
413
			ret = 0;
414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
			best_ndir = stats.used_dirs;
		}
		if (ret)
			goto fallback;
	found_flex_bg:
		if (flex_size == 1) {
			*group = grp;
			return 0;
		}

		/*
		 * We pack inodes at the beginning of the flexgroup's
		 * inode tables.  Block allocation decisions will do
		 * something similar, although regular files will
		 * start at 2nd block group of the flexgroup.  See
		 * ext4_ext_find_goal() and ext4_find_near().
		 */
		grp *= flex_size;
		for (i = 0; i < flex_size; i++) {
433
			if (grp+i >= real_ngroups)
434 435 436 437 438 439
				break;
			desc = ext4_get_group_desc(sb, grp+i, NULL);
			if (desc && ext4_free_inodes_count(sb, desc)) {
				*group = grp+i;
				return 0;
			}
440 441 442 443 444
		}
		goto fallback;
	}

	max_dirs = ndirs / ngroups + inodes_per_group / 16;
445 446 447
	min_inodes = avefreei - inodes_per_group*flex_size / 4;
	if (min_inodes < 1)
		min_inodes = 1;
448
	min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
449 450 451 452 453 454 455 456 457 458

	/*
	 * Start looking in the flex group where we last allocated an
	 * inode for this parent directory
	 */
	if (EXT4_I(parent)->i_last_alloc_group != ~0) {
		parent_group = EXT4_I(parent)->i_last_alloc_group;
		if (flex_size > 1)
			parent_group >>= sbi->s_log_groups_per_flex;
	}
459 460

	for (i = 0; i < ngroups; i++) {
461 462 463
		grp = (parent_group + i) % ngroups;
		get_orlov_stats(sb, grp, flex_size, &stats);
		if (stats.used_dirs >= max_dirs)
464
			continue;
465
		if (stats.free_inodes < min_inodes)
466
			continue;
467
		if (stats.free_clusters < min_clusters)
468
			continue;
469
		goto found_flex_bg;
470 471 472
	}

fallback:
473
	ngroups = real_ngroups;
474
	avefreei = freei / ngroups;
475
fallback_retry:
476
	parent_group = EXT4_I(parent)->i_block_group;
477
	for (i = 0; i < ngroups; i++) {
478 479
		grp = (parent_group + i) % ngroups;
		desc = ext4_get_group_desc(sb, grp, NULL);
480 481
		grp_free = ext4_free_inodes_count(sb, desc);
		if (desc && grp_free && grp_free >= avefreei) {
482
			*group = grp;
483
			return 0;
484
		}
485 486 487 488 489 490 491 492
	}

	if (avefreei) {
		/*
		 * The free-inodes counter is approximate, and for really small
		 * filesystems the above test can fail to find any blockgroups
		 */
		avefreei = 0;
493
		goto fallback_retry;
494 495 496 497 498
	}

	return -1;
}

499
static int find_group_other(struct super_block *sb, struct inode *parent,
A
Al Viro 已提交
500
			    ext4_group_t *group, umode_t mode)
501
{
502
	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
503
	ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
504
	struct ext4_group_desc *desc;
505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541
	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));

	/*
	 * Try to place the inode is the same flex group as its
	 * parent.  If we can't find space, use the Orlov algorithm to
	 * find another flex group, and store that information in the
	 * parent directory's inode information so that use that flex
	 * group for future allocations.
	 */
	if (flex_size > 1) {
		int retry = 0;

	try_again:
		parent_group &= ~(flex_size-1);
		last = parent_group + flex_size;
		if (last > ngroups)
			last = ngroups;
		for  (i = parent_group; i < last; i++) {
			desc = ext4_get_group_desc(sb, i, NULL);
			if (desc && ext4_free_inodes_count(sb, desc)) {
				*group = i;
				return 0;
			}
		}
		if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
			retry = 1;
			parent_group = EXT4_I(parent)->i_last_alloc_group;
			goto try_again;
		}
		/*
		 * If this didn't work, use the Orlov search algorithm
		 * to find a new flex group; we pass in the mode to
		 * avoid the topdir algorithms.
		 */
		*group = parent_group + flex_size;
		if (*group > ngroups)
			*group = 0;
542
		return find_group_orlov(sb, parent, group, mode, NULL);
543
	}
544 545 546 547

	/*
	 * Try to place the inode in its parent directory
	 */
548 549
	*group = parent_group;
	desc = ext4_get_group_desc(sb, *group, NULL);
550
	if (desc && ext4_free_inodes_count(sb, desc) &&
551
	    ext4_free_group_clusters(sb, desc))
552
		return 0;
553 554 555 556 557 558 559 560 561 562

	/*
	 * We're going to place this inode in a different blockgroup from its
	 * parent.  We want to cause files in a common directory to all land in
	 * the same blockgroup.  But we want files which are in a different
	 * directory which shares a blockgroup with our parent to land in a
	 * different blockgroup.
	 *
	 * So add our directory's i_ino into the starting point for the hash.
	 */
563
	*group = (*group + parent->i_ino) % ngroups;
564 565 566 567 568 569

	/*
	 * Use a quadratic hash to find a group with a free inode and some free
	 * blocks.
	 */
	for (i = 1; i < ngroups; i <<= 1) {
570 571 572 573
		*group += i;
		if (*group >= ngroups)
			*group -= ngroups;
		desc = ext4_get_group_desc(sb, *group, NULL);
574
		if (desc && ext4_free_inodes_count(sb, desc) &&
575
		    ext4_free_group_clusters(sb, desc))
576
			return 0;
577 578 579 580 581 582
	}

	/*
	 * That failed: try linear search for a free inode, even if that group
	 * has no free blocks.
	 */
583
	*group = parent_group;
584
	for (i = 0; i < ngroups; i++) {
585 586 587
		if (++*group >= ngroups)
			*group = 0;
		desc = ext4_get_group_desc(sb, *group, NULL);
588
		if (desc && ext4_free_inodes_count(sb, desc))
589
			return 0;
590 591 592 593 594 595 596 597 598 599 600 601 602 603 604
	}

	return -1;
}

/*
 * There are two policies for allocating an inode.  If the new inode is
 * a directory, then a forward search is made for a block group with both
 * free space and a low directory-to-inode ratio; if that fails, then of
 * the groups with above-average free space, that group with the fewest
 * directories already is chosen.
 *
 * For other inodes, search forward from the parent directory's block
 * group to find a free inode.
 */
A
Al Viro 已提交
605
struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
606
			     const struct qstr *qstr, __u32 goal, uid_t *owner)
607 608
{
	struct super_block *sb;
A
Aneesh Kumar K.V 已提交
609 610
	struct buffer_head *inode_bitmap_bh = NULL;
	struct buffer_head *group_desc_bh;
611
	ext4_group_t ngroups, group = 0;
612
	unsigned long ino = 0;
613 614
	struct inode *inode;
	struct ext4_group_desc *gdp = NULL;
615 616
	struct ext4_inode_info *ei;
	struct ext4_sb_info *sbi;
617
	int ret2, err = 0;
618
	struct inode *ret;
619
	ext4_group_t i;
620
	ext4_group_t flex_group;
621 622 623 624 625 626

	/* Cannot create files in a deleted directory */
	if (!dir || !dir->i_nlink)
		return ERR_PTR(-EPERM);

	sb = dir->i_sb;
627
	ngroups = ext4_get_groups_count(sb);
628
	trace_ext4_request_inode(dir, mode);
629 630 631
	inode = new_inode(sb);
	if (!inode)
		return ERR_PTR(-ENOMEM);
632 633
	ei = EXT4_I(inode);
	sbi = EXT4_SB(sb);
634

635 636 637
	if (!goal)
		goal = sbi->s_inode_goal;

638
	if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
639 640 641 642 643 644
		group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
		ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
		ret2 = 0;
		goto got_group;
	}

L
Lukas Czerner 已提交
645 646 647
	if (S_ISDIR(mode))
		ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
	else
648
		ret2 = find_group_other(sb, dir, &group, mode);
649

650
got_group:
651
	EXT4_I(dir)->i_last_alloc_group = group;
652
	err = -ENOSPC;
653
	if (ret2 == -1)
654 655
		goto out;

656 657 658 659 660
	/*
	 * Normally we will only go through one pass of this loop,
	 * unless we get unlucky and it turns out the group we selected
	 * had its last inode grabbed by someone else.
	 */
661
	for (i = 0; i < ngroups; i++, ino = 0) {
662 663
		err = -EIO;

A
Aneesh Kumar K.V 已提交
664
		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
665 666 667
		if (!gdp)
			goto fail;

A
Aneesh Kumar K.V 已提交
668 669 670
		brelse(inode_bitmap_bh);
		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
		if (!inode_bitmap_bh)
671 672 673
			goto fail;

repeat_in_this_group:
674
		ino = ext4_find_next_zero_bit((unsigned long *)
A
Aneesh Kumar K.V 已提交
675 676
					      inode_bitmap_bh->b_data,
					      EXT4_INODES_PER_GROUP(sb), ino);
677 678 679 680
		if (ino >= EXT4_INODES_PER_GROUP(sb)) {
			if (++group == ngroups)
				group = 0;
			continue;
681
		}
682 683 684 685 686 687 688 689 690 691 692 693 694
		if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
			ext4_error(sb, "reserved inode found cleared - "
				   "inode=%lu", ino + 1);
			continue;
		}
		ext4_lock_group(sb, group);
		ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
		ext4_unlock_group(sb, group);
		ino++;		/* the inode bitmap is zero-based */
		if (!ret2)
			goto got; /* we grabbed the inode! */
		if (ino < EXT4_INODES_PER_GROUP(sb))
			goto repeat_in_this_group;
695 696 697 698 699
	}
	err = -ENOSPC;
	goto out;

got:
A
Andreas Dilger 已提交
700 701 702
	/* We may have to initialize the block bitmap if it isn't already */
	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
A
Aneesh Kumar K.V 已提交
703
		struct buffer_head *block_bitmap_bh;
A
Andreas Dilger 已提交
704

A
Aneesh Kumar K.V 已提交
705 706 707
		block_bitmap_bh = ext4_read_block_bitmap(sb, group);
		BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
		err = ext4_journal_get_write_access(handle, block_bitmap_bh);
A
Andreas Dilger 已提交
708
		if (err) {
A
Aneesh Kumar K.V 已提交
709
			brelse(block_bitmap_bh);
A
Andreas Dilger 已提交
710 711 712
			goto fail;
		}

713 714 715 716
		BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
		err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
		brelse(block_bitmap_bh);

A
Andreas Dilger 已提交
717
		/* recheck and clear flag under lock if we still need to */
718
		ext4_lock_group(sb, group);
A
Andreas Dilger 已提交
719
		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
A
Aneesh Kumar K.V 已提交
720
			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
721
			ext4_free_group_clusters_set(sb, gdp,
722
				ext4_free_clusters_after_init(sb, group, gdp));
723 724
			gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
								gdp);
A
Andreas Dilger 已提交
725
		}
726
		ext4_unlock_group(sb, group);
A
Andreas Dilger 已提交
727 728 729 730

		if (err)
			goto fail;
	}
731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783

	BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
	err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
	if (err)
		goto fail;

	BUFFER_TRACE(group_desc_bh, "get_write_access");
	err = ext4_journal_get_write_access(handle, group_desc_bh);
	if (err)
		goto fail;

	/* Update the relevant bg descriptor fields */
	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
		int free;
		struct ext4_group_info *grp = ext4_get_group_info(sb, group);

		down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
		ext4_lock_group(sb, group); /* while we modify the bg desc */
		free = EXT4_INODES_PER_GROUP(sb) -
			ext4_itable_unused_count(sb, gdp);
		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
			free = 0;
		}
		/*
		 * Check the relative inode number against the last used
		 * relative inode number in this group. if it is greater
		 * we need to update the bg_itable_unused count
		 */
		if (ino > free)
			ext4_itable_unused_set(sb, gdp,
					(EXT4_INODES_PER_GROUP(sb) - ino));
		up_read(&grp->alloc_sem);
	}
	ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
	if (S_ISDIR(mode)) {
		ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
		if (sbi->s_log_groups_per_flex) {
			ext4_group_t f = ext4_flex_group(sbi, group);

			atomic_inc(&sbi->s_flex_groups[f].used_dirs);
		}
	}
	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
		gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
		ext4_unlock_group(sb, group);
	}

	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
	err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
	if (err)
		goto fail;

A
Aneesh Kumar K.V 已提交
784 785
	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
786 787
	if (err)
		goto fail;
788 789 790 791

	percpu_counter_dec(&sbi->s_freeinodes_counter);
	if (S_ISDIR(mode))
		percpu_counter_inc(&sbi->s_dirs_counter);
T
Theodore Ts'o 已提交
792
	ext4_mark_super_dirty(sb);
793

794 795
	if (sbi->s_log_groups_per_flex) {
		flex_group = ext4_flex_group(sbi, group);
796
		atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
797
	}
798 799 800 801 802
	if (owner) {
		inode->i_mode = mode;
		inode->i_uid = owner[0];
		inode->i_gid = owner[1];
	} else if (test_opt(sb, GRPID)) {
803 804
		inode->i_mode = mode;
		inode->i_uid = current_fsuid();
805 806
		inode->i_gid = dir->i_gid;
	} else
807
		inode_init_owner(inode, dir, mode);
808

A
Andreas Dilger 已提交
809
	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
810 811
	/* This is the optimal IO size (for stat), not the fs block size */
	inode->i_blocks = 0;
K
Kalpak Shah 已提交
812 813
	inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
						       ext4_current_time(inode);
814 815 816 817 818

	memset(ei->i_data, 0, sizeof(ei->i_data));
	ei->i_dir_start_lookup = 0;
	ei->i_disksize = 0;

819
	/* Don't inherit extent flag from directory, amongst others. */
820 821
	ei->i_flags =
		ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
822 823 824
	ei->i_file_acl = 0;
	ei->i_dtime = 0;
	ei->i_block_group = group;
825
	ei->i_last_alloc_group = ~0;
826

827
	ext4_set_inode_flags(inode);
828
	if (IS_DIRSYNC(inode))
829
		ext4_handle_sync(handle);
A
Al Viro 已提交
830
	if (insert_inode_locked(inode) < 0) {
831 832 833 834 835 836
		/*
		 * Likely a bitmap corruption causing inode to be allocated
		 * twice.
		 */
		err = -EIO;
		goto fail;
A
Al Viro 已提交
837
	}
838 839 840 841
	spin_lock(&sbi->s_next_gen_lock);
	inode->i_generation = sbi->s_next_generation++;
	spin_unlock(&sbi->s_next_gen_lock);

842
	ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
843
	ext4_set_inode_state(inode, EXT4_STATE_NEW);
K
Kalpak Shah 已提交
844 845

	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
846 847

	ret = inode;
848
	dquot_initialize(inode);
849 850
	err = dquot_alloc_inode(inode);
	if (err)
851 852
		goto fail_drop;

853
	err = ext4_init_acl(handle, inode, dir);
854 855 856
	if (err)
		goto fail_free_drop;

857
	err = ext4_init_security(handle, inode, dir, qstr);
858 859 860
	if (err)
		goto fail_free_drop;

861
	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
862
		/* set extent flag only for directory, file and normal symlink*/
863
		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
864
			ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
865 866
			ext4_ext_tree_init(handle, inode);
		}
A
Alex Tomas 已提交
867
	}
868

869 870 871 872 873
	if (ext4_handle_valid(handle)) {
		ei->i_sync_tid = handle->h_transaction->t_tid;
		ei->i_datasync_tid = handle->h_transaction->t_tid;
	}

874 875 876 877 878 879
	err = ext4_mark_inode_dirty(handle, inode);
	if (err) {
		ext4_std_error(sb, err);
		goto fail_free_drop;
	}

880
	ext4_debug("allocating inode %lu\n", inode->i_ino);
881
	trace_ext4_allocate_inode(inode, dir, mode);
882 883
	goto really_out;
fail:
884
	ext4_std_error(sb, err);
885 886 887 888
out:
	iput(inode);
	ret = ERR_PTR(err);
really_out:
A
Aneesh Kumar K.V 已提交
889
	brelse(inode_bitmap_bh);
890 891 892
	return ret;

fail_free_drop:
893
	dquot_free_inode(inode);
894 895

fail_drop:
896
	dquot_drop(inode);
897
	inode->i_flags |= S_NOQUOTA;
898
	clear_nlink(inode);
A
Al Viro 已提交
899
	unlock_new_inode(inode);
900
	iput(inode);
A
Aneesh Kumar K.V 已提交
901
	brelse(inode_bitmap_bh);
902 903 904 905
	return ERR_PTR(err);
}

/* Verify that we are loading a valid orphan from disk */
906
struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
907
{
908
	unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
909
	ext4_group_t block_group;
910
	int bit;
911
	struct buffer_head *bitmap_bh;
912
	struct inode *inode = NULL;
913
	long err = -EIO;
914 915 916

	/* Error cases - e2fsck has already cleaned up for us */
	if (ino > max_ino) {
917
		ext4_warning(sb, "bad orphan ino %lu!  e2fsck was run?", ino);
918
		goto error;
919 920
	}

921 922
	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
923
	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
924
	if (!bitmap_bh) {
925
		ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
926
		goto error;
927 928 929 930 931 932
	}

	/* Having the inode bit set should be a 100% indicator that this
	 * is a valid orphan (no e2fsck run on fs).  Orphans also include
	 * inodes that were being truncated, so we can't check i_nlink==0.
	 */
933 934 935 936 937 938 939
	if (!ext4_test_bit(bit, bitmap_bh->b_data))
		goto bad_orphan;

	inode = ext4_iget(sb, ino);
	if (IS_ERR(inode))
		goto iget_failed;

940 941 942 943 944 945 946 947
	/*
	 * If the orphans has i_nlinks > 0 then it should be able to be
	 * truncated, otherwise it won't be removed from the orphan list
	 * during processing and an infinite loop will result.
	 */
	if (inode->i_nlink && !ext4_can_truncate(inode))
		goto bad_orphan;

948 949 950 951 952 953 954 955 956
	if (NEXT_ORPHAN(inode) > max_ino)
		goto bad_orphan;
	brelse(bitmap_bh);
	return inode;

iget_failed:
	err = PTR_ERR(inode);
	inode = NULL;
bad_orphan:
957
	ext4_warning(sb, "bad orphan inode %lu!  e2fsck was run?", ino);
958 959 960 961 962 963 964 965 966 967
	printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
	       bit, (unsigned long long)bitmap_bh->b_blocknr,
	       ext4_test_bit(bit, bitmap_bh->b_data));
	printk(KERN_NOTICE "inode=%p\n", inode);
	if (inode) {
		printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
		       is_bad_inode(inode));
		printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
		       NEXT_ORPHAN(inode));
		printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
968
		printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
969
		/* Avoid freeing blocks if we got a bad deleted inode */
970
		if (inode->i_nlink == 0)
971 972 973 974
			inode->i_blocks = 0;
		iput(inode);
	}
	brelse(bitmap_bh);
975 976
error:
	return ERR_PTR(err);
977 978
}

979
unsigned long ext4_count_free_inodes(struct super_block *sb)
980 981
{
	unsigned long desc_count;
982
	struct ext4_group_desc *gdp;
983
	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
984 985
#ifdef EXT4FS_DEBUG
	struct ext4_super_block *es;
986 987 988
	unsigned long bitmap_count, x;
	struct buffer_head *bitmap_bh = NULL;

989
	es = EXT4_SB(sb)->s_es;
990 991 992
	desc_count = 0;
	bitmap_count = 0;
	gdp = NULL;
993
	for (i = 0; i < ngroups; i++) {
994
		gdp = ext4_get_group_desc(sb, i, NULL);
995 996
		if (!gdp)
			continue;
997
		desc_count += ext4_free_inodes_count(sb, gdp);
998
		brelse(bitmap_bh);
999
		bitmap_bh = ext4_read_inode_bitmap(sb, i);
1000 1001 1002
		if (!bitmap_bh)
			continue;

1003
		x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
E
Eric Sandeen 已提交
1004
		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
1005
			(unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
1006 1007 1008
		bitmap_count += x;
	}
	brelse(bitmap_bh);
1009 1010 1011
	printk(KERN_DEBUG "ext4_count_free_inodes: "
	       "stored = %u, computed = %lu, %lu\n",
	       le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
1012 1013 1014
	return desc_count;
#else
	desc_count = 0;
1015
	for (i = 0; i < ngroups; i++) {
1016
		gdp = ext4_get_group_desc(sb, i, NULL);
1017 1018
		if (!gdp)
			continue;
1019
		desc_count += ext4_free_inodes_count(sb, gdp);
1020 1021 1022 1023 1024 1025 1026
		cond_resched();
	}
	return desc_count;
#endif
}

/* Called at mount-time, super-block is locked */
1027
unsigned long ext4_count_dirs(struct super_block * sb)
1028 1029
{
	unsigned long count = 0;
1030
	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
1031

1032
	for (i = 0; i < ngroups; i++) {
1033
		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
1034 1035
		if (!gdp)
			continue;
1036
		count += ext4_used_dirs_count(sb, gdp);
1037 1038 1039
	}
	return count;
}
1040 1041 1042 1043 1044 1045 1046

/*
 * Zeroes not yet zeroed inode table - just write zeroes through the whole
 * inode table. Must be called without any spinlock held. The only place
 * where it is called from on active part of filesystem is ext4lazyinit
 * thread, so we do not need any special locks, however we have to prevent
 * inode allocation from the current group, so we take alloc_sem lock, to
1047
 * block ext4_new_inode() until we are finished.
1048
 */
1049
int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093
				 int barrier)
{
	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
	struct ext4_sb_info *sbi = EXT4_SB(sb);
	struct ext4_group_desc *gdp = NULL;
	struct buffer_head *group_desc_bh;
	handle_t *handle;
	ext4_fsblk_t blk;
	int num, ret = 0, used_blks = 0;

	/* This should not happen, but just to be sure check this */
	if (sb->s_flags & MS_RDONLY) {
		ret = 1;
		goto out;
	}

	gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
	if (!gdp)
		goto out;

	/*
	 * We do not need to lock this, because we are the only one
	 * handling this flag.
	 */
	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
		goto out;

	handle = ext4_journal_start_sb(sb, 1);
	if (IS_ERR(handle)) {
		ret = PTR_ERR(handle);
		goto out;
	}

	down_write(&grp->alloc_sem);
	/*
	 * If inode bitmap was already initialized there may be some
	 * used inodes so we need to skip blocks with used inodes in
	 * inode table.
	 */
	if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
		used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
			    ext4_itable_unused_count(sb, gdp)),
			    sbi->s_inodes_per_block);

1094 1095 1096 1097 1098 1099 1100
	if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
		ext4_error(sb, "Something is wrong with group %u\n"
			   "Used itable blocks: %d"
			   "itable unused count: %u\n",
			   group, used_blks,
			   ext4_itable_unused_count(sb, gdp));
		ret = 1;
1101
		goto err_out;
1102 1103
	}

1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122
	blk = ext4_inode_table(sb, gdp) + used_blks;
	num = sbi->s_itb_per_group - used_blks;

	BUFFER_TRACE(group_desc_bh, "get_write_access");
	ret = ext4_journal_get_write_access(handle,
					    group_desc_bh);
	if (ret)
		goto err_out;

	/*
	 * Skip zeroout if the inode table is full. But we set the ZEROED
	 * flag anyway, because obviously, when it is full it does not need
	 * further zeroing.
	 */
	if (unlikely(num == 0))
		goto skip_zeroout;

	ext4_debug("going to zero out inode table in group %d\n",
		   group);
1123
	ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
1124 1125
	if (ret < 0)
		goto err_out;
1126 1127
	if (barrier)
		blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145

skip_zeroout:
	ext4_lock_group(sb, group);
	gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
	ext4_unlock_group(sb, group);

	BUFFER_TRACE(group_desc_bh,
		     "call ext4_handle_dirty_metadata");
	ret = ext4_handle_dirty_metadata(handle, NULL,
					 group_desc_bh);

err_out:
	up_write(&grp->alloc_sem);
	ext4_journal_stop(handle);
out:
	return ret;
}