namei.c 80.9 KB
Newer Older
1
/*
2
 *  linux/fs/ext4/namei.c
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 *  Directory entry file type support and forward compatibility hooks
 *	for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
 *  Hash Tree Directory indexing (c)
 *	Daniel Phillips, 2001
 *  Hash Tree Directory indexing porting
 *	Christopher Li, 2002
 *  Hash Tree Directory indexing cleanup
 *	Theodore Ts'o, 2002
 */

#include <linux/fs.h>
#include <linux/pagemap.h>
29
#include <linux/jbd2.h>
30 31 32 33 34 35 36
#include <linux/time.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/bio.h>
37 38
#include "ext4.h"
#include "ext4_jbd2.h"
39 40 41 42

#include "xattr.h"
#include "acl.h"

43
#include <trace/events/ext4.h>
44 45 46 47 48
/*
 * define how far ahead to read directories while searching them.
 */
#define NAMEI_RA_CHUNKS  2
#define NAMEI_RA_BLOCKS  4
D
Dave Kleikamp 已提交
49
#define NAMEI_RA_SIZE	     (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
50 51
#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))

52
static struct buffer_head *ext4_append(handle_t *handle,
53
					struct inode *inode,
A
Aneesh Kumar K.V 已提交
54
					ext4_lblk_t *block, int *err)
55 56 57 58 59
{
	struct buffer_head *bh;

	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;

60 61
	bh = ext4_bread(handle, inode, *block, 1, err);
	if (bh) {
62
		inode->i_size += inode->i_sb->s_blocksize;
63
		EXT4_I(inode)->i_disksize = inode->i_size;
64 65 66 67 68
		*err = ext4_journal_get_write_access(handle, bh);
		if (*err) {
			brelse(bh);
			bh = NULL;
		}
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
	}
	return bh;
}

#ifndef assert
#define assert(test) J_ASSERT(test)
#endif

#ifdef DX_DEBUG
#define dxtrace(command) command
#else
#define dxtrace(command)
#endif

struct fake_dirent
{
	__le32 inode;
	__le16 rec_len;
	u8 name_len;
	u8 file_type;
};

struct dx_countlimit
{
	__le16 limit;
	__le16 count;
};

struct dx_entry
{
	__le32 hash;
	__le32 block;
};

/*
 * dx_root_info is laid out so that if it should somehow get overlaid by a
 * dirent the two low bits of the hash version will be zero.  Therefore, the
 * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
 */

struct dx_root
{
	struct fake_dirent dot;
	char dot_name[4];
	struct fake_dirent dotdot;
	char dotdot_name[4];
	struct dx_root_info
	{
		__le32 reserved_zero;
		u8 hash_version;
		u8 info_length; /* 8 */
		u8 indirect_levels;
		u8 unused_flags;
	}
	info;
	struct dx_entry	entries[0];
};

struct dx_node
{
	struct fake_dirent fake;
	struct dx_entry	entries[0];
};


struct dx_frame
{
	struct buffer_head *bh;
	struct dx_entry *entries;
	struct dx_entry *at;
};

struct dx_map_entry
{
	u32 hash;
144 145
	u16 offs;
	u16 size;
146 147
};

148 149 150 151 152 153 154 155
/*
 * This goes at the end of each htree block.
 */
struct dx_tail {
	u32 dt_reserved;
	__le32 dt_checksum;	/* crc32c(uuid+inum+dirblock) */
};

A
Aneesh Kumar K.V 已提交
156 157
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
158 159 160 161 162 163 164 165
static inline unsigned dx_get_hash(struct dx_entry *entry);
static void dx_set_hash(struct dx_entry *entry, unsigned value);
static unsigned dx_get_count(struct dx_entry *entries);
static unsigned dx_get_limit(struct dx_entry *entries);
static void dx_set_count(struct dx_entry *entries, unsigned value);
static void dx_set_limit(struct dx_entry *entries, unsigned value);
static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
static unsigned dx_node_limit(struct inode *dir);
166
static struct dx_frame *dx_probe(const struct qstr *d_name,
167 168 169 170
				 struct inode *dir,
				 struct dx_hash_info *hinfo,
				 struct dx_frame *frame,
				 int *err);
171
static void dx_release(struct dx_frame *frames);
172
static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
173
		       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
174
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
175
static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
176
		struct dx_map_entry *offsets, int count, unsigned blocksize);
177
static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
A
Aneesh Kumar K.V 已提交
178 179
static void dx_insert_block(struct dx_frame *frame,
					u32 hash, ext4_lblk_t block);
180
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
181 182 183
				 struct dx_frame *frame,
				 struct dx_frame *frames,
				 __u32 *start_hash);
184 185 186 187
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
		const struct qstr *d_name,
		struct ext4_dir_entry_2 **res_dir,
		int *err);
188
static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
189 190
			     struct inode *inode);

191
/* checksumming functions */
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
#define EXT4_DIRENT_TAIL(block, blocksize) \
	((struct ext4_dir_entry_tail *)(((void *)(block)) + \
					((blocksize) - \
					 sizeof(struct ext4_dir_entry_tail))))

static void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
				   unsigned int blocksize)
{
	memset(t, 0, sizeof(struct ext4_dir_entry_tail));
	t->det_rec_len = ext4_rec_len_to_disk(
			sizeof(struct ext4_dir_entry_tail), blocksize);
	t->det_reserved_ft = EXT4_FT_DIR_CSUM;
}

/* Walk through a dirent block to find a checksum "dirent" at the tail */
static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
						   struct ext4_dir_entry *de)
{
	struct ext4_dir_entry_tail *t;

#ifdef PARANOID
	struct ext4_dir_entry *d, *top;

	d = de;
	top = (struct ext4_dir_entry *)(((void *)de) +
		(EXT4_BLOCK_SIZE(inode->i_sb) -
		sizeof(struct ext4_dir_entry_tail)));
	while (d < top && d->rec_len)
		d = (struct ext4_dir_entry *)(((void *)d) +
		    le16_to_cpu(d->rec_len));

	if (d != top)
		return NULL;

	t = (struct ext4_dir_entry_tail *)d;
#else
	t = EXT4_DIRENT_TAIL(de, EXT4_BLOCK_SIZE(inode->i_sb));
#endif

	if (t->det_reserved_zero1 ||
	    le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
	    t->det_reserved_zero2 ||
	    t->det_reserved_ft != EXT4_FT_DIR_CSUM)
		return NULL;

	return t;
}

static __le32 ext4_dirent_csum(struct inode *inode,
			       struct ext4_dir_entry *dirent, int size)
{
	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
	struct ext4_inode_info *ei = EXT4_I(inode);
	__u32 csum;

	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
	return cpu_to_le32(csum);
}

int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
{
	struct ext4_dir_entry_tail *t;

	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
		return 1;

	t = get_dirent_tail(inode, dirent);
	if (!t) {
		EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
				 "leaf for checksum.  Please run e2fsck -D.");
		return 0;
	}

	if (t->det_checksum != ext4_dirent_csum(inode, dirent,
						(void *)t - (void *)dirent))
		return 0;

	return 1;
}

static void ext4_dirent_csum_set(struct inode *inode,
				 struct ext4_dir_entry *dirent)
{
	struct ext4_dir_entry_tail *t;

	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
		return;

	t = get_dirent_tail(inode, dirent);
	if (!t) {
		EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
				 "leaf for checksum.  Please run e2fsck -D.");
		return;
	}

	t->det_checksum = ext4_dirent_csum(inode, dirent,
					   (void *)t - (void *)dirent);
}

static inline int ext4_handle_dirty_dirent_node(handle_t *handle,
						struct inode *inode,
						struct buffer_head *bh)
{
	ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
	return ext4_handle_dirty_metadata(handle, inode, bh);
}

301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
					       struct ext4_dir_entry *dirent,
					       int *offset)
{
	struct ext4_dir_entry *dp;
	struct dx_root_info *root;
	int count_offset;

	if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
		count_offset = 8;
	else if (le16_to_cpu(dirent->rec_len) == 12) {
		dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
		if (le16_to_cpu(dp->rec_len) !=
		    EXT4_BLOCK_SIZE(inode->i_sb) - 12)
			return NULL;
		root = (struct dx_root_info *)(((void *)dp + 12));
		if (root->reserved_zero ||
		    root->info_length != sizeof(struct dx_root_info))
			return NULL;
		count_offset = 32;
	} else
		return NULL;

	if (offset)
		*offset = count_offset;
	return (struct dx_countlimit *)(((void *)dirent) + count_offset);
}

static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
			   int count_offset, int count, struct dx_tail *t)
{
	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
	struct ext4_inode_info *ei = EXT4_I(inode);
	__u32 csum, old_csum;
	int size;

	size = count_offset + (count * sizeof(struct dx_entry));
	old_csum = t->dt_checksum;
	t->dt_checksum = 0;
	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
	csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
	t->dt_checksum = old_csum;

	return cpu_to_le32(csum);
}

static int ext4_dx_csum_verify(struct inode *inode,
			       struct ext4_dir_entry *dirent)
{
	struct dx_countlimit *c;
	struct dx_tail *t;
	int count_offset, limit, count;

	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
		return 1;

	c = get_dx_countlimit(inode, dirent, &count_offset);
	if (!c) {
		EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
		return 1;
	}
	limit = le16_to_cpu(c->limit);
	count = le16_to_cpu(c->count);
	if (count_offset + (limit * sizeof(struct dx_entry)) >
	    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
		EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
				 "tree checksum found.  Run e2fsck -D.");
		return 1;
	}
	t = (struct dx_tail *)(((struct dx_entry *)c) + limit);

	if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset,
					    count, t))
		return 0;
	return 1;
}

static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
{
	struct dx_countlimit *c;
	struct dx_tail *t;
	int count_offset, limit, count;

	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
		return;

	c = get_dx_countlimit(inode, dirent, &count_offset);
	if (!c) {
		EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
		return;
	}
	limit = le16_to_cpu(c->limit);
	count = le16_to_cpu(c->count);
	if (count_offset + (limit * sizeof(struct dx_entry)) >
	    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
		EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
				 "tree checksum.  Run e2fsck -D.");
		return;
	}
	t = (struct dx_tail *)(((struct dx_entry *)c) + limit);

	t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t);
}

static inline int ext4_handle_dirty_dx_node(handle_t *handle,
					    struct inode *inode,
					    struct buffer_head *bh)
{
	ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
	return ext4_handle_dirty_metadata(handle, inode, bh);
}

415 416 417 418
/*
 * p is at least 6 bytes before the end of page
 */
static inline struct ext4_dir_entry_2 *
419
ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
420 421
{
	return (struct ext4_dir_entry_2 *)((char *)p +
422
		ext4_rec_len_from_disk(p->rec_len, blocksize));
423 424
}

425 426 427 428 429
/*
 * Future: use high four bits of block for coalesce-on-delete flags
 * Mask them off for now.
 */

A
Aneesh Kumar K.V 已提交
430
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
431 432 433 434
{
	return le32_to_cpu(entry->block) & 0x00ffffff;
}

A
Aneesh Kumar K.V 已提交
435
static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
436 437 438 439
{
	entry->block = cpu_to_le32(value);
}

440
static inline unsigned dx_get_hash(struct dx_entry *entry)
441 442 443 444
{
	return le32_to_cpu(entry->hash);
}

445
static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
446 447 448 449
{
	entry->hash = cpu_to_le32(value);
}

450
static inline unsigned dx_get_count(struct dx_entry *entries)
451 452 453 454
{
	return le16_to_cpu(((struct dx_countlimit *) entries)->count);
}

455
static inline unsigned dx_get_limit(struct dx_entry *entries)
456 457 458 459
{
	return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
}

460
static inline void dx_set_count(struct dx_entry *entries, unsigned value)
461 462 463 464
{
	((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
}

465
static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
466 467 468 469
{
	((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
}

470
static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
471
{
472 473
	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
		EXT4_DIR_REC_LEN(2) - infosize;
474 475 476 477

	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
		entry_space -= sizeof(struct dx_tail);
478
	return entry_space / sizeof(struct dx_entry);
479 480
}

481
static inline unsigned dx_node_limit(struct inode *dir)
482
{
483
	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
484 485 486 487

	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
		entry_space -= sizeof(struct dx_tail);
488
	return entry_space / sizeof(struct dx_entry);
489 490 491 492 493 494
}

/*
 * Debug
 */
#ifdef DX_DEBUG
495
static void dx_show_index(char * label, struct dx_entry *entries)
496
{
A
Andrew Morton 已提交
497
	int i, n = dx_get_count (entries);
498
	printk(KERN_DEBUG "%s index ", label);
A
Andrew Morton 已提交
499
	for (i = 0; i < n; i++) {
500
		printk("%x->%lu ", i ? dx_get_hash(entries + i) :
A
Aneesh Kumar K.V 已提交
501
				0, (unsigned long)dx_get_block(entries + i));
A
Andrew Morton 已提交
502 503
	}
	printk("\n");
504 505 506 507 508 509 510 511 512
}

struct stats
{
	unsigned names;
	unsigned space;
	unsigned bcount;
};

513
static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de,
514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529
				 int size, int show_names)
{
	unsigned names = 0, space = 0;
	char *base = (char *) de;
	struct dx_hash_info h = *hinfo;

	printk("names: ");
	while ((char *) de < base + size)
	{
		if (de->inode)
		{
			if (show_names)
			{
				int len = de->name_len;
				char *name = de->name;
				while (len--) printk("%c", *name++);
530
				ext4fs_dirhash(de->name, de->name_len, &h);
531
				printk(":%x.%u ", h.hash,
532
				       (unsigned) ((char *) de - base));
533
			}
534
			space += EXT4_DIR_REC_LEN(de->name_len);
535 536
			names++;
		}
537
		de = ext4_next_entry(de, size);
538 539 540 541 542 543 544 545 546
	}
	printk("(%i)\n", names);
	return (struct stats) { names, space, 1 };
}

struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
			     struct dx_entry *entries, int levels)
{
	unsigned blocksize = dir->i_sb->s_blocksize;
547
	unsigned count = dx_get_count(entries), names = 0, space = 0, i;
548 549 550 551 552 553
	unsigned bcount = 0;
	struct buffer_head *bh;
	int err;
	printk("%i indexed blocks...\n", count);
	for (i = 0; i < count; i++, entries++)
	{
A
Aneesh Kumar K.V 已提交
554 555
		ext4_lblk_t block = dx_get_block(entries);
		ext4_lblk_t hash  = i ? dx_get_hash(entries): 0;
556 557 558
		u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
		struct stats stats;
		printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
559
		if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue;
560 561
		stats = levels?
		   dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
562
		   dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
563 564 565
		names += stats.names;
		space += stats.space;
		bcount += stats.bcount;
566
		brelse(bh);
567 568
	}
	if (bcount)
569
		printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
570 571
		       levels ? "" : "   ", names, space/bcount,
		       (space/bcount)*100/blocksize);
572 573 574 575 576 577 578 579 580 581 582 583 584 585
	return (struct stats) { names, space, bcount};
}
#endif /* DX_DEBUG */

/*
 * Probe for a directory leaf block to search.
 *
 * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
 * error in the directory index, and the caller should fall back to
 * searching the directory normally.  The callers of dx_probe **MUST**
 * check for this error code, and make sure it never gets reflected
 * back to userspace.
 */
static struct dx_frame *
586
dx_probe(const struct qstr *d_name, struct inode *dir,
587 588 589 590 591 592 593 594 595 596
	 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
{
	unsigned count, indirect;
	struct dx_entry *at, *entries, *p, *q, *m;
	struct dx_root *root;
	struct buffer_head *bh;
	struct dx_frame *frame = frame_in;
	u32 hash;

	frame->bh = NULL;
597
	if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
598 599 600 601 602
		goto fail;
	root = (struct dx_root *) bh->b_data;
	if (root->info.hash_version != DX_HASH_TEA &&
	    root->info.hash_version != DX_HASH_HALF_MD4 &&
	    root->info.hash_version != DX_HASH_LEGACY) {
603
		ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
604 605 606 607 608 609
			     root->info.hash_version);
		brelse(bh);
		*err = ERR_BAD_DX_DIR;
		goto fail;
	}
	hinfo->hash_version = root->info.hash_version;
610 611
	if (hinfo->hash_version <= DX_HASH_TEA)
		hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
612
	hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
613 614
	if (d_name)
		ext4fs_dirhash(d_name->name, d_name->len, hinfo);
615 616 617
	hash = hinfo->hash;

	if (root->info.unused_flags & 1) {
618
		ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
619 620 621 622 623 624 625
			     root->info.unused_flags);
		brelse(bh);
		*err = ERR_BAD_DX_DIR;
		goto fail;
	}

	if ((indirect = root->info.indirect_levels) > 1) {
626
		ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
627 628 629 630 631 632
			     root->info.indirect_levels);
		brelse(bh);
		*err = ERR_BAD_DX_DIR;
		goto fail;
	}

633 634 635 636 637 638 639 640 641
	if (!buffer_verified(bh) &&
	    !ext4_dx_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) {
		ext4_warning(dir->i_sb, "Root failed checksum");
		brelse(bh);
		*err = ERR_BAD_DX_DIR;
		goto fail;
	}
	set_buffer_verified(bh);

642 643
	entries = (struct dx_entry *) (((char *)&root->info) +
				       root->info.info_length);
644 645 646

	if (dx_get_limit(entries) != dx_root_limit(dir,
						   root->info.info_length)) {
647
		ext4_warning(dir->i_sb, "dx entry: limit != root limit");
648 649 650 651 652
		brelse(bh);
		*err = ERR_BAD_DX_DIR;
		goto fail;
	}

653
	dxtrace(printk("Look up %x", hash));
654 655 656
	while (1)
	{
		count = dx_get_count(entries);
657
		if (!count || count > dx_get_limit(entries)) {
658
			ext4_warning(dir->i_sb,
659 660 661 662 663 664
				     "dx entry: no count or count > limit");
			brelse(bh);
			*err = ERR_BAD_DX_DIR;
			goto fail2;
		}

665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698
		p = entries + 1;
		q = entries + count - 1;
		while (p <= q)
		{
			m = p + (q - p)/2;
			dxtrace(printk("."));
			if (dx_get_hash(m) > hash)
				q = m - 1;
			else
				p = m + 1;
		}

		if (0) // linear search cross check
		{
			unsigned n = count - 1;
			at = entries;
			while (n--)
			{
				dxtrace(printk(","));
				if (dx_get_hash(++at) > hash)
				{
					at--;
					break;
				}
			}
			assert (at == p - 1);
		}

		at = p - 1;
		dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
		frame->bh = bh;
		frame->entries = entries;
		frame->at = at;
		if (!indirect--) return frame;
699
		if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
700 701
			goto fail2;
		at = entries = ((struct dx_node *) bh->b_data)->entries;
702 703 704 705 706 707 708 709 710 711 712

		if (!buffer_verified(bh) &&
		    !ext4_dx_csum_verify(dir,
					 (struct ext4_dir_entry *)bh->b_data)) {
			ext4_warning(dir->i_sb, "Node failed checksum");
			brelse(bh);
			*err = ERR_BAD_DX_DIR;
			goto fail;
		}
		set_buffer_verified(bh);

713
		if (dx_get_limit(entries) != dx_node_limit (dir)) {
714
			ext4_warning(dir->i_sb,
715 716 717 718 719
				     "dx entry: limit != node limit");
			brelse(bh);
			*err = ERR_BAD_DX_DIR;
			goto fail2;
		}
720
		frame++;
721
		frame->bh = NULL;
722 723 724 725 726 727 728
	}
fail2:
	while (frame >= frame_in) {
		brelse(frame->bh);
		frame--;
	}
fail:
729
	if (*err == ERR_BAD_DX_DIR)
730
		ext4_warning(dir->i_sb,
Z
Zheng Liu 已提交
731
			     "Corrupt dir inode %lu, running e2fsck is "
732
			     "recommended.", dir->i_ino);
733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762
	return NULL;
}

static void dx_release (struct dx_frame *frames)
{
	if (frames[0].bh == NULL)
		return;

	if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
		brelse(frames[1].bh);
	brelse(frames[0].bh);
}

/*
 * This function increments the frame pointer to search the next leaf
 * block, and reads in the necessary intervening nodes if the search
 * should be necessary.  Whether or not the search is necessary is
 * controlled by the hash parameter.  If the hash value is even, then
 * the search is only continued if the next block starts with that
 * hash value.  This is used if we are searching for a specific file.
 *
 * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
 *
 * This function returns 1 if the caller should continue to search,
 * or 0 if it should not.  If there is an error reading one of the
 * index blocks, it will a negative error code.
 *
 * If start_hash is non-null, it will be filled in with the starting
 * hash of the next page.
 */
763
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808
				 struct dx_frame *frame,
				 struct dx_frame *frames,
				 __u32 *start_hash)
{
	struct dx_frame *p;
	struct buffer_head *bh;
	int err, num_frames = 0;
	__u32 bhash;

	p = frame;
	/*
	 * Find the next leaf page by incrementing the frame pointer.
	 * If we run out of entries in the interior node, loop around and
	 * increment pointer in the parent node.  When we break out of
	 * this loop, num_frames indicates the number of interior
	 * nodes need to be read.
	 */
	while (1) {
		if (++(p->at) < p->entries + dx_get_count(p->entries))
			break;
		if (p == frames)
			return 0;
		num_frames++;
		p--;
	}

	/*
	 * If the hash is 1, then continue only if the next page has a
	 * continuation hash of any value.  This is used for readdir
	 * handling.  Otherwise, check to see if the hash matches the
	 * desired contiuation hash.  If it doesn't, return since
	 * there's no point to read in the successive index pages.
	 */
	bhash = dx_get_hash(p->at);
	if (start_hash)
		*start_hash = bhash;
	if ((hash & 1) == 0) {
		if ((bhash & ~1) != hash)
			return 0;
	}
	/*
	 * If the hash is HASH_NB_ALWAYS, we always go to the next
	 * block so no check is necessary
	 */
	while (num_frames--) {
809
		if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
810 811
				      0, &err)))
			return err; /* Failure */
812 813 814 815 816 817 818 819 820

		if (!buffer_verified(bh) &&
		    !ext4_dx_csum_verify(dir,
					 (struct ext4_dir_entry *)bh->b_data)) {
			ext4_warning(dir->i_sb, "Node failed checksum");
			return -EIO;
		}
		set_buffer_verified(bh);

821
		p++;
822
		brelse(p->bh);
823 824 825 826 827 828 829 830 831 832 833 834 835
		p->bh = bh;
		p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
	}
	return 1;
}


/*
 * This function fills a red-black tree with information from a
 * directory block.  It returns the number directory entries loaded
 * into the tree.  If there is an error it is returned in err.
 */
static int htree_dirblock_to_tree(struct file *dir_file,
A
Aneesh Kumar K.V 已提交
836
				  struct inode *dir, ext4_lblk_t block,
837 838 839 840
				  struct dx_hash_info *hinfo,
				  __u32 start_hash, __u32 start_minor_hash)
{
	struct buffer_head *bh;
841
	struct ext4_dir_entry_2 *de, *top;
842 843
	int err, count = 0;

A
Aneesh Kumar K.V 已提交
844 845
	dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
							(unsigned long)block));
846
	if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
847 848
		return err;

849 850 851 852 853
	if (!buffer_verified(bh) &&
	    !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
		return -EIO;
	set_buffer_verified(bh);

854 855
	de = (struct ext4_dir_entry_2 *) bh->b_data;
	top = (struct ext4_dir_entry_2 *) ((char *) de +
856
					   dir->i_sb->s_blocksize -
857
					   EXT4_DIR_REC_LEN(0));
858
	for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
859
		if (ext4_check_dir_entry(dir, NULL, de, bh,
860 861
				(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
					 + ((char *)de - bh->b_data))) {
862 863 864
			/* On error, skip the f_pos to the next block. */
			dir_file->f_pos = (dir_file->f_pos |
					(dir->i_sb->s_blocksize - 1)) + 1;
865
			brelse(bh);
866 867
			return count;
		}
868
		ext4fs_dirhash(de->name, de->name_len, hinfo);
869 870 871 872 873 874
		if ((hinfo->hash < start_hash) ||
		    ((hinfo->hash == start_hash) &&
		     (hinfo->minor_hash < start_minor_hash)))
			continue;
		if (de->inode == 0)
			continue;
875
		if ((err = ext4_htree_store_dirent(dir_file,
876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894
				   hinfo->hash, hinfo->minor_hash, de)) != 0) {
			brelse(bh);
			return err;
		}
		count++;
	}
	brelse(bh);
	return count;
}


/*
 * This function fills a red-black tree with information from a
 * directory.  We start scanning the directory in hash order, starting
 * at start_hash and start_minor_hash.
 *
 * This function returns the number of entries inserted into the tree,
 * or a negative error code.
 */
895
int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
896 897 898
			 __u32 start_minor_hash, __u32 *next_hash)
{
	struct dx_hash_info hinfo;
899
	struct ext4_dir_entry_2 *de;
900 901
	struct dx_frame frames[2], *frame;
	struct inode *dir;
A
Aneesh Kumar K.V 已提交
902
	ext4_lblk_t block;
903
	int count = 0;
A
Aneesh Kumar K.V 已提交
904
	int ret, err;
905 906
	__u32 hashval;

907
	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
908
		       start_hash, start_minor_hash));
909
	dir = dir_file->f_path.dentry->d_inode;
910
	if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
911
		hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
912 913 914
		if (hinfo.hash_version <= DX_HASH_TEA)
			hinfo.hash_version +=
				EXT4_SB(dir->i_sb)->s_hash_unsigned;
915
		hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
916 917 918 919 920 921 922
		count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
					       start_hash, start_minor_hash);
		*next_hash = ~0;
		return count;
	}
	hinfo.hash = start_hash;
	hinfo.minor_hash = 0;
923
	frame = dx_probe(NULL, dir, &hinfo, frames, &err);
924 925 926 927 928
	if (!frame)
		return err;

	/* Add '.' and '..' from the htree header */
	if (!start_hash && !start_minor_hash) {
929 930
		de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
		if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0)
931 932 933 934
			goto errout;
		count++;
	}
	if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
935
		de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
936
		de = ext4_next_entry(de, dir->i_sb->s_blocksize);
937
		if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
938 939 940 941 942 943 944 945 946 947 948 949 950 951
			goto errout;
		count++;
	}

	while (1) {
		block = dx_get_block(frame->at);
		ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
					     start_hash, start_minor_hash);
		if (ret < 0) {
			err = ret;
			goto errout;
		}
		count += ret;
		hashval = ~0;
952
		ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968
					    frame, frames, &hashval);
		*next_hash = hashval;
		if (ret < 0) {
			err = ret;
			goto errout;
		}
		/*
		 * Stop if:  (a) there are no more entries, or
		 * (b) we have inserted at least one entry and the
		 * next hash value is not a continuation
		 */
		if ((ret == 0) ||
		    (count && ((hashval & 1) == 0)))
			break;
	}
	dx_release(frames);
969 970
	dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
		       "next hash: %x\n", count, *next_hash));
971 972 973 974 975 976 977 978 979 980 981
	return count;
errout:
	dx_release(frames);
	return (err);
}


/*
 * Directory block splitting, compacting
 */

982 983 984 985
/*
 * Create map of hash values, offsets, and sizes, stored at end of block.
 * Returns number of entries mapped.
 */
986 987 988
static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
		       struct dx_hash_info *hinfo,
		       struct dx_map_entry *map_tail)
989 990 991 992 993
{
	int count = 0;
	char *base = (char *) de;
	struct dx_hash_info h = *hinfo;

994
	while ((char *) de < base + blocksize) {
995
		if (de->name_len && de->inode) {
996
			ext4fs_dirhash(de->name, de->name_len, &h);
997 998
			map_tail--;
			map_tail->hash = h.hash;
999
			map_tail->offs = ((char *) de - base)>>2;
1000
			map_tail->size = le16_to_cpu(de->rec_len);
1001 1002 1003 1004
			count++;
			cond_resched();
		}
		/* XXX: do we need to check rec_len == 0 case? -Chris */
1005
		de = ext4_next_entry(de, blocksize);
1006 1007 1008 1009
	}
	return count;
}

1010
/* Sort map by hash value */
1011 1012
static void dx_sort_map (struct dx_map_entry *map, unsigned count)
{
A
Andrew Morton 已提交
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029
	struct dx_map_entry *p, *q, *top = map + count - 1;
	int more;
	/* Combsort until bubble sort doesn't suck */
	while (count > 2) {
		count = count*10/13;
		if (count - 9 < 2) /* 9, 10 -> 11 */
			count = 11;
		for (p = top, q = p - count; q >= map; p--, q--)
			if (p->hash < q->hash)
				swap(*p, *q);
	}
	/* Garden variety bubble sort */
	do {
		more = 0;
		q = top;
		while (q-- > map) {
			if (q[1].hash >= q[0].hash)
1030
				continue;
A
Andrew Morton 已提交
1031 1032
			swap(*(q+1), *q);
			more = 1;
1033 1034 1035 1036
		}
	} while(more);
}

A
Aneesh Kumar K.V 已提交
1037
static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
{
	struct dx_entry *entries = frame->entries;
	struct dx_entry *old = frame->at, *new = old + 1;
	int count = dx_get_count(entries);

	assert(count < dx_get_limit(entries));
	assert(old < entries + count);
	memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
	dx_set_hash(new, hash);
	dx_set_block(new, block);
	dx_set_count(entries, count + 1);
}

1051
static void ext4_update_dx_flag(struct inode *inode)
1052
{
1053 1054
	if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
				     EXT4_FEATURE_COMPAT_DIR_INDEX))
1055
		ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
1056 1057 1058
}

/*
1059
 * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
1060
 *
1061
 * `len <= EXT4_NAME_LEN' is guaranteed by caller.
1062 1063
 * `de != NULL' is guaranteed by caller.
 */
1064 1065
static inline int ext4_match (int len, const char * const name,
			      struct ext4_dir_entry_2 * de)
1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076
{
	if (len != de->name_len)
		return 0;
	if (!de->inode)
		return 0;
	return !memcmp(name, de->name, len);
}

/*
 * Returns 0 if not found, -1 on failure, and 1 on success
 */
1077
static inline int search_dirblock(struct buffer_head *bh,
1078
				  struct inode *dir,
1079
				  const struct qstr *d_name,
1080
				  unsigned int offset,
1081
				  struct ext4_dir_entry_2 ** res_dir)
1082
{
1083
	struct ext4_dir_entry_2 * de;
1084 1085
	char * dlimit;
	int de_len;
1086 1087
	const char *name = d_name->name;
	int namelen = d_name->len;
1088

1089
	de = (struct ext4_dir_entry_2 *) bh->b_data;
1090 1091 1092 1093 1094 1095
	dlimit = bh->b_data + dir->i_sb->s_blocksize;
	while ((char *) de < dlimit) {
		/* this code is executed quadratically often */
		/* do minimal checking `by hand' */

		if ((char *) de + namelen <= dlimit &&
1096
		    ext4_match (namelen, name, de)) {
1097
			/* found a match - just to be sure, do a full check */
1098
			if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
1099 1100 1101 1102 1103
				return -1;
			*res_dir = de;
			return 1;
		}
		/* prevent looping on a bad block */
1104 1105
		de_len = ext4_rec_len_from_disk(de->rec_len,
						dir->i_sb->s_blocksize);
1106 1107 1108
		if (de_len <= 0)
			return -1;
		offset += de_len;
1109
		de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
1110 1111 1112 1113 1114 1115
	}
	return 0;
}


/*
1116
 *	ext4_find_entry()
1117 1118 1119 1120 1121 1122 1123 1124 1125
 *
 * finds an entry in the specified directory with the wanted name. It
 * returns the cache buffer in which the entry was found, and the entry
 * itself (as a parameter - res_dir). It does NOT read the inode of the
 * entry - you'll have to do that yourself if you want to.
 *
 * The returned buffer_head has ->b_count elevated.  The caller is expected
 * to brelse() it when appropriate.
 */
1126 1127
static struct buffer_head * ext4_find_entry (struct inode *dir,
					const struct qstr *d_name,
1128
					struct ext4_dir_entry_2 ** res_dir)
1129
{
1130 1131 1132
	struct super_block *sb;
	struct buffer_head *bh_use[NAMEI_RA_SIZE];
	struct buffer_head *bh, *ret = NULL;
A
Aneesh Kumar K.V 已提交
1133
	ext4_lblk_t start, block, b;
1134
	const u8 *name = d_name->name;
1135 1136 1137 1138 1139
	int ra_max = 0;		/* Number of bh's in the readahead
				   buffer, bh_use[] */
	int ra_ptr = 0;		/* Current index into readahead
				   buffer */
	int num = 0;
A
Aneesh Kumar K.V 已提交
1140 1141
	ext4_lblk_t  nblocks;
	int i, err;
1142 1143 1144 1145
	int namelen;

	*res_dir = NULL;
	sb = dir->i_sb;
1146
	namelen = d_name->len;
1147
	if (namelen > EXT4_NAME_LEN)
1148
		return NULL;
1149
	if ((namelen <= 2) && (name[0] == '.') &&
1150
	    (name[1] == '.' || name[1] == '\0')) {
1151 1152 1153 1154 1155 1156 1157 1158
		/*
		 * "." or ".." will only be in the first block
		 * NFS may look up ".."; "." should be handled by the VFS
		 */
		block = start = 0;
		nblocks = 1;
		goto restart;
	}
1159
	if (is_dx(dir)) {
1160
		bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
1161 1162 1163 1164 1165 1166 1167
		/*
		 * On success, or if the error was file not found,
		 * return.  Otherwise, fall back to doing a search the
		 * old fashioned way.
		 */
		if (bh || (err != ERR_BAD_DX_DIR))
			return bh;
1168 1169
		dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
			       "falling back\n"));
1170
	}
1171 1172
	nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
	start = EXT4_I(dir)->i_dir_start_lookup;
1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195
	if (start >= nblocks)
		start = 0;
	block = start;
restart:
	do {
		/*
		 * We deal with the read-ahead logic here.
		 */
		if (ra_ptr >= ra_max) {
			/* Refill the readahead buffer */
			ra_ptr = 0;
			b = block;
			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
				/*
				 * Terminate if we reach the end of the
				 * directory and must wrap, or if our
				 * search has finished at this block.
				 */
				if (b >= nblocks || (num && block == start)) {
					bh_use[ra_max] = NULL;
					break;
				}
				num++;
1196
				bh = ext4_getblk(NULL, dir, b++, 0, &err);
1197 1198
				bh_use[ra_max] = bh;
				if (bh)
1199 1200
					ll_rw_block(READ | REQ_META | REQ_PRIO,
						    1, &bh);
1201 1202 1203 1204 1205 1206 1207
			}
		}
		if ((bh = bh_use[ra_ptr++]) == NULL)
			goto next;
		wait_on_buffer(bh);
		if (!buffer_uptodate(bh)) {
			/* read error, skip block & hope for the best */
1208 1209
			EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
					 (unsigned long) block);
1210 1211 1212
			brelse(bh);
			goto next;
		}
1213 1214 1215 1216 1217 1218 1219 1220 1221
		if (!buffer_verified(bh) &&
		    !ext4_dirent_csum_verify(dir,
				(struct ext4_dir_entry *)bh->b_data)) {
			EXT4_ERROR_INODE(dir, "checksumming directory "
					 "block %lu", (unsigned long)block);
			brelse(bh);
			goto next;
		}
		set_buffer_verified(bh);
1222
		i = search_dirblock(bh, dir, d_name,
1223
			    block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
1224
		if (i == 1) {
1225
			EXT4_I(dir)->i_dir_start_lookup = block;
1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242
			ret = bh;
			goto cleanup_and_exit;
		} else {
			brelse(bh);
			if (i < 0)
				goto cleanup_and_exit;
		}
	next:
		if (++block >= nblocks)
			block = 0;
	} while (block != start);

	/*
	 * If the directory has grown while we were searching, then
	 * search the last part of the directory before giving up.
	 */
	block = nblocks;
1243
	nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
1244 1245 1246 1247 1248 1249 1250 1251
	if (block < nblocks) {
		start = 0;
		goto restart;
	}

cleanup_and_exit:
	/* Clean up the read-ahead blocks */
	for (; ra_ptr < ra_max; ra_ptr++)
1252
		brelse(bh_use[ra_ptr]);
1253 1254 1255
	return ret;
}

1256
static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
1257
		       struct ext4_dir_entry_2 **res_dir, int *err)
1258
{
1259
	struct super_block * sb = dir->i_sb;
1260 1261 1262
	struct dx_hash_info	hinfo;
	struct dx_frame frames[2], *frame;
	struct buffer_head *bh;
A
Aneesh Kumar K.V 已提交
1263
	ext4_lblk_t block;
1264 1265
	int retval;

1266 1267
	if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
		return NULL;
1268 1269
	do {
		block = dx_get_block(frame->at);
1270
		if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
1271
			goto errout;
1272

1273 1274 1275 1276 1277 1278 1279 1280 1281 1282
		if (!buffer_verified(bh) &&
		    !ext4_dirent_csum_verify(dir,
				(struct ext4_dir_entry *)bh->b_data)) {
			EXT4_ERROR_INODE(dir, "checksumming directory "
					 "block %lu", (unsigned long)block);
			brelse(bh);
			*err = -EIO;
			goto errout;
		}
		set_buffer_verified(bh);
1283 1284 1285 1286 1287 1288
		retval = search_dirblock(bh, dir, d_name,
					 block << EXT4_BLOCK_SIZE_BITS(sb),
					 res_dir);
		if (retval == 1) { 	/* Success! */
			dx_release(frames);
			return bh;
1289
		}
1290
		brelse(bh);
1291 1292 1293 1294 1295
		if (retval == -1) {
			*err = ERR_BAD_DX_DIR;
			goto errout;
		}

1296
		/* Check to see if we should continue to search */
1297
		retval = ext4_htree_next_block(dir, hinfo.hash, frame,
1298 1299
					       frames, NULL);
		if (retval < 0) {
1300
			ext4_warning(sb,
1301 1302 1303 1304 1305 1306 1307 1308 1309
			     "error reading index page in directory #%lu",
			     dir->i_ino);
			*err = retval;
			goto errout;
		}
	} while (retval == 1);

	*err = -ENOENT;
errout:
1310
	dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
1311 1312 1313 1314
	dx_release (frames);
	return NULL;
}

1315
static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1316
{
1317 1318 1319
	struct inode *inode;
	struct ext4_dir_entry_2 *de;
	struct buffer_head *bh;
1320

1321
	if (dentry->d_name.len > EXT4_NAME_LEN)
1322 1323
		return ERR_PTR(-ENAMETOOLONG);

1324
	bh = ext4_find_entry(dir, &dentry->d_name, &de);
1325 1326
	inode = NULL;
	if (bh) {
1327
		__u32 ino = le32_to_cpu(de->inode);
1328
		brelse(bh);
1329
		if (!ext4_valid_inum(dir->i_sb, ino)) {
1330
			EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
1331
			return ERR_PTR(-EIO);
1332
		}
1333
		inode = ext4_iget(dir->i_sb, ino);
1334 1335 1336 1337 1338
		if (inode == ERR_PTR(-ESTALE)) {
			EXT4_ERROR_INODE(dir,
					 "deleted inode referenced: %u",
					 ino);
			return ERR_PTR(-EIO);
1339
		}
1340 1341 1342 1343 1344
	}
	return d_splice_alias(inode, dentry);
}


1345
struct dentry *ext4_get_parent(struct dentry *child)
1346
{
1347
	__u32 ino;
1348 1349 1350 1351
	static const struct qstr dotdot = {
		.name = "..",
		.len = 2,
	};
1352
	struct ext4_dir_entry_2 * de;
1353 1354
	struct buffer_head *bh;

1355
	bh = ext4_find_entry(child->d_inode, &dotdot, &de);
1356 1357 1358 1359 1360
	if (!bh)
		return ERR_PTR(-ENOENT);
	ino = le32_to_cpu(de->inode);
	brelse(bh);

1361
	if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1362 1363
		EXT4_ERROR_INODE(child->d_inode,
				 "bad parent inode number: %u", ino);
1364
		return ERR_PTR(-EIO);
1365 1366
	}

1367
	return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
1368 1369 1370
}

#define S_SHIFT 12
1371 1372 1373 1374 1375 1376 1377 1378
static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
	[S_IFREG >> S_SHIFT]	= EXT4_FT_REG_FILE,
	[S_IFDIR >> S_SHIFT]	= EXT4_FT_DIR,
	[S_IFCHR >> S_SHIFT]	= EXT4_FT_CHRDEV,
	[S_IFBLK >> S_SHIFT]	= EXT4_FT_BLKDEV,
	[S_IFIFO >> S_SHIFT]	= EXT4_FT_FIFO,
	[S_IFSOCK >> S_SHIFT]	= EXT4_FT_SOCK,
	[S_IFLNK >> S_SHIFT]	= EXT4_FT_SYMLINK,
1379 1380
};

1381 1382
static inline void ext4_set_de_type(struct super_block *sb,
				struct ext4_dir_entry_2 *de,
1383
				umode_t mode) {
1384 1385
	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
		de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1386 1387
}

1388 1389 1390 1391
/*
 * Move count entries from end of map between two memory locations.
 * Returns pointer to last entry moved.
 */
1392
static struct ext4_dir_entry_2 *
1393 1394
dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
		unsigned blocksize)
1395 1396 1397 1398
{
	unsigned rec_len = 0;

	while (count--) {
1399
		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
1400
						(from + (map->offs<<2));
1401
		rec_len = EXT4_DIR_REC_LEN(de->name_len);
1402
		memcpy (to, de, rec_len);
1403
		((struct ext4_dir_entry_2 *) to)->rec_len =
1404
				ext4_rec_len_to_disk(rec_len, blocksize);
1405 1406 1407 1408
		de->inode = 0;
		map++;
		to += rec_len;
	}
1409
	return (struct ext4_dir_entry_2 *) (to - rec_len);
1410 1411
}

1412 1413 1414 1415
/*
 * Compact each dir entry in the range to the minimal rec_len.
 * Returns pointer to last entry in range.
 */
1416
static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
1417
{
1418
	struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
1419 1420 1421
	unsigned rec_len = 0;

	prev = to = de;
1422
	while ((char*)de < base + blocksize) {
1423
		next = ext4_next_entry(de, blocksize);
1424
		if (de->inode && de->name_len) {
1425
			rec_len = EXT4_DIR_REC_LEN(de->name_len);
1426 1427
			if (de > to)
				memmove(to, de, rec_len);
1428
			to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
1429
			prev = to;
1430
			to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
1431 1432 1433 1434 1435 1436
		}
		de = next;
	}
	return prev;
}

1437 1438 1439 1440 1441
/*
 * Split a full leaf block to make room for a new dir entry.
 * Allocate a new block, and move entries so that they are approx. equally full.
 * Returns pointer to de in block into which the new entry will be inserted.
 */
1442
static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1443 1444 1445 1446 1447 1448
			struct buffer_head **bh,struct dx_frame *frame,
			struct dx_hash_info *hinfo, int *error)
{
	unsigned blocksize = dir->i_sb->s_blocksize;
	unsigned count, continued;
	struct buffer_head *bh2;
A
Aneesh Kumar K.V 已提交
1449
	ext4_lblk_t newblock;
1450 1451 1452
	u32 hash2;
	struct dx_map_entry *map;
	char *data1 = (*bh)->b_data, *data2;
1453
	unsigned split, move, size;
1454
	struct ext4_dir_entry_2 *de = NULL, *de2;
1455 1456
	struct ext4_dir_entry_tail *t;
	int	csum_size = 0;
1457
	int	err = 0, i;
1458

1459 1460 1461 1462
	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
		csum_size = sizeof(struct ext4_dir_entry_tail);

1463
	bh2 = ext4_append (handle, dir, &newblock, &err);
1464 1465 1466 1467 1468 1469 1470
	if (!(bh2)) {
		brelse(*bh);
		*bh = NULL;
		goto errout;
	}

	BUFFER_TRACE(*bh, "get_write_access");
1471
	err = ext4_journal_get_write_access(handle, *bh);
1472 1473 1474
	if (err)
		goto journal_error;

1475
	BUFFER_TRACE(frame->bh, "get_write_access");
1476
	err = ext4_journal_get_write_access(handle, frame->bh);
1477 1478 1479 1480 1481 1482 1483
	if (err)
		goto journal_error;

	data2 = bh2->b_data;

	/* create map in the end of data2 block */
	map = (struct dx_map_entry *) (data2 + blocksize);
1484
	count = dx_make_map((struct ext4_dir_entry_2 *) data1,
1485 1486
			     blocksize, hinfo, map);
	map -= count;
1487
	dx_sort_map(map, count);
1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499
	/* Split the existing block in the middle, size-wise */
	size = 0;
	move = 0;
	for (i = count-1; i >= 0; i--) {
		/* is more than half of this entry in 2nd half of the block? */
		if (size + map[i].size/2 > blocksize/2)
			break;
		size += map[i].size;
		move++;
	}
	/* map index at which we will split */
	split = count - move;
1500 1501
	hash2 = map[split].hash;
	continued = hash2 == map[split - 1].hash;
A
Aneesh Kumar K.V 已提交
1502 1503 1504
	dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
			(unsigned long)dx_get_block(frame->at),
					hash2, split, count-split));
1505 1506

	/* Fancy dance to stay within two buffers */
1507
	de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
1508
	de = dx_pack_dirents(data1, blocksize);
1509 1510
	de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
					   (char *) de,
1511
					   blocksize);
1512 1513
	de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
					    (char *) de2,
1514
					    blocksize);
1515 1516 1517 1518 1519 1520 1521 1522
	if (csum_size) {
		t = EXT4_DIRENT_TAIL(data2, blocksize);
		initialize_dirent_tail(t, blocksize);

		t = EXT4_DIRENT_TAIL(data1, blocksize);
		initialize_dirent_tail(t, blocksize);
	}

1523 1524
	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1525 1526 1527 1528 1529 1530 1531

	/* Which block gets the new entry? */
	if (hinfo->hash >= hash2)
	{
		swap(*bh, bh2);
		de = de2;
	}
1532
	dx_insert_block(frame, hash2 + continued, newblock);
1533
	err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
1534 1535
	if (err)
		goto journal_error;
1536
	err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
1537 1538
	if (err)
		goto journal_error;
1539 1540
	brelse(bh2);
	dxtrace(dx_show_index("frame", frame->entries));
1541
	return de;
1542 1543 1544 1545 1546 1547 1548 1549 1550

journal_error:
	brelse(*bh);
	brelse(bh2);
	*bh = NULL;
	ext4_std_error(dir->i_sb, err);
errout:
	*error = err;
	return NULL;
1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561
}

/*
 * Add a new entry into a directory (leaf) block.  If de is non-NULL,
 * it points to a directory entry which is guaranteed to be large
 * enough for new directory entry.  If de is NULL, then
 * add_dirent_to_buf will attempt search the directory block for
 * space.  It will return -ENOSPC if no space is available, and -EIO
 * and -EEXIST if directory entry already exists.
 */
static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1562
			     struct inode *inode, struct ext4_dir_entry_2 *de,
1563
			     struct buffer_head *bh)
1564 1565 1566 1567
{
	struct inode	*dir = dentry->d_parent->d_inode;
	const char	*name = dentry->d_name.name;
	int		namelen = dentry->d_name.len;
1568
	unsigned int	offset = 0;
1569
	unsigned int	blocksize = dir->i_sb->s_blocksize;
1570 1571 1572
	unsigned short	reclen;
	int		nlen, rlen, err;
	char		*top;
1573 1574 1575 1576 1577
	int		csum_size = 0;

	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
		csum_size = sizeof(struct ext4_dir_entry_tail);
1578

1579
	reclen = EXT4_DIR_REC_LEN(namelen);
1580
	if (!de) {
1581
		de = (struct ext4_dir_entry_2 *)bh->b_data;
1582
		top = bh->b_data + (blocksize - csum_size) - reclen;
1583
		while ((char *) de <= top) {
1584
			if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
1585
				return -EIO;
1586
			if (ext4_match(namelen, name, de))
1587
				return -EEXIST;
1588
			nlen = EXT4_DIR_REC_LEN(de->name_len);
1589
			rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1590 1591
			if ((de->inode? rlen - nlen: rlen) >= reclen)
				break;
1592
			de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
1593 1594 1595 1596 1597 1598
			offset += rlen;
		}
		if ((char *) de > top)
			return -ENOSPC;
	}
	BUFFER_TRACE(bh, "get_write_access");
1599
	err = ext4_journal_get_write_access(handle, bh);
1600
	if (err) {
1601
		ext4_std_error(dir->i_sb, err);
1602 1603 1604 1605
		return err;
	}

	/* By now the buffer is marked for journaling */
1606
	nlen = EXT4_DIR_REC_LEN(de->name_len);
1607
	rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1608
	if (de->inode) {
1609
		struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
1610 1611
		de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
		de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
1612 1613
		de = de1;
	}
1614
	de->file_type = EXT4_FT_UNKNOWN;
1615 1616
	de->inode = cpu_to_le32(inode->i_ino);
	ext4_set_de_type(dir->i_sb, de, inode->i_mode);
1617
	de->name_len = namelen;
1618
	memcpy(de->name, name, namelen);
1619 1620 1621 1622 1623 1624
	/*
	 * XXX shouldn't update any times until successful
	 * completion of syscall, but too many callers depend
	 * on this.
	 *
	 * XXX similarly, too many callers depend on
1625
	 * ext4_new_inode() setting the times, but error
1626 1627 1628 1629
	 * recovery deletes the inode, so the worst that can
	 * happen is that the times are slightly out of date
	 * and/or different from the directory change time.
	 */
K
Kalpak Shah 已提交
1630
	dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
1631
	ext4_update_dx_flag(dir);
1632
	dir->i_version++;
1633
	ext4_mark_inode_dirty(handle, dir);
1634
	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1635
	err = ext4_handle_dirty_dirent_node(handle, dir, bh);
1636
	if (err)
1637
		ext4_std_error(dir->i_sb, err);
1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654
	return 0;
}

/*
 * This converts a one block unindexed directory to a 3 block indexed
 * directory, and adds the dentry to the indexed directory.
 */
static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
			    struct inode *inode, struct buffer_head *bh)
{
	struct inode	*dir = dentry->d_parent->d_inode;
	const char	*name = dentry->d_name.name;
	int		namelen = dentry->d_name.len;
	struct buffer_head *bh2;
	struct dx_root	*root;
	struct dx_frame	frames[2], *frame;
	struct dx_entry *entries;
1655
	struct ext4_dir_entry_2	*de, *de2;
1656
	struct ext4_dir_entry_tail *t;
1657 1658 1659 1660 1661
	char		*data1, *top;
	unsigned	len;
	int		retval;
	unsigned	blocksize;
	struct dx_hash_info hinfo;
A
Aneesh Kumar K.V 已提交
1662
	ext4_lblk_t  block;
1663
	struct fake_dirent *fde;
1664 1665 1666 1667 1668
	int		csum_size = 0;

	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
		csum_size = sizeof(struct ext4_dir_entry_tail);
1669 1670

	blocksize =  dir->i_sb->s_blocksize;
1671
	dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
1672
	retval = ext4_journal_get_write_access(handle, bh);
1673
	if (retval) {
1674
		ext4_std_error(dir->i_sb, retval);
1675 1676 1677 1678 1679
		brelse(bh);
		return retval;
	}
	root = (struct dx_root *) bh->b_data;

1680 1681 1682
	/* The 0th block becomes the root, move the dirents out */
	fde = &root->dotdot;
	de = (struct ext4_dir_entry_2 *)((char *)fde +
1683
		ext4_rec_len_from_disk(fde->rec_len, blocksize));
1684
	if ((char *) de >= (((char *) root) + blocksize)) {
1685
		EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
1686 1687 1688
		brelse(bh);
		return -EIO;
	}
1689
	len = ((char *) root) + (blocksize - csum_size) - (char *) de;
1690 1691

	/* Allocate new block for the 0th block's dirents */
1692
	bh2 = ext4_append(handle, dir, &block, &retval);
1693 1694 1695 1696
	if (!(bh2)) {
		brelse(bh);
		return retval;
	}
1697
	ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
1698 1699 1700
	data1 = bh2->b_data;

	memcpy (data1, de, len);
1701
	de = (struct ext4_dir_entry_2 *) data1;
1702
	top = data1 + len;
1703
	while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
1704
		de = de2;
1705 1706
	de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
					   (char *) de,
1707
					   blocksize);
1708 1709 1710 1711 1712 1713

	if (csum_size) {
		t = EXT4_DIRENT_TAIL(data1, blocksize);
		initialize_dirent_tail(t, blocksize);
	}

1714
	/* Initialize the root; the dot dirents already exist */
1715
	de = (struct ext4_dir_entry_2 *) (&root->dotdot);
1716 1717
	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
					   blocksize);
1718 1719
	memset (&root->info, 0, sizeof(root->info));
	root->info.info_length = sizeof(root->info);
1720
	root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
1721
	entries = root->entries;
1722 1723 1724
	dx_set_block(entries, 1);
	dx_set_count(entries, 1);
	dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
1725 1726 1727

	/* Initialize as for dx_probe */
	hinfo.hash_version = root->info.hash_version;
1728 1729
	if (hinfo.hash_version <= DX_HASH_TEA)
		hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
1730 1731
	hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
	ext4fs_dirhash(name, namelen, &hinfo);
1732 1733 1734 1735 1736
	frame = frames;
	frame->entries = entries;
	frame->at = entries;
	frame->bh = bh;
	bh = bh2;
1737

1738
	ext4_handle_dirty_dx_node(handle, dir, frame->bh);
1739
	ext4_handle_dirty_dirent_node(handle, dir, bh);
1740

1741
	de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1742 1743 1744 1745 1746 1747 1748 1749
	if (!de) {
		/*
		 * Even if the block split failed, we have to properly write
		 * out all the changes we did so far. Otherwise we can end up
		 * with corrupted filesystem.
		 */
		ext4_mark_inode_dirty(handle, dir);
		dx_release(frames);
1750
		return retval;
1751 1752
	}
	dx_release(frames);
1753

1754 1755 1756
	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
	brelse(bh);
	return retval;
1757 1758 1759
}

/*
1760
 *	ext4_add_entry()
1761 1762
 *
 * adds a file entry to the specified directory, using the same
1763
 * semantics as ext4_find_entry(). It returns NULL if it failed.
1764 1765 1766 1767 1768
 *
 * NOTE!! The inode part of 'de' is left at 0 - which means you
 * may not sleep between calling this and putting something into
 * the entry, as someone else might have used it while you slept.
 */
1769 1770
static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
			  struct inode *inode)
1771 1772
{
	struct inode *dir = dentry->d_parent->d_inode;
1773
	struct buffer_head *bh;
1774
	struct ext4_dir_entry_2 *de;
1775
	struct ext4_dir_entry_tail *t;
1776
	struct super_block *sb;
1777 1778 1779
	int	retval;
	int	dx_fallback=0;
	unsigned blocksize;
A
Aneesh Kumar K.V 已提交
1780
	ext4_lblk_t block, blocks;
1781 1782 1783 1784 1785
	int	csum_size = 0;

	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
		csum_size = sizeof(struct ext4_dir_entry_tail);
1786 1787 1788 1789 1790 1791

	sb = dir->i_sb;
	blocksize = sb->s_blocksize;
	if (!dentry->d_name.len)
		return -EINVAL;
	if (is_dx(dir)) {
1792
		retval = ext4_dx_add_entry(handle, dentry, inode);
1793 1794
		if (!retval || (retval != ERR_BAD_DX_DIR))
			return retval;
1795
		ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
1796
		dx_fallback++;
1797
		ext4_mark_inode_dirty(handle, dir);
1798 1799
	}
	blocks = dir->i_size >> sb->s_blocksize_bits;
1800
	for (block = 0; block < blocks; block++) {
1801
		bh = ext4_bread(handle, dir, block, 0, &retval);
1802 1803
		if(!bh)
			return retval;
1804 1805 1806 1807 1808
		if (!buffer_verified(bh) &&
		    !ext4_dirent_csum_verify(dir,
				(struct ext4_dir_entry *)bh->b_data))
			return -EIO;
		set_buffer_verified(bh);
1809
		retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1810 1811
		if (retval != -ENOSPC) {
			brelse(bh);
1812
			return retval;
1813
		}
1814 1815

		if (blocks == 1 && !dx_fallback &&
1816 1817
		    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
			return make_indexed_dir(handle, dentry, inode, bh);
1818 1819
		brelse(bh);
	}
1820
	bh = ext4_append(handle, dir, &block, &retval);
1821 1822
	if (!bh)
		return retval;
1823
	de = (struct ext4_dir_entry_2 *) bh->b_data;
1824
	de->inode = 0;
1825 1826 1827 1828 1829 1830 1831
	de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);

	if (csum_size) {
		t = EXT4_DIRENT_TAIL(bh->b_data, blocksize);
		initialize_dirent_tail(t, blocksize);
	}

1832 1833
	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
	brelse(bh);
1834 1835
	if (retval == 0)
		ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
1836
	return retval;
1837 1838 1839 1840 1841
}

/*
 * Returns 0 for success, or a negative error value
 */
1842
static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1843 1844 1845 1846 1847
			     struct inode *inode)
{
	struct dx_frame frames[2], *frame;
	struct dx_entry *entries, *at;
	struct dx_hash_info hinfo;
1848
	struct buffer_head *bh;
1849
	struct inode *dir = dentry->d_parent->d_inode;
1850
	struct super_block *sb = dir->i_sb;
1851
	struct ext4_dir_entry_2 *de;
1852 1853
	int err;

1854
	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
1855 1856 1857 1858 1859
	if (!frame)
		return err;
	entries = frame->entries;
	at = frame->at;

1860
	if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
1861 1862
		goto cleanup;

1863 1864 1865 1866 1867
	if (!buffer_verified(bh) &&
	    !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
		goto journal_error;
	set_buffer_verified(bh);

1868
	BUFFER_TRACE(bh, "get_write_access");
1869
	err = ext4_journal_get_write_access(handle, bh);
1870 1871 1872 1873
	if (err)
		goto journal_error;

	err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1874
	if (err != -ENOSPC)
1875 1876 1877
		goto cleanup;

	/* Block full, should compress but for now just split */
1878
	dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
1879 1880 1881
		       dx_get_count(entries), dx_get_limit(entries)));
	/* Need to split index? */
	if (dx_get_count(entries) == dx_get_limit(entries)) {
A
Aneesh Kumar K.V 已提交
1882
		ext4_lblk_t newblock;
1883 1884 1885 1886 1887 1888 1889 1890
		unsigned icount = dx_get_count(entries);
		int levels = frame - frames;
		struct dx_entry *entries2;
		struct dx_node *node2;
		struct buffer_head *bh2;

		if (levels && (dx_get_count(frames->entries) ==
			       dx_get_limit(frames->entries))) {
1891
			ext4_warning(sb, "Directory index full!");
1892 1893 1894
			err = -ENOSPC;
			goto cleanup;
		}
1895
		bh2 = ext4_append (handle, dir, &newblock, &err);
1896 1897 1898 1899
		if (!(bh2))
			goto cleanup;
		node2 = (struct dx_node *)(bh2->b_data);
		entries2 = node2->entries;
1900
		memset(&node2->fake, 0, sizeof(struct fake_dirent));
1901 1902
		node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
							   sb->s_blocksize);
1903
		BUFFER_TRACE(frame->bh, "get_write_access");
1904
		err = ext4_journal_get_write_access(handle, frame->bh);
1905 1906 1907 1908 1909
		if (err)
			goto journal_error;
		if (levels) {
			unsigned icount1 = icount/2, icount2 = icount - icount1;
			unsigned hash2 = dx_get_hash(entries + icount1);
1910 1911
			dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
				       icount1, icount2));
1912 1913

			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1914
			err = ext4_journal_get_write_access(handle,
1915 1916 1917 1918
							     frames[0].bh);
			if (err)
				goto journal_error;

1919 1920 1921 1922 1923
			memcpy((char *) entries2, (char *) (entries + icount1),
			       icount2 * sizeof(struct dx_entry));
			dx_set_count(entries, icount1);
			dx_set_count(entries2, icount2);
			dx_set_limit(entries2, dx_node_limit(dir));
1924 1925 1926 1927 1928 1929 1930

			/* Which index block gets the new entry? */
			if (at - entries >= icount1) {
				frame->at = at = at - entries - icount1 + entries2;
				frame->entries = entries = entries2;
				swap(frame->bh, bh2);
			}
1931 1932 1933
			dx_insert_block(frames + 0, hash2, newblock);
			dxtrace(dx_show_index("node", frames[1].entries));
			dxtrace(dx_show_index("node",
1934
			       ((struct dx_node *) bh2->b_data)->entries));
1935
			err = ext4_handle_dirty_dx_node(handle, dir, bh2);
1936 1937 1938 1939
			if (err)
				goto journal_error;
			brelse (bh2);
		} else {
1940 1941
			dxtrace(printk(KERN_DEBUG
				       "Creating second level index...\n"));
1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955
			memcpy((char *) entries2, (char *) entries,
			       icount * sizeof(struct dx_entry));
			dx_set_limit(entries2, dx_node_limit(dir));

			/* Set up root */
			dx_set_count(entries, 1);
			dx_set_block(entries + 0, newblock);
			((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;

			/* Add new access path frame */
			frame = frames + 1;
			frame->at = at = at - entries + entries2;
			frame->entries = entries = entries2;
			frame->bh = bh2;
1956
			err = ext4_journal_get_write_access(handle,
1957 1958 1959 1960
							     frame->bh);
			if (err)
				goto journal_error;
		}
1961
		err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
1962 1963 1964 1965
		if (err) {
			ext4_std_error(inode->i_sb, err);
			goto cleanup;
		}
1966 1967 1968 1969 1970 1971 1972 1973
	}
	de = do_split(handle, dir, &bh, frame, &hinfo, &err);
	if (!de)
		goto cleanup;
	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
	goto cleanup;

journal_error:
1974
	ext4_std_error(dir->i_sb, err);
1975 1976 1977 1978 1979 1980 1981 1982
cleanup:
	if (bh)
		brelse(bh);
	dx_release(frames);
	return err;
}

/*
1983
 * ext4_delete_entry deletes a directory entry by merging it with the
1984 1985
 * previous entry
 */
1986 1987 1988 1989
static int ext4_delete_entry(handle_t *handle,
			     struct inode *dir,
			     struct ext4_dir_entry_2 *de_del,
			     struct buffer_head *bh)
1990
{
1991
	struct ext4_dir_entry_2 *de, *pde;
1992
	unsigned int blocksize = dir->i_sb->s_blocksize;
1993
	int csum_size = 0;
1994
	int i, err;
1995

1996 1997 1998 1999
	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
		csum_size = sizeof(struct ext4_dir_entry_tail);

2000 2001
	i = 0;
	pde = NULL;
2002
	de = (struct ext4_dir_entry_2 *) bh->b_data;
2003
	while (i < bh->b_size - csum_size) {
2004
		if (ext4_check_dir_entry(dir, NULL, de, bh, i))
2005 2006 2007
			return -EIO;
		if (de == de_del)  {
			BUFFER_TRACE(bh, "get_write_access");
2008 2009 2010 2011 2012
			err = ext4_journal_get_write_access(handle, bh);
			if (unlikely(err)) {
				ext4_std_error(dir->i_sb, err);
				return err;
			}
2013
			if (pde)
2014
				pde->rec_len = ext4_rec_len_to_disk(
2015 2016 2017 2018 2019
					ext4_rec_len_from_disk(pde->rec_len,
							       blocksize) +
					ext4_rec_len_from_disk(de->rec_len,
							       blocksize),
					blocksize);
2020 2021 2022
			else
				de->inode = 0;
			dir->i_version++;
2023
			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
2024
			err = ext4_handle_dirty_dirent_node(handle, dir, bh);
2025 2026 2027 2028
			if (unlikely(err)) {
				ext4_std_error(dir->i_sb, err);
				return err;
			}
2029 2030
			return 0;
		}
2031
		i += ext4_rec_len_from_disk(de->rec_len, blocksize);
2032
		pde = de;
2033
		de = ext4_next_entry(de, blocksize);
2034 2035 2036 2037
	}
	return -ENOENT;
}

2038 2039 2040 2041 2042 2043 2044 2045 2046 2047
/*
 * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
 * since this indicates that nlinks count was previously 1.
 */
static void ext4_inc_count(handle_t *handle, struct inode *inode)
{
	inc_nlink(inode);
	if (is_dx(inode) && inode->i_nlink > 1) {
		/* limit is 16-bit i_links_count */
		if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
M
Miklos Szeredi 已提交
2048
			set_nlink(inode, 1);
2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060
			EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
					      EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
		}
	}
}

/*
 * If a directory had nlink == 1, then we should let it be 1. This indicates
 * directory has >EXT4_LINK_MAX subdirs.
 */
static void ext4_dec_count(handle_t *handle, struct inode *inode)
{
2061 2062
	if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
		drop_nlink(inode);
2063 2064 2065
}


2066
static int ext4_add_nondir(handle_t *handle,
2067 2068
		struct dentry *dentry, struct inode *inode)
{
2069
	int err = ext4_add_entry(handle, dentry, inode);
2070
	if (!err) {
2071
		ext4_mark_inode_dirty(handle, inode);
2072
		d_instantiate(dentry, inode);
A
Al Viro 已提交
2073
		unlock_new_inode(inode);
2074 2075
		return 0;
	}
2076
	drop_nlink(inode);
A
Al Viro 已提交
2077
	unlock_new_inode(inode);
2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089
	iput(inode);
	return err;
}

/*
 * By the time this is called, we already have created
 * the directory cache entry for the new file, but it
 * is so far negative - it has no inode.
 *
 * If the create succeeds, we fill in the inode information
 * with d_instantiate().
 */
A
Al Viro 已提交
2090
static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2091
		       struct nameidata *nd)
2092 2093
{
	handle_t *handle;
2094
	struct inode *inode;
2095 2096
	int err, retries = 0;

2097
	dquot_initialize(dir);
2098

2099
retry:
2100 2101
	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
D
Dmitry Monakhov 已提交
2102
					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2103 2104 2105 2106
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	if (IS_DIRSYNC(dir))
2107
		ext4_handle_sync(handle);
2108

2109
	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
2110 2111
	err = PTR_ERR(inode);
	if (!IS_ERR(inode)) {
2112 2113 2114 2115
		inode->i_op = &ext4_file_inode_operations;
		inode->i_fop = &ext4_file_operations;
		ext4_set_aops(inode);
		err = ext4_add_nondir(handle, dentry, inode);
2116
	}
2117 2118
	ext4_journal_stop(handle);
	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2119 2120 2121 2122
		goto retry;
	return err;
}

2123
static int ext4_mknod(struct inode *dir, struct dentry *dentry,
A
Al Viro 已提交
2124
		      umode_t mode, dev_t rdev)
2125 2126 2127 2128 2129 2130 2131 2132
{
	handle_t *handle;
	struct inode *inode;
	int err, retries = 0;

	if (!new_valid_dev(rdev))
		return -EINVAL;

2133
	dquot_initialize(dir);
2134

2135
retry:
2136 2137
	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
D
Dmitry Monakhov 已提交
2138
					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2139 2140 2141 2142
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	if (IS_DIRSYNC(dir))
2143
		ext4_handle_sync(handle);
2144

2145
	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
2146 2147 2148
	err = PTR_ERR(inode);
	if (!IS_ERR(inode)) {
		init_special_inode(inode, inode->i_mode, rdev);
T
Theodore Ts'o 已提交
2149
#ifdef CONFIG_EXT4_FS_XATTR
2150
		inode->i_op = &ext4_special_inode_operations;
2151
#endif
2152
		err = ext4_add_nondir(handle, dentry, inode);
2153
	}
2154 2155
	ext4_journal_stop(handle);
	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2156 2157 2158 2159
		goto retry;
	return err;
}

2160
static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
2161 2162
{
	handle_t *handle;
2163
	struct inode *inode;
2164
	struct buffer_head *dir_block = NULL;
2165
	struct ext4_dir_entry_2 *de;
2166
	struct ext4_dir_entry_tail *t;
2167
	unsigned int blocksize = dir->i_sb->s_blocksize;
2168
	int csum_size = 0;
2169 2170
	int err, retries = 0;

2171 2172 2173 2174
	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
		csum_size = sizeof(struct ext4_dir_entry_tail);

2175
	if (EXT4_DIR_LINK_MAX(dir))
2176 2177
		return -EMLINK;

2178
	dquot_initialize(dir);
2179

2180
retry:
2181 2182
	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
D
Dmitry Monakhov 已提交
2183
					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2184 2185 2186 2187
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	if (IS_DIRSYNC(dir))
2188
		ext4_handle_sync(handle);
2189

2190
	inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
2191
			       &dentry->d_name, 0, NULL);
2192 2193 2194 2195
	err = PTR_ERR(inode);
	if (IS_ERR(inode))
		goto out_stop;

2196 2197 2198
	inode->i_op = &ext4_dir_inode_operations;
	inode->i_fop = &ext4_dir_operations;
	inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
2199
	dir_block = ext4_bread(handle, inode, 0, 1, &err);
2200 2201
	if (!dir_block)
		goto out_clear_inode;
2202
	BUFFER_TRACE(dir_block, "get_write_access");
2203 2204 2205
	err = ext4_journal_get_write_access(handle, dir_block);
	if (err)
		goto out_clear_inode;
2206
	de = (struct ext4_dir_entry_2 *) dir_block->b_data;
2207 2208
	de->inode = cpu_to_le32(inode->i_ino);
	de->name_len = 1;
2209 2210
	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
					   blocksize);
2211
	strcpy(de->name, ".");
2212
	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
2213
	de = ext4_next_entry(de, blocksize);
2214
	de->inode = cpu_to_le32(dir->i_ino);
2215 2216
	de->rec_len = ext4_rec_len_to_disk(blocksize -
					   (csum_size + EXT4_DIR_REC_LEN(1)),
2217
					   blocksize);
2218
	de->name_len = 2;
2219
	strcpy(de->name, "..");
2220
	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
M
Miklos Szeredi 已提交
2221
	set_nlink(inode, 2);
2222 2223 2224 2225 2226 2227

	if (csum_size) {
		t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
		initialize_dirent_tail(t, blocksize);
	}

2228
	BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
2229
	err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
2230 2231
	if (err)
		goto out_clear_inode;
2232
	set_buffer_verified(dir_block);
2233 2234 2235
	err = ext4_mark_inode_dirty(handle, inode);
	if (!err)
		err = ext4_add_entry(handle, dentry, inode);
2236
	if (err) {
2237 2238
out_clear_inode:
		clear_nlink(inode);
A
Al Viro 已提交
2239
		unlock_new_inode(inode);
2240
		ext4_mark_inode_dirty(handle, inode);
2241
		iput(inode);
2242 2243
		goto out_stop;
	}
2244
	ext4_inc_count(handle, dir);
2245
	ext4_update_dx_flag(dir);
2246 2247 2248
	err = ext4_mark_inode_dirty(handle, dir);
	if (err)
		goto out_clear_inode;
2249
	d_instantiate(dentry, inode);
A
Al Viro 已提交
2250
	unlock_new_inode(inode);
2251
out_stop:
2252
	brelse(dir_block);
2253 2254
	ext4_journal_stop(handle);
	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2255 2256 2257 2258 2259 2260 2261
		goto retry;
	return err;
}

/*
 * routine to check that the specified directory is empty (for rmdir)
 */
2262
static int empty_dir(struct inode *inode)
2263
{
2264
	unsigned int offset;
2265 2266 2267
	struct buffer_head *bh;
	struct ext4_dir_entry_2 *de, *de1;
	struct super_block *sb;
2268 2269 2270
	int err = 0;

	sb = inode->i_sb;
2271
	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
2272
	    !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
2273
		if (err)
2274 2275
			EXT4_ERROR_INODE(inode,
				"error %d reading directory lblock 0", err);
2276
		else
2277
			ext4_warning(inode->i_sb,
2278 2279 2280 2281
				     "bad directory (dir #%lu) - no data block",
				     inode->i_ino);
		return 1;
	}
2282 2283 2284 2285 2286 2287 2288 2289
	if (!buffer_verified(bh) &&
	    !ext4_dirent_csum_verify(inode,
			(struct ext4_dir_entry *)bh->b_data)) {
		EXT4_ERROR_INODE(inode, "checksum error reading directory "
				 "lblock 0");
		return -EIO;
	}
	set_buffer_verified(bh);
2290
	de = (struct ext4_dir_entry_2 *) bh->b_data;
2291
	de1 = ext4_next_entry(de, sb->s_blocksize);
2292 2293
	if (le32_to_cpu(de->inode) != inode->i_ino ||
			!le32_to_cpu(de1->inode) ||
2294 2295
			strcmp(".", de->name) ||
			strcmp("..", de1->name)) {
2296
		ext4_warning(inode->i_sb,
2297 2298 2299
			     "bad directory (dir #%lu) - no `.' or `..'",
			     inode->i_ino);
		brelse(bh);
2300 2301
		return 1;
	}
2302 2303 2304
	offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
		 ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
	de = ext4_next_entry(de1, sb->s_blocksize);
2305
	while (offset < inode->i_size) {
2306
		if (!bh ||
2307 2308
		    (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
			unsigned int lblock;
2309
			err = 0;
2310
			brelse(bh);
2311 2312
			lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
			bh = ext4_bread(NULL, inode, lblock, 0, &err);
2313 2314
			if (!bh) {
				if (err)
2315 2316 2317
					EXT4_ERROR_INODE(inode,
						"error %d reading directory "
						"lblock %u", err, lblock);
2318 2319 2320
				offset += sb->s_blocksize;
				continue;
			}
2321 2322 2323 2324 2325 2326 2327 2328
			if (!buffer_verified(bh) &&
			    !ext4_dirent_csum_verify(inode,
					(struct ext4_dir_entry *)bh->b_data)) {
				EXT4_ERROR_INODE(inode, "checksum error "
						 "reading directory lblock 0");
				return -EIO;
			}
			set_buffer_verified(bh);
2329
			de = (struct ext4_dir_entry_2 *) bh->b_data;
2330
		}
2331
		if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
2332
			de = (struct ext4_dir_entry_2 *)(bh->b_data +
2333 2334 2335 2336 2337
							 sb->s_blocksize);
			offset = (offset | (sb->s_blocksize - 1)) + 1;
			continue;
		}
		if (le32_to_cpu(de->inode)) {
2338
			brelse(bh);
2339 2340
			return 0;
		}
2341 2342
		offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
		de = ext4_next_entry(de, sb->s_blocksize);
2343
	}
2344
	brelse(bh);
2345 2346 2347
	return 1;
}

2348
/* ext4_orphan_add() links an unlinked or truncated inode into a list of
2349 2350 2351 2352 2353
 * such inodes, starting at the superblock, in case we crash before the
 * file is closed/deleted, or in case the inode truncate spans multiple
 * transactions and the last transaction is not recovered after a crash.
 *
 * At filesystem recovery time, we walk this list deleting unlinked
2354
 * inodes and truncating linked inodes in ext4_orphan_cleanup().
2355
 */
2356
int ext4_orphan_add(handle_t *handle, struct inode *inode)
2357 2358
{
	struct super_block *sb = inode->i_sb;
2359
	struct ext4_iloc iloc;
2360 2361
	int err = 0, rc;

2362 2363 2364
	if (!ext4_handle_valid(handle))
		return 0;

2365
	mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
2366
	if (!list_empty(&EXT4_I(inode)->i_orphan))
2367 2368
		goto out_unlock;

2369 2370 2371 2372 2373
	/*
	 * Orphan handling is only valid for files with data blocks
	 * being truncated, or files being unlinked. Note that we either
	 * hold i_mutex, or the inode can not be referenced from outside,
	 * so i_nlink should not be bumped due to race
2374
	 */
2375 2376
	J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
		  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
2377

2378 2379
	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
2380 2381 2382
	if (err)
		goto out_unlock;

2383
	err = ext4_reserve_inode_write(handle, inode, &iloc);
2384 2385
	if (err)
		goto out_unlock;
2386 2387 2388 2389 2390 2391 2392
	/*
	 * Due to previous errors inode may be already a part of on-disk
	 * orphan list. If so skip on-disk list modification.
	 */
	if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
		(le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
			goto mem_insert;
2393 2394

	/* Insert this inode at the head of the on-disk orphan list... */
2395 2396
	NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
	EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
2397
	err = ext4_handle_dirty_super_now(handle, sb);
2398
	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409
	if (!err)
		err = rc;

	/* Only add to the head of the in-memory list if all the
	 * previous operations succeeded.  If the orphan_add is going to
	 * fail (possibly taking the journal offline), we can't risk
	 * leaving the inode on the orphan list: stray orphan-list
	 * entries can cause panics at unmount time.
	 *
	 * This is safe: on error we're going to ignore the orphan list
	 * anyway on the next recovery. */
2410
mem_insert:
2411
	if (!err)
2412
		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2413 2414 2415 2416 2417

	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
	jbd_debug(4, "orphan inode %lu will point to %d\n",
			inode->i_ino, NEXT_ORPHAN(inode));
out_unlock:
2418
	mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
2419
	ext4_std_error(inode->i_sb, err);
2420 2421 2422 2423
	return err;
}

/*
2424
 * ext4_orphan_del() removes an unlinked or truncated inode from the list
2425 2426
 * of such inodes stored on disk, because it is finally being cleaned up.
 */
2427
int ext4_orphan_del(handle_t *handle, struct inode *inode)
2428 2429
{
	struct list_head *prev;
2430 2431
	struct ext4_inode_info *ei = EXT4_I(inode);
	struct ext4_sb_info *sbi;
2432
	__u32 ino_next;
2433
	struct ext4_iloc iloc;
2434 2435
	int err = 0;

2436 2437
	/* ext4_handle_valid() assumes a valid handle_t pointer */
	if (handle && !ext4_handle_valid(handle))
2438 2439
		return 0;

2440 2441 2442
	mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
	if (list_empty(&ei->i_orphan))
		goto out;
2443 2444 2445

	ino_next = NEXT_ORPHAN(inode);
	prev = ei->i_orphan.prev;
2446
	sbi = EXT4_SB(inode->i_sb);
2447 2448 2449 2450 2451 2452 2453 2454 2455

	jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);

	list_del_init(&ei->i_orphan);

	/* If we're on an error path, we may not have a valid
	 * transaction handle with which to update the orphan list on
	 * disk, but we still need to remove the inode from the linked
	 * list in memory. */
2456
	if (sbi->s_journal && !handle)
2457 2458
		goto out;

2459
	err = ext4_reserve_inode_write(handle, inode, &iloc);
2460 2461 2462 2463
	if (err)
		goto out_err;

	if (prev == &sbi->s_orphan) {
2464
		jbd_debug(4, "superblock will point to %u\n", ino_next);
2465
		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
2466
		err = ext4_journal_get_write_access(handle, sbi->s_sbh);
2467 2468 2469
		if (err)
			goto out_brelse;
		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
2470
		err = ext4_handle_dirty_super_now(handle, inode->i_sb);
2471
	} else {
2472
		struct ext4_iloc iloc2;
2473
		struct inode *i_prev =
2474
			&list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
2475

2476
		jbd_debug(4, "orphan inode %lu will point to %u\n",
2477
			  i_prev->i_ino, ino_next);
2478
		err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
2479 2480 2481
		if (err)
			goto out_brelse;
		NEXT_ORPHAN(i_prev) = ino_next;
2482
		err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
2483 2484 2485 2486
	}
	if (err)
		goto out_brelse;
	NEXT_ORPHAN(inode) = 0;
2487
	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
2488 2489

out_err:
2490
	ext4_std_error(inode->i_sb, err);
2491
out:
2492
	mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
2493 2494 2495 2496 2497 2498 2499
	return err;

out_brelse:
	brelse(iloc.bh);
	goto out_err;
}

2500
static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2501 2502
{
	int retval;
2503 2504 2505
	struct inode *inode;
	struct buffer_head *bh;
	struct ext4_dir_entry_2 *de;
2506 2507 2508 2509
	handle_t *handle;

	/* Initialize quotas before so that eventual writes go in
	 * separate transaction */
2510 2511
	dquot_initialize(dir);
	dquot_initialize(dentry->d_inode);
2512

2513
	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2514 2515 2516 2517
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	retval = -ENOENT;
2518
	bh = ext4_find_entry(dir, &dentry->d_name, &de);
2519 2520 2521 2522
	if (!bh)
		goto end_rmdir;

	if (IS_DIRSYNC(dir))
2523
		ext4_handle_sync(handle);
2524 2525 2526 2527 2528 2529 2530 2531

	inode = dentry->d_inode;

	retval = -EIO;
	if (le32_to_cpu(de->inode) != inode->i_ino)
		goto end_rmdir;

	retval = -ENOTEMPTY;
2532
	if (!empty_dir(inode))
2533 2534
		goto end_rmdir;

2535
	retval = ext4_delete_entry(handle, dir, de, bh);
2536 2537
	if (retval)
		goto end_rmdir;
2538
	if (!EXT4_DIR_LINK_EMPTY(inode))
2539
		ext4_warning(inode->i_sb,
2540 2541
			     "empty directory has too many links (%d)",
			     inode->i_nlink);
2542 2543 2544 2545 2546 2547
	inode->i_version++;
	clear_nlink(inode);
	/* There's no need to set i_disksize: the fact that i_nlink is
	 * zero will ensure that the right thing happens during any
	 * recovery. */
	inode->i_size = 0;
2548
	ext4_orphan_add(handle, inode);
K
Kalpak Shah 已提交
2549
	inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
2550
	ext4_mark_inode_dirty(handle, inode);
2551
	ext4_dec_count(handle, dir);
2552 2553
	ext4_update_dx_flag(dir);
	ext4_mark_inode_dirty(handle, dir);
2554 2555

end_rmdir:
2556
	ext4_journal_stop(handle);
2557
	brelse(bh);
2558 2559 2560
	return retval;
}

2561
static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2562 2563
{
	int retval;
2564 2565 2566
	struct inode *inode;
	struct buffer_head *bh;
	struct ext4_dir_entry_2 *de;
2567 2568
	handle_t *handle;

2569
	trace_ext4_unlink_enter(dir, dentry);
2570 2571
	/* Initialize quotas before so that eventual writes go
	 * in separate transaction */
2572 2573
	dquot_initialize(dir);
	dquot_initialize(dentry->d_inode);
2574

2575
	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2576 2577 2578 2579
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	if (IS_DIRSYNC(dir))
2580
		ext4_handle_sync(handle);
2581 2582

	retval = -ENOENT;
2583
	bh = ext4_find_entry(dir, &dentry->d_name, &de);
2584 2585 2586 2587 2588 2589 2590 2591 2592 2593
	if (!bh)
		goto end_unlink;

	inode = dentry->d_inode;

	retval = -EIO;
	if (le32_to_cpu(de->inode) != inode->i_ino)
		goto end_unlink;

	if (!inode->i_nlink) {
2594
		ext4_warning(inode->i_sb,
2595 2596
			     "Deleting nonexistent file (%lu), %d",
			     inode->i_ino, inode->i_nlink);
M
Miklos Szeredi 已提交
2597
		set_nlink(inode, 1);
2598
	}
2599
	retval = ext4_delete_entry(handle, dir, de, bh);
2600 2601
	if (retval)
		goto end_unlink;
K
Kalpak Shah 已提交
2602
	dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
2603 2604
	ext4_update_dx_flag(dir);
	ext4_mark_inode_dirty(handle, dir);
2605
	drop_nlink(inode);
2606
	if (!inode->i_nlink)
2607
		ext4_orphan_add(handle, inode);
K
Kalpak Shah 已提交
2608
	inode->i_ctime = ext4_current_time(inode);
2609
	ext4_mark_inode_dirty(handle, inode);
2610 2611 2612
	retval = 0;

end_unlink:
2613
	ext4_journal_stop(handle);
2614
	brelse(bh);
2615
	trace_ext4_unlink_exit(dentry, retval);
2616 2617 2618
	return retval;
}

2619 2620
static int ext4_symlink(struct inode *dir,
			struct dentry *dentry, const char *symname)
2621 2622
{
	handle_t *handle;
2623
	struct inode *inode;
2624
	int l, err, retries = 0;
2625
	int credits;
2626 2627 2628 2629 2630

	l = strlen(symname)+1;
	if (l > dir->i_sb->s_blocksize)
		return -ENAMETOOLONG;

2631
	dquot_initialize(dir);
2632

2633 2634 2635 2636
	if (l > EXT4_N_BLOCKS * 4) {
		/*
		 * For non-fast symlinks, we just allocate inode and put it on
		 * orphan list in the first transaction => we need bitmap,
2637 2638
		 * group descriptor, sb, inode block, quota blocks, and
		 * possibly selinux xattr blocks.
2639
		 */
2640 2641
		credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
			  EXT4_XATTR_TRANS_BLOCKS;
2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652
	} else {
		/*
		 * Fast symlink. We have to add entry to directory
		 * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
		 * allocate new inode (bitmap, group descriptor, inode block,
		 * quota blocks, sb is already counted in previous macros).
		 */
		credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
			  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
			  EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
	}
2653
retry:
2654
	handle = ext4_journal_start(dir, credits);
2655 2656 2657 2658
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	if (IS_DIRSYNC(dir))
2659
		ext4_handle_sync(handle);
2660

2661
	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
2662
			       &dentry->d_name, 0, NULL);
2663 2664 2665 2666
	err = PTR_ERR(inode);
	if (IS_ERR(inode))
		goto out_stop;

2667
	if (l > EXT4_N_BLOCKS * 4) {
2668 2669
		inode->i_op = &ext4_symlink_inode_operations;
		ext4_set_aops(inode);
2670
		/*
2671 2672 2673 2674 2675 2676 2677 2678
		 * We cannot call page_symlink() with transaction started
		 * because it calls into ext4_write_begin() which can wait
		 * for transaction commit if we are running out of space
		 * and thus we deadlock. So we have to stop transaction now
		 * and restart it when symlink contents is written.
		 * 
		 * To keep fs consistent in case of crash, we have to put inode
		 * to orphan list in the mean time.
2679
		 */
2680 2681 2682 2683 2684
		drop_nlink(inode);
		err = ext4_orphan_add(handle, inode);
		ext4_journal_stop(handle);
		if (err)
			goto err_drop_inode;
2685
		err = __page_symlink(inode, symname, l, 1);
2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698
		if (err)
			goto err_drop_inode;
		/*
		 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
		 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
		 */
		handle = ext4_journal_start(dir,
				EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
				EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
		if (IS_ERR(handle)) {
			err = PTR_ERR(handle);
			goto err_drop_inode;
		}
2699
		set_nlink(inode, 1);
2700
		err = ext4_orphan_del(handle, inode);
2701
		if (err) {
2702
			ext4_journal_stop(handle);
2703
			clear_nlink(inode);
2704
			goto err_drop_inode;
2705 2706
		}
	} else {
2707
		/* clear the extent format for fast symlink */
2708
		ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
2709
		inode->i_op = &ext4_fast_symlink_inode_operations;
2710
		memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
2711 2712
		inode->i_size = l-1;
	}
2713 2714
	EXT4_I(inode)->i_disksize = inode->i_size;
	err = ext4_add_nondir(handle, dentry, inode);
2715
out_stop:
2716 2717
	ext4_journal_stop(handle);
	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2718 2719
		goto retry;
	return err;
2720 2721 2722 2723
err_drop_inode:
	unlock_new_inode(inode);
	iput(inode);
	return err;
2724 2725
}

2726 2727
static int ext4_link(struct dentry *old_dentry,
		     struct inode *dir, struct dentry *dentry)
2728 2729 2730 2731 2732
{
	handle_t *handle;
	struct inode *inode = old_dentry->d_inode;
	int err, retries = 0;

2733
	if (inode->i_nlink >= EXT4_LINK_MAX)
2734
		return -EMLINK;
2735

2736
	dquot_initialize(dir);
2737

2738
retry:
2739 2740
	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
					EXT4_INDEX_EXTRA_TRANS_BLOCKS);
2741 2742 2743 2744
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	if (IS_DIRSYNC(dir))
2745
		ext4_handle_sync(handle);
2746

K
Kalpak Shah 已提交
2747
	inode->i_ctime = ext4_current_time(inode);
2748
	ext4_inc_count(handle, inode);
A
Al Viro 已提交
2749
	ihold(inode);
2750

A
Al Viro 已提交
2751 2752 2753 2754 2755 2756 2757 2758
	err = ext4_add_entry(handle, dentry, inode);
	if (!err) {
		ext4_mark_inode_dirty(handle, inode);
		d_instantiate(dentry, inode);
	} else {
		drop_nlink(inode);
		iput(inode);
	}
2759 2760
	ext4_journal_stop(handle);
	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2761 2762 2763 2764
		goto retry;
	return err;
}

2765 2766
#define PARENT_INO(buffer, size) \
	(ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
2767 2768 2769 2770 2771

/*
 * Anybody can rename anything with this: the permission checks are left to the
 * higher-level routines.
 */
2772 2773
static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
		       struct inode *new_dir, struct dentry *new_dentry)
2774 2775
{
	handle_t *handle;
2776 2777 2778
	struct inode *old_inode, *new_inode;
	struct buffer_head *old_bh, *new_bh, *dir_bh;
	struct ext4_dir_entry_2 *old_de, *new_de;
2779
	int retval, force_da_alloc = 0;
2780

2781 2782
	dquot_initialize(old_dir);
	dquot_initialize(new_dir);
2783

2784 2785 2786 2787 2788
	old_bh = new_bh = dir_bh = NULL;

	/* Initialize quotas before so that eventual writes go
	 * in separate transaction */
	if (new_dentry->d_inode)
2789
		dquot_initialize(new_dentry->d_inode);
2790 2791 2792
	handle = ext4_journal_start(old_dir, 2 *
					EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
2793 2794 2795 2796
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2797
		ext4_handle_sync(handle);
2798

2799
	old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811
	/*
	 *  Check for inode number is _not_ due to possible IO errors.
	 *  We might rmdir the source, keep it as pwd of some process
	 *  and merrily kill the link to whatever was created under the
	 *  same name. Goodbye sticky bit ;-<
	 */
	old_inode = old_dentry->d_inode;
	retval = -ENOENT;
	if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
		goto end_rename;

	new_inode = new_dentry->d_inode;
2812
	new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
2813 2814
	if (new_bh) {
		if (!new_inode) {
2815
			brelse(new_bh);
2816 2817 2818 2819 2820 2821
			new_bh = NULL;
		}
	}
	if (S_ISDIR(old_inode->i_mode)) {
		if (new_inode) {
			retval = -ENOTEMPTY;
2822
			if (!empty_dir(new_inode))
2823 2824 2825
				goto end_rename;
		}
		retval = -EIO;
2826
		dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
2827 2828
		if (!dir_bh)
			goto end_rename;
2829 2830 2831 2832 2833
		if (!buffer_verified(dir_bh) &&
		    !ext4_dirent_csum_verify(old_inode,
				(struct ext4_dir_entry *)dir_bh->b_data))
			goto end_rename;
		set_buffer_verified(dir_bh);
2834 2835
		if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
				old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
2836 2837
			goto end_rename;
		retval = -EMLINK;
2838
		if (!new_inode && new_dir != old_dir &&
2839
		    EXT4_DIR_LINK_MAX(new_dir))
2840
			goto end_rename;
2841 2842 2843 2844
		BUFFER_TRACE(dir_bh, "get_write_access");
		retval = ext4_journal_get_write_access(handle, dir_bh);
		if (retval)
			goto end_rename;
2845 2846
	}
	if (!new_bh) {
2847
		retval = ext4_add_entry(handle, new_dentry, old_inode);
2848 2849 2850 2851
		if (retval)
			goto end_rename;
	} else {
		BUFFER_TRACE(new_bh, "get write access");
2852 2853 2854
		retval = ext4_journal_get_write_access(handle, new_bh);
		if (retval)
			goto end_rename;
2855
		new_de->inode = cpu_to_le32(old_inode->i_ino);
2856 2857
		if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
					      EXT4_FEATURE_INCOMPAT_FILETYPE))
2858 2859
			new_de->file_type = old_de->file_type;
		new_dir->i_version++;
2860 2861 2862
		new_dir->i_ctime = new_dir->i_mtime =
					ext4_current_time(new_dir);
		ext4_mark_inode_dirty(handle, new_dir);
2863
		BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
2864
		retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh);
2865 2866 2867 2868
		if (unlikely(retval)) {
			ext4_std_error(new_dir->i_sb, retval);
			goto end_rename;
		}
2869 2870 2871 2872 2873 2874 2875 2876
		brelse(new_bh);
		new_bh = NULL;
	}

	/*
	 * Like most other Unix systems, set the ctime for inodes on a
	 * rename.
	 */
K
Kalpak Shah 已提交
2877
	old_inode->i_ctime = ext4_current_time(old_inode);
2878
	ext4_mark_inode_dirty(handle, old_inode);
2879 2880 2881 2882 2883 2884 2885

	/*
	 * ok, that's it
	 */
	if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
	    old_de->name_len != old_dentry->d_name.len ||
	    strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
2886
	    (retval = ext4_delete_entry(handle, old_dir,
2887 2888 2889 2890 2891 2892
					old_de, old_bh)) == -ENOENT) {
		/* old_de could have moved from under us during htree split, so
		 * make sure that we are deleting the right entry.  We might
		 * also be pointing to a stale entry in the unused part of
		 * old_bh so just checking inum and the name isn't enough. */
		struct buffer_head *old_bh2;
2893
		struct ext4_dir_entry_2 *old_de2;
2894

2895
		old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
2896
		if (old_bh2) {
2897
			retval = ext4_delete_entry(handle, old_dir,
2898 2899 2900 2901 2902
						   old_de2, old_bh2);
			brelse(old_bh2);
		}
	}
	if (retval) {
2903
		ext4_warning(old_dir->i_sb,
2904 2905 2906 2907 2908
				"Deleting old file (%lu), %d, error=%d",
				old_dir->i_ino, old_dir->i_nlink, retval);
	}

	if (new_inode) {
2909
		ext4_dec_count(handle, new_inode);
K
Kalpak Shah 已提交
2910
		new_inode->i_ctime = ext4_current_time(new_inode);
2911
	}
K
Kalpak Shah 已提交
2912
	old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
2913
	ext4_update_dx_flag(old_dir);
2914
	if (dir_bh) {
2915 2916
		PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
						cpu_to_le32(new_dir->i_ino);
2917
		BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2918 2919
		retval = ext4_handle_dirty_dirent_node(handle, old_inode,
						       dir_bh);
2920 2921 2922 2923
		if (retval) {
			ext4_std_error(old_dir->i_sb, retval);
			goto end_rename;
		}
2924
		ext4_dec_count(handle, old_dir);
2925
		if (new_inode) {
2926
			/* checked empty_dir above, can't have another parent,
2927
			 * ext4_dec_count() won't work for many-linked dirs */
2928
			clear_nlink(new_inode);
2929
		} else {
2930
			ext4_inc_count(handle, new_dir);
2931 2932
			ext4_update_dx_flag(new_dir);
			ext4_mark_inode_dirty(handle, new_dir);
2933 2934
		}
	}
2935
	ext4_mark_inode_dirty(handle, old_dir);
2936
	if (new_inode) {
2937
		ext4_mark_inode_dirty(handle, new_inode);
2938
		if (!new_inode->i_nlink)
2939
			ext4_orphan_add(handle, new_inode);
2940 2941
		if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
			force_da_alloc = 1;
2942 2943 2944 2945
	}
	retval = 0;

end_rename:
2946 2947 2948
	brelse(dir_bh);
	brelse(old_bh);
	brelse(new_bh);
2949
	ext4_journal_stop(handle);
2950 2951
	if (retval == 0 && force_da_alloc)
		ext4_alloc_da_blocks(old_inode);
2952 2953 2954 2955 2956 2957
	return retval;
}

/*
 * directories can handle most operations...
 */
2958
const struct inode_operations ext4_dir_inode_operations = {
2959 2960 2961 2962 2963 2964 2965 2966 2967 2968
	.create		= ext4_create,
	.lookup		= ext4_lookup,
	.link		= ext4_link,
	.unlink		= ext4_unlink,
	.symlink	= ext4_symlink,
	.mkdir		= ext4_mkdir,
	.rmdir		= ext4_rmdir,
	.mknod		= ext4_mknod,
	.rename		= ext4_rename,
	.setattr	= ext4_setattr,
T
Theodore Ts'o 已提交
2969
#ifdef CONFIG_EXT4_FS_XATTR
2970 2971
	.setxattr	= generic_setxattr,
	.getxattr	= generic_getxattr,
2972
	.listxattr	= ext4_listxattr,
2973 2974
	.removexattr	= generic_removexattr,
#endif
2975
	.get_acl	= ext4_get_acl,
2976
	.fiemap         = ext4_fiemap,
2977 2978
};

2979
const struct inode_operations ext4_special_inode_operations = {
2980
	.setattr	= ext4_setattr,
T
Theodore Ts'o 已提交
2981
#ifdef CONFIG_EXT4_FS_XATTR
2982 2983
	.setxattr	= generic_setxattr,
	.getxattr	= generic_getxattr,
2984
	.listxattr	= ext4_listxattr,
2985 2986
	.removexattr	= generic_removexattr,
#endif
2987
	.get_acl	= ext4_get_acl,
2988
};