namei.c 74.8 KB
Newer Older
1
/*
2
 *  linux/fs/ext4/namei.c
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 *  Directory entry file type support and forward compatibility hooks
 *	for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
 *  Hash Tree Directory indexing (c)
 *	Daniel Phillips, 2001
 *  Hash Tree Directory indexing porting
 *	Christopher Li, 2002
 *  Hash Tree Directory indexing cleanup
 *	Theodore Ts'o, 2002
 */

#include <linux/fs.h>
#include <linux/pagemap.h>
29
#include <linux/jbd2.h>
30 31 32 33 34 35 36
#include <linux/time.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/bio.h>
37 38
#include "ext4.h"
#include "ext4_jbd2.h"
39 40 41 42

#include "xattr.h"
#include "acl.h"

43
#include <trace/events/ext4.h>
44 45 46 47 48
/*
 * define how far ahead to read directories while searching them.
 */
#define NAMEI_RA_CHUNKS  2
#define NAMEI_RA_BLOCKS  4
D
Dave Kleikamp 已提交
49
#define NAMEI_RA_SIZE	     (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
50 51
#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))

52
static struct buffer_head *ext4_append(handle_t *handle,
53
					struct inode *inode,
A
Aneesh Kumar K.V 已提交
54
					ext4_lblk_t *block, int *err)
55 56 57 58 59
{
	struct buffer_head *bh;

	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;

60 61
	bh = ext4_bread(handle, inode, *block, 1, err);
	if (bh) {
62
		inode->i_size += inode->i_sb->s_blocksize;
63
		EXT4_I(inode)->i_disksize = inode->i_size;
64 65 66 67 68
		*err = ext4_journal_get_write_access(handle, bh);
		if (*err) {
			brelse(bh);
			bh = NULL;
		}
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
	}
	return bh;
}

#ifndef assert
#define assert(test) J_ASSERT(test)
#endif

#ifdef DX_DEBUG
#define dxtrace(command) command
#else
#define dxtrace(command)
#endif

struct fake_dirent
{
	__le32 inode;
	__le16 rec_len;
	u8 name_len;
	u8 file_type;
};

struct dx_countlimit
{
	__le16 limit;
	__le16 count;
};

struct dx_entry
{
	__le32 hash;
	__le32 block;
};

/*
 * dx_root_info is laid out so that if it should somehow get overlaid by a
 * dirent the two low bits of the hash version will be zero.  Therefore, the
 * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
 */

struct dx_root
{
	struct fake_dirent dot;
	char dot_name[4];
	struct fake_dirent dotdot;
	char dotdot_name[4];
	struct dx_root_info
	{
		__le32 reserved_zero;
		u8 hash_version;
		u8 info_length; /* 8 */
		u8 indirect_levels;
		u8 unused_flags;
	}
	info;
	struct dx_entry	entries[0];
};

struct dx_node
{
	struct fake_dirent fake;
	struct dx_entry	entries[0];
};


struct dx_frame
{
	struct buffer_head *bh;
	struct dx_entry *entries;
	struct dx_entry *at;
};

struct dx_map_entry
{
	u32 hash;
144 145
	u16 offs;
	u16 size;
146 147
};

148 149 150 151 152 153 154 155
/*
 * This goes at the end of each htree block.
 */
struct dx_tail {
	u32 dt_reserved;
	__le32 dt_checksum;	/* crc32c(uuid+inum+dirblock) */
};

A
Aneesh Kumar K.V 已提交
156 157
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
158 159 160 161 162 163 164 165
static inline unsigned dx_get_hash(struct dx_entry *entry);
static void dx_set_hash(struct dx_entry *entry, unsigned value);
static unsigned dx_get_count(struct dx_entry *entries);
static unsigned dx_get_limit(struct dx_entry *entries);
static void dx_set_count(struct dx_entry *entries, unsigned value);
static void dx_set_limit(struct dx_entry *entries, unsigned value);
static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
static unsigned dx_node_limit(struct inode *dir);
166
static struct dx_frame *dx_probe(const struct qstr *d_name,
167 168 169 170
				 struct inode *dir,
				 struct dx_hash_info *hinfo,
				 struct dx_frame *frame,
				 int *err);
171
static void dx_release(struct dx_frame *frames);
172
static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
173
		       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
174
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
175
static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
176
		struct dx_map_entry *offsets, int count, unsigned blocksize);
177
static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
A
Aneesh Kumar K.V 已提交
178 179
static void dx_insert_block(struct dx_frame *frame,
					u32 hash, ext4_lblk_t block);
180
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
181 182 183
				 struct dx_frame *frame,
				 struct dx_frame *frames,
				 __u32 *start_hash);
184 185 186 187
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
		const struct qstr *d_name,
		struct ext4_dir_entry_2 **res_dir,
		int *err);
188
static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
189 190
			     struct inode *inode);

191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
/* checksumming functions */
static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
					       struct ext4_dir_entry *dirent,
					       int *offset)
{
	struct ext4_dir_entry *dp;
	struct dx_root_info *root;
	int count_offset;

	if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
		count_offset = 8;
	else if (le16_to_cpu(dirent->rec_len) == 12) {
		dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
		if (le16_to_cpu(dp->rec_len) !=
		    EXT4_BLOCK_SIZE(inode->i_sb) - 12)
			return NULL;
		root = (struct dx_root_info *)(((void *)dp + 12));
		if (root->reserved_zero ||
		    root->info_length != sizeof(struct dx_root_info))
			return NULL;
		count_offset = 32;
	} else
		return NULL;

	if (offset)
		*offset = count_offset;
	return (struct dx_countlimit *)(((void *)dirent) + count_offset);
}

static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
			   int count_offset, int count, struct dx_tail *t)
{
	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
	struct ext4_inode_info *ei = EXT4_I(inode);
	__u32 csum, old_csum;
	int size;

	size = count_offset + (count * sizeof(struct dx_entry));
	old_csum = t->dt_checksum;
	t->dt_checksum = 0;
	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
	csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
	t->dt_checksum = old_csum;

	return cpu_to_le32(csum);
}

static int ext4_dx_csum_verify(struct inode *inode,
			       struct ext4_dir_entry *dirent)
{
	struct dx_countlimit *c;
	struct dx_tail *t;
	int count_offset, limit, count;

	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
		return 1;

	c = get_dx_countlimit(inode, dirent, &count_offset);
	if (!c) {
		EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
		return 1;
	}
	limit = le16_to_cpu(c->limit);
	count = le16_to_cpu(c->count);
	if (count_offset + (limit * sizeof(struct dx_entry)) >
	    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
		EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
				 "tree checksum found.  Run e2fsck -D.");
		return 1;
	}
	t = (struct dx_tail *)(((struct dx_entry *)c) + limit);

	if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset,
					    count, t))
		return 0;
	return 1;
}

static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
{
	struct dx_countlimit *c;
	struct dx_tail *t;
	int count_offset, limit, count;

	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
		return;

	c = get_dx_countlimit(inode, dirent, &count_offset);
	if (!c) {
		EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
		return;
	}
	limit = le16_to_cpu(c->limit);
	count = le16_to_cpu(c->count);
	if (count_offset + (limit * sizeof(struct dx_entry)) >
	    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
		EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
				 "tree checksum.  Run e2fsck -D.");
		return;
	}
	t = (struct dx_tail *)(((struct dx_entry *)c) + limit);

	t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t);
}

static inline int ext4_handle_dirty_dx_node(handle_t *handle,
					    struct inode *inode,
					    struct buffer_head *bh)
{
	ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
	return ext4_handle_dirty_metadata(handle, inode, bh);
}

306 307 308 309
/*
 * p is at least 6 bytes before the end of page
 */
static inline struct ext4_dir_entry_2 *
310
ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
311 312
{
	return (struct ext4_dir_entry_2 *)((char *)p +
313
		ext4_rec_len_from_disk(p->rec_len, blocksize));
314 315
}

316 317 318 319 320
/*
 * Future: use high four bits of block for coalesce-on-delete flags
 * Mask them off for now.
 */

A
Aneesh Kumar K.V 已提交
321
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
322 323 324 325
{
	return le32_to_cpu(entry->block) & 0x00ffffff;
}

A
Aneesh Kumar K.V 已提交
326
static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
327 328 329 330
{
	entry->block = cpu_to_le32(value);
}

331
static inline unsigned dx_get_hash(struct dx_entry *entry)
332 333 334 335
{
	return le32_to_cpu(entry->hash);
}

336
static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
337 338 339 340
{
	entry->hash = cpu_to_le32(value);
}

341
static inline unsigned dx_get_count(struct dx_entry *entries)
342 343 344 345
{
	return le16_to_cpu(((struct dx_countlimit *) entries)->count);
}

346
static inline unsigned dx_get_limit(struct dx_entry *entries)
347 348 349 350
{
	return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
}

351
static inline void dx_set_count(struct dx_entry *entries, unsigned value)
352 353 354 355
{
	((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
}

356
static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
357 358 359 360
{
	((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
}

361
static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
362
{
363 364
	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
		EXT4_DIR_REC_LEN(2) - infosize;
365 366 367 368

	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
		entry_space -= sizeof(struct dx_tail);
369
	return entry_space / sizeof(struct dx_entry);
370 371
}

372
static inline unsigned dx_node_limit(struct inode *dir)
373
{
374
	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
375 376 377 378

	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
		entry_space -= sizeof(struct dx_tail);
379
	return entry_space / sizeof(struct dx_entry);
380 381 382 383 384 385
}

/*
 * Debug
 */
#ifdef DX_DEBUG
386
static void dx_show_index(char * label, struct dx_entry *entries)
387
{
A
Andrew Morton 已提交
388
	int i, n = dx_get_count (entries);
389
	printk(KERN_DEBUG "%s index ", label);
A
Andrew Morton 已提交
390
	for (i = 0; i < n; i++) {
391
		printk("%x->%lu ", i ? dx_get_hash(entries + i) :
A
Aneesh Kumar K.V 已提交
392
				0, (unsigned long)dx_get_block(entries + i));
A
Andrew Morton 已提交
393 394
	}
	printk("\n");
395 396 397 398 399 400 401 402 403
}

struct stats
{
	unsigned names;
	unsigned space;
	unsigned bcount;
};

404
static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de,
405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420
				 int size, int show_names)
{
	unsigned names = 0, space = 0;
	char *base = (char *) de;
	struct dx_hash_info h = *hinfo;

	printk("names: ");
	while ((char *) de < base + size)
	{
		if (de->inode)
		{
			if (show_names)
			{
				int len = de->name_len;
				char *name = de->name;
				while (len--) printk("%c", *name++);
421
				ext4fs_dirhash(de->name, de->name_len, &h);
422
				printk(":%x.%u ", h.hash,
423
				       (unsigned) ((char *) de - base));
424
			}
425
			space += EXT4_DIR_REC_LEN(de->name_len);
426 427
			names++;
		}
428
		de = ext4_next_entry(de, size);
429 430 431 432 433 434 435 436 437
	}
	printk("(%i)\n", names);
	return (struct stats) { names, space, 1 };
}

struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
			     struct dx_entry *entries, int levels)
{
	unsigned blocksize = dir->i_sb->s_blocksize;
438
	unsigned count = dx_get_count(entries), names = 0, space = 0, i;
439 440 441 442 443 444
	unsigned bcount = 0;
	struct buffer_head *bh;
	int err;
	printk("%i indexed blocks...\n", count);
	for (i = 0; i < count; i++, entries++)
	{
A
Aneesh Kumar K.V 已提交
445 446
		ext4_lblk_t block = dx_get_block(entries);
		ext4_lblk_t hash  = i ? dx_get_hash(entries): 0;
447 448 449
		u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
		struct stats stats;
		printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
450
		if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue;
451 452
		stats = levels?
		   dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
453
		   dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
454 455 456
		names += stats.names;
		space += stats.space;
		bcount += stats.bcount;
457
		brelse(bh);
458 459
	}
	if (bcount)
460
		printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
461 462
		       levels ? "" : "   ", names, space/bcount,
		       (space/bcount)*100/blocksize);
463 464 465 466 467 468 469 470 471 472 473 474 475 476
	return (struct stats) { names, space, bcount};
}
#endif /* DX_DEBUG */

/*
 * Probe for a directory leaf block to search.
 *
 * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
 * error in the directory index, and the caller should fall back to
 * searching the directory normally.  The callers of dx_probe **MUST**
 * check for this error code, and make sure it never gets reflected
 * back to userspace.
 */
static struct dx_frame *
477
dx_probe(const struct qstr *d_name, struct inode *dir,
478 479 480 481 482 483 484 485 486 487
	 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
{
	unsigned count, indirect;
	struct dx_entry *at, *entries, *p, *q, *m;
	struct dx_root *root;
	struct buffer_head *bh;
	struct dx_frame *frame = frame_in;
	u32 hash;

	frame->bh = NULL;
488
	if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
489 490 491 492 493
		goto fail;
	root = (struct dx_root *) bh->b_data;
	if (root->info.hash_version != DX_HASH_TEA &&
	    root->info.hash_version != DX_HASH_HALF_MD4 &&
	    root->info.hash_version != DX_HASH_LEGACY) {
494
		ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
495 496 497 498 499 500
			     root->info.hash_version);
		brelse(bh);
		*err = ERR_BAD_DX_DIR;
		goto fail;
	}
	hinfo->hash_version = root->info.hash_version;
501 502
	if (hinfo->hash_version <= DX_HASH_TEA)
		hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
503
	hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
504 505
	if (d_name)
		ext4fs_dirhash(d_name->name, d_name->len, hinfo);
506 507 508
	hash = hinfo->hash;

	if (root->info.unused_flags & 1) {
509
		ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
510 511 512 513 514 515 516
			     root->info.unused_flags);
		brelse(bh);
		*err = ERR_BAD_DX_DIR;
		goto fail;
	}

	if ((indirect = root->info.indirect_levels) > 1) {
517
		ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
518 519 520 521 522 523
			     root->info.indirect_levels);
		brelse(bh);
		*err = ERR_BAD_DX_DIR;
		goto fail;
	}

524 525 526 527 528 529 530 531 532
	if (!buffer_verified(bh) &&
	    !ext4_dx_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) {
		ext4_warning(dir->i_sb, "Root failed checksum");
		brelse(bh);
		*err = ERR_BAD_DX_DIR;
		goto fail;
	}
	set_buffer_verified(bh);

533 534
	entries = (struct dx_entry *) (((char *)&root->info) +
				       root->info.info_length);
535 536 537

	if (dx_get_limit(entries) != dx_root_limit(dir,
						   root->info.info_length)) {
538
		ext4_warning(dir->i_sb, "dx entry: limit != root limit");
539 540 541 542 543
		brelse(bh);
		*err = ERR_BAD_DX_DIR;
		goto fail;
	}

544
	dxtrace(printk("Look up %x", hash));
545 546 547
	while (1)
	{
		count = dx_get_count(entries);
548
		if (!count || count > dx_get_limit(entries)) {
549
			ext4_warning(dir->i_sb,
550 551 552 553 554 555
				     "dx entry: no count or count > limit");
			brelse(bh);
			*err = ERR_BAD_DX_DIR;
			goto fail2;
		}

556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589
		p = entries + 1;
		q = entries + count - 1;
		while (p <= q)
		{
			m = p + (q - p)/2;
			dxtrace(printk("."));
			if (dx_get_hash(m) > hash)
				q = m - 1;
			else
				p = m + 1;
		}

		if (0) // linear search cross check
		{
			unsigned n = count - 1;
			at = entries;
			while (n--)
			{
				dxtrace(printk(","));
				if (dx_get_hash(++at) > hash)
				{
					at--;
					break;
				}
			}
			assert (at == p - 1);
		}

		at = p - 1;
		dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
		frame->bh = bh;
		frame->entries = entries;
		frame->at = at;
		if (!indirect--) return frame;
590
		if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
591 592
			goto fail2;
		at = entries = ((struct dx_node *) bh->b_data)->entries;
593 594 595 596 597 598 599 600 601 602 603

		if (!buffer_verified(bh) &&
		    !ext4_dx_csum_verify(dir,
					 (struct ext4_dir_entry *)bh->b_data)) {
			ext4_warning(dir->i_sb, "Node failed checksum");
			brelse(bh);
			*err = ERR_BAD_DX_DIR;
			goto fail;
		}
		set_buffer_verified(bh);

604
		if (dx_get_limit(entries) != dx_node_limit (dir)) {
605
			ext4_warning(dir->i_sb,
606 607 608 609 610
				     "dx entry: limit != node limit");
			brelse(bh);
			*err = ERR_BAD_DX_DIR;
			goto fail2;
		}
611
		frame++;
612
		frame->bh = NULL;
613 614 615 616 617 618 619
	}
fail2:
	while (frame >= frame_in) {
		brelse(frame->bh);
		frame--;
	}
fail:
620
	if (*err == ERR_BAD_DX_DIR)
621
		ext4_warning(dir->i_sb,
Z
Zheng Liu 已提交
622
			     "Corrupt dir inode %lu, running e2fsck is "
623
			     "recommended.", dir->i_ino);
624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653
	return NULL;
}

static void dx_release (struct dx_frame *frames)
{
	if (frames[0].bh == NULL)
		return;

	if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
		brelse(frames[1].bh);
	brelse(frames[0].bh);
}

/*
 * This function increments the frame pointer to search the next leaf
 * block, and reads in the necessary intervening nodes if the search
 * should be necessary.  Whether or not the search is necessary is
 * controlled by the hash parameter.  If the hash value is even, then
 * the search is only continued if the next block starts with that
 * hash value.  This is used if we are searching for a specific file.
 *
 * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
 *
 * This function returns 1 if the caller should continue to search,
 * or 0 if it should not.  If there is an error reading one of the
 * index blocks, it will a negative error code.
 *
 * If start_hash is non-null, it will be filled in with the starting
 * hash of the next page.
 */
654
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699
				 struct dx_frame *frame,
				 struct dx_frame *frames,
				 __u32 *start_hash)
{
	struct dx_frame *p;
	struct buffer_head *bh;
	int err, num_frames = 0;
	__u32 bhash;

	p = frame;
	/*
	 * Find the next leaf page by incrementing the frame pointer.
	 * If we run out of entries in the interior node, loop around and
	 * increment pointer in the parent node.  When we break out of
	 * this loop, num_frames indicates the number of interior
	 * nodes need to be read.
	 */
	while (1) {
		if (++(p->at) < p->entries + dx_get_count(p->entries))
			break;
		if (p == frames)
			return 0;
		num_frames++;
		p--;
	}

	/*
	 * If the hash is 1, then continue only if the next page has a
	 * continuation hash of any value.  This is used for readdir
	 * handling.  Otherwise, check to see if the hash matches the
	 * desired contiuation hash.  If it doesn't, return since
	 * there's no point to read in the successive index pages.
	 */
	bhash = dx_get_hash(p->at);
	if (start_hash)
		*start_hash = bhash;
	if ((hash & 1) == 0) {
		if ((bhash & ~1) != hash)
			return 0;
	}
	/*
	 * If the hash is HASH_NB_ALWAYS, we always go to the next
	 * block so no check is necessary
	 */
	while (num_frames--) {
700
		if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
701 702
				      0, &err)))
			return err; /* Failure */
703 704 705 706 707 708 709 710 711

		if (!buffer_verified(bh) &&
		    !ext4_dx_csum_verify(dir,
					 (struct ext4_dir_entry *)bh->b_data)) {
			ext4_warning(dir->i_sb, "Node failed checksum");
			return -EIO;
		}
		set_buffer_verified(bh);

712
		p++;
713
		brelse(p->bh);
714 715 716 717 718 719 720 721 722 723 724 725 726
		p->bh = bh;
		p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
	}
	return 1;
}


/*
 * This function fills a red-black tree with information from a
 * directory block.  It returns the number directory entries loaded
 * into the tree.  If there is an error it is returned in err.
 */
static int htree_dirblock_to_tree(struct file *dir_file,
A
Aneesh Kumar K.V 已提交
727
				  struct inode *dir, ext4_lblk_t block,
728 729 730 731
				  struct dx_hash_info *hinfo,
				  __u32 start_hash, __u32 start_minor_hash)
{
	struct buffer_head *bh;
732
	struct ext4_dir_entry_2 *de, *top;
733 734
	int err, count = 0;

A
Aneesh Kumar K.V 已提交
735 736
	dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
							(unsigned long)block));
737
	if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
738 739
		return err;

740 741
	de = (struct ext4_dir_entry_2 *) bh->b_data;
	top = (struct ext4_dir_entry_2 *) ((char *) de +
742
					   dir->i_sb->s_blocksize -
743
					   EXT4_DIR_REC_LEN(0));
744
	for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
745
		if (ext4_check_dir_entry(dir, NULL, de, bh,
746 747
				(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
					 + ((char *)de - bh->b_data))) {
748 749 750
			/* On error, skip the f_pos to the next block. */
			dir_file->f_pos = (dir_file->f_pos |
					(dir->i_sb->s_blocksize - 1)) + 1;
751
			brelse(bh);
752 753
			return count;
		}
754
		ext4fs_dirhash(de->name, de->name_len, hinfo);
755 756 757 758 759 760
		if ((hinfo->hash < start_hash) ||
		    ((hinfo->hash == start_hash) &&
		     (hinfo->minor_hash < start_minor_hash)))
			continue;
		if (de->inode == 0)
			continue;
761
		if ((err = ext4_htree_store_dirent(dir_file,
762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780
				   hinfo->hash, hinfo->minor_hash, de)) != 0) {
			brelse(bh);
			return err;
		}
		count++;
	}
	brelse(bh);
	return count;
}


/*
 * This function fills a red-black tree with information from a
 * directory.  We start scanning the directory in hash order, starting
 * at start_hash and start_minor_hash.
 *
 * This function returns the number of entries inserted into the tree,
 * or a negative error code.
 */
781
int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
782 783 784
			 __u32 start_minor_hash, __u32 *next_hash)
{
	struct dx_hash_info hinfo;
785
	struct ext4_dir_entry_2 *de;
786 787
	struct dx_frame frames[2], *frame;
	struct inode *dir;
A
Aneesh Kumar K.V 已提交
788
	ext4_lblk_t block;
789
	int count = 0;
A
Aneesh Kumar K.V 已提交
790
	int ret, err;
791 792
	__u32 hashval;

793
	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
794
		       start_hash, start_minor_hash));
795
	dir = dir_file->f_path.dentry->d_inode;
796
	if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
797
		hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
798 799 800
		if (hinfo.hash_version <= DX_HASH_TEA)
			hinfo.hash_version +=
				EXT4_SB(dir->i_sb)->s_hash_unsigned;
801
		hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
802 803 804 805 806 807 808
		count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
					       start_hash, start_minor_hash);
		*next_hash = ~0;
		return count;
	}
	hinfo.hash = start_hash;
	hinfo.minor_hash = 0;
809
	frame = dx_probe(NULL, dir, &hinfo, frames, &err);
810 811 812 813 814
	if (!frame)
		return err;

	/* Add '.' and '..' from the htree header */
	if (!start_hash && !start_minor_hash) {
815 816
		de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
		if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0)
817 818 819 820
			goto errout;
		count++;
	}
	if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
821
		de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
822
		de = ext4_next_entry(de, dir->i_sb->s_blocksize);
823
		if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
824 825 826 827 828 829 830 831 832 833 834 835 836 837
			goto errout;
		count++;
	}

	while (1) {
		block = dx_get_block(frame->at);
		ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
					     start_hash, start_minor_hash);
		if (ret < 0) {
			err = ret;
			goto errout;
		}
		count += ret;
		hashval = ~0;
838
		ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854
					    frame, frames, &hashval);
		*next_hash = hashval;
		if (ret < 0) {
			err = ret;
			goto errout;
		}
		/*
		 * Stop if:  (a) there are no more entries, or
		 * (b) we have inserted at least one entry and the
		 * next hash value is not a continuation
		 */
		if ((ret == 0) ||
		    (count && ((hashval & 1) == 0)))
			break;
	}
	dx_release(frames);
855 856
	dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
		       "next hash: %x\n", count, *next_hash));
857 858 859 860 861 862 863 864 865 866 867
	return count;
errout:
	dx_release(frames);
	return (err);
}


/*
 * Directory block splitting, compacting
 */

868 869 870 871
/*
 * Create map of hash values, offsets, and sizes, stored at end of block.
 * Returns number of entries mapped.
 */
872 873 874
static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
		       struct dx_hash_info *hinfo,
		       struct dx_map_entry *map_tail)
875 876 877 878 879
{
	int count = 0;
	char *base = (char *) de;
	struct dx_hash_info h = *hinfo;

880
	while ((char *) de < base + blocksize) {
881
		if (de->name_len && de->inode) {
882
			ext4fs_dirhash(de->name, de->name_len, &h);
883 884
			map_tail--;
			map_tail->hash = h.hash;
885
			map_tail->offs = ((char *) de - base)>>2;
886
			map_tail->size = le16_to_cpu(de->rec_len);
887 888 889 890
			count++;
			cond_resched();
		}
		/* XXX: do we need to check rec_len == 0 case? -Chris */
891
		de = ext4_next_entry(de, blocksize);
892 893 894 895
	}
	return count;
}

896
/* Sort map by hash value */
897 898
static void dx_sort_map (struct dx_map_entry *map, unsigned count)
{
A
Andrew Morton 已提交
899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915
	struct dx_map_entry *p, *q, *top = map + count - 1;
	int more;
	/* Combsort until bubble sort doesn't suck */
	while (count > 2) {
		count = count*10/13;
		if (count - 9 < 2) /* 9, 10 -> 11 */
			count = 11;
		for (p = top, q = p - count; q >= map; p--, q--)
			if (p->hash < q->hash)
				swap(*p, *q);
	}
	/* Garden variety bubble sort */
	do {
		more = 0;
		q = top;
		while (q-- > map) {
			if (q[1].hash >= q[0].hash)
916
				continue;
A
Andrew Morton 已提交
917 918
			swap(*(q+1), *q);
			more = 1;
919 920 921 922
		}
	} while(more);
}

A
Aneesh Kumar K.V 已提交
923
static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
924 925 926 927 928 929 930 931 932 933 934 935 936
{
	struct dx_entry *entries = frame->entries;
	struct dx_entry *old = frame->at, *new = old + 1;
	int count = dx_get_count(entries);

	assert(count < dx_get_limit(entries));
	assert(old < entries + count);
	memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
	dx_set_hash(new, hash);
	dx_set_block(new, block);
	dx_set_count(entries, count + 1);
}

937
static void ext4_update_dx_flag(struct inode *inode)
938
{
939 940
	if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
				     EXT4_FEATURE_COMPAT_DIR_INDEX))
941
		ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
942 943 944
}

/*
945
 * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
946
 *
947
 * `len <= EXT4_NAME_LEN' is guaranteed by caller.
948 949
 * `de != NULL' is guaranteed by caller.
 */
950 951
static inline int ext4_match (int len, const char * const name,
			      struct ext4_dir_entry_2 * de)
952 953 954 955 956 957 958 959 960 961 962
{
	if (len != de->name_len)
		return 0;
	if (!de->inode)
		return 0;
	return !memcmp(name, de->name, len);
}

/*
 * Returns 0 if not found, -1 on failure, and 1 on success
 */
963
static inline int search_dirblock(struct buffer_head *bh,
964
				  struct inode *dir,
965
				  const struct qstr *d_name,
966
				  unsigned int offset,
967
				  struct ext4_dir_entry_2 ** res_dir)
968
{
969
	struct ext4_dir_entry_2 * de;
970 971
	char * dlimit;
	int de_len;
972 973
	const char *name = d_name->name;
	int namelen = d_name->len;
974

975
	de = (struct ext4_dir_entry_2 *) bh->b_data;
976 977 978 979 980 981
	dlimit = bh->b_data + dir->i_sb->s_blocksize;
	while ((char *) de < dlimit) {
		/* this code is executed quadratically often */
		/* do minimal checking `by hand' */

		if ((char *) de + namelen <= dlimit &&
982
		    ext4_match (namelen, name, de)) {
983
			/* found a match - just to be sure, do a full check */
984
			if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
985 986 987 988 989
				return -1;
			*res_dir = de;
			return 1;
		}
		/* prevent looping on a bad block */
990 991
		de_len = ext4_rec_len_from_disk(de->rec_len,
						dir->i_sb->s_blocksize);
992 993 994
		if (de_len <= 0)
			return -1;
		offset += de_len;
995
		de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
996 997 998 999 1000 1001
	}
	return 0;
}


/*
1002
 *	ext4_find_entry()
1003 1004 1005 1006 1007 1008 1009 1010 1011
 *
 * finds an entry in the specified directory with the wanted name. It
 * returns the cache buffer in which the entry was found, and the entry
 * itself (as a parameter - res_dir). It does NOT read the inode of the
 * entry - you'll have to do that yourself if you want to.
 *
 * The returned buffer_head has ->b_count elevated.  The caller is expected
 * to brelse() it when appropriate.
 */
1012 1013
static struct buffer_head * ext4_find_entry (struct inode *dir,
					const struct qstr *d_name,
1014
					struct ext4_dir_entry_2 ** res_dir)
1015
{
1016 1017 1018
	struct super_block *sb;
	struct buffer_head *bh_use[NAMEI_RA_SIZE];
	struct buffer_head *bh, *ret = NULL;
A
Aneesh Kumar K.V 已提交
1019
	ext4_lblk_t start, block, b;
1020
	const u8 *name = d_name->name;
1021 1022 1023 1024 1025
	int ra_max = 0;		/* Number of bh's in the readahead
				   buffer, bh_use[] */
	int ra_ptr = 0;		/* Current index into readahead
				   buffer */
	int num = 0;
A
Aneesh Kumar K.V 已提交
1026 1027
	ext4_lblk_t  nblocks;
	int i, err;
1028 1029 1030 1031
	int namelen;

	*res_dir = NULL;
	sb = dir->i_sb;
1032
	namelen = d_name->len;
1033
	if (namelen > EXT4_NAME_LEN)
1034
		return NULL;
1035
	if ((namelen <= 2) && (name[0] == '.') &&
1036
	    (name[1] == '.' || name[1] == '\0')) {
1037 1038 1039 1040 1041 1042 1043 1044
		/*
		 * "." or ".." will only be in the first block
		 * NFS may look up ".."; "." should be handled by the VFS
		 */
		block = start = 0;
		nblocks = 1;
		goto restart;
	}
1045
	if (is_dx(dir)) {
1046
		bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
1047 1048 1049 1050 1051 1052 1053
		/*
		 * On success, or if the error was file not found,
		 * return.  Otherwise, fall back to doing a search the
		 * old fashioned way.
		 */
		if (bh || (err != ERR_BAD_DX_DIR))
			return bh;
1054 1055
		dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
			       "falling back\n"));
1056
	}
1057 1058
	nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
	start = EXT4_I(dir)->i_dir_start_lookup;
1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081
	if (start >= nblocks)
		start = 0;
	block = start;
restart:
	do {
		/*
		 * We deal with the read-ahead logic here.
		 */
		if (ra_ptr >= ra_max) {
			/* Refill the readahead buffer */
			ra_ptr = 0;
			b = block;
			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
				/*
				 * Terminate if we reach the end of the
				 * directory and must wrap, or if our
				 * search has finished at this block.
				 */
				if (b >= nblocks || (num && block == start)) {
					bh_use[ra_max] = NULL;
					break;
				}
				num++;
1082
				bh = ext4_getblk(NULL, dir, b++, 0, &err);
1083 1084
				bh_use[ra_max] = bh;
				if (bh)
1085 1086
					ll_rw_block(READ | REQ_META | REQ_PRIO,
						    1, &bh);
1087 1088 1089 1090 1091 1092 1093
			}
		}
		if ((bh = bh_use[ra_ptr++]) == NULL)
			goto next;
		wait_on_buffer(bh);
		if (!buffer_uptodate(bh)) {
			/* read error, skip block & hope for the best */
1094 1095
			EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
					 (unsigned long) block);
1096 1097 1098
			brelse(bh);
			goto next;
		}
1099
		i = search_dirblock(bh, dir, d_name,
1100
			    block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
1101
		if (i == 1) {
1102
			EXT4_I(dir)->i_dir_start_lookup = block;
1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119
			ret = bh;
			goto cleanup_and_exit;
		} else {
			brelse(bh);
			if (i < 0)
				goto cleanup_and_exit;
		}
	next:
		if (++block >= nblocks)
			block = 0;
	} while (block != start);

	/*
	 * If the directory has grown while we were searching, then
	 * search the last part of the directory before giving up.
	 */
	block = nblocks;
1120
	nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
1121 1122 1123 1124 1125 1126 1127 1128
	if (block < nblocks) {
		start = 0;
		goto restart;
	}

cleanup_and_exit:
	/* Clean up the read-ahead blocks */
	for (; ra_ptr < ra_max; ra_ptr++)
1129
		brelse(bh_use[ra_ptr]);
1130 1131 1132
	return ret;
}

1133
static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
1134
		       struct ext4_dir_entry_2 **res_dir, int *err)
1135
{
1136
	struct super_block * sb = dir->i_sb;
1137 1138 1139
	struct dx_hash_info	hinfo;
	struct dx_frame frames[2], *frame;
	struct buffer_head *bh;
A
Aneesh Kumar K.V 已提交
1140
	ext4_lblk_t block;
1141 1142
	int retval;

1143 1144
	if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
		return NULL;
1145 1146
	do {
		block = dx_get_block(frame->at);
1147
		if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
1148
			goto errout;
1149

1150 1151 1152 1153 1154 1155
		retval = search_dirblock(bh, dir, d_name,
					 block << EXT4_BLOCK_SIZE_BITS(sb),
					 res_dir);
		if (retval == 1) { 	/* Success! */
			dx_release(frames);
			return bh;
1156
		}
1157
		brelse(bh);
1158 1159 1160 1161 1162
		if (retval == -1) {
			*err = ERR_BAD_DX_DIR;
			goto errout;
		}

1163
		/* Check to see if we should continue to search */
1164
		retval = ext4_htree_next_block(dir, hinfo.hash, frame,
1165 1166
					       frames, NULL);
		if (retval < 0) {
1167
			ext4_warning(sb,
1168 1169 1170 1171 1172 1173 1174 1175 1176
			     "error reading index page in directory #%lu",
			     dir->i_ino);
			*err = retval;
			goto errout;
		}
	} while (retval == 1);

	*err = -ENOENT;
errout:
1177
	dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
1178 1179 1180 1181
	dx_release (frames);
	return NULL;
}

1182
static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1183
{
1184 1185 1186
	struct inode *inode;
	struct ext4_dir_entry_2 *de;
	struct buffer_head *bh;
1187

1188
	if (dentry->d_name.len > EXT4_NAME_LEN)
1189 1190
		return ERR_PTR(-ENAMETOOLONG);

1191
	bh = ext4_find_entry(dir, &dentry->d_name, &de);
1192 1193
	inode = NULL;
	if (bh) {
1194
		__u32 ino = le32_to_cpu(de->inode);
1195
		brelse(bh);
1196
		if (!ext4_valid_inum(dir->i_sb, ino)) {
1197
			EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
1198
			return ERR_PTR(-EIO);
1199
		}
1200
		inode = ext4_iget(dir->i_sb, ino);
1201 1202 1203 1204 1205
		if (inode == ERR_PTR(-ESTALE)) {
			EXT4_ERROR_INODE(dir,
					 "deleted inode referenced: %u",
					 ino);
			return ERR_PTR(-EIO);
1206
		}
1207 1208 1209 1210 1211
	}
	return d_splice_alias(inode, dentry);
}


1212
struct dentry *ext4_get_parent(struct dentry *child)
1213
{
1214
	__u32 ino;
1215 1216 1217 1218
	static const struct qstr dotdot = {
		.name = "..",
		.len = 2,
	};
1219
	struct ext4_dir_entry_2 * de;
1220 1221
	struct buffer_head *bh;

1222
	bh = ext4_find_entry(child->d_inode, &dotdot, &de);
1223 1224 1225 1226 1227
	if (!bh)
		return ERR_PTR(-ENOENT);
	ino = le32_to_cpu(de->inode);
	brelse(bh);

1228
	if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1229 1230
		EXT4_ERROR_INODE(child->d_inode,
				 "bad parent inode number: %u", ino);
1231
		return ERR_PTR(-EIO);
1232 1233
	}

1234
	return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
1235 1236 1237
}

#define S_SHIFT 12
1238 1239 1240 1241 1242 1243 1244 1245
static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
	[S_IFREG >> S_SHIFT]	= EXT4_FT_REG_FILE,
	[S_IFDIR >> S_SHIFT]	= EXT4_FT_DIR,
	[S_IFCHR >> S_SHIFT]	= EXT4_FT_CHRDEV,
	[S_IFBLK >> S_SHIFT]	= EXT4_FT_BLKDEV,
	[S_IFIFO >> S_SHIFT]	= EXT4_FT_FIFO,
	[S_IFSOCK >> S_SHIFT]	= EXT4_FT_SOCK,
	[S_IFLNK >> S_SHIFT]	= EXT4_FT_SYMLINK,
1246 1247
};

1248 1249
static inline void ext4_set_de_type(struct super_block *sb,
				struct ext4_dir_entry_2 *de,
1250
				umode_t mode) {
1251 1252
	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
		de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1253 1254
}

1255 1256 1257 1258
/*
 * Move count entries from end of map between two memory locations.
 * Returns pointer to last entry moved.
 */
1259
static struct ext4_dir_entry_2 *
1260 1261
dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
		unsigned blocksize)
1262 1263 1264 1265
{
	unsigned rec_len = 0;

	while (count--) {
1266
		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
1267
						(from + (map->offs<<2));
1268
		rec_len = EXT4_DIR_REC_LEN(de->name_len);
1269
		memcpy (to, de, rec_len);
1270
		((struct ext4_dir_entry_2 *) to)->rec_len =
1271
				ext4_rec_len_to_disk(rec_len, blocksize);
1272 1273 1274 1275
		de->inode = 0;
		map++;
		to += rec_len;
	}
1276
	return (struct ext4_dir_entry_2 *) (to - rec_len);
1277 1278
}

1279 1280 1281 1282
/*
 * Compact each dir entry in the range to the minimal rec_len.
 * Returns pointer to last entry in range.
 */
1283
static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
1284
{
1285
	struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
1286 1287 1288
	unsigned rec_len = 0;

	prev = to = de;
1289
	while ((char*)de < base + blocksize) {
1290
		next = ext4_next_entry(de, blocksize);
1291
		if (de->inode && de->name_len) {
1292
			rec_len = EXT4_DIR_REC_LEN(de->name_len);
1293 1294
			if (de > to)
				memmove(to, de, rec_len);
1295
			to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
1296
			prev = to;
1297
			to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
1298 1299 1300 1301 1302 1303
		}
		de = next;
	}
	return prev;
}

1304 1305 1306 1307 1308
/*
 * Split a full leaf block to make room for a new dir entry.
 * Allocate a new block, and move entries so that they are approx. equally full.
 * Returns pointer to de in block into which the new entry will be inserted.
 */
1309
static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1310 1311 1312 1313 1314 1315
			struct buffer_head **bh,struct dx_frame *frame,
			struct dx_hash_info *hinfo, int *error)
{
	unsigned blocksize = dir->i_sb->s_blocksize;
	unsigned count, continued;
	struct buffer_head *bh2;
A
Aneesh Kumar K.V 已提交
1316
	ext4_lblk_t newblock;
1317 1318 1319
	u32 hash2;
	struct dx_map_entry *map;
	char *data1 = (*bh)->b_data, *data2;
1320
	unsigned split, move, size;
1321
	struct ext4_dir_entry_2 *de = NULL, *de2;
1322
	int	err = 0, i;
1323

1324
	bh2 = ext4_append (handle, dir, &newblock, &err);
1325 1326 1327 1328 1329 1330 1331
	if (!(bh2)) {
		brelse(*bh);
		*bh = NULL;
		goto errout;
	}

	BUFFER_TRACE(*bh, "get_write_access");
1332
	err = ext4_journal_get_write_access(handle, *bh);
1333 1334 1335
	if (err)
		goto journal_error;

1336
	BUFFER_TRACE(frame->bh, "get_write_access");
1337
	err = ext4_journal_get_write_access(handle, frame->bh);
1338 1339 1340 1341 1342 1343 1344
	if (err)
		goto journal_error;

	data2 = bh2->b_data;

	/* create map in the end of data2 block */
	map = (struct dx_map_entry *) (data2 + blocksize);
1345
	count = dx_make_map((struct ext4_dir_entry_2 *) data1,
1346 1347
			     blocksize, hinfo, map);
	map -= count;
1348
	dx_sort_map(map, count);
1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360
	/* Split the existing block in the middle, size-wise */
	size = 0;
	move = 0;
	for (i = count-1; i >= 0; i--) {
		/* is more than half of this entry in 2nd half of the block? */
		if (size + map[i].size/2 > blocksize/2)
			break;
		size += map[i].size;
		move++;
	}
	/* map index at which we will split */
	split = count - move;
1361 1362
	hash2 = map[split].hash;
	continued = hash2 == map[split - 1].hash;
A
Aneesh Kumar K.V 已提交
1363 1364 1365
	dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
			(unsigned long)dx_get_block(frame->at),
					hash2, split, count-split));
1366 1367

	/* Fancy dance to stay within two buffers */
1368
	de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
1369
	de = dx_pack_dirents(data1, blocksize);
1370 1371 1372 1373
	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
					   blocksize);
	de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
					    blocksize);
1374 1375
	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1376 1377 1378 1379 1380 1381 1382

	/* Which block gets the new entry? */
	if (hinfo->hash >= hash2)
	{
		swap(*bh, bh2);
		de = de2;
	}
1383
	dx_insert_block(frame, hash2 + continued, newblock);
1384
	err = ext4_handle_dirty_metadata(handle, dir, bh2);
1385 1386
	if (err)
		goto journal_error;
1387
	err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
1388 1389
	if (err)
		goto journal_error;
1390 1391
	brelse(bh2);
	dxtrace(dx_show_index("frame", frame->entries));
1392
	return de;
1393 1394 1395 1396 1397 1398 1399 1400 1401

journal_error:
	brelse(*bh);
	brelse(bh2);
	*bh = NULL;
	ext4_std_error(dir->i_sb, err);
errout:
	*error = err;
	return NULL;
1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412
}

/*
 * Add a new entry into a directory (leaf) block.  If de is non-NULL,
 * it points to a directory entry which is guaranteed to be large
 * enough for new directory entry.  If de is NULL, then
 * add_dirent_to_buf will attempt search the directory block for
 * space.  It will return -ENOSPC if no space is available, and -EIO
 * and -EEXIST if directory entry already exists.
 */
static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1413
			     struct inode *inode, struct ext4_dir_entry_2 *de,
1414
			     struct buffer_head *bh)
1415 1416 1417 1418
{
	struct inode	*dir = dentry->d_parent->d_inode;
	const char	*name = dentry->d_name.name;
	int		namelen = dentry->d_name.len;
1419
	unsigned int	offset = 0;
1420
	unsigned int	blocksize = dir->i_sb->s_blocksize;
1421 1422 1423 1424
	unsigned short	reclen;
	int		nlen, rlen, err;
	char		*top;

1425
	reclen = EXT4_DIR_REC_LEN(namelen);
1426
	if (!de) {
1427
		de = (struct ext4_dir_entry_2 *)bh->b_data;
1428
		top = bh->b_data + blocksize - reclen;
1429
		while ((char *) de <= top) {
1430
			if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
1431
				return -EIO;
1432
			if (ext4_match(namelen, name, de))
1433
				return -EEXIST;
1434
			nlen = EXT4_DIR_REC_LEN(de->name_len);
1435
			rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1436 1437
			if ((de->inode? rlen - nlen: rlen) >= reclen)
				break;
1438
			de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
1439 1440 1441 1442 1443 1444
			offset += rlen;
		}
		if ((char *) de > top)
			return -ENOSPC;
	}
	BUFFER_TRACE(bh, "get_write_access");
1445
	err = ext4_journal_get_write_access(handle, bh);
1446
	if (err) {
1447
		ext4_std_error(dir->i_sb, err);
1448 1449 1450 1451
		return err;
	}

	/* By now the buffer is marked for journaling */
1452
	nlen = EXT4_DIR_REC_LEN(de->name_len);
1453
	rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1454
	if (de->inode) {
1455
		struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
1456 1457
		de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
		de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
1458 1459
		de = de1;
	}
1460
	de->file_type = EXT4_FT_UNKNOWN;
1461 1462
	if (inode) {
		de->inode = cpu_to_le32(inode->i_ino);
1463
		ext4_set_de_type(dir->i_sb, de, inode->i_mode);
1464 1465 1466
	} else
		de->inode = 0;
	de->name_len = namelen;
1467
	memcpy(de->name, name, namelen);
1468 1469 1470 1471 1472 1473
	/*
	 * XXX shouldn't update any times until successful
	 * completion of syscall, but too many callers depend
	 * on this.
	 *
	 * XXX similarly, too many callers depend on
1474
	 * ext4_new_inode() setting the times, but error
1475 1476 1477 1478
	 * recovery deletes the inode, so the worst that can
	 * happen is that the times are slightly out of date
	 * and/or different from the directory change time.
	 */
K
Kalpak Shah 已提交
1479
	dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
1480
	ext4_update_dx_flag(dir);
1481
	dir->i_version++;
1482
	ext4_mark_inode_dirty(handle, dir);
1483 1484
	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
	err = ext4_handle_dirty_metadata(handle, dir, bh);
1485
	if (err)
1486
		ext4_std_error(dir->i_sb, err);
1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503
	return 0;
}

/*
 * This converts a one block unindexed directory to a 3 block indexed
 * directory, and adds the dentry to the indexed directory.
 */
static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
			    struct inode *inode, struct buffer_head *bh)
{
	struct inode	*dir = dentry->d_parent->d_inode;
	const char	*name = dentry->d_name.name;
	int		namelen = dentry->d_name.len;
	struct buffer_head *bh2;
	struct dx_root	*root;
	struct dx_frame	frames[2], *frame;
	struct dx_entry *entries;
1504
	struct ext4_dir_entry_2	*de, *de2;
1505 1506 1507 1508 1509
	char		*data1, *top;
	unsigned	len;
	int		retval;
	unsigned	blocksize;
	struct dx_hash_info hinfo;
A
Aneesh Kumar K.V 已提交
1510
	ext4_lblk_t  block;
1511 1512 1513
	struct fake_dirent *fde;

	blocksize =  dir->i_sb->s_blocksize;
1514
	dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
1515
	retval = ext4_journal_get_write_access(handle, bh);
1516
	if (retval) {
1517
		ext4_std_error(dir->i_sb, retval);
1518 1519 1520 1521 1522
		brelse(bh);
		return retval;
	}
	root = (struct dx_root *) bh->b_data;

1523 1524 1525
	/* The 0th block becomes the root, move the dirents out */
	fde = &root->dotdot;
	de = (struct ext4_dir_entry_2 *)((char *)fde +
1526
		ext4_rec_len_from_disk(fde->rec_len, blocksize));
1527
	if ((char *) de >= (((char *) root) + blocksize)) {
1528
		EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
1529 1530 1531 1532 1533 1534
		brelse(bh);
		return -EIO;
	}
	len = ((char *) root) + blocksize - (char *) de;

	/* Allocate new block for the 0th block's dirents */
1535
	bh2 = ext4_append(handle, dir, &block, &retval);
1536 1537 1538 1539
	if (!(bh2)) {
		brelse(bh);
		return retval;
	}
1540
	ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
1541 1542 1543
	data1 = bh2->b_data;

	memcpy (data1, de, len);
1544
	de = (struct ext4_dir_entry_2 *) data1;
1545
	top = data1 + len;
1546
	while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
1547
		de = de2;
1548 1549
	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
					   blocksize);
1550
	/* Initialize the root; the dot dirents already exist */
1551
	de = (struct ext4_dir_entry_2 *) (&root->dotdot);
1552 1553
	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
					   blocksize);
1554 1555
	memset (&root->info, 0, sizeof(root->info));
	root->info.info_length = sizeof(root->info);
1556
	root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
1557
	entries = root->entries;
1558 1559 1560
	dx_set_block(entries, 1);
	dx_set_count(entries, 1);
	dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
1561 1562 1563

	/* Initialize as for dx_probe */
	hinfo.hash_version = root->info.hash_version;
1564 1565
	if (hinfo.hash_version <= DX_HASH_TEA)
		hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
1566 1567
	hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
	ext4fs_dirhash(name, namelen, &hinfo);
1568 1569 1570 1571 1572
	frame = frames;
	frame->entries = entries;
	frame->at = entries;
	frame->bh = bh;
	bh = bh2;
1573

1574
	ext4_handle_dirty_dx_node(handle, dir, frame->bh);
1575 1576
	ext4_handle_dirty_metadata(handle, dir, bh);

1577
	de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1578 1579 1580 1581 1582 1583 1584 1585
	if (!de) {
		/*
		 * Even if the block split failed, we have to properly write
		 * out all the changes we did so far. Otherwise we can end up
		 * with corrupted filesystem.
		 */
		ext4_mark_inode_dirty(handle, dir);
		dx_release(frames);
1586
		return retval;
1587 1588
	}
	dx_release(frames);
1589

1590 1591 1592
	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
	brelse(bh);
	return retval;
1593 1594 1595
}

/*
1596
 *	ext4_add_entry()
1597 1598
 *
 * adds a file entry to the specified directory, using the same
1599
 * semantics as ext4_find_entry(). It returns NULL if it failed.
1600 1601 1602 1603 1604
 *
 * NOTE!! The inode part of 'de' is left at 0 - which means you
 * may not sleep between calling this and putting something into
 * the entry, as someone else might have used it while you slept.
 */
1605 1606
static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
			  struct inode *inode)
1607 1608
{
	struct inode *dir = dentry->d_parent->d_inode;
1609
	struct buffer_head *bh;
1610
	struct ext4_dir_entry_2 *de;
1611
	struct super_block *sb;
1612 1613 1614
	int	retval;
	int	dx_fallback=0;
	unsigned blocksize;
A
Aneesh Kumar K.V 已提交
1615
	ext4_lblk_t block, blocks;
1616 1617 1618 1619 1620 1621

	sb = dir->i_sb;
	blocksize = sb->s_blocksize;
	if (!dentry->d_name.len)
		return -EINVAL;
	if (is_dx(dir)) {
1622
		retval = ext4_dx_add_entry(handle, dentry, inode);
1623 1624
		if (!retval || (retval != ERR_BAD_DX_DIR))
			return retval;
1625
		ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
1626
		dx_fallback++;
1627
		ext4_mark_inode_dirty(handle, dir);
1628 1629
	}
	blocks = dir->i_size >> sb->s_blocksize_bits;
1630
	for (block = 0; block < blocks; block++) {
1631
		bh = ext4_bread(handle, dir, block, 0, &retval);
1632 1633 1634
		if(!bh)
			return retval;
		retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1635 1636
		if (retval != -ENOSPC) {
			brelse(bh);
1637
			return retval;
1638
		}
1639 1640

		if (blocks == 1 && !dx_fallback &&
1641 1642
		    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
			return make_indexed_dir(handle, dentry, inode, bh);
1643 1644
		brelse(bh);
	}
1645
	bh = ext4_append(handle, dir, &block, &retval);
1646 1647
	if (!bh)
		return retval;
1648
	de = (struct ext4_dir_entry_2 *) bh->b_data;
1649
	de->inode = 0;
1650
	de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1651 1652
	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
	brelse(bh);
1653 1654
	if (retval == 0)
		ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
1655
	return retval;
1656 1657 1658 1659 1660
}

/*
 * Returns 0 for success, or a negative error value
 */
1661
static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1662 1663 1664 1665 1666
			     struct inode *inode)
{
	struct dx_frame frames[2], *frame;
	struct dx_entry *entries, *at;
	struct dx_hash_info hinfo;
1667
	struct buffer_head *bh;
1668
	struct inode *dir = dentry->d_parent->d_inode;
1669
	struct super_block *sb = dir->i_sb;
1670
	struct ext4_dir_entry_2 *de;
1671 1672
	int err;

1673
	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
1674 1675 1676 1677 1678
	if (!frame)
		return err;
	entries = frame->entries;
	at = frame->at;

1679
	if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
1680 1681 1682
		goto cleanup;

	BUFFER_TRACE(bh, "get_write_access");
1683
	err = ext4_journal_get_write_access(handle, bh);
1684 1685 1686 1687
	if (err)
		goto journal_error;

	err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1688
	if (err != -ENOSPC)
1689 1690 1691
		goto cleanup;

	/* Block full, should compress but for now just split */
1692
	dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
1693 1694 1695
		       dx_get_count(entries), dx_get_limit(entries)));
	/* Need to split index? */
	if (dx_get_count(entries) == dx_get_limit(entries)) {
A
Aneesh Kumar K.V 已提交
1696
		ext4_lblk_t newblock;
1697 1698 1699 1700 1701 1702 1703 1704
		unsigned icount = dx_get_count(entries);
		int levels = frame - frames;
		struct dx_entry *entries2;
		struct dx_node *node2;
		struct buffer_head *bh2;

		if (levels && (dx_get_count(frames->entries) ==
			       dx_get_limit(frames->entries))) {
1705
			ext4_warning(sb, "Directory index full!");
1706 1707 1708
			err = -ENOSPC;
			goto cleanup;
		}
1709
		bh2 = ext4_append (handle, dir, &newblock, &err);
1710 1711 1712 1713
		if (!(bh2))
			goto cleanup;
		node2 = (struct dx_node *)(bh2->b_data);
		entries2 = node2->entries;
1714
		memset(&node2->fake, 0, sizeof(struct fake_dirent));
1715 1716
		node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
							   sb->s_blocksize);
1717
		BUFFER_TRACE(frame->bh, "get_write_access");
1718
		err = ext4_journal_get_write_access(handle, frame->bh);
1719 1720 1721 1722 1723
		if (err)
			goto journal_error;
		if (levels) {
			unsigned icount1 = icount/2, icount2 = icount - icount1;
			unsigned hash2 = dx_get_hash(entries + icount1);
1724 1725
			dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
				       icount1, icount2));
1726 1727

			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1728
			err = ext4_journal_get_write_access(handle,
1729 1730 1731 1732
							     frames[0].bh);
			if (err)
				goto journal_error;

1733 1734 1735 1736 1737
			memcpy((char *) entries2, (char *) (entries + icount1),
			       icount2 * sizeof(struct dx_entry));
			dx_set_count(entries, icount1);
			dx_set_count(entries2, icount2);
			dx_set_limit(entries2, dx_node_limit(dir));
1738 1739 1740 1741 1742 1743 1744

			/* Which index block gets the new entry? */
			if (at - entries >= icount1) {
				frame->at = at = at - entries - icount1 + entries2;
				frame->entries = entries = entries2;
				swap(frame->bh, bh2);
			}
1745 1746 1747
			dx_insert_block(frames + 0, hash2, newblock);
			dxtrace(dx_show_index("node", frames[1].entries));
			dxtrace(dx_show_index("node",
1748
			       ((struct dx_node *) bh2->b_data)->entries));
1749
			err = ext4_handle_dirty_dx_node(handle, dir, bh2);
1750 1751 1752 1753
			if (err)
				goto journal_error;
			brelse (bh2);
		} else {
1754 1755
			dxtrace(printk(KERN_DEBUG
				       "Creating second level index...\n"));
1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769
			memcpy((char *) entries2, (char *) entries,
			       icount * sizeof(struct dx_entry));
			dx_set_limit(entries2, dx_node_limit(dir));

			/* Set up root */
			dx_set_count(entries, 1);
			dx_set_block(entries + 0, newblock);
			((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;

			/* Add new access path frame */
			frame = frames + 1;
			frame->at = at = at - entries + entries2;
			frame->entries = entries = entries2;
			frame->bh = bh2;
1770
			err = ext4_journal_get_write_access(handle,
1771 1772 1773 1774
							     frame->bh);
			if (err)
				goto journal_error;
		}
1775
		err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
1776 1777 1778 1779
		if (err) {
			ext4_std_error(inode->i_sb, err);
			goto cleanup;
		}
1780 1781 1782 1783 1784 1785 1786 1787
	}
	de = do_split(handle, dir, &bh, frame, &hinfo, &err);
	if (!de)
		goto cleanup;
	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
	goto cleanup;

journal_error:
1788
	ext4_std_error(dir->i_sb, err);
1789 1790 1791 1792 1793 1794 1795 1796
cleanup:
	if (bh)
		brelse(bh);
	dx_release(frames);
	return err;
}

/*
1797
 * ext4_delete_entry deletes a directory entry by merging it with the
1798 1799
 * previous entry
 */
1800 1801 1802 1803
static int ext4_delete_entry(handle_t *handle,
			     struct inode *dir,
			     struct ext4_dir_entry_2 *de_del,
			     struct buffer_head *bh)
1804
{
1805
	struct ext4_dir_entry_2 *de, *pde;
1806
	unsigned int blocksize = dir->i_sb->s_blocksize;
1807
	int i, err;
1808 1809 1810

	i = 0;
	pde = NULL;
1811
	de = (struct ext4_dir_entry_2 *) bh->b_data;
1812
	while (i < bh->b_size) {
1813
		if (ext4_check_dir_entry(dir, NULL, de, bh, i))
1814 1815 1816
			return -EIO;
		if (de == de_del)  {
			BUFFER_TRACE(bh, "get_write_access");
1817 1818 1819 1820 1821
			err = ext4_journal_get_write_access(handle, bh);
			if (unlikely(err)) {
				ext4_std_error(dir->i_sb, err);
				return err;
			}
1822
			if (pde)
1823
				pde->rec_len = ext4_rec_len_to_disk(
1824 1825 1826 1827 1828
					ext4_rec_len_from_disk(pde->rec_len,
							       blocksize) +
					ext4_rec_len_from_disk(de->rec_len,
							       blocksize),
					blocksize);
1829 1830 1831
			else
				de->inode = 0;
			dir->i_version++;
1832
			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1833 1834 1835 1836 1837
			err = ext4_handle_dirty_metadata(handle, dir, bh);
			if (unlikely(err)) {
				ext4_std_error(dir->i_sb, err);
				return err;
			}
1838 1839
			return 0;
		}
1840
		i += ext4_rec_len_from_disk(de->rec_len, blocksize);
1841
		pde = de;
1842
		de = ext4_next_entry(de, blocksize);
1843 1844 1845 1846
	}
	return -ENOENT;
}

1847 1848 1849 1850 1851 1852 1853 1854 1855 1856
/*
 * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
 * since this indicates that nlinks count was previously 1.
 */
static void ext4_inc_count(handle_t *handle, struct inode *inode)
{
	inc_nlink(inode);
	if (is_dx(inode) && inode->i_nlink > 1) {
		/* limit is 16-bit i_links_count */
		if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
M
Miklos Szeredi 已提交
1857
			set_nlink(inode, 1);
1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869
			EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
					      EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
		}
	}
}

/*
 * If a directory had nlink == 1, then we should let it be 1. This indicates
 * directory has >EXT4_LINK_MAX subdirs.
 */
static void ext4_dec_count(handle_t *handle, struct inode *inode)
{
1870 1871
	if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
		drop_nlink(inode);
1872 1873 1874
}


1875
static int ext4_add_nondir(handle_t *handle,
1876 1877
		struct dentry *dentry, struct inode *inode)
{
1878
	int err = ext4_add_entry(handle, dentry, inode);
1879
	if (!err) {
1880
		ext4_mark_inode_dirty(handle, inode);
1881
		d_instantiate(dentry, inode);
A
Al Viro 已提交
1882
		unlock_new_inode(inode);
1883 1884
		return 0;
	}
1885
	drop_nlink(inode);
A
Al Viro 已提交
1886
	unlock_new_inode(inode);
1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898
	iput(inode);
	return err;
}

/*
 * By the time this is called, we already have created
 * the directory cache entry for the new file, but it
 * is so far negative - it has no inode.
 *
 * If the create succeeds, we fill in the inode information
 * with d_instantiate().
 */
A
Al Viro 已提交
1899
static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
1900
		       struct nameidata *nd)
1901 1902
{
	handle_t *handle;
1903
	struct inode *inode;
1904 1905
	int err, retries = 0;

1906
	dquot_initialize(dir);
1907

1908
retry:
1909 1910
	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
D
Dmitry Monakhov 已提交
1911
					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1912 1913 1914 1915
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	if (IS_DIRSYNC(dir))
1916
		ext4_handle_sync(handle);
1917

1918
	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
1919 1920
	err = PTR_ERR(inode);
	if (!IS_ERR(inode)) {
1921 1922 1923 1924
		inode->i_op = &ext4_file_inode_operations;
		inode->i_fop = &ext4_file_operations;
		ext4_set_aops(inode);
		err = ext4_add_nondir(handle, dentry, inode);
1925
	}
1926 1927
	ext4_journal_stop(handle);
	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
1928 1929 1930 1931
		goto retry;
	return err;
}

1932
static int ext4_mknod(struct inode *dir, struct dentry *dentry,
A
Al Viro 已提交
1933
		      umode_t mode, dev_t rdev)
1934 1935 1936 1937 1938 1939 1940 1941
{
	handle_t *handle;
	struct inode *inode;
	int err, retries = 0;

	if (!new_valid_dev(rdev))
		return -EINVAL;

1942
	dquot_initialize(dir);
1943

1944
retry:
1945 1946
	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
D
Dmitry Monakhov 已提交
1947
					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1948 1949 1950 1951
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	if (IS_DIRSYNC(dir))
1952
		ext4_handle_sync(handle);
1953

1954
	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
1955 1956 1957
	err = PTR_ERR(inode);
	if (!IS_ERR(inode)) {
		init_special_inode(inode, inode->i_mode, rdev);
T
Theodore Ts'o 已提交
1958
#ifdef CONFIG_EXT4_FS_XATTR
1959
		inode->i_op = &ext4_special_inode_operations;
1960
#endif
1961
		err = ext4_add_nondir(handle, dentry, inode);
1962
	}
1963 1964
	ext4_journal_stop(handle);
	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
1965 1966 1967 1968
		goto retry;
	return err;
}

1969
static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1970 1971
{
	handle_t *handle;
1972
	struct inode *inode;
1973
	struct buffer_head *dir_block = NULL;
1974
	struct ext4_dir_entry_2 *de;
1975
	unsigned int blocksize = dir->i_sb->s_blocksize;
1976 1977
	int err, retries = 0;

1978
	if (EXT4_DIR_LINK_MAX(dir))
1979 1980
		return -EMLINK;

1981
	dquot_initialize(dir);
1982

1983
retry:
1984 1985
	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
D
Dmitry Monakhov 已提交
1986
					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1987 1988 1989 1990
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	if (IS_DIRSYNC(dir))
1991
		ext4_handle_sync(handle);
1992

1993
	inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
1994
			       &dentry->d_name, 0, NULL);
1995 1996 1997 1998
	err = PTR_ERR(inode);
	if (IS_ERR(inode))
		goto out_stop;

1999 2000 2001
	inode->i_op = &ext4_dir_inode_operations;
	inode->i_fop = &ext4_dir_operations;
	inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
2002
	dir_block = ext4_bread(handle, inode, 0, 1, &err);
2003 2004
	if (!dir_block)
		goto out_clear_inode;
2005
	BUFFER_TRACE(dir_block, "get_write_access");
2006 2007 2008
	err = ext4_journal_get_write_access(handle, dir_block);
	if (err)
		goto out_clear_inode;
2009
	de = (struct ext4_dir_entry_2 *) dir_block->b_data;
2010 2011
	de->inode = cpu_to_le32(inode->i_ino);
	de->name_len = 1;
2012 2013
	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
					   blocksize);
2014
	strcpy(de->name, ".");
2015
	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
2016
	de = ext4_next_entry(de, blocksize);
2017
	de->inode = cpu_to_le32(dir->i_ino);
2018 2019
	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
					   blocksize);
2020
	de->name_len = 2;
2021
	strcpy(de->name, "..");
2022
	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
M
Miklos Szeredi 已提交
2023
	set_nlink(inode, 2);
2024
	BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
2025
	err = ext4_handle_dirty_metadata(handle, inode, dir_block);
2026 2027 2028 2029 2030
	if (err)
		goto out_clear_inode;
	err = ext4_mark_inode_dirty(handle, inode);
	if (!err)
		err = ext4_add_entry(handle, dentry, inode);
2031
	if (err) {
2032 2033
out_clear_inode:
		clear_nlink(inode);
A
Al Viro 已提交
2034
		unlock_new_inode(inode);
2035
		ext4_mark_inode_dirty(handle, inode);
2036
		iput(inode);
2037 2038
		goto out_stop;
	}
2039
	ext4_inc_count(handle, dir);
2040
	ext4_update_dx_flag(dir);
2041 2042 2043
	err = ext4_mark_inode_dirty(handle, dir);
	if (err)
		goto out_clear_inode;
2044
	d_instantiate(dentry, inode);
A
Al Viro 已提交
2045
	unlock_new_inode(inode);
2046
out_stop:
2047
	brelse(dir_block);
2048 2049
	ext4_journal_stop(handle);
	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2050 2051 2052 2053 2054 2055 2056
		goto retry;
	return err;
}

/*
 * routine to check that the specified directory is empty (for rmdir)
 */
2057
static int empty_dir(struct inode *inode)
2058
{
2059
	unsigned int offset;
2060 2061 2062
	struct buffer_head *bh;
	struct ext4_dir_entry_2 *de, *de1;
	struct super_block *sb;
2063 2064 2065
	int err = 0;

	sb = inode->i_sb;
2066
	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
2067
	    !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
2068
		if (err)
2069 2070
			EXT4_ERROR_INODE(inode,
				"error %d reading directory lblock 0", err);
2071
		else
2072
			ext4_warning(inode->i_sb,
2073 2074 2075 2076
				     "bad directory (dir #%lu) - no data block",
				     inode->i_ino);
		return 1;
	}
2077
	de = (struct ext4_dir_entry_2 *) bh->b_data;
2078
	de1 = ext4_next_entry(de, sb->s_blocksize);
2079 2080
	if (le32_to_cpu(de->inode) != inode->i_ino ||
			!le32_to_cpu(de1->inode) ||
2081 2082
			strcmp(".", de->name) ||
			strcmp("..", de1->name)) {
2083
		ext4_warning(inode->i_sb,
2084 2085 2086
			     "bad directory (dir #%lu) - no `.' or `..'",
			     inode->i_ino);
		brelse(bh);
2087 2088
		return 1;
	}
2089 2090 2091
	offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
		 ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
	de = ext4_next_entry(de1, sb->s_blocksize);
2092
	while (offset < inode->i_size) {
2093
		if (!bh ||
2094 2095
		    (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
			unsigned int lblock;
2096
			err = 0;
2097
			brelse(bh);
2098 2099
			lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
			bh = ext4_bread(NULL, inode, lblock, 0, &err);
2100 2101
			if (!bh) {
				if (err)
2102 2103 2104
					EXT4_ERROR_INODE(inode,
						"error %d reading directory "
						"lblock %u", err, lblock);
2105 2106 2107
				offset += sb->s_blocksize;
				continue;
			}
2108
			de = (struct ext4_dir_entry_2 *) bh->b_data;
2109
		}
2110
		if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
2111
			de = (struct ext4_dir_entry_2 *)(bh->b_data +
2112 2113 2114 2115 2116
							 sb->s_blocksize);
			offset = (offset | (sb->s_blocksize - 1)) + 1;
			continue;
		}
		if (le32_to_cpu(de->inode)) {
2117
			brelse(bh);
2118 2119
			return 0;
		}
2120 2121
		offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
		de = ext4_next_entry(de, sb->s_blocksize);
2122
	}
2123
	brelse(bh);
2124 2125 2126
	return 1;
}

2127
/* ext4_orphan_add() links an unlinked or truncated inode into a list of
2128 2129 2130 2131 2132
 * such inodes, starting at the superblock, in case we crash before the
 * file is closed/deleted, or in case the inode truncate spans multiple
 * transactions and the last transaction is not recovered after a crash.
 *
 * At filesystem recovery time, we walk this list deleting unlinked
2133
 * inodes and truncating linked inodes in ext4_orphan_cleanup().
2134
 */
2135
int ext4_orphan_add(handle_t *handle, struct inode *inode)
2136 2137
{
	struct super_block *sb = inode->i_sb;
2138
	struct ext4_iloc iloc;
2139 2140
	int err = 0, rc;

2141 2142 2143
	if (!ext4_handle_valid(handle))
		return 0;

2144
	mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
2145
	if (!list_empty(&EXT4_I(inode)->i_orphan))
2146 2147
		goto out_unlock;

2148 2149 2150 2151 2152
	/*
	 * Orphan handling is only valid for files with data blocks
	 * being truncated, or files being unlinked. Note that we either
	 * hold i_mutex, or the inode can not be referenced from outside,
	 * so i_nlink should not be bumped due to race
2153
	 */
2154 2155
	J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
		  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
2156

2157 2158
	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
2159 2160 2161
	if (err)
		goto out_unlock;

2162
	err = ext4_reserve_inode_write(handle, inode, &iloc);
2163 2164
	if (err)
		goto out_unlock;
2165 2166 2167 2168 2169 2170 2171
	/*
	 * Due to previous errors inode may be already a part of on-disk
	 * orphan list. If so skip on-disk list modification.
	 */
	if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
		(le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
			goto mem_insert;
2172 2173

	/* Insert this inode at the head of the on-disk orphan list... */
2174 2175
	NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
	EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
2176
	err = ext4_handle_dirty_super_now(handle, sb);
2177
	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188
	if (!err)
		err = rc;

	/* Only add to the head of the in-memory list if all the
	 * previous operations succeeded.  If the orphan_add is going to
	 * fail (possibly taking the journal offline), we can't risk
	 * leaving the inode on the orphan list: stray orphan-list
	 * entries can cause panics at unmount time.
	 *
	 * This is safe: on error we're going to ignore the orphan list
	 * anyway on the next recovery. */
2189
mem_insert:
2190
	if (!err)
2191
		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2192 2193 2194 2195 2196

	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
	jbd_debug(4, "orphan inode %lu will point to %d\n",
			inode->i_ino, NEXT_ORPHAN(inode));
out_unlock:
2197
	mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
2198
	ext4_std_error(inode->i_sb, err);
2199 2200 2201 2202
	return err;
}

/*
2203
 * ext4_orphan_del() removes an unlinked or truncated inode from the list
2204 2205
 * of such inodes stored on disk, because it is finally being cleaned up.
 */
2206
int ext4_orphan_del(handle_t *handle, struct inode *inode)
2207 2208
{
	struct list_head *prev;
2209 2210
	struct ext4_inode_info *ei = EXT4_I(inode);
	struct ext4_sb_info *sbi;
2211
	__u32 ino_next;
2212
	struct ext4_iloc iloc;
2213 2214
	int err = 0;

2215 2216
	/* ext4_handle_valid() assumes a valid handle_t pointer */
	if (handle && !ext4_handle_valid(handle))
2217 2218
		return 0;

2219 2220 2221
	mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
	if (list_empty(&ei->i_orphan))
		goto out;
2222 2223 2224

	ino_next = NEXT_ORPHAN(inode);
	prev = ei->i_orphan.prev;
2225
	sbi = EXT4_SB(inode->i_sb);
2226 2227 2228 2229 2230 2231 2232 2233 2234

	jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);

	list_del_init(&ei->i_orphan);

	/* If we're on an error path, we may not have a valid
	 * transaction handle with which to update the orphan list on
	 * disk, but we still need to remove the inode from the linked
	 * list in memory. */
2235
	if (sbi->s_journal && !handle)
2236 2237
		goto out;

2238
	err = ext4_reserve_inode_write(handle, inode, &iloc);
2239 2240 2241 2242
	if (err)
		goto out_err;

	if (prev == &sbi->s_orphan) {
2243
		jbd_debug(4, "superblock will point to %u\n", ino_next);
2244
		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
2245
		err = ext4_journal_get_write_access(handle, sbi->s_sbh);
2246 2247 2248
		if (err)
			goto out_brelse;
		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
2249
		err = ext4_handle_dirty_super_now(handle, inode->i_sb);
2250
	} else {
2251
		struct ext4_iloc iloc2;
2252
		struct inode *i_prev =
2253
			&list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
2254

2255
		jbd_debug(4, "orphan inode %lu will point to %u\n",
2256
			  i_prev->i_ino, ino_next);
2257
		err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
2258 2259 2260
		if (err)
			goto out_brelse;
		NEXT_ORPHAN(i_prev) = ino_next;
2261
		err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
2262 2263 2264 2265
	}
	if (err)
		goto out_brelse;
	NEXT_ORPHAN(inode) = 0;
2266
	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
2267 2268

out_err:
2269
	ext4_std_error(inode->i_sb, err);
2270
out:
2271
	mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
2272 2273 2274 2275 2276 2277 2278
	return err;

out_brelse:
	brelse(iloc.bh);
	goto out_err;
}

2279
static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2280 2281
{
	int retval;
2282 2283 2284
	struct inode *inode;
	struct buffer_head *bh;
	struct ext4_dir_entry_2 *de;
2285 2286 2287 2288
	handle_t *handle;

	/* Initialize quotas before so that eventual writes go in
	 * separate transaction */
2289 2290
	dquot_initialize(dir);
	dquot_initialize(dentry->d_inode);
2291

2292
	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2293 2294 2295 2296
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	retval = -ENOENT;
2297
	bh = ext4_find_entry(dir, &dentry->d_name, &de);
2298 2299 2300 2301
	if (!bh)
		goto end_rmdir;

	if (IS_DIRSYNC(dir))
2302
		ext4_handle_sync(handle);
2303 2304 2305 2306 2307 2308 2309 2310

	inode = dentry->d_inode;

	retval = -EIO;
	if (le32_to_cpu(de->inode) != inode->i_ino)
		goto end_rmdir;

	retval = -ENOTEMPTY;
2311
	if (!empty_dir(inode))
2312 2313
		goto end_rmdir;

2314
	retval = ext4_delete_entry(handle, dir, de, bh);
2315 2316
	if (retval)
		goto end_rmdir;
2317
	if (!EXT4_DIR_LINK_EMPTY(inode))
2318
		ext4_warning(inode->i_sb,
2319 2320
			     "empty directory has too many links (%d)",
			     inode->i_nlink);
2321 2322 2323 2324 2325 2326
	inode->i_version++;
	clear_nlink(inode);
	/* There's no need to set i_disksize: the fact that i_nlink is
	 * zero will ensure that the right thing happens during any
	 * recovery. */
	inode->i_size = 0;
2327
	ext4_orphan_add(handle, inode);
K
Kalpak Shah 已提交
2328
	inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
2329
	ext4_mark_inode_dirty(handle, inode);
2330
	ext4_dec_count(handle, dir);
2331 2332
	ext4_update_dx_flag(dir);
	ext4_mark_inode_dirty(handle, dir);
2333 2334

end_rmdir:
2335
	ext4_journal_stop(handle);
2336
	brelse(bh);
2337 2338 2339
	return retval;
}

2340
static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2341 2342
{
	int retval;
2343 2344 2345
	struct inode *inode;
	struct buffer_head *bh;
	struct ext4_dir_entry_2 *de;
2346 2347
	handle_t *handle;

2348
	trace_ext4_unlink_enter(dir, dentry);
2349 2350
	/* Initialize quotas before so that eventual writes go
	 * in separate transaction */
2351 2352
	dquot_initialize(dir);
	dquot_initialize(dentry->d_inode);
2353

2354
	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2355 2356 2357 2358
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	if (IS_DIRSYNC(dir))
2359
		ext4_handle_sync(handle);
2360 2361

	retval = -ENOENT;
2362
	bh = ext4_find_entry(dir, &dentry->d_name, &de);
2363 2364 2365 2366 2367 2368 2369 2370 2371 2372
	if (!bh)
		goto end_unlink;

	inode = dentry->d_inode;

	retval = -EIO;
	if (le32_to_cpu(de->inode) != inode->i_ino)
		goto end_unlink;

	if (!inode->i_nlink) {
2373
		ext4_warning(inode->i_sb,
2374 2375
			     "Deleting nonexistent file (%lu), %d",
			     inode->i_ino, inode->i_nlink);
M
Miklos Szeredi 已提交
2376
		set_nlink(inode, 1);
2377
	}
2378
	retval = ext4_delete_entry(handle, dir, de, bh);
2379 2380
	if (retval)
		goto end_unlink;
K
Kalpak Shah 已提交
2381
	dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
2382 2383
	ext4_update_dx_flag(dir);
	ext4_mark_inode_dirty(handle, dir);
2384
	drop_nlink(inode);
2385
	if (!inode->i_nlink)
2386
		ext4_orphan_add(handle, inode);
K
Kalpak Shah 已提交
2387
	inode->i_ctime = ext4_current_time(inode);
2388
	ext4_mark_inode_dirty(handle, inode);
2389 2390 2391
	retval = 0;

end_unlink:
2392
	ext4_journal_stop(handle);
2393
	brelse(bh);
2394
	trace_ext4_unlink_exit(dentry, retval);
2395 2396 2397
	return retval;
}

2398 2399
static int ext4_symlink(struct inode *dir,
			struct dentry *dentry, const char *symname)
2400 2401
{
	handle_t *handle;
2402
	struct inode *inode;
2403
	int l, err, retries = 0;
2404
	int credits;
2405 2406 2407 2408 2409

	l = strlen(symname)+1;
	if (l > dir->i_sb->s_blocksize)
		return -ENAMETOOLONG;

2410
	dquot_initialize(dir);
2411

2412 2413 2414 2415
	if (l > EXT4_N_BLOCKS * 4) {
		/*
		 * For non-fast symlinks, we just allocate inode and put it on
		 * orphan list in the first transaction => we need bitmap,
2416 2417
		 * group descriptor, sb, inode block, quota blocks, and
		 * possibly selinux xattr blocks.
2418
		 */
2419 2420
		credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
			  EXT4_XATTR_TRANS_BLOCKS;
2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431
	} else {
		/*
		 * Fast symlink. We have to add entry to directory
		 * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
		 * allocate new inode (bitmap, group descriptor, inode block,
		 * quota blocks, sb is already counted in previous macros).
		 */
		credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
			  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
			  EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
	}
2432
retry:
2433
	handle = ext4_journal_start(dir, credits);
2434 2435 2436 2437
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	if (IS_DIRSYNC(dir))
2438
		ext4_handle_sync(handle);
2439

2440
	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
2441
			       &dentry->d_name, 0, NULL);
2442 2443 2444 2445
	err = PTR_ERR(inode);
	if (IS_ERR(inode))
		goto out_stop;

2446
	if (l > EXT4_N_BLOCKS * 4) {
2447 2448
		inode->i_op = &ext4_symlink_inode_operations;
		ext4_set_aops(inode);
2449
		/*
2450 2451 2452 2453 2454 2455 2456 2457
		 * We cannot call page_symlink() with transaction started
		 * because it calls into ext4_write_begin() which can wait
		 * for transaction commit if we are running out of space
		 * and thus we deadlock. So we have to stop transaction now
		 * and restart it when symlink contents is written.
		 * 
		 * To keep fs consistent in case of crash, we have to put inode
		 * to orphan list in the mean time.
2458
		 */
2459 2460 2461 2462 2463
		drop_nlink(inode);
		err = ext4_orphan_add(handle, inode);
		ext4_journal_stop(handle);
		if (err)
			goto err_drop_inode;
2464
		err = __page_symlink(inode, symname, l, 1);
2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477
		if (err)
			goto err_drop_inode;
		/*
		 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
		 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
		 */
		handle = ext4_journal_start(dir,
				EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
				EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
		if (IS_ERR(handle)) {
			err = PTR_ERR(handle);
			goto err_drop_inode;
		}
2478
		set_nlink(inode, 1);
2479
		err = ext4_orphan_del(handle, inode);
2480
		if (err) {
2481
			ext4_journal_stop(handle);
2482
			clear_nlink(inode);
2483
			goto err_drop_inode;
2484 2485
		}
	} else {
2486
		/* clear the extent format for fast symlink */
2487
		ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
2488
		inode->i_op = &ext4_fast_symlink_inode_operations;
2489
		memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
2490 2491
		inode->i_size = l-1;
	}
2492 2493
	EXT4_I(inode)->i_disksize = inode->i_size;
	err = ext4_add_nondir(handle, dentry, inode);
2494
out_stop:
2495 2496
	ext4_journal_stop(handle);
	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2497 2498
		goto retry;
	return err;
2499 2500 2501 2502
err_drop_inode:
	unlock_new_inode(inode);
	iput(inode);
	return err;
2503 2504
}

2505 2506
static int ext4_link(struct dentry *old_dentry,
		     struct inode *dir, struct dentry *dentry)
2507 2508 2509 2510 2511
{
	handle_t *handle;
	struct inode *inode = old_dentry->d_inode;
	int err, retries = 0;

2512
	if (inode->i_nlink >= EXT4_LINK_MAX)
2513
		return -EMLINK;
2514

2515
	dquot_initialize(dir);
2516

2517
retry:
2518 2519
	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
					EXT4_INDEX_EXTRA_TRANS_BLOCKS);
2520 2521 2522 2523
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	if (IS_DIRSYNC(dir))
2524
		ext4_handle_sync(handle);
2525

K
Kalpak Shah 已提交
2526
	inode->i_ctime = ext4_current_time(inode);
2527
	ext4_inc_count(handle, inode);
A
Al Viro 已提交
2528
	ihold(inode);
2529

A
Al Viro 已提交
2530 2531 2532 2533 2534 2535 2536 2537
	err = ext4_add_entry(handle, dentry, inode);
	if (!err) {
		ext4_mark_inode_dirty(handle, inode);
		d_instantiate(dentry, inode);
	} else {
		drop_nlink(inode);
		iput(inode);
	}
2538 2539
	ext4_journal_stop(handle);
	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2540 2541 2542 2543
		goto retry;
	return err;
}

2544 2545
#define PARENT_INO(buffer, size) \
	(ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
2546 2547 2548 2549 2550

/*
 * Anybody can rename anything with this: the permission checks are left to the
 * higher-level routines.
 */
2551 2552
static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
		       struct inode *new_dir, struct dentry *new_dentry)
2553 2554
{
	handle_t *handle;
2555 2556 2557
	struct inode *old_inode, *new_inode;
	struct buffer_head *old_bh, *new_bh, *dir_bh;
	struct ext4_dir_entry_2 *old_de, *new_de;
2558
	int retval, force_da_alloc = 0;
2559

2560 2561
	dquot_initialize(old_dir);
	dquot_initialize(new_dir);
2562

2563 2564 2565 2566 2567
	old_bh = new_bh = dir_bh = NULL;

	/* Initialize quotas before so that eventual writes go
	 * in separate transaction */
	if (new_dentry->d_inode)
2568
		dquot_initialize(new_dentry->d_inode);
2569 2570 2571
	handle = ext4_journal_start(old_dir, 2 *
					EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
2572 2573 2574 2575
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2576
		ext4_handle_sync(handle);
2577

2578
	old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590
	/*
	 *  Check for inode number is _not_ due to possible IO errors.
	 *  We might rmdir the source, keep it as pwd of some process
	 *  and merrily kill the link to whatever was created under the
	 *  same name. Goodbye sticky bit ;-<
	 */
	old_inode = old_dentry->d_inode;
	retval = -ENOENT;
	if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
		goto end_rename;

	new_inode = new_dentry->d_inode;
2591
	new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
2592 2593
	if (new_bh) {
		if (!new_inode) {
2594
			brelse(new_bh);
2595 2596 2597 2598 2599 2600
			new_bh = NULL;
		}
	}
	if (S_ISDIR(old_inode->i_mode)) {
		if (new_inode) {
			retval = -ENOTEMPTY;
2601
			if (!empty_dir(new_inode))
2602 2603 2604
				goto end_rename;
		}
		retval = -EIO;
2605
		dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
2606 2607
		if (!dir_bh)
			goto end_rename;
2608 2609
		if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
				old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
2610 2611
			goto end_rename;
		retval = -EMLINK;
2612
		if (!new_inode && new_dir != old_dir &&
2613
		    EXT4_DIR_LINK_MAX(new_dir))
2614
			goto end_rename;
2615 2616 2617 2618
		BUFFER_TRACE(dir_bh, "get_write_access");
		retval = ext4_journal_get_write_access(handle, dir_bh);
		if (retval)
			goto end_rename;
2619 2620
	}
	if (!new_bh) {
2621
		retval = ext4_add_entry(handle, new_dentry, old_inode);
2622 2623 2624 2625
		if (retval)
			goto end_rename;
	} else {
		BUFFER_TRACE(new_bh, "get write access");
2626 2627 2628
		retval = ext4_journal_get_write_access(handle, new_bh);
		if (retval)
			goto end_rename;
2629
		new_de->inode = cpu_to_le32(old_inode->i_ino);
2630 2631
		if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
					      EXT4_FEATURE_INCOMPAT_FILETYPE))
2632 2633
			new_de->file_type = old_de->file_type;
		new_dir->i_version++;
2634 2635 2636
		new_dir->i_ctime = new_dir->i_mtime =
					ext4_current_time(new_dir);
		ext4_mark_inode_dirty(handle, new_dir);
2637
		BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
2638 2639 2640 2641 2642
		retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
		if (unlikely(retval)) {
			ext4_std_error(new_dir->i_sb, retval);
			goto end_rename;
		}
2643 2644 2645 2646 2647 2648 2649 2650
		brelse(new_bh);
		new_bh = NULL;
	}

	/*
	 * Like most other Unix systems, set the ctime for inodes on a
	 * rename.
	 */
K
Kalpak Shah 已提交
2651
	old_inode->i_ctime = ext4_current_time(old_inode);
2652
	ext4_mark_inode_dirty(handle, old_inode);
2653 2654 2655 2656 2657 2658 2659

	/*
	 * ok, that's it
	 */
	if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
	    old_de->name_len != old_dentry->d_name.len ||
	    strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
2660
	    (retval = ext4_delete_entry(handle, old_dir,
2661 2662 2663 2664 2665 2666
					old_de, old_bh)) == -ENOENT) {
		/* old_de could have moved from under us during htree split, so
		 * make sure that we are deleting the right entry.  We might
		 * also be pointing to a stale entry in the unused part of
		 * old_bh so just checking inum and the name isn't enough. */
		struct buffer_head *old_bh2;
2667
		struct ext4_dir_entry_2 *old_de2;
2668

2669
		old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
2670
		if (old_bh2) {
2671
			retval = ext4_delete_entry(handle, old_dir,
2672 2673 2674 2675 2676
						   old_de2, old_bh2);
			brelse(old_bh2);
		}
	}
	if (retval) {
2677
		ext4_warning(old_dir->i_sb,
2678 2679 2680 2681 2682
				"Deleting old file (%lu), %d, error=%d",
				old_dir->i_ino, old_dir->i_nlink, retval);
	}

	if (new_inode) {
2683
		ext4_dec_count(handle, new_inode);
K
Kalpak Shah 已提交
2684
		new_inode->i_ctime = ext4_current_time(new_inode);
2685
	}
K
Kalpak Shah 已提交
2686
	old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
2687
	ext4_update_dx_flag(old_dir);
2688
	if (dir_bh) {
2689 2690
		PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
						cpu_to_le32(new_dir->i_ino);
2691
		BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2692
		retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh);
2693 2694 2695 2696
		if (retval) {
			ext4_std_error(old_dir->i_sb, retval);
			goto end_rename;
		}
2697
		ext4_dec_count(handle, old_dir);
2698
		if (new_inode) {
2699
			/* checked empty_dir above, can't have another parent,
2700
			 * ext4_dec_count() won't work for many-linked dirs */
2701
			clear_nlink(new_inode);
2702
		} else {
2703
			ext4_inc_count(handle, new_dir);
2704 2705
			ext4_update_dx_flag(new_dir);
			ext4_mark_inode_dirty(handle, new_dir);
2706 2707
		}
	}
2708
	ext4_mark_inode_dirty(handle, old_dir);
2709
	if (new_inode) {
2710
		ext4_mark_inode_dirty(handle, new_inode);
2711
		if (!new_inode->i_nlink)
2712
			ext4_orphan_add(handle, new_inode);
2713 2714
		if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
			force_da_alloc = 1;
2715 2716 2717 2718
	}
	retval = 0;

end_rename:
2719 2720 2721
	brelse(dir_bh);
	brelse(old_bh);
	brelse(new_bh);
2722
	ext4_journal_stop(handle);
2723 2724
	if (retval == 0 && force_da_alloc)
		ext4_alloc_da_blocks(old_inode);
2725 2726 2727 2728 2729 2730
	return retval;
}

/*
 * directories can handle most operations...
 */
2731
const struct inode_operations ext4_dir_inode_operations = {
2732 2733 2734 2735 2736 2737 2738 2739 2740 2741
	.create		= ext4_create,
	.lookup		= ext4_lookup,
	.link		= ext4_link,
	.unlink		= ext4_unlink,
	.symlink	= ext4_symlink,
	.mkdir		= ext4_mkdir,
	.rmdir		= ext4_rmdir,
	.mknod		= ext4_mknod,
	.rename		= ext4_rename,
	.setattr	= ext4_setattr,
T
Theodore Ts'o 已提交
2742
#ifdef CONFIG_EXT4_FS_XATTR
2743 2744
	.setxattr	= generic_setxattr,
	.getxattr	= generic_getxattr,
2745
	.listxattr	= ext4_listxattr,
2746 2747
	.removexattr	= generic_removexattr,
#endif
2748
	.get_acl	= ext4_get_acl,
2749
	.fiemap         = ext4_fiemap,
2750 2751
};

2752
const struct inode_operations ext4_special_inode_operations = {
2753
	.setattr	= ext4_setattr,
T
Theodore Ts'o 已提交
2754
#ifdef CONFIG_EXT4_FS_XATTR
2755 2756
	.setxattr	= generic_setxattr,
	.getxattr	= generic_getxattr,
2757
	.listxattr	= ext4_listxattr,
2758 2759
	.removexattr	= generic_removexattr,
#endif
2760
	.get_acl	= ext4_get_acl,
2761
};