dir.c 40.0 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
#include <linux/ceph/ceph_debug.h>
S
Sage Weil 已提交
3 4 5 6

#include <linux/spinlock.h>
#include <linux/fs_struct.h>
#include <linux/namei.h>
7
#include <linux/slab.h>
S
Sage Weil 已提交
8
#include <linux/sched.h>
9
#include <linux/xattr.h>
S
Sage Weil 已提交
10 11

#include "super.h"
12
#include "mds_client.h"
S
Sage Weil 已提交
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30

/*
 * Directory operations: readdir, lookup, create, link, unlink,
 * rename, etc.
 */

/*
 * Ceph MDS operations are specified in terms of a base ino and
 * relative path.  Thus, the client can specify an operation on a
 * specific inode (e.g., a getattr due to fstat(2)), or as a path
 * relative to, say, the root directory.
 *
 * Normally, we limit ourselves to strict inode ops (no path component)
 * or dentry operations (a single path component relative to an ino).  The
 * exception to this is open_root_dentry(), which will open the mount
 * point by name.
 */

S
Sage Weil 已提交
31
const struct dentry_operations ceph_dentry_ops;
S
Sage Weil 已提交
32 33 34 35

/*
 * Initialize ceph dentry state.
 */
A
Al Viro 已提交
36
static int ceph_d_init(struct dentry *dentry)
S
Sage Weil 已提交
37 38 39
{
	struct ceph_dentry_info *di;

G
Geliang Tang 已提交
40
	di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
S
Sage Weil 已提交
41 42 43 44 45
	if (!di)
		return -ENOMEM;          /* oh well */

	di->dentry = dentry;
	di->lease_session = NULL;
M
Miklos Szeredi 已提交
46
	di->time = jiffies;
47
	dentry->d_fsdata = di;
S
Sage Weil 已提交
48 49 50 51 52
	ceph_dentry_lru_add(dentry);
	return 0;
}

/*
53 54 55 56 57 58
 * for f_pos for readdir:
 * - hash order:
 *	(0xff << 52) | ((24 bits hash) << 28) |
 *	(the nth entry has hash collision);
 * - frag+name order;
 *	((frag value) << 28) | (the nth entry in frag);
S
Sage Weil 已提交
59
 */
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
#define OFFSET_BITS	28
#define OFFSET_MASK	((1 << OFFSET_BITS) - 1)
#define HASH_ORDER	(0xffull << (OFFSET_BITS + 24))
loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
{
	loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
	if (hash_order)
		fpos |= HASH_ORDER;
	return fpos;
}

static bool is_hash_order(loff_t p)
{
	return (p & HASH_ORDER) == HASH_ORDER;
}

S
Sage Weil 已提交
76 77
static unsigned fpos_frag(loff_t p)
{
78
	return p >> OFFSET_BITS;
S
Sage Weil 已提交
79
}
80 81 82 83 84 85

static unsigned fpos_hash(loff_t p)
{
	return ceph_frag_value(fpos_frag(p));
}

S
Sage Weil 已提交
86 87
static unsigned fpos_off(loff_t p)
{
88
	return p & OFFSET_MASK;
S
Sage Weil 已提交
89 90
}

Y
Yan, Zheng 已提交
91 92 93 94 95 96 97 98
static int fpos_cmp(loff_t l, loff_t r)
{
	int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r));
	if (v)
		return v;
	return (int)(fpos_off(l) - fpos_off(r));
}

Y
Yan, Zheng 已提交
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
/*
 * make note of the last dentry we read, so we can
 * continue at the same lexicographical point,
 * regardless of what dir changes take place on the
 * server.
 */
static int note_last_dentry(struct ceph_file_info *fi, const char *name,
		            int len, unsigned next_offset)
{
	char *buf = kmalloc(len+1, GFP_KERNEL);
	if (!buf)
		return -ENOMEM;
	kfree(fi->last_name);
	fi->last_name = buf;
	memcpy(fi->last_name, name, len);
	fi->last_name[len] = 0;
	fi->next_offset = next_offset;
	dout("note_last_dentry '%s'\n", fi->last_name);
	return 0;
}

120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163

static struct dentry *
__dcache_find_get_entry(struct dentry *parent, u64 idx,
			struct ceph_readdir_cache_control *cache_ctl)
{
	struct inode *dir = d_inode(parent);
	struct dentry *dentry;
	unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
	loff_t ptr_pos = idx * sizeof(struct dentry *);
	pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT;

	if (ptr_pos >= i_size_read(dir))
		return NULL;

	if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) {
		ceph_readdir_cache_release(cache_ctl);
		cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
		if (!cache_ctl->page) {
			dout(" page %lu not found\n", ptr_pgoff);
			return ERR_PTR(-EAGAIN);
		}
		/* reading/filling the cache are serialized by
		   i_mutex, no need to use page lock */
		unlock_page(cache_ctl->page);
		cache_ctl->dentries = kmap(cache_ctl->page);
	}

	cache_ctl->index = idx & idx_mask;

	rcu_read_lock();
	spin_lock(&parent->d_lock);
	/* check i_size again here, because empty directory can be
	 * marked as complete while not holding the i_mutex. */
	if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
		dentry = cache_ctl->dentries[cache_ctl->index];
	else
		dentry = NULL;
	spin_unlock(&parent->d_lock);
	if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
		dentry = NULL;
	rcu_read_unlock();
	return dentry ? : ERR_PTR(-EAGAIN);
}

S
Sage Weil 已提交
164 165 166
/*
 * When possible, we try to satisfy a readdir by peeking at the
 * dcache.  We make this work by carefully ordering dentries on
167
 * d_child when we initially get results back from the MDS, and
S
Sage Weil 已提交
168 169 170
 * falling back to a "normal" sync readdir if any dentries in the dir
 * are dropped.
 *
171
 * Complete dir indicates that we have all dentries in the dir.  It is
S
Sage Weil 已提交
172 173 174
 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
 * the MDS if/when the directory is modified).
 */
175
static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
176
			    int shared_gen)
S
Sage Weil 已提交
177
{
A
Al Viro 已提交
178
	struct ceph_file_info *fi = file->private_data;
A
Al Viro 已提交
179
	struct dentry *parent = file->f_path.dentry;
180
	struct inode *dir = d_inode(parent);
Y
Yan, Zheng 已提交
181
	struct dentry *dentry, *last = NULL;
S
Sage Weil 已提交
182
	struct ceph_dentry_info *di;
Y
Yan, Zheng 已提交
183
	struct ceph_readdir_cache_control cache_ctl = {};
184 185
	u64 idx = 0;
	int err = 0;
S
Sage Weil 已提交
186

187
	dout("__dcache_readdir %p v%u at %llx\n", dir, (unsigned)shared_gen, ctx->pos);
S
Sage Weil 已提交
188

189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
	/* search start position */
	if (ctx->pos > 2) {
		u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *));
		while (count > 0) {
			u64 step = count >> 1;
			dentry = __dcache_find_get_entry(parent, idx + step,
							 &cache_ctl);
			if (!dentry) {
				/* use linar search */
				idx = 0;
				break;
			}
			if (IS_ERR(dentry)) {
				err = PTR_ERR(dentry);
				goto out;
			}
			di = ceph_dentry(dentry);
			spin_lock(&dentry->d_lock);
			if (fpos_cmp(di->offset, ctx->pos) < 0) {
				idx += step + 1;
				count -= step + 1;
			} else {
				count = step;
			}
			spin_unlock(&dentry->d_lock);
			dput(dentry);
		}
S
Sage Weil 已提交
216

217
		dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
S
Sage Weil 已提交
218 219
	}

Y
Yan, Zheng 已提交
220

221 222 223 224
	for (;;) {
		bool emit_dentry = false;
		dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
		if (!dentry) {
225
			fi->flags |= CEPH_F_ATEND;
Y
Yan, Zheng 已提交
226 227
			err = 0;
			break;
S
Sage Weil 已提交
228
		}
229 230 231
		if (IS_ERR(dentry)) {
			err = PTR_ERR(dentry);
			goto out;
Y
Yan, Zheng 已提交
232 233 234
		}

		spin_lock(&dentry->d_lock);
235 236 237 238 239 240 241 242 243 244
		di = ceph_dentry(dentry);
		if (d_unhashed(dentry) ||
		    d_really_is_negative(dentry) ||
		    di->lease_shared_gen != shared_gen) {
			spin_unlock(&dentry->d_lock);
			dput(dentry);
			err = -EAGAIN;
			goto out;
		}
		if (fpos_cmp(ctx->pos, di->offset) <= 0) {
Y
Yan, Zheng 已提交
245 246
			emit_dentry = true;
		}
N
Nick Piggin 已提交
247
		spin_unlock(&dentry->d_lock);
S
Sage Weil 已提交
248

Y
Yan, Zheng 已提交
249
		if (emit_dentry) {
250
			dout(" %llx dentry %p %pd %p\n", di->offset,
Y
Yan, Zheng 已提交
251 252 253 254 255 256 257 258 259 260 261 262
			     dentry, dentry, d_inode(dentry));
			ctx->pos = di->offset;
			if (!dir_emit(ctx, dentry->d_name.name,
				      dentry->d_name.len,
				      ceph_translate_ino(dentry->d_sb,
							 d_inode(dentry)->i_ino),
				      d_inode(dentry)->i_mode >> 12)) {
				dput(dentry);
				err = 0;
				break;
			}
			ctx->pos++;
263

Y
Yan, Zheng 已提交
264 265 266 267 268
			if (last)
				dput(last);
			last = dentry;
		} else {
			dput(dentry);
S
Sage Weil 已提交
269
		}
Y
Yan, Zheng 已提交
270
	}
271
out:
Y
Yan, Zheng 已提交
272 273 274 275 276 277 278 279
	ceph_readdir_cache_release(&cache_ctl);
	if (last) {
		int ret;
		di = ceph_dentry(last);
		ret = note_last_dentry(fi, last->d_name.name, last->d_name.len,
				       fpos_off(di->offset) + 1);
		if (ret < 0)
			err = ret;
S
Sage Weil 已提交
280
		dput(last);
281 282 283 284 285
		/* last_name no longer match cache index */
		if (fi->readdir_cache_idx >= 0) {
			fi->readdir_cache_idx = -1;
			fi->dir_release_count = 0;
		}
Y
Yan, Zheng 已提交
286
	}
S
Sage Weil 已提交
287 288 289
	return err;
}

290 291 292 293 294 295 296 297 298 299
static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
{
	if (!fi->last_readdir)
		return true;
	if (is_hash_order(pos))
		return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
	else
		return fi->frag != fpos_frag(pos);
}

A
Al Viro 已提交
300
static int ceph_readdir(struct file *file, struct dir_context *ctx)
S
Sage Weil 已提交
301
{
A
Al Viro 已提交
302 303
	struct ceph_file_info *fi = file->private_data;
	struct inode *inode = file_inode(file);
S
Sage Weil 已提交
304
	struct ceph_inode_info *ci = ceph_inode(inode);
305 306
	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
	struct ceph_mds_client *mdsc = fsc->mdsc;
307
	int i;
S
Sage Weil 已提交
308
	int err;
309
	unsigned frag = -1;
S
Sage Weil 已提交
310 311
	struct ceph_mds_reply_info_parsed *rinfo;

312
	dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
313
	if (fi->flags & CEPH_F_ATEND)
S
Sage Weil 已提交
314 315 316
		return 0;

	/* always start with . and .. */
A
Al Viro 已提交
317
	if (ctx->pos == 0) {
S
Sage Weil 已提交
318
		dout("readdir off 0 -> '.'\n");
A
Al Viro 已提交
319
		if (!dir_emit(ctx, ".", 1, 
Y
Yehuda Sadeh 已提交
320
			    ceph_translate_ino(inode->i_sb, inode->i_ino),
A
Al Viro 已提交
321
			    inode->i_mode >> 12))
S
Sage Weil 已提交
322
			return 0;
A
Al Viro 已提交
323
		ctx->pos = 1;
S
Sage Weil 已提交
324
	}
A
Al Viro 已提交
325
	if (ctx->pos == 1) {
A
Al Viro 已提交
326
		ino_t ino = parent_ino(file->f_path.dentry);
S
Sage Weil 已提交
327
		dout("readdir off 1 -> '..'\n");
A
Al Viro 已提交
328
		if (!dir_emit(ctx, "..", 2,
Y
Yehuda Sadeh 已提交
329
			    ceph_translate_ino(inode->i_sb, ino),
A
Al Viro 已提交
330
			    inode->i_mode >> 12))
S
Sage Weil 已提交
331
			return 0;
A
Al Viro 已提交
332
		ctx->pos = 2;
S
Sage Weil 已提交
333 334 335
	}

	/* can we use the dcache? */
336
	spin_lock(&ci->i_ceph_lock);
Y
Yan, Zheng 已提交
337
	if (ceph_test_mount_opt(fsc, DCACHE) &&
338
	    !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
339
	    ceph_snap(inode) != CEPH_SNAPDIR &&
340
	    __ceph_dir_is_complete_ordered(ci) &&
S
Sage Weil 已提交
341
	    __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
342
		int shared_gen = atomic_read(&ci->i_shared_gen);
343
		spin_unlock(&ci->i_ceph_lock);
344
		err = __dcache_readdir(file, ctx, shared_gen);
345
		if (err != -EAGAIN)
S
Sage Weil 已提交
346
			return err;
347
	} else {
348
		spin_unlock(&ci->i_ceph_lock);
S
Sage Weil 已提交
349 350 351 352 353
	}

	/* proceed with a normal readdir */
more:
	/* do we have the correct frag content buffered? */
354
	if (need_send_readdir(fi, ctx->pos)) {
S
Sage Weil 已提交
355 356 357 358 359
		struct ceph_mds_request *req;
		int op = ceph_snap(inode) == CEPH_SNAPDIR ?
			CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;

		/* discard old result, if any */
360
		if (fi->last_readdir) {
S
Sage Weil 已提交
361
			ceph_mdsc_put_request(fi->last_readdir);
362 363
			fi->last_readdir = NULL;
		}
S
Sage Weil 已提交
364

365
		if (is_hash_order(ctx->pos)) {
366 367 368 369 370
			/* fragtree isn't always accurate. choose frag
			 * based on previous reply when possible. */
			if (frag == (unsigned)-1)
				frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
							NULL, NULL);
371 372 373 374
		} else {
			frag = fpos_frag(ctx->pos);
		}

S
Sage Weil 已提交
375 376 377 378 379
		dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
		     ceph_vinop(inode), frag, fi->last_name);
		req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
		if (IS_ERR(req))
			return PTR_ERR(req);
380 381 382 383 384
		err = ceph_alloc_readdir_reply_buffer(req, inode);
		if (err) {
			ceph_mdsc_put_request(req);
			return err;
		}
S
Sage Weil 已提交
385 386
		/* hints to request -> mds selection code */
		req->r_direct_mode = USE_AUTH_MDS;
387 388 389
		if (op == CEPH_MDS_OP_READDIR) {
			req->r_direct_hash = ceph_frag_value(frag);
			__set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
390
			req->r_inode_drop = CEPH_CAP_FILE_EXCL;
391
		}
392
		if (fi->last_name) {
393
			req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL);
394 395 396 397
			if (!req->r_path2) {
				ceph_mdsc_put_request(req);
				return -ENOMEM;
			}
398 399 400
		} else if (is_hash_order(ctx->pos)) {
			req->r_args.readdir.offset_hash =
				cpu_to_le32(fpos_hash(ctx->pos));
401
		}
402

Y
Yan, Zheng 已提交
403 404 405
		req->r_dir_release_cnt = fi->dir_release_count;
		req->r_dir_ordered_cnt = fi->dir_ordered_count;
		req->r_readdir_cache_idx = fi->readdir_cache_idx;
S
Sage Weil 已提交
406 407
		req->r_readdir_offset = fi->next_offset;
		req->r_args.readdir.frag = cpu_to_le32(frag);
408 409
		req->r_args.readdir.flags =
				cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
410 411 412 413

		req->r_inode = inode;
		ihold(inode);
		req->r_dentry = dget(file->f_path.dentry);
S
Sage Weil 已提交
414 415 416 417 418
		err = ceph_mdsc_do_request(mdsc, NULL, req);
		if (err < 0) {
			ceph_mdsc_put_request(req);
			return err;
		}
419 420 421
		dout("readdir got and parsed readdir result=%d on "
		     "frag %x, end=%d, complete=%d, hash_order=%d\n",
		     err, frag,
S
Sage Weil 已提交
422
		     (int)req->r_reply_info.dir_end,
423 424
		     (int)req->r_reply_info.dir_complete,
		     (int)req->r_reply_info.hash_order);
S
Sage Weil 已提交
425

426 427 428
		rinfo = &req->r_reply_info;
		if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
			frag = le32_to_cpu(rinfo->dir_dir->frag);
429 430 431 432 433 434 435
			if (!rinfo->hash_order) {
				fi->next_offset = req->r_readdir_offset;
				/* adjust ctx->pos to beginning of frag */
				ctx->pos = ceph_make_fpos(frag,
							  fi->next_offset,
							  false);
			}
436
		}
Y
Yan, Zheng 已提交
437

Y
Yan, Zheng 已提交
438
		fi->frag = frag;
S
Sage Weil 已提交
439 440
		fi->last_readdir = req;

441
		if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) {
Y
Yan, Zheng 已提交
442 443 444 445
			fi->readdir_cache_idx = req->r_readdir_cache_idx;
			if (fi->readdir_cache_idx < 0) {
				/* preclude from marking dir ordered */
				fi->dir_ordered_count = 0;
446 447
			} else if (ceph_frag_is_leftmost(frag) &&
				   fi->next_offset == 2) {
Y
Yan, Zheng 已提交
448 449 450 451 452 453
				/* note dir version at start of readdir so
				 * we can tell if any dentries get dropped */
				fi->dir_release_count = req->r_dir_release_cnt;
				fi->dir_ordered_count = req->r_dir_ordered_cnt;
			}
		} else {
454
			dout("readdir !did_prepopulate\n");
Y
Yan, Zheng 已提交
455 456 457 458 459 460
			/* disable readdir cache */
			fi->readdir_cache_idx = -1;
			/* preclude from marking dir complete */
			fi->dir_release_count = 0;
		}

461 462
		/* note next offset and last dentry name */
		if (rinfo->dir_nr > 0) {
463 464
			struct ceph_mds_reply_dir_entry *rde =
					rinfo->dir_entries + (rinfo->dir_nr-1);
465 466
			unsigned next_offset = req->r_reply_info.dir_end ?
					2 : (fpos_off(rde->offset) + 1);
467
			err = note_last_dentry(fi, rde->name, rde->name_len,
468
					       next_offset);
S
Sage Weil 已提交
469 470
			if (err)
				return err;
471 472 473
		} else if (req->r_reply_info.dir_end) {
			fi->next_offset = 2;
			/* keep last name */
S
Sage Weil 已提交
474 475 476 477
		}
	}

	rinfo = &fi->last_readdir->r_reply_info;
478
	dout("readdir frag %x num %d pos %llx chunk first %llx\n",
479
	     fi->frag, rinfo->dir_nr, ctx->pos,
480
	     rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
A
Al Viro 已提交
481

482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497
	i = 0;
	/* search start position */
	if (rinfo->dir_nr > 0) {
		int step, nr = rinfo->dir_nr;
		while (nr > 0) {
			step = nr >> 1;
			if (rinfo->dir_entries[i + step].offset < ctx->pos) {
				i +=  step + 1;
				nr -= step + 1;
			} else {
				nr = step;
			}
		}
	}
	for (; i < rinfo->dir_nr; i++) {
		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
498 499
		struct ceph_vino vino;
		ino_t ino;
500
		u32 ftype;
501

502 503 504 505 506
		BUG_ON(rde->offset < ctx->pos);

		ctx->pos = rde->offset;
		dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
		     i, rinfo->dir_nr, ctx->pos,
507
		     rde->name_len, rde->name, &rde->inode.in);
508

509 510 511 512
		BUG_ON(!rde->inode.in);
		ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
		vino.ino = le64_to_cpu(rde->inode.in->ino);
		vino.snap = le64_to_cpu(rde->inode.in->snapid);
513
		ino = ceph_vino_to_ino(vino);
514

515 516
		if (!dir_emit(ctx, rde->name, rde->name_len,
			      ceph_translate_ino(inode->i_sb, ino), ftype)) {
S
Sage Weil 已提交
517 518 519
			dout("filldir stopping us...\n");
			return 0;
		}
A
Al Viro 已提交
520
		ctx->pos++;
S
Sage Weil 已提交
521 522
	}

523 524 525
	ceph_mdsc_put_request(fi->last_readdir);
	fi->last_readdir = NULL;

526
	if (fi->next_offset > 2) {
527
		frag = fi->frag;
S
Sage Weil 已提交
528 529 530 531
		goto more;
	}

	/* more frags? */
532
	if (!ceph_frag_is_rightmost(fi->frag)) {
533
		frag = ceph_frag_next(fi->frag);
534 535 536 537 538 539 540 541 542 543 544
		if (is_hash_order(ctx->pos)) {
			loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
							fi->next_offset, true);
			if (new_pos > ctx->pos)
				ctx->pos = new_pos;
			/* keep last_name */
		} else {
			ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
			kfree(fi->last_name);
			fi->last_name = NULL;
		}
S
Sage Weil 已提交
545 546 547
		dout("readdir next frag is %x\n", frag);
		goto more;
	}
548
	fi->flags |= CEPH_F_ATEND;
S
Sage Weil 已提交
549 550 551 552 553 554

	/*
	 * if dir_release_count still matches the dir, no dentries
	 * were released during the whole readdir, and we should have
	 * the complete dir contents in our cache.
	 */
Y
Yan, Zheng 已提交
555 556 557
	if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) {
		spin_lock(&ci->i_ceph_lock);
		if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) {
558
			dout(" marking %p complete and ordered\n", inode);
Y
Yan, Zheng 已提交
559 560 561 562 563 564
			/* use i_size to track number of entries in
			 * readdir cache */
			BUG_ON(fi->readdir_cache_idx < 0);
			i_size_write(inode, fi->readdir_cache_idx *
				     sizeof(struct dentry*));
		} else {
565
			dout(" marking %p complete\n", inode);
Y
Yan, Zheng 已提交
566
		}
567 568
		__ceph_dir_set_complete(ci, fi->dir_release_count,
					fi->dir_ordered_count);
Y
Yan, Zheng 已提交
569
		spin_unlock(&ci->i_ceph_lock);
S
Sage Weil 已提交
570 571
	}

A
Al Viro 已提交
572
	dout("readdir %p file %p done.\n", inode, file);
S
Sage Weil 已提交
573 574 575
	return 0;
}

576
static void reset_readdir(struct ceph_file_info *fi)
S
Sage Weil 已提交
577 578 579 580 581 582
{
	if (fi->last_readdir) {
		ceph_mdsc_put_request(fi->last_readdir);
		fi->last_readdir = NULL;
	}
	kfree(fi->last_name);
S
Sage Weil 已提交
583
	fi->last_name = NULL;
Y
Yan, Zheng 已提交
584 585
	fi->dir_release_count = 0;
	fi->readdir_cache_idx = -1;
Y
Yan, Zheng 已提交
586
	fi->next_offset = 2;  /* compensate for . and .. */
587
	fi->flags &= ~CEPH_F_ATEND;
S
Sage Weil 已提交
588 589
}

590 591 592 593 594 595 596
/*
 * discard buffered readdir content on seekdir(0), or seek to new frag,
 * or seek prior to current chunk
 */
static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
{
	struct ceph_mds_reply_info_parsed *rinfo;
597
	loff_t chunk_offset;
598 599
	if (new_pos == 0)
		return true;
600 601 602
	if (is_hash_order(new_pos)) {
		/* no need to reset last_name for a forward seek when
		 * dentries are sotred in hash order */
603
	} else if (fi->frag != fpos_frag(new_pos)) {
604
		return true;
605
	}
606 607 608
	rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
	if (!rinfo || !rinfo->dir_nr)
		return true;
609 610 611
	chunk_offset = rinfo->dir_entries[0].offset;
	return new_pos < chunk_offset ||
	       is_hash_order(new_pos) != is_hash_order(chunk_offset);
612 613
}

614
static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
S
Sage Weil 已提交
615 616 617 618 619
{
	struct ceph_file_info *fi = file->private_data;
	struct inode *inode = file->f_mapping->host;
	loff_t retval;

A
Al Viro 已提交
620
	inode_lock(inode);
621
	retval = -EINVAL;
622
	switch (whence) {
S
Sage Weil 已提交
623 624
	case SEEK_CUR:
		offset += file->f_pos;
625 626
	case SEEK_SET:
		break;
Y
Yan, Zheng 已提交
627 628
	case SEEK_END:
		retval = -EOPNOTSUPP;
629 630
	default:
		goto out;
S
Sage Weil 已提交
631
	}
632

Y
Yan, Zheng 已提交
633
	if (offset >= 0) {
634 635 636 637 638 639 640 641 642 643
		if (need_reset_readdir(fi, offset)) {
			dout("dir_llseek dropping %p content\n", file);
			reset_readdir(fi);
		} else if (is_hash_order(offset) && offset > file->f_pos) {
			/* for hash offset, we don't know if a forward seek
			 * is within same frag */
			fi->dir_release_count = 0;
			fi->readdir_cache_idx = -1;
		}

S
Sage Weil 已提交
644 645 646
		if (offset != file->f_pos) {
			file->f_pos = offset;
			file->f_version = 0;
647
			fi->flags &= ~CEPH_F_ATEND;
S
Sage Weil 已提交
648 649 650
		}
		retval = offset;
	}
651
out:
A
Al Viro 已提交
652
	inode_unlock(inode);
S
Sage Weil 已提交
653 654 655 656
	return retval;
}

/*
657
 * Handle lookups for the hidden .snap directory.
S
Sage Weil 已提交
658
 */
659 660
int ceph_handle_snapdir(struct ceph_mds_request *req,
			struct dentry *dentry, int err)
S
Sage Weil 已提交
661
{
662
	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
663
	struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */
S
Sage Weil 已提交
664 665 666

	/* .snap dir? */
	if (err == -ENOENT &&
667
	    ceph_snap(parent) == CEPH_NOSNAP &&
668
	    strcmp(dentry->d_name.name,
669
		   fsc->mount_options->snapdir_name) == 0) {
S
Sage Weil 已提交
670
		struct inode *inode = ceph_get_snapdir(parent);
A
Al Viro 已提交
671 672
		dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n",
		     dentry, dentry, inode);
673
		BUG_ON(!d_unhashed(dentry));
S
Sage Weil 已提交
674 675 676
		d_add(dentry, inode);
		err = 0;
	}
677 678
	return err;
}
S
Sage Weil 已提交
679

680 681 682 683 684 685 686 687 688 689 690 691 692 693
/*
 * Figure out final result of a lookup/open request.
 *
 * Mainly, make sure we return the final req->r_dentry (if it already
 * existed) in place of the original VFS-provided dentry when they
 * differ.
 *
 * Gracefully handle the case where the MDS replies with -ENOENT and
 * no trace (which it may do, at its discretion, e.g., if it doesn't
 * care to issue a lease on the negative dentry).
 */
struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
				  struct dentry *dentry, int err)
{
S
Sage Weil 已提交
694 695 696 697 698
	if (err == -ENOENT) {
		/* no trace? */
		err = 0;
		if (!req->r_reply_info.head->is_dentry) {
			dout("ENOENT and no trace, dentry %p inode %p\n",
699 700
			     dentry, d_inode(dentry));
			if (d_really_is_positive(dentry)) {
S
Sage Weil 已提交
701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716
				d_drop(dentry);
				err = -ENOENT;
			} else {
				d_add(dentry, NULL);
			}
		}
	}
	if (err)
		dentry = ERR_PTR(err);
	else if (dentry != req->r_dentry)
		dentry = dget(req->r_dentry);   /* we got spliced */
	else
		dentry = NULL;
	return dentry;
}

717
static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
S
Sage Weil 已提交
718 719 720 721 722
{
	return ceph_ino(inode) == CEPH_INO_ROOT &&
		strncmp(dentry->d_name.name, ".ceph", 5) == 0;
}

S
Sage Weil 已提交
723 724 725 726 727
/*
 * Look up a single dir entry.  If there is a lookup intent, inform
 * the MDS so that it gets our 'caps wanted' value in a single op.
 */
static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
A
Al Viro 已提交
728
				  unsigned int flags)
S
Sage Weil 已提交
729
{
730 731
	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
	struct ceph_mds_client *mdsc = fsc->mdsc;
S
Sage Weil 已提交
732 733
	struct ceph_mds_request *req;
	int op;
Y
Yan, Zheng 已提交
734
	int mask;
S
Sage Weil 已提交
735 736
	int err;

A
Al Viro 已提交
737 738
	dout("lookup %p dentry %p '%pd'\n",
	     dir, dentry, dentry);
S
Sage Weil 已提交
739 740 741 742 743

	if (dentry->d_name.len > NAME_MAX)
		return ERR_PTR(-ENAMETOOLONG);

	/* can we conclude ENOENT locally? */
744
	if (d_really_is_negative(dentry)) {
S
Sage Weil 已提交
745 746 747
		struct ceph_inode_info *ci = ceph_inode(dir);
		struct ceph_dentry_info *di = ceph_dentry(dentry);

748
		spin_lock(&ci->i_ceph_lock);
S
Sage Weil 已提交
749 750
		dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
		if (strncmp(dentry->d_name.name,
751
			    fsc->mount_options->snapdir_name,
S
Sage Weil 已提交
752
			    dentry->d_name.len) &&
S
Sage Weil 已提交
753
		    !is_root_ceph_dentry(dir, dentry) &&
754
		    ceph_test_mount_opt(fsc, DCACHE) &&
755
		    __ceph_dir_is_complete(ci) &&
S
Sage Weil 已提交
756
		    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
757
			spin_unlock(&ci->i_ceph_lock);
S
Sage Weil 已提交
758 759
			dout(" dir %p complete, -ENOENT\n", dir);
			d_add(dentry, NULL);
760
			di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
S
Sage Weil 已提交
761 762
			return NULL;
		}
763
		spin_unlock(&ci->i_ceph_lock);
S
Sage Weil 已提交
764 765 766 767 768 769
	}

	op = ceph_snap(dir) == CEPH_SNAPDIR ?
		CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
	req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
	if (IS_ERR(req))
J
Julia Lawall 已提交
770
		return ERR_CAST(req);
S
Sage Weil 已提交
771 772
	req->r_dentry = dget(dentry);
	req->r_num_caps = 2;
Y
Yan, Zheng 已提交
773 774 775 776 777 778

	mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
	if (ceph_security_xattr_wanted(dir))
		mask |= CEPH_CAP_XATTR_SHARED;
	req->r_args.getattr.mask = cpu_to_le32(mask);

779 780
	req->r_parent = dir;
	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
S
Sage Weil 已提交
781
	err = ceph_mdsc_do_request(mdsc, NULL, req);
782
	err = ceph_handle_snapdir(req, dentry, err);
S
Sage Weil 已提交
783 784 785 786 787 788 789 790 791 792 793 794
	dentry = ceph_finish_lookup(req, dentry, err);
	ceph_mdsc_put_request(req);  /* will dput(dentry) */
	dout("lookup result=%p\n", dentry);
	return dentry;
}

/*
 * If we do a create but get no trace back from the MDS, follow up with
 * a lookup (the VFS expects us to link up the provided dentry).
 */
int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
{
A
Al Viro 已提交
795
	struct dentry *result = ceph_lookup(dir, dentry, 0);
S
Sage Weil 已提交
796 797 798 799 800

	if (result && !IS_ERR(result)) {
		/*
		 * We created the item, then did a lookup, and found
		 * it was already linked to another inode we already
801 802 803 804 805 806 807 808
		 * had in our cache (and thus got spliced). To not
		 * confuse VFS (especially when inode is a directory),
		 * we don't link our dentry to that inode, return an
		 * error instead.
		 *
		 * This event should be rare and it happens only when
		 * we talk to old MDS. Recent MDS does not send traceless
		 * reply for request that creates new inode.
S
Sage Weil 已提交
809
		 */
Y
Yan, Zheng 已提交
810
		d_drop(result);
811
		return -ESTALE;
S
Sage Weil 已提交
812 813 814 815 816
	}
	return PTR_ERR(result);
}

static int ceph_mknod(struct inode *dir, struct dentry *dentry,
A
Al Viro 已提交
817
		      umode_t mode, dev_t rdev)
S
Sage Weil 已提交
818
{
819 820
	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
	struct ceph_mds_client *mdsc = fsc->mdsc;
S
Sage Weil 已提交
821
	struct ceph_mds_request *req;
822
	struct ceph_acls_info acls = {};
S
Sage Weil 已提交
823 824 825 826 827
	int err;

	if (ceph_snap(dir) != CEPH_NOSNAP)
		return -EROFS;

828 829 830 831
	err = ceph_pre_init_acls(dir, &mode, &acls);
	if (err < 0)
		return err;

A
Al Viro 已提交
832
	dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
S
Sage Weil 已提交
833 834 835
	     dir, dentry, mode, rdev);
	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
	if (IS_ERR(req)) {
836 837
		err = PTR_ERR(req);
		goto out;
S
Sage Weil 已提交
838 839 840
	}
	req->r_dentry = dget(dentry);
	req->r_num_caps = 2;
841 842
	req->r_parent = dir;
	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
S
Sage Weil 已提交
843 844
	req->r_args.mknod.mode = cpu_to_le32(mode);
	req->r_args.mknod.rdev = cpu_to_le32(rdev);
845
	req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
S
Sage Weil 已提交
846
	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
847 848 849 850
	if (acls.pagelist) {
		req->r_pagelist = acls.pagelist;
		acls.pagelist = NULL;
	}
S
Sage Weil 已提交
851 852 853 854
	err = ceph_mdsc_do_request(mdsc, dir, req);
	if (!err && !req->r_reply_info.head->is_dentry)
		err = ceph_handle_notrace_create(dir, dentry);
	ceph_mdsc_put_request(req);
855
out:
G
Guangliang Zhao 已提交
856
	if (!err)
857
		ceph_init_inode_acls(d_inode(dentry), &acls);
858
	else
S
Sage Weil 已提交
859
		d_drop(dentry);
860
	ceph_release_acls_info(&acls);
S
Sage Weil 已提交
861 862 863
	return err;
}

A
Al Viro 已提交
864
static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
A
Al Viro 已提交
865
		       bool excl)
S
Sage Weil 已提交
866
{
867
	return ceph_mknod(dir, dentry, mode, 0);
S
Sage Weil 已提交
868 869 870 871 872
}

static int ceph_symlink(struct inode *dir, struct dentry *dentry,
			    const char *dest)
{
873 874
	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
	struct ceph_mds_client *mdsc = fsc->mdsc;
S
Sage Weil 已提交
875 876 877 878 879 880 881 882 883
	struct ceph_mds_request *req;
	int err;

	if (ceph_snap(dir) != CEPH_NOSNAP)
		return -EROFS;

	dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
	if (IS_ERR(req)) {
884 885
		err = PTR_ERR(req);
		goto out;
S
Sage Weil 已提交
886
	}
887
	req->r_path2 = kstrdup(dest, GFP_KERNEL);
888 889 890 891 892
	if (!req->r_path2) {
		err = -ENOMEM;
		ceph_mdsc_put_request(req);
		goto out;
	}
893 894
	req->r_parent = dir;
	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
895 896
	req->r_dentry = dget(dentry);
	req->r_num_caps = 2;
897
	req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
S
Sage Weil 已提交
898 899 900 901 902
	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
	err = ceph_mdsc_do_request(mdsc, dir, req);
	if (!err && !req->r_reply_info.head->is_dentry)
		err = ceph_handle_notrace_create(dir, dentry);
	ceph_mdsc_put_request(req);
903 904
out:
	if (err)
S
Sage Weil 已提交
905 906 907 908
		d_drop(dentry);
	return err;
}

909
static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
S
Sage Weil 已提交
910
{
911 912
	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
	struct ceph_mds_client *mdsc = fsc->mdsc;
S
Sage Weil 已提交
913
	struct ceph_mds_request *req;
914
	struct ceph_acls_info acls = {};
S
Sage Weil 已提交
915 916 917 918 919 920
	int err = -EROFS;
	int op;

	if (ceph_snap(dir) == CEPH_SNAPDIR) {
		/* mkdir .snap/foo is a MKSNAP */
		op = CEPH_MDS_OP_MKSNAP;
A
Al Viro 已提交
921 922
		dout("mksnap dir %p snap '%pd' dn %p\n", dir,
		     dentry, dentry);
S
Sage Weil 已提交
923
	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
924
		dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
S
Sage Weil 已提交
925 926 927 928
		op = CEPH_MDS_OP_MKDIR;
	} else {
		goto out;
	}
929 930 931 932 933 934

	mode |= S_IFDIR;
	err = ceph_pre_init_acls(dir, &mode, &acls);
	if (err < 0)
		goto out;

S
Sage Weil 已提交
935 936 937 938 939 940 941 942
	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
	if (IS_ERR(req)) {
		err = PTR_ERR(req);
		goto out;
	}

	req->r_dentry = dget(dentry);
	req->r_num_caps = 2;
943 944
	req->r_parent = dir;
	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
S
Sage Weil 已提交
945
	req->r_args.mkdir.mode = cpu_to_le32(mode);
946
	req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
S
Sage Weil 已提交
947
	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
948 949 950 951
	if (acls.pagelist) {
		req->r_pagelist = acls.pagelist;
		acls.pagelist = NULL;
	}
S
Sage Weil 已提交
952
	err = ceph_mdsc_do_request(mdsc, dir, req);
Y
Yan, Zheng 已提交
953 954 955
	if (!err &&
	    !req->r_reply_info.head->is_target &&
	    !req->r_reply_info.head->is_dentry)
S
Sage Weil 已提交
956 957 958
		err = ceph_handle_notrace_create(dir, dentry);
	ceph_mdsc_put_request(req);
out:
959
	if (!err)
960
		ceph_init_inode_acls(d_inode(dentry), &acls);
961
	else
S
Sage Weil 已提交
962
		d_drop(dentry);
963
	ceph_release_acls_info(&acls);
S
Sage Weil 已提交
964 965 966 967 968 969
	return err;
}

static int ceph_link(struct dentry *old_dentry, struct inode *dir,
		     struct dentry *dentry)
{
970 971
	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
	struct ceph_mds_client *mdsc = fsc->mdsc;
S
Sage Weil 已提交
972 973 974 975 976 977 978 979 980 981 982 983 984 985 986
	struct ceph_mds_request *req;
	int err;

	if (ceph_snap(dir) != CEPH_NOSNAP)
		return -EROFS;

	dout("link in dir %p old_dentry %p dentry %p\n", dir,
	     old_dentry, dentry);
	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
	if (IS_ERR(req)) {
		d_drop(dentry);
		return PTR_ERR(req);
	}
	req->r_dentry = dget(dentry);
	req->r_num_caps = 2;
987
	req->r_old_dentry = dget(old_dentry);
988 989
	req->r_parent = dir;
	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
S
Sage Weil 已提交
990 991
	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
992
	/* release LINK_SHARED on source inode (mds will lock it) */
993
	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
S
Sage Weil 已提交
994
	err = ceph_mdsc_do_request(mdsc, dir, req);
995
	if (err) {
S
Sage Weil 已提交
996
		d_drop(dentry);
997
	} else if (!req->r_reply_info.head->is_dentry) {
998 999
		ihold(d_inode(old_dentry));
		d_instantiate(dentry, d_inode(old_dentry));
1000
	}
S
Sage Weil 已提交
1001 1002 1003 1004 1005 1006 1007 1008 1009
	ceph_mdsc_put_request(req);
	return err;
}

/*
 * rmdir and unlink are differ only by the metadata op code
 */
static int ceph_unlink(struct inode *dir, struct dentry *dentry)
{
1010 1011
	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
	struct ceph_mds_client *mdsc = fsc->mdsc;
1012
	struct inode *inode = d_inode(dentry);
S
Sage Weil 已提交
1013 1014 1015 1016 1017 1018
	struct ceph_mds_request *req;
	int err = -EROFS;
	int op;

	if (ceph_snap(dir) == CEPH_SNAPDIR) {
		/* rmdir .snap/foo is RMSNAP */
A
Al Viro 已提交
1019
		dout("rmsnap dir %p '%pd' dn %p\n", dir, dentry, dentry);
S
Sage Weil 已提交
1020 1021 1022 1023
		op = CEPH_MDS_OP_RMSNAP;
	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
		dout("unlink/rmdir dir %p dn %p inode %p\n",
		     dir, dentry, inode);
1024
		op = d_is_dir(dentry) ?
S
Sage Weil 已提交
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034
			CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
	} else
		goto out;
	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
	if (IS_ERR(req)) {
		err = PTR_ERR(req);
		goto out;
	}
	req->r_dentry = dget(dentry);
	req->r_num_caps = 2;
1035 1036
	req->r_parent = dir;
	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
S
Sage Weil 已提交
1037 1038
	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
1039
	req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
S
Sage Weil 已提交
1040 1041 1042 1043 1044 1045 1046 1047 1048
	err = ceph_mdsc_do_request(mdsc, dir, req);
	if (!err && !req->r_reply_info.head->is_dentry)
		d_delete(dentry);
	ceph_mdsc_put_request(req);
out:
	return err;
}

static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
1049 1050
		       struct inode *new_dir, struct dentry *new_dentry,
		       unsigned int flags)
S
Sage Weil 已提交
1051
{
1052 1053
	struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
	struct ceph_mds_client *mdsc = fsc->mdsc;
S
Sage Weil 已提交
1054
	struct ceph_mds_request *req;
Y
Yan, Zheng 已提交
1055
	int op = CEPH_MDS_OP_RENAME;
S
Sage Weil 已提交
1056 1057
	int err;

1058 1059 1060
	if (flags)
		return -EINVAL;

S
Sage Weil 已提交
1061 1062
	if (ceph_snap(old_dir) != ceph_snap(new_dir))
		return -EXDEV;
Y
Yan, Zheng 已提交
1063 1064 1065 1066 1067 1068
	if (ceph_snap(old_dir) != CEPH_NOSNAP) {
		if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR)
			op = CEPH_MDS_OP_RENAMESNAP;
		else
			return -EROFS;
	}
S
Sage Weil 已提交
1069 1070
	dout("rename dir %p dentry %p to dir %p dentry %p\n",
	     old_dir, old_dentry, new_dir, new_dentry);
Y
Yan, Zheng 已提交
1071
	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
S
Sage Weil 已提交
1072 1073
	if (IS_ERR(req))
		return PTR_ERR(req);
1074
	ihold(old_dir);
S
Sage Weil 已提交
1075 1076 1077
	req->r_dentry = dget(new_dentry);
	req->r_num_caps = 2;
	req->r_old_dentry = dget(old_dentry);
1078
	req->r_old_dentry_dir = old_dir;
1079 1080
	req->r_parent = new_dir;
	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
S
Sage Weil 已提交
1081 1082 1083 1084 1085
	req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
	req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
	/* release LINK_RDCACHE on source inode (mds will lock it) */
1086
	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
1087 1088 1089 1090
	if (d_really_is_positive(new_dentry)) {
		req->r_inode_drop =
			ceph_drop_caps_for_unlink(d_inode(new_dentry));
	}
S
Sage Weil 已提交
1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103
	err = ceph_mdsc_do_request(mdsc, old_dir, req);
	if (!err && !req->r_reply_info.head->is_dentry) {
		/*
		 * Normally d_move() is done by fill_trace (called by
		 * do_request, above).  If there is no trace, we need
		 * to do it here.
		 */
		d_move(old_dentry, new_dentry);
	}
	ceph_mdsc_put_request(req);
	return err;
}

1104 1105 1106 1107 1108 1109
/*
 * Ensure a dentry lease will no longer revalidate.
 */
void ceph_invalidate_dentry_lease(struct dentry *dentry)
{
	spin_lock(&dentry->d_lock);
M
Miklos Szeredi 已提交
1110
	ceph_dentry(dentry)->time = jiffies;
1111 1112 1113
	ceph_dentry(dentry)->lease_shared_gen = 0;
	spin_unlock(&dentry->d_lock);
}
S
Sage Weil 已提交
1114 1115 1116 1117 1118

/*
 * Check if dentry lease is valid.  If not, delete the lease.  Try to
 * renew if the least is more than half up.
 */
1119 1120
static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
				 struct inode *dir)
S
Sage Weil 已提交
1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131
{
	struct ceph_dentry_info *di;
	struct ceph_mds_session *s;
	int valid = 0;
	u32 gen;
	unsigned long ttl;
	struct ceph_mds_session *session = NULL;
	u32 seq = 0;

	spin_lock(&dentry->d_lock);
	di = ceph_dentry(dentry);
1132
	if (di && di->lease_session) {
S
Sage Weil 已提交
1133
		s = di->lease_session;
1134
		spin_lock(&s->s_gen_ttl_lock);
S
Sage Weil 已提交
1135 1136
		gen = s->s_cap_gen;
		ttl = s->s_cap_ttl;
1137
		spin_unlock(&s->s_gen_ttl_lock);
S
Sage Weil 已提交
1138 1139

		if (di->lease_gen == gen &&
M
Miklos Szeredi 已提交
1140
		    time_before(jiffies, di->time) &&
S
Sage Weil 已提交
1141 1142 1143 1144
		    time_before(jiffies, ttl)) {
			valid = 1;
			if (di->lease_renew_after &&
			    time_after(jiffies, di->lease_renew_after)) {
1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157
				/*
				 * We should renew. If we're in RCU walk mode
				 * though, we can't do that so just return
				 * -ECHILD.
				 */
				if (flags & LOOKUP_RCU) {
					valid = -ECHILD;
				} else {
					session = ceph_get_mds_session(s);
					seq = di->lease_seq;
					di->lease_renew_after = 0;
					di->lease_renew_from = jiffies;
				}
S
Sage Weil 已提交
1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180
			}
		}
	}
	spin_unlock(&dentry->d_lock);

	if (session) {
		ceph_mdsc_lease_send_msg(session, dir, dentry,
					 CEPH_MDS_LEASE_RENEW, seq);
		ceph_put_mds_session(session);
	}
	dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
	return valid;
}

/*
 * Check if directory-wide content lease/cap is valid.
 */
static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
{
	struct ceph_inode_info *ci = ceph_inode(dir);
	struct ceph_dentry_info *di = ceph_dentry(dentry);
	int valid = 0;

1181
	spin_lock(&ci->i_ceph_lock);
1182
	if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen)
S
Sage Weil 已提交
1183
		valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
1184
	spin_unlock(&ci->i_ceph_lock);
S
Sage Weil 已提交
1185
	dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
1186 1187
	     dir, (unsigned)atomic_read(&ci->i_shared_gen),
	     dentry, (unsigned)di->lease_shared_gen, valid);
S
Sage Weil 已提交
1188 1189 1190 1191 1192 1193
	return valid;
}

/*
 * Check if cached dentry can be trusted.
 */
1194
static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
S
Sage Weil 已提交
1195
{
1196
	int valid = 0;
1197
	struct dentry *parent;
1198 1199
	struct inode *dir;

1200
	if (flags & LOOKUP_RCU) {
1201
		parent = READ_ONCE(dentry->d_parent);
1202 1203 1204 1205 1206 1207 1208
		dir = d_inode_rcu(parent);
		if (!dir)
			return -ECHILD;
	} else {
		parent = dget_parent(dentry);
		dir = d_inode(parent);
	}
1209

A
Al Viro 已提交
1210
	dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
1211
	     dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
S
Sage Weil 已提交
1212 1213 1214

	/* always trust cached snapped dentries, snapdir dentry */
	if (ceph_snap(dir) != CEPH_NOSNAP) {
A
Al Viro 已提交
1215
		dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
1216
		     dentry, d_inode(dentry));
1217
		valid = 1;
1218 1219
	} else if (d_really_is_positive(dentry) &&
		   ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) {
1220
		valid = 1;
1221 1222 1223 1224 1225 1226 1227 1228 1229 1230
	} else {
		valid = dentry_lease_is_valid(dentry, flags, dir);
		if (valid == -ECHILD)
			return valid;
		if (valid || dir_lease_is_valid(dir, dentry)) {
			if (d_really_is_positive(dentry))
				valid = ceph_is_any_caps(d_inode(dentry));
			else
				valid = 1;
		}
S
Sage Weil 已提交
1231 1232
	}

1233 1234 1235 1236
	if (!valid) {
		struct ceph_mds_client *mdsc =
			ceph_sb_to_client(dir->i_sb)->mdsc;
		struct ceph_mds_request *req;
1237 1238
		int op, err;
		u32 mask;
1239

1240 1241 1242
		if (flags & LOOKUP_RCU)
			return -ECHILD;

1243
		op = ceph_snap(dir) == CEPH_SNAPDIR ?
1244
			CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
1245 1246 1247
		req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
		if (!IS_ERR(req)) {
			req->r_dentry = dget(dentry);
1248 1249
			req->r_num_caps = 2;
			req->r_parent = dir;
1250 1251 1252 1253

			mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
			if (ceph_security_xattr_wanted(dir))
				mask |= CEPH_CAP_XATTR_SHARED;
1254
			req->r_args.getattr.mask = cpu_to_le32(mask);
1255 1256

			err = ceph_mdsc_do_request(mdsc, NULL, req);
1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268
			switch (err) {
			case 0:
				if (d_really_is_positive(dentry) &&
				    d_inode(dentry) == req->r_target_inode)
					valid = 1;
				break;
			case -ENOENT:
				if (d_really_is_negative(dentry))
					valid = 1;
				/* Fallthrough */
			default:
				break;
1269 1270 1271 1272 1273 1274 1275
			}
			ceph_mdsc_put_request(req);
			dout("d_revalidate %p lookup result=%d\n",
			     dentry, err);
		}
	}

1276
	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
1277
	if (valid) {
1278
		ceph_dentry_lru_touch(dentry);
1279 1280 1281
	} else {
		ceph_dir_clear_complete(dir);
	}
1282

1283 1284
	if (!(flags & LOOKUP_RCU))
		dput(parent);
1285
	return valid;
S
Sage Weil 已提交
1286 1287 1288
}

/*
1289
 * Release our ceph_dentry_info.
S
Sage Weil 已提交
1290
 */
1291
static void ceph_d_release(struct dentry *dentry)
S
Sage Weil 已提交
1292 1293 1294
{
	struct ceph_dentry_info *di = ceph_dentry(dentry);

1295
	dout("d_release %p\n", dentry);
1296
	ceph_dentry_lru_del(dentry);
1297 1298 1299 1300 1301

	spin_lock(&dentry->d_lock);
	dentry->d_fsdata = NULL;
	spin_unlock(&dentry->d_lock);

1302 1303 1304
	if (di->lease_session)
		ceph_put_mds_session(di->lease_session);
	kmem_cache_free(ceph_dentry_cachep, di);
S
Sage Weil 已提交
1305 1306
}

1307 1308 1309 1310 1311 1312 1313 1314
/*
 * When the VFS prunes a dentry from the cache, we need to clear the
 * complete flag on the parent directory.
 *
 * Called under dentry->d_lock.
 */
static void ceph_d_prune(struct dentry *dentry)
{
1315 1316 1317 1318
	struct ceph_inode_info *dir_ci;
	struct ceph_dentry_info *di;

	dout("ceph_d_prune %pd %p\n", dentry, dentry);
1319 1320

	/* do we have a valid parent? */
1321
	if (IS_ROOT(dentry))
1322 1323
		return;

1324 1325 1326
	/* we hold d_lock, so d_parent is stable */
	dir_ci = ceph_inode(d_inode(dentry->d_parent));
	if (dir_ci->i_vino.snap == CEPH_SNAPDIR)
1327
		return;
S
Sage Weil 已提交
1328

1329 1330
	/* who calls d_delete() should also disable dcache readdir */
	if (d_really_is_negative(dentry))
A
Al Viro 已提交
1331 1332
		return;

1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345
	/* d_fsdata does not get cleared until d_release */
	if (!d_unhashed(dentry)) {
		__ceph_dir_clear_complete(dir_ci);
		return;
	}

	/* Disable dcache readdir just in case that someone called d_drop()
	 * or d_invalidate(), but MDS didn't revoke CEPH_CAP_FILE_SHARED
	 * properly (dcache readdir is still enabled) */
	di = ceph_dentry(dentry);
	if (di->offset > 0 &&
	    di->lease_shared_gen == atomic_read(&dir_ci->i_shared_gen))
		__ceph_dir_clear_ordered(dir_ci);
1346
}
S
Sage Weil 已提交
1347 1348 1349 1350 1351 1352 1353 1354

/*
 * read() on a dir.  This weird interface hack only works if mounted
 * with '-o dirstat'.
 */
static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
			     loff_t *ppos)
{
1355
	struct ceph_file_info *fi = file->private_data;
A
Al Viro 已提交
1356
	struct inode *inode = file_inode(file);
S
Sage Weil 已提交
1357 1358
	struct ceph_inode_info *ci = ceph_inode(inode);
	int left;
1359
	const int bufsize = 1024;
S
Sage Weil 已提交
1360

1361
	if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
S
Sage Weil 已提交
1362 1363
		return -EISDIR;

1364 1365 1366
	if (!fi->dir_info) {
		fi->dir_info = kmalloc(bufsize, GFP_KERNEL);
		if (!fi->dir_info)
S
Sage Weil 已提交
1367
			return -ENOMEM;
1368 1369
		fi->dir_info_len =
			snprintf(fi->dir_info, bufsize,
S
Sage Weil 已提交
1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388
				"entries:   %20lld\n"
				" files:    %20lld\n"
				" subdirs:  %20lld\n"
				"rentries:  %20lld\n"
				" rfiles:   %20lld\n"
				" rsubdirs: %20lld\n"
				"rbytes:    %20lld\n"
				"rctime:    %10ld.%09ld\n",
				ci->i_files + ci->i_subdirs,
				ci->i_files,
				ci->i_subdirs,
				ci->i_rfiles + ci->i_rsubdirs,
				ci->i_rfiles,
				ci->i_rsubdirs,
				ci->i_rbytes,
				(long)ci->i_rctime.tv_sec,
				(long)ci->i_rctime.tv_nsec);
	}

1389
	if (*ppos >= fi->dir_info_len)
S
Sage Weil 已提交
1390
		return 0;
1391 1392
	size = min_t(unsigned, size, fi->dir_info_len-*ppos);
	left = copy_to_user(buf, fi->dir_info + *ppos, size);
S
Sage Weil 已提交
1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408
	if (left == size)
		return -EFAULT;
	*ppos += (size - left);
	return size - left;
}

/*
 * We maintain a private dentry LRU.
 *
 * FIXME: this needs to be changed to a per-mds lru to be useful.
 */
void ceph_dentry_lru_add(struct dentry *dn)
{
	struct ceph_dentry_info *di = ceph_dentry(dn);
	struct ceph_mds_client *mdsc;

A
Al Viro 已提交
1409
	dout("dentry_lru_add %p %p '%pd'\n", di, dn, dn);
1410 1411 1412 1413 1414
	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
	spin_lock(&mdsc->dentry_lru_lock);
	list_add_tail(&di->lru, &mdsc->dentry_lru);
	mdsc->num_dentry++;
	spin_unlock(&mdsc->dentry_lru_lock);
S
Sage Weil 已提交
1415 1416 1417 1418 1419 1420 1421
}

void ceph_dentry_lru_touch(struct dentry *dn)
{
	struct ceph_dentry_info *di = ceph_dentry(dn);
	struct ceph_mds_client *mdsc;

A
Al Viro 已提交
1422 1423
	dout("dentry_lru_touch %p %p '%pd' (offset %lld)\n", di, dn, dn,
	     di->offset);
1424 1425 1426 1427
	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
	spin_lock(&mdsc->dentry_lru_lock);
	list_move_tail(&di->lru, &mdsc->dentry_lru);
	spin_unlock(&mdsc->dentry_lru_lock);
S
Sage Weil 已提交
1428 1429 1430 1431 1432 1433 1434
}

void ceph_dentry_lru_del(struct dentry *dn)
{
	struct ceph_dentry_info *di = ceph_dentry(dn);
	struct ceph_mds_client *mdsc;

A
Al Viro 已提交
1435
	dout("dentry_lru_del %p %p '%pd'\n", di, dn, dn);
1436 1437 1438 1439 1440
	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
	spin_lock(&mdsc->dentry_lru_lock);
	list_del_init(&di->lru);
	mdsc->num_dentry--;
	spin_unlock(&mdsc->dentry_lru_lock);
S
Sage Weil 已提交
1441 1442
}

S
Sage Weil 已提交
1443 1444 1445 1446
/*
 * Return name hash for a given dentry.  This is dependent on
 * the parent directory's hash function.
 */
1447
unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
S
Sage Weil 已提交
1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461
{
	struct ceph_inode_info *dci = ceph_inode(dir);

	switch (dci->i_dir_layout.dl_dir_hash) {
	case 0:	/* for backward compat */
	case CEPH_STR_HASH_LINUX:
		return dn->d_name.hash;

	default:
		return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
				     dn->d_name.name, dn->d_name.len);
	}
}

S
Sage Weil 已提交
1462 1463
const struct file_operations ceph_dir_fops = {
	.read = ceph_read_dir,
A
Al Viro 已提交
1464
	.iterate = ceph_readdir,
S
Sage Weil 已提交
1465 1466 1467 1468
	.llseek = ceph_dir_llseek,
	.open = ceph_open,
	.release = ceph_release,
	.unlocked_ioctl = ceph_ioctl,
Y
Yan, Zheng 已提交
1469
	.fsync = ceph_fsync,
S
Sage Weil 已提交
1470 1471
};

1472 1473 1474 1475 1476 1477 1478
const struct file_operations ceph_snapdir_fops = {
	.iterate = ceph_readdir,
	.llseek = ceph_dir_llseek,
	.open = ceph_open,
	.release = ceph_release,
};

S
Sage Weil 已提交
1479 1480 1481 1482 1483 1484
const struct inode_operations ceph_dir_iops = {
	.lookup = ceph_lookup,
	.permission = ceph_permission,
	.getattr = ceph_getattr,
	.setattr = ceph_setattr,
	.listxattr = ceph_listxattr,
G
Guangliang Zhao 已提交
1485
	.get_acl = ceph_get_acl,
S
Sage Weil 已提交
1486
	.set_acl = ceph_set_acl,
S
Sage Weil 已提交
1487 1488 1489 1490 1491 1492 1493 1494
	.mknod = ceph_mknod,
	.symlink = ceph_symlink,
	.mkdir = ceph_mkdir,
	.link = ceph_link,
	.unlink = ceph_unlink,
	.rmdir = ceph_unlink,
	.rename = ceph_rename,
	.create = ceph_create,
1495
	.atomic_open = ceph_atomic_open,
S
Sage Weil 已提交
1496 1497
};

1498 1499 1500 1501 1502 1503
const struct inode_operations ceph_snapdir_iops = {
	.lookup = ceph_lookup,
	.permission = ceph_permission,
	.getattr = ceph_getattr,
	.mkdir = ceph_mkdir,
	.rmdir = ceph_unlink,
Y
Yan, Zheng 已提交
1504
	.rename = ceph_rename,
1505 1506
};

S
Sage Weil 已提交
1507
const struct dentry_operations ceph_dentry_ops = {
S
Sage Weil 已提交
1508
	.d_revalidate = ceph_d_revalidate,
1509
	.d_release = ceph_d_release,
1510
	.d_prune = ceph_d_prune,
A
Al Viro 已提交
1511
	.d_init = ceph_d_init,
S
Sage Weil 已提交
1512
};