super.c 24.0 KB
Newer Older
S
Sage Weil 已提交
1

2
#include <linux/ceph/ceph_debug.h>
S
Sage Weil 已提交
3 4

#include <linux/backing-dev.h>
S
Sage Weil 已提交
5
#include <linux/ctype.h>
S
Sage Weil 已提交
6 7 8 9 10 11 12 13
#include <linux/fs.h>
#include <linux/inet.h>
#include <linux/in6.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/parser.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
14
#include <linux/slab.h>
S
Sage Weil 已提交
15 16 17 18
#include <linux/statfs.h>
#include <linux/string.h>

#include "super.h"
19 20
#include "mds_client.h"

21
#include <linux/ceph/ceph_features.h>
22 23 24 25
#include <linux/ceph/decode.h>
#include <linux/ceph/mon_client.h>
#include <linux/ceph/auth.h>
#include <linux/ceph/debugfs.h>
S
Sage Weil 已提交
26 27 28 29 30 31 32 33 34 35 36 37

/*
 * Ceph superblock operations
 *
 * Handle the basics of mounting, unmounting.
 */

/*
 * super ops
 */
static void ceph_put_super(struct super_block *s)
{
38
	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
S
Sage Weil 已提交
39 40

	dout("put_super\n");
41
	ceph_mdsc_close_sessions(fsc->mdsc);
42 43 44 45 46

	/*
	 * ensure we release the bdi before put_anon_super releases
	 * the device name.
	 */
47 48
	if (s->s_bdi == &fsc->backing_dev_info) {
		bdi_unregister(&fsc->backing_dev_info);
49 50 51
		s->s_bdi = NULL;
	}

S
Sage Weil 已提交
52 53 54 55 56
	return;
}

static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
{
57 58
	struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
	struct ceph_monmap *monmap = fsc->client->monc.monmap;
S
Sage Weil 已提交
59 60 61 62 63
	struct ceph_statfs st;
	u64 fsid;
	int err;

	dout("statfs\n");
64
	err = ceph_monc_do_statfs(&fsc->client->monc, &st);
S
Sage Weil 已提交
65 66 67 68 69 70 71 72 73
	if (err < 0)
		return err;

	/* fill in kstatfs */
	buf->f_type = CEPH_SUPER_MAGIC;  /* ?? */

	/*
	 * express utilization in terms of large blocks to avoid
	 * overflow on 32-bit machines.
S
Sage Weil 已提交
74 75 76 77 78
	 *
	 * NOTE: for the time being, we make bsize == frsize to humor
	 * not-yet-ancient versions of glibc that are broken.
	 * Someday, we will probably want to report a real block
	 * size...  whatever that may mean for a network file system!
S
Sage Weil 已提交
79 80
	 */
	buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
S
Sage Weil 已提交
81
	buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
S
Sage Weil 已提交
82
	buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
83
	buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
S
Sage Weil 已提交
84 85 86 87
	buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);

	buf->f_files = le64_to_cpu(st.num_objects);
	buf->f_ffree = -1;
88
	buf->f_namelen = NAME_MAX;
S
Sage Weil 已提交
89 90 91 92 93 94 95 96 97 98

	/* leave fsid little-endian, regardless of host endianness */
	fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
	buf->f_fsid.val[0] = fsid & 0xffffffff;
	buf->f_fsid.val[1] = fsid >> 32;

	return 0;
}


99
static int ceph_sync_fs(struct super_block *sb, int wait)
S
Sage Weil 已提交
100
{
101
	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
102 103 104

	if (!wait) {
		dout("sync_fs (non-blocking)\n");
105
		ceph_flush_dirty_caps(fsc->mdsc);
106 107 108 109 110
		dout("sync_fs (non-blocking) done\n");
		return 0;
	}

	dout("sync_fs (blocking)\n");
111 112
	ceph_osdc_sync(&fsc->client->osdc);
	ceph_mdsc_sync(fsc->mdsc);
113
	dout("sync_fs (blocking) done\n");
S
Sage Weil 已提交
114 115 116 117 118 119 120 121 122
	return 0;
}

/*
 * mount options
 */
enum {
	Opt_wsize,
	Opt_rsize,
S
Sage Weil 已提交
123
	Opt_rasize,
S
Sage Weil 已提交
124 125
	Opt_caps_wanted_delay_min,
	Opt_caps_wanted_delay_max,
126
	Opt_cap_release_safety,
S
Sage Weil 已提交
127
	Opt_readdir_max_entries,
128
	Opt_readdir_max_bytes,
Y
Yehuda Sadeh 已提交
129
	Opt_congestion_kb,
130
	Opt_last_int,
S
Sage Weil 已提交
131 132
	/* int args above */
	Opt_snapdirname,
133
	Opt_last_string,
S
Sage Weil 已提交
134 135 136 137 138
	/* string args above */
	Opt_dirstat,
	Opt_nodirstat,
	Opt_rbytes,
	Opt_norbytes,
139
	Opt_asyncreaddir,
S
Sage Weil 已提交
140
	Opt_noasyncreaddir,
141 142
	Opt_dcache,
	Opt_nodcache,
Y
Yehuda Sadeh 已提交
143
	Opt_ino32,
144
	Opt_noino32,
S
Sage Weil 已提交
145 146
};

147
static match_table_t fsopt_tokens = {
S
Sage Weil 已提交
148 149
	{Opt_wsize, "wsize=%d"},
	{Opt_rsize, "rsize=%d"},
S
Sage Weil 已提交
150
	{Opt_rasize, "rasize=%d"},
S
Sage Weil 已提交
151 152
	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
153
	{Opt_cap_release_safety, "cap_release_safety=%d"},
S
Sage Weil 已提交
154
	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
155
	{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
Y
Yehuda Sadeh 已提交
156
	{Opt_congestion_kb, "write_congestion_kb=%d"},
S
Sage Weil 已提交
157 158 159 160 161 162 163
	/* int args above */
	{Opt_snapdirname, "snapdirname=%s"},
	/* string args above */
	{Opt_dirstat, "dirstat"},
	{Opt_nodirstat, "nodirstat"},
	{Opt_rbytes, "rbytes"},
	{Opt_norbytes, "norbytes"},
164
	{Opt_asyncreaddir, "asyncreaddir"},
S
Sage Weil 已提交
165
	{Opt_noasyncreaddir, "noasyncreaddir"},
166 167
	{Opt_dcache, "dcache"},
	{Opt_nodcache, "nodcache"},
Y
Yehuda Sadeh 已提交
168
	{Opt_ino32, "ino32"},
169
	{Opt_noino32, "noino32"},
S
Sage Weil 已提交
170 171 172
	{-1, NULL}
};

173
static int parse_fsopt_token(char *c, void *private)
S
Sage Weil 已提交
174
{
175 176 177 178 179 180 181 182 183 184 185 186 187 188
	struct ceph_mount_options *fsopt = private;
	substring_t argstr[MAX_OPT_ARGS];
	int token, intval, ret;

	token = match_token((char *)c, fsopt_tokens, argstr);
	if (token < 0)
		return -EINVAL;

	if (token < Opt_last_int) {
		ret = match_int(&argstr[0], &intval);
		if (ret < 0) {
			pr_err("bad mount option arg (not int) "
			       "at '%s'\n", c);
			return ret;
S
Sage Weil 已提交
189
		}
190 191 192 193 194 195
		dout("got int token %d val %d\n", token, intval);
	} else if (token > Opt_last_int && token < Opt_last_string) {
		dout("got string token %d val %s\n", token,
		     argstr[0].from);
	} else {
		dout("got token %d\n", token);
S
Sage Weil 已提交
196 197
	}

198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
	switch (token) {
	case Opt_snapdirname:
		kfree(fsopt->snapdir_name);
		fsopt->snapdir_name = kstrndup(argstr[0].from,
					       argstr[0].to-argstr[0].from,
					       GFP_KERNEL);
		if (!fsopt->snapdir_name)
			return -ENOMEM;
		break;

		/* misc */
	case Opt_wsize:
		fsopt->wsize = intval;
		break;
	case Opt_rsize:
		fsopt->rsize = intval;
		break;
S
Sage Weil 已提交
215 216 217
	case Opt_rasize:
		fsopt->rasize = intval;
		break;
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
	case Opt_caps_wanted_delay_min:
		fsopt->caps_wanted_delay_min = intval;
		break;
	case Opt_caps_wanted_delay_max:
		fsopt->caps_wanted_delay_max = intval;
		break;
	case Opt_readdir_max_entries:
		fsopt->max_readdir = intval;
		break;
	case Opt_readdir_max_bytes:
		fsopt->max_readdir_bytes = intval;
		break;
	case Opt_congestion_kb:
		fsopt->congestion_kb = intval;
		break;
	case Opt_dirstat:
		fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
		break;
	case Opt_nodirstat:
		fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
		break;
	case Opt_rbytes:
		fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
		break;
	case Opt_norbytes:
		fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
		break;
245 246 247
	case Opt_asyncreaddir:
		fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR;
		break;
248 249 250
	case Opt_noasyncreaddir:
		fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
		break;
251 252 253 254 255 256
	case Opt_dcache:
		fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
		break;
	case Opt_nodcache:
		fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
		break;
Y
Yehuda Sadeh 已提交
257 258 259
	case Opt_ino32:
		fsopt->flags |= CEPH_MOUNT_OPT_INO32;
		break;
260 261 262
	case Opt_noino32:
		fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
		break;
263 264 265 266
	default:
		BUG_ON(token);
	}
	return 0;
S
Sage Weil 已提交
267
}
S
Sage Weil 已提交
268

269
static void destroy_mount_options(struct ceph_mount_options *args)
S
Sage Weil 已提交
270
{
271 272 273 274
	dout("destroy_mount_options %p\n", args);
	kfree(args->snapdir_name);
	kfree(args);
}
S
Sage Weil 已提交
275

276 277 278 279 280 281 282 283 284 285
static int strcmp_null(const char *s1, const char *s2)
{
	if (!s1 && !s2)
		return 0;
	if (s1 && !s2)
		return -1;
	if (!s1 && s2)
		return 1;
	return strcmp(s1, s2);
}
S
Sage Weil 已提交
286

287 288 289 290 291 292 293 294
static int compare_mount_options(struct ceph_mount_options *new_fsopt,
				 struct ceph_options *new_opt,
				 struct ceph_fs_client *fsc)
{
	struct ceph_mount_options *fsopt1 = new_fsopt;
	struct ceph_mount_options *fsopt2 = fsc->mount_options;
	int ofs = offsetof(struct ceph_mount_options, snapdir_name);
	int ret;
S
Sage Weil 已提交
295

296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
	ret = memcmp(fsopt1, fsopt2, ofs);
	if (ret)
		return ret;

	ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
	if (ret)
		return ret;

	return ceph_compare_options(new_opt, fsc->client);
}

static int parse_mount_options(struct ceph_mount_options **pfsopt,
			       struct ceph_options **popt,
			       int flags, char *options,
			       const char *dev_name,
			       const char **path)
{
	struct ceph_mount_options *fsopt;
	const char *dev_name_end;
315 316 317 318
	int err;

	if (!dev_name || !*dev_name)
		return -EINVAL;
319 320 321 322 323 324 325

	fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
	if (!fsopt)
		return -ENOMEM;

	dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);

326 327
	fsopt->sb_flags = flags;
	fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
328

329 330 331
	fsopt->rsize = CEPH_RSIZE_DEFAULT;
	fsopt->rasize = CEPH_RASIZE_DEFAULT;
	fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
332 333
	fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
	fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
334 335 336 337 338
	fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
	fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
	fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
	fsopt->congestion_kb = default_congestion_kb();

339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357
	/*
	 * Distinguish the server list from the path in "dev_name".
	 * Internally we do not include the leading '/' in the path.
	 *
	 * "dev_name" will look like:
	 *     <server_spec>[,<server_spec>...]:[<path>]
	 * where
	 *     <server_spec> is <ip>[:<port>]
	 *     <path> is optional, but if present must begin with '/'
	 */
	dev_name_end = strchr(dev_name, '/');
	if (dev_name_end) {
		/* skip over leading '/' for path */
		*path = dev_name_end + 1;
	} else {
		/* path is empty */
		dev_name_end = dev_name + strlen(dev_name);
		*path = dev_name_end;
	}
358
	err = -EINVAL;
359 360 361
	dev_name_end--;		/* back up to ':' separator */
	if (*dev_name_end != ':') {
		pr_err("device name is missing path (no : separator in %s)\n",
362 363 364
				dev_name);
		goto out;
	}
365
	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
S
Sage Weil 已提交
366 367
	dout("server path '%s'\n", *path);

368
	*popt = ceph_parse_options(options, dev_name, dev_name_end,
369
				 parse_fsopt_token, (void *)fsopt);
370 371
	if (IS_ERR(*popt)) {
		err = PTR_ERR(*popt);
372
		goto out;
373
	}
374 375 376 377

	/* success */
	*pfsopt = fsopt;
	return 0;
S
Sage Weil 已提交
378

379
out:
380 381
	destroy_mount_options(fsopt);
	return err;
S
Sage Weil 已提交
382 383
}

384 385 386
/**
 * ceph_show_options - Show mount options in /proc/mounts
 * @m: seq_file to write to
387
 * @root: root of that (sub)tree
388
 */
389
static int ceph_show_options(struct seq_file *m, struct dentry *root)
S
Sage Weil 已提交
390
{
391
	struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
392 393 394 395 396 397 398 399 400 401 402 403
	struct ceph_mount_options *fsopt = fsc->mount_options;
	struct ceph_options *opt = fsc->client->options;

	if (opt->flags & CEPH_OPT_FSID)
		seq_printf(m, ",fsid=%pU", &opt->fsid);
	if (opt->flags & CEPH_OPT_NOSHARE)
		seq_puts(m, ",noshare");
	if (opt->flags & CEPH_OPT_NOCRC)
		seq_puts(m, ",nocrc");

	if (opt->name)
		seq_printf(m, ",name=%s", opt->name);
404
	if (opt->key)
405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420
		seq_puts(m, ",secret=<hidden>");

	if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
		seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
		seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
		seq_printf(m, ",osdkeepalivetimeout=%d",
			   opt->osd_keepalive_timeout);

	if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
		seq_puts(m, ",dirstat");
	if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
		seq_puts(m, ",norbytes");
	if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
		seq_puts(m, ",noasyncreaddir");
421 422 423 424
	if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE)
		seq_puts(m, ",dcache");
	else
		seq_puts(m, ",nodcache");
425 426 427

	if (fsopt->wsize)
		seq_printf(m, ",wsize=%d", fsopt->wsize);
428
	if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
429
		seq_printf(m, ",rsize=%d", fsopt->rsize);
S
Sage Weil 已提交
430
	if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
431
		seq_printf(m, ",rasize=%d", fsopt->rasize);
432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
	if (fsopt->congestion_kb != default_congestion_kb())
		seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
	if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
		seq_printf(m, ",caps_wanted_delay_min=%d",
			 fsopt->caps_wanted_delay_min);
	if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
		seq_printf(m, ",caps_wanted_delay_max=%d",
			   fsopt->caps_wanted_delay_max);
	if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
		seq_printf(m, ",cap_release_safety=%d",
			   fsopt->cap_release_safety);
	if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
		seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
	if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
		seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
	if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
		seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
	return 0;
S
Sage Weil 已提交
450 451 452
}

/*
453 454
 * handle any mon messages the standard library doesn't understand.
 * return error if we don't either.
S
Sage Weil 已提交
455
 */
456
static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
S
Sage Weil 已提交
457
{
458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473
	struct ceph_fs_client *fsc = client->private;
	int type = le16_to_cpu(msg->hdr.type);

	switch (type) {
	case CEPH_MSG_MDS_MAP:
		ceph_mdsc_handle_map(fsc->mdsc, msg);
		return 0;

	default:
		return -1;
	}
}

/*
 * create a new fs client
 */
474
static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
475 476 477
					struct ceph_options *opt)
{
	struct ceph_fs_client *fsc;
S
Sage Weil 已提交
478 479 480 481
	const unsigned supported_features =
		CEPH_FEATURE_FLOCK |
		CEPH_FEATURE_DIRLAYOUTHASH;
	const unsigned required_features = 0;
S
Sage Weil 已提交
482 483
	int err = -ENOMEM;

484 485
	fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
	if (!fsc)
S
Sage Weil 已提交
486 487
		return ERR_PTR(-ENOMEM);

S
Sage Weil 已提交
488 489
	fsc->client = ceph_create_client(opt, fsc, supported_features,
					 required_features);
490 491 492 493 494 495
	if (IS_ERR(fsc->client)) {
		err = PTR_ERR(fsc->client);
		goto fail;
	}
	fsc->client->extra_mon_dispatch = extra_mon_dispatch;
	fsc->client->monc.want_mdsmap = 1;
S
Sage Weil 已提交
496

497
	fsc->mount_options = fsopt;
S
Sage Weil 已提交
498

499 500
	fsc->sb = NULL;
	fsc->mount_state = CEPH_MOUNT_MOUNTING;
S
Sage Weil 已提交
501

502
	atomic_long_set(&fsc->writeback_count, 0);
S
Sage Weil 已提交
503

504
	err = bdi_init(&fsc->backing_dev_info);
505
	if (err < 0)
506
		goto fail_client;
507

S
Sage Weil 已提交
508
	err = -ENOMEM;
509 510 511 512 513
	/*
	 * The number of concurrent works can be high but they don't need
	 * to be processed in parallel, limit concurrency.
	 */
	fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
514
	if (fsc->wb_wq == NULL)
515
		goto fail_bdi;
516
	fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
517
	if (fsc->pg_inv_wq == NULL)
S
Sage Weil 已提交
518
		goto fail_wb_wq;
519
	fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
520
	if (fsc->trunc_wq == NULL)
S
Sage Weil 已提交
521 522
		goto fail_pg_inv_wq;

523 524
	/* set up mempools */
	err = -ENOMEM;
525 526 527
	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
			      fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
	if (!fsc->wb_pagevec_pool)
528 529
		goto fail_trunc_wq;

530
	/* caps */
531 532 533
	fsc->min_caps = fsopt->max_readdir;

	return fsc;
534

S
Sage Weil 已提交
535
fail_trunc_wq:
536
	destroy_workqueue(fsc->trunc_wq);
S
Sage Weil 已提交
537
fail_pg_inv_wq:
538
	destroy_workqueue(fsc->pg_inv_wq);
S
Sage Weil 已提交
539
fail_wb_wq:
540
	destroy_workqueue(fsc->wb_wq);
541
fail_bdi:
542 543 544
	bdi_destroy(&fsc->backing_dev_info);
fail_client:
	ceph_destroy_client(fsc->client);
S
Sage Weil 已提交
545
fail:
546
	kfree(fsc);
S
Sage Weil 已提交
547 548 549
	return ERR_PTR(err);
}

550
static void destroy_fs_client(struct ceph_fs_client *fsc)
S
Sage Weil 已提交
551
{
552
	dout("destroy_fs_client %p\n", fsc);
S
Sage Weil 已提交
553

554 555 556
	destroy_workqueue(fsc->wb_wq);
	destroy_workqueue(fsc->pg_inv_wq);
	destroy_workqueue(fsc->trunc_wq);
S
Sage Weil 已提交
557

558
	bdi_destroy(&fsc->backing_dev_info);
559

560
	mempool_destroy(fsc->wb_pagevec_pool);
S
Sage Weil 已提交
561

562
	destroy_mount_options(fsc->mount_options);
563

564
	ceph_fs_debugfs_cleanup(fsc);
S
Sage Weil 已提交
565

566
	ceph_destroy_client(fsc->client);
S
Sage Weil 已提交
567

568 569
	kfree(fsc);
	dout("destroy_fs_client %p done\n", fsc);
S
Sage Weil 已提交
570 571
}

572
/*
573
 * caches
574
 */
575 576 577 578 579 580
struct kmem_cache *ceph_inode_cachep;
struct kmem_cache *ceph_cap_cachep;
struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;

static void ceph_inode_init_once(void *foo)
581
{
582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610
	struct ceph_inode_info *ci = foo;
	inode_init_once(&ci->vfs_inode);
}

static int __init init_caches(void)
{
	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
				      sizeof(struct ceph_inode_info),
				      __alignof__(struct ceph_inode_info),
				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
				      ceph_inode_init_once);
	if (ceph_inode_cachep == NULL)
		return -ENOMEM;

	ceph_cap_cachep = KMEM_CACHE(ceph_cap,
				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
	if (ceph_cap_cachep == NULL)
		goto bad_cap;

	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
	if (ceph_dentry_cachep == NULL)
		goto bad_dentry;

	ceph_file_cachep = KMEM_CACHE(ceph_file_info,
				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
	if (ceph_file_cachep == NULL)
		goto bad_file;

611
	return 0;
612 613 614 615 616 617 618 619

bad_file:
	kmem_cache_destroy(ceph_dentry_cachep);
bad_dentry:
	kmem_cache_destroy(ceph_cap_cachep);
bad_cap:
	kmem_cache_destroy(ceph_inode_cachep);
	return -ENOMEM;
620 621
}

622 623
static void destroy_caches(void)
{
624 625 626 627 628
	/*
	 * Make sure all delayed rcu free inodes are flushed before we
	 * destroy cache.
	 */
	rcu_barrier();
629 630 631 632 633 634 635
	kmem_cache_destroy(ceph_inode_cachep);
	kmem_cache_destroy(ceph_cap_cachep);
	kmem_cache_destroy(ceph_dentry_cachep);
	kmem_cache_destroy(ceph_file_cachep);
}


S
Sage Weil 已提交
636
/*
637 638
 * ceph_umount_begin - initiate forced umount.  Tear down down the
 * mount, skipping steps that may hang while waiting for server(s).
S
Sage Weil 已提交
639
 */
640
static void ceph_umount_begin(struct super_block *sb)
S
Sage Weil 已提交
641
{
642 643 644 645 646 647 648
	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);

	dout("ceph_umount_begin - starting forced umount\n");
	if (!fsc)
		return;
	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
	return;
S
Sage Weil 已提交
649 650
}

651 652 653 654 655 656 657 658 659 660 661
static const struct super_operations ceph_super_ops = {
	.alloc_inode	= ceph_alloc_inode,
	.destroy_inode	= ceph_destroy_inode,
	.write_inode    = ceph_write_inode,
	.sync_fs        = ceph_sync_fs,
	.put_super	= ceph_put_super,
	.show_options   = ceph_show_options,
	.statfs		= ceph_statfs,
	.umount_begin   = ceph_umount_begin,
};

S
Sage Weil 已提交
662 663 664 665
/*
 * Bootstrap mount by opening the root directory.  Note the mount
 * @started time from caller, and time out if this takes too long.
 */
666
static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
S
Sage Weil 已提交
667 668 669
				       const char *path,
				       unsigned long started)
{
670
	struct ceph_mds_client *mdsc = fsc->mdsc;
S
Sage Weil 已提交
671 672 673 674 675 676 677 678
	struct ceph_mds_request *req = NULL;
	int err;
	struct dentry *root;

	/* open dir */
	dout("open_root_inode opening '%s'\n", path);
	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
	if (IS_ERR(req))
J
Julia Lawall 已提交
679
		return ERR_CAST(req);
S
Sage Weil 已提交
680 681 682 683
	req->r_path1 = kstrdup(path, GFP_NOFS);
	req->r_ino1.ino = CEPH_INO_ROOT;
	req->r_ino1.snap = CEPH_NOSNAP;
	req->r_started = started;
684
	req->r_timeout = fsc->client->options->mount_timeout * HZ;
S
Sage Weil 已提交
685 686 687 688
	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
	req->r_num_caps = 2;
	err = ceph_mdsc_do_request(mdsc, NULL, req);
	if (err == 0) {
A
Al Viro 已提交
689 690
		struct inode *inode = req->r_target_inode;
		req->r_target_inode = NULL;
S
Sage Weil 已提交
691
		dout("open_root_inode success\n");
A
Al Viro 已提交
692
		if (ceph_ino(inode) == CEPH_INO_ROOT &&
S
Sage Weil 已提交
693
		    fsc->sb->s_root == NULL) {
694
			root = d_make_root(inode);
A
Al Viro 已提交
695 696 697 698
			if (!root) {
				root = ERR_PTR(-ENOMEM);
				goto out;
			}
S
Sage Weil 已提交
699
		} else {
A
Al Viro 已提交
700
			root = d_obtain_alias(inode);
S
Sage Weil 已提交
701
		}
702
		ceph_init_dentry(root);
S
Sage Weil 已提交
703 704 705 706
		dout("open_root_inode success, root dentry is %p\n", root);
	} else {
		root = ERR_PTR(err);
	}
A
Al Viro 已提交
707
out:
S
Sage Weil 已提交
708 709 710 711
	ceph_mdsc_put_request(req);
	return root;
}

712 713 714



S
Sage Weil 已提交
715 716 717
/*
 * mount: join the ceph cluster, and open root directory.
 */
A
Al Viro 已提交
718
static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
S
Sage Weil 已提交
719 720 721 722 723
		      const char *path)
{
	int err;
	unsigned long started = jiffies;  /* note the start time */
	struct dentry *root;
724
	int first = 0;   /* first vfsmount for this super_block */
S
Sage Weil 已提交
725 726

	dout("mount start\n");
727
	mutex_lock(&fsc->client->mount_mutex);
S
Sage Weil 已提交
728

729
	err = __ceph_open_session(fsc->client, started);
S
Sage Weil 已提交
730 731 732 733
	if (err < 0)
		goto out;

	dout("mount opening root\n");
734
	root = open_root_dentry(fsc, "", started);
S
Sage Weil 已提交
735 736 737 738
	if (IS_ERR(root)) {
		err = PTR_ERR(root);
		goto out;
	}
739
	if (fsc->sb->s_root) {
S
Sage Weil 已提交
740
		dput(root);
741 742 743 744 745 746 747 748
	} else {
		fsc->sb->s_root = root;
		first = 1;

		err = ceph_fs_debugfs_init(fsc);
		if (err < 0)
			goto fail;
	}
S
Sage Weil 已提交
749 750 751 752 753

	if (path[0] == 0) {
		dget(root);
	} else {
		dout("mount opening base mountpoint\n");
754
		root = open_root_dentry(fsc, path, started);
S
Sage Weil 已提交
755 756
		if (IS_ERR(root)) {
			err = PTR_ERR(root);
757
			goto fail;
S
Sage Weil 已提交
758 759 760
		}
	}

761
	fsc->mount_state = CEPH_MOUNT_MOUNTED;
S
Sage Weil 已提交
762
	dout("mount success\n");
A
Al Viro 已提交
763 764
	mutex_unlock(&fsc->client->mount_mutex);
	return root;
S
Sage Weil 已提交
765 766

out:
767
	mutex_unlock(&fsc->client->mount_mutex);
A
Al Viro 已提交
768
	return ERR_PTR(err);
769 770 771 772 773 774 775

fail:
	if (first) {
		dput(fsc->sb->s_root);
		fsc->sb->s_root = NULL;
	}
	goto out;
S
Sage Weil 已提交
776 777 778 779
}

static int ceph_set_super(struct super_block *s, void *data)
{
780
	struct ceph_fs_client *fsc = data;
S
Sage Weil 已提交
781 782 783 784
	int ret;

	dout("set_super %p data %p\n", s, data);

785
	s->s_flags = fsc->mount_options->sb_flags;
S
Sage Weil 已提交
786 787
	s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */

788 789
	s->s_fs_info = fsc;
	fsc->sb = s;
S
Sage Weil 已提交
790 791 792 793 794 795 796 797 798 799 800 801 802 803

	s->s_op = &ceph_super_ops;
	s->s_export_op = &ceph_export_ops;

	s->s_time_gran = 1000;  /* 1000 ns == 1 us */

	ret = set_anon_super(s, NULL);  /* what is that second arg for? */
	if (ret != 0)
		goto fail;

	return ret;

fail:
	s->s_fs_info = NULL;
804
	fsc->sb = NULL;
S
Sage Weil 已提交
805 806 807 808 809 810 811 812
	return ret;
}

/*
 * share superblock if same fs AND options
 */
static int ceph_compare_super(struct super_block *sb, void *data)
{
813 814 815 816
	struct ceph_fs_client *new = data;
	struct ceph_mount_options *fsopt = new->mount_options;
	struct ceph_options *opt = new->client->options;
	struct ceph_fs_client *other = ceph_sb_to_client(sb);
S
Sage Weil 已提交
817 818

	dout("ceph_compare_super %p\n", sb);
819 820 821 822

	if (compare_mount_options(fsopt, opt, other)) {
		dout("monitor(s)/mount options don't match\n");
		return 0;
S
Sage Weil 已提交
823
	}
824 825 826 827 828 829
	if ((opt->flags & CEPH_OPT_FSID) &&
	    ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
		dout("fsid doesn't match\n");
		return 0;
	}
	if (fsopt->sb_flags != other->mount_options->sb_flags) {
S
Sage Weil 已提交
830 831 832 833 834 835 836 837 838
		dout("flags differ\n");
		return 0;
	}
	return 1;
}

/*
 * construct our own bdi so we can control readahead, etc.
 */
839
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
840

841 842
static int ceph_register_bdi(struct super_block *sb,
			     struct ceph_fs_client *fsc)
S
Sage Weil 已提交
843 844 845
{
	int err;

S
Sage Weil 已提交
846 847
	/* set ra_pages based on rasize mount option? */
	if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)
848
		fsc->backing_dev_info.ra_pages =
S
Sage Weil 已提交
849
			(fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)
S
Sage Weil 已提交
850
			>> PAGE_SHIFT;
851 852 853 854
	else
		fsc->backing_dev_info.ra_pages =
			default_backing_dev_info.ra_pages;

855
	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
856
			   atomic_long_inc_return(&bdi_seq));
857
	if (!err)
858
		sb->s_bdi = &fsc->backing_dev_info;
S
Sage Weil 已提交
859 860 861
	return err;
}

A
Al Viro 已提交
862 863
static struct dentry *ceph_mount(struct file_system_type *fs_type,
		       int flags, const char *dev_name, void *data)
S
Sage Weil 已提交
864 865
{
	struct super_block *sb;
866
	struct ceph_fs_client *fsc;
A
Al Viro 已提交
867
	struct dentry *res;
S
Sage Weil 已提交
868 869
	int err;
	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
S
Sage Weil 已提交
870
	const char *path = NULL;
871 872
	struct ceph_mount_options *fsopt = NULL;
	struct ceph_options *opt = NULL;
S
Sage Weil 已提交
873

A
Al Viro 已提交
874
	dout("ceph_mount\n");
875
	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
A
Al Viro 已提交
876 877
	if (err < 0) {
		res = ERR_PTR(err);
878
		goto out_final;
A
Al Viro 已提交
879
	}
S
Sage Weil 已提交
880 881

	/* create client (which we may/may not use) */
882 883
	fsc = create_fs_client(fsopt, opt);
	if (IS_ERR(fsc)) {
A
Al Viro 已提交
884
		res = ERR_CAST(fsc);
N
Noah Watkins 已提交
885 886
		destroy_mount_options(fsopt);
		ceph_destroy_options(opt);
887 888
		goto out_final;
	}
S
Sage Weil 已提交
889

890
	err = ceph_mdsc_init(fsc);
A
Al Viro 已提交
891 892
	if (err < 0) {
		res = ERR_PTR(err);
893
		goto out;
A
Al Viro 已提交
894
	}
895 896

	if (ceph_test_opt(fsc->client, NOSHARE))
S
Sage Weil 已提交
897
		compare_super = NULL;
D
David Howells 已提交
898
	sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc);
S
Sage Weil 已提交
899
	if (IS_ERR(sb)) {
A
Al Viro 已提交
900
		res = ERR_CAST(sb);
S
Sage Weil 已提交
901 902 903
		goto out;
	}

904 905 906 907 908
	if (ceph_sb_to_client(sb) != fsc) {
		ceph_mdsc_destroy(fsc);
		destroy_fs_client(fsc);
		fsc = ceph_sb_to_client(sb);
		dout("get_sb got existing client %p\n", fsc);
S
Sage Weil 已提交
909
	} else {
910 911
		dout("get_sb using new client %p\n", fsc);
		err = ceph_register_bdi(sb, fsc);
A
Al Viro 已提交
912 913
		if (err < 0) {
			res = ERR_PTR(err);
S
Sage Weil 已提交
914
			goto out_splat;
A
Al Viro 已提交
915
		}
S
Sage Weil 已提交
916 917
	}

A
Al Viro 已提交
918 919
	res = ceph_real_mount(fsc, path);
	if (IS_ERR(res))
S
Sage Weil 已提交
920
		goto out_splat;
A
Al Viro 已提交
921 922 923
	dout("root %p inode %p ino %llx.%llx\n", res,
	     res->d_inode, ceph_vinop(res->d_inode));
	return res;
S
Sage Weil 已提交
924 925

out_splat:
926
	ceph_mdsc_close_sessions(fsc->mdsc);
927
	deactivate_locked_super(sb);
S
Sage Weil 已提交
928 929 930
	goto out_final;

out:
931 932
	ceph_mdsc_destroy(fsc);
	destroy_fs_client(fsc);
S
Sage Weil 已提交
933
out_final:
A
Al Viro 已提交
934 935
	dout("ceph_mount fail %ld\n", PTR_ERR(res));
	return res;
S
Sage Weil 已提交
936 937 938 939
}

static void ceph_kill_sb(struct super_block *s)
{
940
	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
S
Sage Weil 已提交
941
	dout("kill_sb %p\n", s);
942
	ceph_mdsc_pre_umount(fsc->mdsc);
S
Sage Weil 已提交
943
	kill_anon_super(s);    /* will call put_super after sb is r/o */
944 945
	ceph_mdsc_destroy(fsc);
	destroy_fs_client(fsc);
S
Sage Weil 已提交
946 947 948 949 950
}

static struct file_system_type ceph_fs_type = {
	.owner		= THIS_MODULE,
	.name		= "ceph",
A
Al Viro 已提交
951
	.mount		= ceph_mount,
S
Sage Weil 已提交
952 953 954
	.kill_sb	= ceph_kill_sb,
	.fs_flags	= FS_RENAME_DOES_D_MOVE,
};
955
MODULE_ALIAS_FS("ceph");
S
Sage Weil 已提交
956 957 958 959 960 961

#define _STRINGIFY(x) #x
#define STRINGIFY(x) _STRINGIFY(x)

static int __init init_ceph(void)
{
962
	int ret = init_caches();
S
Sage Weil 已提交
963
	if (ret)
964
		goto out;
S
Sage Weil 已提交
965

966
	ceph_xattr_init();
S
Sage Weil 已提交
967 968 969 970
	ret = register_filesystem(&ceph_fs_type);
	if (ret)
		goto out_icache;

971 972
	pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);

S
Sage Weil 已提交
973 974 975
	return 0;

out_icache:
976
	ceph_xattr_exit();
S
Sage Weil 已提交
977 978 979 980 981 982 983 984 985
	destroy_caches();
out:
	return ret;
}

static void __exit exit_ceph(void)
{
	dout("exit_ceph\n");
	unregister_filesystem(&ceph_fs_type);
986
	ceph_xattr_exit();
S
Sage Weil 已提交
987 988 989 990 991 992 993 994 995 996 997
	destroy_caches();
}

module_init(init_ceph);
module_exit(exit_ceph);

MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
MODULE_DESCRIPTION("Ceph filesystem for Linux");
MODULE_LICENSE("GPL");