super.c 23.6 KB
Newer Older
S
Sage Weil 已提交
1

2
#include <linux/ceph/ceph_debug.h>
S
Sage Weil 已提交
3 4

#include <linux/backing-dev.h>
S
Sage Weil 已提交
5
#include <linux/ctype.h>
S
Sage Weil 已提交
6 7 8 9 10 11 12 13
#include <linux/fs.h>
#include <linux/inet.h>
#include <linux/in6.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/parser.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
14
#include <linux/slab.h>
S
Sage Weil 已提交
15 16 17 18
#include <linux/statfs.h>
#include <linux/string.h>

#include "super.h"
19 20
#include "mds_client.h"

21
#include <linux/ceph/ceph_features.h>
22 23 24 25
#include <linux/ceph/decode.h>
#include <linux/ceph/mon_client.h>
#include <linux/ceph/auth.h>
#include <linux/ceph/debugfs.h>
S
Sage Weil 已提交
26 27 28 29 30 31 32 33 34 35 36 37

/*
 * Ceph superblock operations
 *
 * Handle the basics of mounting, unmounting.
 */

/*
 * super ops
 */
static void ceph_put_super(struct super_block *s)
{
38
	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
S
Sage Weil 已提交
39 40

	dout("put_super\n");
41
	ceph_mdsc_close_sessions(fsc->mdsc);
42 43 44 45 46

	/*
	 * ensure we release the bdi before put_anon_super releases
	 * the device name.
	 */
47 48
	if (s->s_bdi == &fsc->backing_dev_info) {
		bdi_unregister(&fsc->backing_dev_info);
49 50 51
		s->s_bdi = NULL;
	}

S
Sage Weil 已提交
52 53 54 55 56
	return;
}

static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
{
57 58
	struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
	struct ceph_monmap *monmap = fsc->client->monc.monmap;
S
Sage Weil 已提交
59 60 61 62 63
	struct ceph_statfs st;
	u64 fsid;
	int err;

	dout("statfs\n");
64
	err = ceph_monc_do_statfs(&fsc->client->monc, &st);
S
Sage Weil 已提交
65 66 67 68 69 70 71 72 73 74 75 76
	if (err < 0)
		return err;

	/* fill in kstatfs */
	buf->f_type = CEPH_SUPER_MAGIC;  /* ?? */

	/*
	 * express utilization in terms of large blocks to avoid
	 * overflow on 32-bit machines.
	 */
	buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
	buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
77
	buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
S
Sage Weil 已提交
78 79 80 81
	buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);

	buf->f_files = le64_to_cpu(st.num_objects);
	buf->f_ffree = -1;
82
	buf->f_namelen = NAME_MAX;
S
Sage Weil 已提交
83 84 85 86 87 88 89 90 91 92 93
	buf->f_frsize = PAGE_CACHE_SIZE;

	/* leave fsid little-endian, regardless of host endianness */
	fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
	buf->f_fsid.val[0] = fsid & 0xffffffff;
	buf->f_fsid.val[1] = fsid >> 32;

	return 0;
}


94
static int ceph_sync_fs(struct super_block *sb, int wait)
S
Sage Weil 已提交
95
{
96
	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
97 98 99

	if (!wait) {
		dout("sync_fs (non-blocking)\n");
100
		ceph_flush_dirty_caps(fsc->mdsc);
101 102 103 104 105
		dout("sync_fs (non-blocking) done\n");
		return 0;
	}

	dout("sync_fs (blocking)\n");
106 107
	ceph_osdc_sync(&fsc->client->osdc);
	ceph_mdsc_sync(fsc->mdsc);
108
	dout("sync_fs (blocking) done\n");
S
Sage Weil 已提交
109 110 111 112 113 114 115 116 117
	return 0;
}

/*
 * mount options
 */
enum {
	Opt_wsize,
	Opt_rsize,
S
Sage Weil 已提交
118
	Opt_rasize,
S
Sage Weil 已提交
119 120
	Opt_caps_wanted_delay_min,
	Opt_caps_wanted_delay_max,
121
	Opt_cap_release_safety,
S
Sage Weil 已提交
122
	Opt_readdir_max_entries,
123
	Opt_readdir_max_bytes,
Y
Yehuda Sadeh 已提交
124
	Opt_congestion_kb,
125
	Opt_last_int,
S
Sage Weil 已提交
126 127
	/* int args above */
	Opt_snapdirname,
128
	Opt_last_string,
S
Sage Weil 已提交
129 130 131 132 133
	/* string args above */
	Opt_dirstat,
	Opt_nodirstat,
	Opt_rbytes,
	Opt_norbytes,
134
	Opt_asyncreaddir,
S
Sage Weil 已提交
135
	Opt_noasyncreaddir,
136 137
	Opt_dcache,
	Opt_nodcache,
Y
Yehuda Sadeh 已提交
138
	Opt_ino32,
139
	Opt_noino32,
S
Sage Weil 已提交
140 141
};

142
static match_table_t fsopt_tokens = {
S
Sage Weil 已提交
143 144
	{Opt_wsize, "wsize=%d"},
	{Opt_rsize, "rsize=%d"},
S
Sage Weil 已提交
145
	{Opt_rasize, "rasize=%d"},
S
Sage Weil 已提交
146 147
	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
148
	{Opt_cap_release_safety, "cap_release_safety=%d"},
S
Sage Weil 已提交
149
	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
150
	{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
Y
Yehuda Sadeh 已提交
151
	{Opt_congestion_kb, "write_congestion_kb=%d"},
S
Sage Weil 已提交
152 153 154 155 156 157 158
	/* int args above */
	{Opt_snapdirname, "snapdirname=%s"},
	/* string args above */
	{Opt_dirstat, "dirstat"},
	{Opt_nodirstat, "nodirstat"},
	{Opt_rbytes, "rbytes"},
	{Opt_norbytes, "norbytes"},
159
	{Opt_asyncreaddir, "asyncreaddir"},
S
Sage Weil 已提交
160
	{Opt_noasyncreaddir, "noasyncreaddir"},
161 162
	{Opt_dcache, "dcache"},
	{Opt_nodcache, "nodcache"},
Y
Yehuda Sadeh 已提交
163
	{Opt_ino32, "ino32"},
164
	{Opt_noino32, "noino32"},
S
Sage Weil 已提交
165 166 167
	{-1, NULL}
};

168
static int parse_fsopt_token(char *c, void *private)
S
Sage Weil 已提交
169
{
170 171 172 173 174 175 176 177 178 179 180 181 182 183
	struct ceph_mount_options *fsopt = private;
	substring_t argstr[MAX_OPT_ARGS];
	int token, intval, ret;

	token = match_token((char *)c, fsopt_tokens, argstr);
	if (token < 0)
		return -EINVAL;

	if (token < Opt_last_int) {
		ret = match_int(&argstr[0], &intval);
		if (ret < 0) {
			pr_err("bad mount option arg (not int) "
			       "at '%s'\n", c);
			return ret;
S
Sage Weil 已提交
184
		}
185 186 187 188 189 190
		dout("got int token %d val %d\n", token, intval);
	} else if (token > Opt_last_int && token < Opt_last_string) {
		dout("got string token %d val %s\n", token,
		     argstr[0].from);
	} else {
		dout("got token %d\n", token);
S
Sage Weil 已提交
191 192
	}

193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
	switch (token) {
	case Opt_snapdirname:
		kfree(fsopt->snapdir_name);
		fsopt->snapdir_name = kstrndup(argstr[0].from,
					       argstr[0].to-argstr[0].from,
					       GFP_KERNEL);
		if (!fsopt->snapdir_name)
			return -ENOMEM;
		break;

		/* misc */
	case Opt_wsize:
		fsopt->wsize = intval;
		break;
	case Opt_rsize:
		fsopt->rsize = intval;
		break;
S
Sage Weil 已提交
210 211 212
	case Opt_rasize:
		fsopt->rasize = intval;
		break;
213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
	case Opt_caps_wanted_delay_min:
		fsopt->caps_wanted_delay_min = intval;
		break;
	case Opt_caps_wanted_delay_max:
		fsopt->caps_wanted_delay_max = intval;
		break;
	case Opt_readdir_max_entries:
		fsopt->max_readdir = intval;
		break;
	case Opt_readdir_max_bytes:
		fsopt->max_readdir_bytes = intval;
		break;
	case Opt_congestion_kb:
		fsopt->congestion_kb = intval;
		break;
	case Opt_dirstat:
		fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
		break;
	case Opt_nodirstat:
		fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
		break;
	case Opt_rbytes:
		fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
		break;
	case Opt_norbytes:
		fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
		break;
240 241 242
	case Opt_asyncreaddir:
		fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR;
		break;
243 244 245
	case Opt_noasyncreaddir:
		fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
		break;
246 247 248 249 250 251
	case Opt_dcache:
		fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
		break;
	case Opt_nodcache:
		fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
		break;
Y
Yehuda Sadeh 已提交
252 253 254
	case Opt_ino32:
		fsopt->flags |= CEPH_MOUNT_OPT_INO32;
		break;
255 256 257
	case Opt_noino32:
		fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
		break;
258 259 260 261
	default:
		BUG_ON(token);
	}
	return 0;
S
Sage Weil 已提交
262
}
S
Sage Weil 已提交
263

264
static void destroy_mount_options(struct ceph_mount_options *args)
S
Sage Weil 已提交
265
{
266 267 268 269
	dout("destroy_mount_options %p\n", args);
	kfree(args->snapdir_name);
	kfree(args);
}
S
Sage Weil 已提交
270

271 272 273 274 275 276 277 278 279 280
static int strcmp_null(const char *s1, const char *s2)
{
	if (!s1 && !s2)
		return 0;
	if (s1 && !s2)
		return -1;
	if (!s1 && s2)
		return 1;
	return strcmp(s1, s2);
}
S
Sage Weil 已提交
281

282 283 284 285 286 287 288 289
static int compare_mount_options(struct ceph_mount_options *new_fsopt,
				 struct ceph_options *new_opt,
				 struct ceph_fs_client *fsc)
{
	struct ceph_mount_options *fsopt1 = new_fsopt;
	struct ceph_mount_options *fsopt2 = fsc->mount_options;
	int ofs = offsetof(struct ceph_mount_options, snapdir_name);
	int ret;
S
Sage Weil 已提交
290

291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
	ret = memcmp(fsopt1, fsopt2, ofs);
	if (ret)
		return ret;

	ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
	if (ret)
		return ret;

	return ceph_compare_options(new_opt, fsc->client);
}

static int parse_mount_options(struct ceph_mount_options **pfsopt,
			       struct ceph_options **popt,
			       int flags, char *options,
			       const char *dev_name,
			       const char **path)
{
	struct ceph_mount_options *fsopt;
	const char *dev_name_end;
310 311 312 313
	int err;

	if (!dev_name || !*dev_name)
		return -EINVAL;
314 315 316 317 318 319 320

	fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
	if (!fsopt)
		return -ENOMEM;

	dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);

321 322
	fsopt->sb_flags = flags;
	fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
323

324 325 326
	fsopt->rsize = CEPH_RSIZE_DEFAULT;
	fsopt->rasize = CEPH_RASIZE_DEFAULT;
	fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
327 328
	fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
	fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
329 330 331 332 333
	fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
	fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
	fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
	fsopt->congestion_kb = default_congestion_kb();

334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
	/*
	 * Distinguish the server list from the path in "dev_name".
	 * Internally we do not include the leading '/' in the path.
	 *
	 * "dev_name" will look like:
	 *     <server_spec>[,<server_spec>...]:[<path>]
	 * where
	 *     <server_spec> is <ip>[:<port>]
	 *     <path> is optional, but if present must begin with '/'
	 */
	dev_name_end = strchr(dev_name, '/');
	if (dev_name_end) {
		/* skip over leading '/' for path */
		*path = dev_name_end + 1;
	} else {
		/* path is empty */
		dev_name_end = dev_name + strlen(dev_name);
		*path = dev_name_end;
	}
353
	err = -EINVAL;
354 355 356
	dev_name_end--;		/* back up to ':' separator */
	if (*dev_name_end != ':') {
		pr_err("device name is missing path (no : separator in %s)\n",
357 358 359
				dev_name);
		goto out;
	}
360
	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
S
Sage Weil 已提交
361 362
	dout("server path '%s'\n", *path);

363
	*popt = ceph_parse_options(options, dev_name, dev_name_end,
364
				 parse_fsopt_token, (void *)fsopt);
365 366
	if (IS_ERR(*popt)) {
		err = PTR_ERR(*popt);
367
		goto out;
368
	}
369 370 371 372

	/* success */
	*pfsopt = fsopt;
	return 0;
S
Sage Weil 已提交
373

374
out:
375 376
	destroy_mount_options(fsopt);
	return err;
S
Sage Weil 已提交
377 378
}

379 380 381
/**
 * ceph_show_options - Show mount options in /proc/mounts
 * @m: seq_file to write to
382
 * @root: root of that (sub)tree
383
 */
384
static int ceph_show_options(struct seq_file *m, struct dentry *root)
S
Sage Weil 已提交
385
{
386
	struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
387 388 389 390 391 392 393 394 395 396 397 398
	struct ceph_mount_options *fsopt = fsc->mount_options;
	struct ceph_options *opt = fsc->client->options;

	if (opt->flags & CEPH_OPT_FSID)
		seq_printf(m, ",fsid=%pU", &opt->fsid);
	if (opt->flags & CEPH_OPT_NOSHARE)
		seq_puts(m, ",noshare");
	if (opt->flags & CEPH_OPT_NOCRC)
		seq_puts(m, ",nocrc");

	if (opt->name)
		seq_printf(m, ",name=%s", opt->name);
399
	if (opt->key)
400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415
		seq_puts(m, ",secret=<hidden>");

	if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
		seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
		seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
		seq_printf(m, ",osdkeepalivetimeout=%d",
			   opt->osd_keepalive_timeout);

	if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
		seq_puts(m, ",dirstat");
	if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
		seq_puts(m, ",norbytes");
	if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
		seq_puts(m, ",noasyncreaddir");
416 417 418 419
	if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE)
		seq_puts(m, ",dcache");
	else
		seq_puts(m, ",nodcache");
420 421 422

	if (fsopt->wsize)
		seq_printf(m, ",wsize=%d", fsopt->wsize);
423
	if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
424
		seq_printf(m, ",rsize=%d", fsopt->rsize);
S
Sage Weil 已提交
425
	if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
426
		seq_printf(m, ",rasize=%d", fsopt->rasize);
427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444
	if (fsopt->congestion_kb != default_congestion_kb())
		seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
	if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
		seq_printf(m, ",caps_wanted_delay_min=%d",
			 fsopt->caps_wanted_delay_min);
	if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
		seq_printf(m, ",caps_wanted_delay_max=%d",
			   fsopt->caps_wanted_delay_max);
	if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
		seq_printf(m, ",cap_release_safety=%d",
			   fsopt->cap_release_safety);
	if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
		seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
	if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
		seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
	if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
		seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
	return 0;
S
Sage Weil 已提交
445 446 447
}

/*
448 449
 * handle any mon messages the standard library doesn't understand.
 * return error if we don't either.
S
Sage Weil 已提交
450
 */
451
static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
S
Sage Weil 已提交
452
{
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
	struct ceph_fs_client *fsc = client->private;
	int type = le16_to_cpu(msg->hdr.type);

	switch (type) {
	case CEPH_MSG_MDS_MAP:
		ceph_mdsc_handle_map(fsc->mdsc, msg);
		return 0;

	default:
		return -1;
	}
}

/*
 * create a new fs client
 */
469
static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
470 471 472
					struct ceph_options *opt)
{
	struct ceph_fs_client *fsc;
S
Sage Weil 已提交
473 474 475 476
	const unsigned supported_features =
		CEPH_FEATURE_FLOCK |
		CEPH_FEATURE_DIRLAYOUTHASH;
	const unsigned required_features = 0;
S
Sage Weil 已提交
477 478
	int err = -ENOMEM;

479 480
	fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
	if (!fsc)
S
Sage Weil 已提交
481 482
		return ERR_PTR(-ENOMEM);

S
Sage Weil 已提交
483 484
	fsc->client = ceph_create_client(opt, fsc, supported_features,
					 required_features);
485 486 487 488 489 490
	if (IS_ERR(fsc->client)) {
		err = PTR_ERR(fsc->client);
		goto fail;
	}
	fsc->client->extra_mon_dispatch = extra_mon_dispatch;
	fsc->client->monc.want_mdsmap = 1;
S
Sage Weil 已提交
491

492
	fsc->mount_options = fsopt;
S
Sage Weil 已提交
493

494 495
	fsc->sb = NULL;
	fsc->mount_state = CEPH_MOUNT_MOUNTING;
S
Sage Weil 已提交
496

497
	atomic_long_set(&fsc->writeback_count, 0);
S
Sage Weil 已提交
498

499
	err = bdi_init(&fsc->backing_dev_info);
500
	if (err < 0)
501
		goto fail_client;
502

S
Sage Weil 已提交
503
	err = -ENOMEM;
504 505 506 507 508
	/*
	 * The number of concurrent works can be high but they don't need
	 * to be processed in parallel, limit concurrency.
	 */
	fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
509
	if (fsc->wb_wq == NULL)
510
		goto fail_bdi;
511
	fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
512
	if (fsc->pg_inv_wq == NULL)
S
Sage Weil 已提交
513
		goto fail_wb_wq;
514
	fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
515
	if (fsc->trunc_wq == NULL)
S
Sage Weil 已提交
516 517
		goto fail_pg_inv_wq;

518 519
	/* set up mempools */
	err = -ENOMEM;
520 521 522
	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
			      fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
	if (!fsc->wb_pagevec_pool)
523 524
		goto fail_trunc_wq;

525
	/* caps */
526 527 528
	fsc->min_caps = fsopt->max_readdir;

	return fsc;
529

S
Sage Weil 已提交
530
fail_trunc_wq:
531
	destroy_workqueue(fsc->trunc_wq);
S
Sage Weil 已提交
532
fail_pg_inv_wq:
533
	destroy_workqueue(fsc->pg_inv_wq);
S
Sage Weil 已提交
534
fail_wb_wq:
535
	destroy_workqueue(fsc->wb_wq);
536
fail_bdi:
537 538 539
	bdi_destroy(&fsc->backing_dev_info);
fail_client:
	ceph_destroy_client(fsc->client);
S
Sage Weil 已提交
540
fail:
541
	kfree(fsc);
S
Sage Weil 已提交
542 543 544
	return ERR_PTR(err);
}

545
static void destroy_fs_client(struct ceph_fs_client *fsc)
S
Sage Weil 已提交
546
{
547
	dout("destroy_fs_client %p\n", fsc);
S
Sage Weil 已提交
548

549 550 551
	destroy_workqueue(fsc->wb_wq);
	destroy_workqueue(fsc->pg_inv_wq);
	destroy_workqueue(fsc->trunc_wq);
S
Sage Weil 已提交
552

553
	bdi_destroy(&fsc->backing_dev_info);
554

555
	mempool_destroy(fsc->wb_pagevec_pool);
S
Sage Weil 已提交
556

557
	destroy_mount_options(fsc->mount_options);
558

559
	ceph_fs_debugfs_cleanup(fsc);
S
Sage Weil 已提交
560

561
	ceph_destroy_client(fsc->client);
S
Sage Weil 已提交
562

563 564
	kfree(fsc);
	dout("destroy_fs_client %p done\n", fsc);
S
Sage Weil 已提交
565 566
}

567
/*
568
 * caches
569
 */
570 571 572 573 574 575
struct kmem_cache *ceph_inode_cachep;
struct kmem_cache *ceph_cap_cachep;
struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;

static void ceph_inode_init_once(void *foo)
576
{
577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605
	struct ceph_inode_info *ci = foo;
	inode_init_once(&ci->vfs_inode);
}

static int __init init_caches(void)
{
	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
				      sizeof(struct ceph_inode_info),
				      __alignof__(struct ceph_inode_info),
				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
				      ceph_inode_init_once);
	if (ceph_inode_cachep == NULL)
		return -ENOMEM;

	ceph_cap_cachep = KMEM_CACHE(ceph_cap,
				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
	if (ceph_cap_cachep == NULL)
		goto bad_cap;

	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
	if (ceph_dentry_cachep == NULL)
		goto bad_dentry;

	ceph_file_cachep = KMEM_CACHE(ceph_file_info,
				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
	if (ceph_file_cachep == NULL)
		goto bad_file;

606
	return 0;
607 608 609 610 611 612 613 614

bad_file:
	kmem_cache_destroy(ceph_dentry_cachep);
bad_dentry:
	kmem_cache_destroy(ceph_cap_cachep);
bad_cap:
	kmem_cache_destroy(ceph_inode_cachep);
	return -ENOMEM;
615 616
}

617 618 619 620 621 622 623 624 625
static void destroy_caches(void)
{
	kmem_cache_destroy(ceph_inode_cachep);
	kmem_cache_destroy(ceph_cap_cachep);
	kmem_cache_destroy(ceph_dentry_cachep);
	kmem_cache_destroy(ceph_file_cachep);
}


S
Sage Weil 已提交
626
/*
627 628
 * ceph_umount_begin - initiate forced umount.  Tear down down the
 * mount, skipping steps that may hang while waiting for server(s).
S
Sage Weil 已提交
629
 */
630
static void ceph_umount_begin(struct super_block *sb)
S
Sage Weil 已提交
631
{
632 633 634 635 636 637 638
	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);

	dout("ceph_umount_begin - starting forced umount\n");
	if (!fsc)
		return;
	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
	return;
S
Sage Weil 已提交
639 640
}

641 642 643 644 645 646 647 648 649 650 651
static const struct super_operations ceph_super_ops = {
	.alloc_inode	= ceph_alloc_inode,
	.destroy_inode	= ceph_destroy_inode,
	.write_inode    = ceph_write_inode,
	.sync_fs        = ceph_sync_fs,
	.put_super	= ceph_put_super,
	.show_options   = ceph_show_options,
	.statfs		= ceph_statfs,
	.umount_begin   = ceph_umount_begin,
};

S
Sage Weil 已提交
652 653 654 655
/*
 * Bootstrap mount by opening the root directory.  Note the mount
 * @started time from caller, and time out if this takes too long.
 */
656
static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
S
Sage Weil 已提交
657 658 659
				       const char *path,
				       unsigned long started)
{
660
	struct ceph_mds_client *mdsc = fsc->mdsc;
S
Sage Weil 已提交
661 662 663 664 665 666 667 668
	struct ceph_mds_request *req = NULL;
	int err;
	struct dentry *root;

	/* open dir */
	dout("open_root_inode opening '%s'\n", path);
	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
	if (IS_ERR(req))
J
Julia Lawall 已提交
669
		return ERR_CAST(req);
S
Sage Weil 已提交
670 671 672 673
	req->r_path1 = kstrdup(path, GFP_NOFS);
	req->r_ino1.ino = CEPH_INO_ROOT;
	req->r_ino1.snap = CEPH_NOSNAP;
	req->r_started = started;
674
	req->r_timeout = fsc->client->options->mount_timeout * HZ;
S
Sage Weil 已提交
675 676 677 678
	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
	req->r_num_caps = 2;
	err = ceph_mdsc_do_request(mdsc, NULL, req);
	if (err == 0) {
A
Al Viro 已提交
679 680
		struct inode *inode = req->r_target_inode;
		req->r_target_inode = NULL;
S
Sage Weil 已提交
681
		dout("open_root_inode success\n");
A
Al Viro 已提交
682
		if (ceph_ino(inode) == CEPH_INO_ROOT &&
S
Sage Weil 已提交
683
		    fsc->sb->s_root == NULL) {
684
			root = d_make_root(inode);
A
Al Viro 已提交
685 686 687 688
			if (!root) {
				root = ERR_PTR(-ENOMEM);
				goto out;
			}
S
Sage Weil 已提交
689
		} else {
A
Al Viro 已提交
690
			root = d_obtain_alias(inode);
S
Sage Weil 已提交
691
		}
692
		ceph_init_dentry(root);
S
Sage Weil 已提交
693 694 695 696
		dout("open_root_inode success, root dentry is %p\n", root);
	} else {
		root = ERR_PTR(err);
	}
A
Al Viro 已提交
697
out:
S
Sage Weil 已提交
698 699 700 701
	ceph_mdsc_put_request(req);
	return root;
}

702 703 704



S
Sage Weil 已提交
705 706 707
/*
 * mount: join the ceph cluster, and open root directory.
 */
A
Al Viro 已提交
708
static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
S
Sage Weil 已提交
709 710 711 712 713
		      const char *path)
{
	int err;
	unsigned long started = jiffies;  /* note the start time */
	struct dentry *root;
714
	int first = 0;   /* first vfsmount for this super_block */
S
Sage Weil 已提交
715 716

	dout("mount start\n");
717
	mutex_lock(&fsc->client->mount_mutex);
S
Sage Weil 已提交
718

719
	err = __ceph_open_session(fsc->client, started);
S
Sage Weil 已提交
720 721 722 723
	if (err < 0)
		goto out;

	dout("mount opening root\n");
724
	root = open_root_dentry(fsc, "", started);
S
Sage Weil 已提交
725 726 727 728
	if (IS_ERR(root)) {
		err = PTR_ERR(root);
		goto out;
	}
729
	if (fsc->sb->s_root) {
S
Sage Weil 已提交
730
		dput(root);
731 732 733 734 735 736 737 738
	} else {
		fsc->sb->s_root = root;
		first = 1;

		err = ceph_fs_debugfs_init(fsc);
		if (err < 0)
			goto fail;
	}
S
Sage Weil 已提交
739 740 741 742 743

	if (path[0] == 0) {
		dget(root);
	} else {
		dout("mount opening base mountpoint\n");
744
		root = open_root_dentry(fsc, path, started);
S
Sage Weil 已提交
745 746
		if (IS_ERR(root)) {
			err = PTR_ERR(root);
747
			goto fail;
S
Sage Weil 已提交
748 749 750
		}
	}

751
	fsc->mount_state = CEPH_MOUNT_MOUNTED;
S
Sage Weil 已提交
752
	dout("mount success\n");
A
Al Viro 已提交
753 754
	mutex_unlock(&fsc->client->mount_mutex);
	return root;
S
Sage Weil 已提交
755 756

out:
757
	mutex_unlock(&fsc->client->mount_mutex);
A
Al Viro 已提交
758
	return ERR_PTR(err);
759 760 761 762 763 764 765

fail:
	if (first) {
		dput(fsc->sb->s_root);
		fsc->sb->s_root = NULL;
	}
	goto out;
S
Sage Weil 已提交
766 767 768 769
}

static int ceph_set_super(struct super_block *s, void *data)
{
770
	struct ceph_fs_client *fsc = data;
S
Sage Weil 已提交
771 772 773 774
	int ret;

	dout("set_super %p data %p\n", s, data);

775
	s->s_flags = fsc->mount_options->sb_flags;
S
Sage Weil 已提交
776 777
	s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */

778 779
	s->s_fs_info = fsc;
	fsc->sb = s;
S
Sage Weil 已提交
780 781 782 783 784 785 786 787 788 789 790 791 792 793

	s->s_op = &ceph_super_ops;
	s->s_export_op = &ceph_export_ops;

	s->s_time_gran = 1000;  /* 1000 ns == 1 us */

	ret = set_anon_super(s, NULL);  /* what is that second arg for? */
	if (ret != 0)
		goto fail;

	return ret;

fail:
	s->s_fs_info = NULL;
794
	fsc->sb = NULL;
S
Sage Weil 已提交
795 796 797 798 799 800 801 802
	return ret;
}

/*
 * share superblock if same fs AND options
 */
static int ceph_compare_super(struct super_block *sb, void *data)
{
803 804 805 806
	struct ceph_fs_client *new = data;
	struct ceph_mount_options *fsopt = new->mount_options;
	struct ceph_options *opt = new->client->options;
	struct ceph_fs_client *other = ceph_sb_to_client(sb);
S
Sage Weil 已提交
807 808

	dout("ceph_compare_super %p\n", sb);
809 810 811 812

	if (compare_mount_options(fsopt, opt, other)) {
		dout("monitor(s)/mount options don't match\n");
		return 0;
S
Sage Weil 已提交
813
	}
814 815 816 817 818 819
	if ((opt->flags & CEPH_OPT_FSID) &&
	    ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
		dout("fsid doesn't match\n");
		return 0;
	}
	if (fsopt->sb_flags != other->mount_options->sb_flags) {
S
Sage Weil 已提交
820 821 822 823 824 825 826 827 828
		dout("flags differ\n");
		return 0;
	}
	return 1;
}

/*
 * construct our own bdi so we can control readahead, etc.
 */
829
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
830

831 832
static int ceph_register_bdi(struct super_block *sb,
			     struct ceph_fs_client *fsc)
S
Sage Weil 已提交
833 834 835
{
	int err;

S
Sage Weil 已提交
836 837
	/* set ra_pages based on rasize mount option? */
	if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)
838
		fsc->backing_dev_info.ra_pages =
S
Sage Weil 已提交
839
			(fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)
S
Sage Weil 已提交
840
			>> PAGE_SHIFT;
841 842 843 844
	else
		fsc->backing_dev_info.ra_pages =
			default_backing_dev_info.ra_pages;

845
	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
846
			   atomic_long_inc_return(&bdi_seq));
847
	if (!err)
848
		sb->s_bdi = &fsc->backing_dev_info;
S
Sage Weil 已提交
849 850 851
	return err;
}

A
Al Viro 已提交
852 853
static struct dentry *ceph_mount(struct file_system_type *fs_type,
		       int flags, const char *dev_name, void *data)
S
Sage Weil 已提交
854 855
{
	struct super_block *sb;
856
	struct ceph_fs_client *fsc;
A
Al Viro 已提交
857
	struct dentry *res;
S
Sage Weil 已提交
858 859
	int err;
	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
S
Sage Weil 已提交
860
	const char *path = NULL;
861 862
	struct ceph_mount_options *fsopt = NULL;
	struct ceph_options *opt = NULL;
S
Sage Weil 已提交
863

A
Al Viro 已提交
864
	dout("ceph_mount\n");
865
	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
A
Al Viro 已提交
866 867
	if (err < 0) {
		res = ERR_PTR(err);
868
		goto out_final;
A
Al Viro 已提交
869
	}
S
Sage Weil 已提交
870 871

	/* create client (which we may/may not use) */
872 873
	fsc = create_fs_client(fsopt, opt);
	if (IS_ERR(fsc)) {
A
Al Viro 已提交
874
		res = ERR_CAST(fsc);
N
Noah Watkins 已提交
875 876
		destroy_mount_options(fsopt);
		ceph_destroy_options(opt);
877 878
		goto out_final;
	}
S
Sage Weil 已提交
879

880
	err = ceph_mdsc_init(fsc);
A
Al Viro 已提交
881 882
	if (err < 0) {
		res = ERR_PTR(err);
883
		goto out;
A
Al Viro 已提交
884
	}
885 886

	if (ceph_test_opt(fsc->client, NOSHARE))
S
Sage Weil 已提交
887
		compare_super = NULL;
D
David Howells 已提交
888
	sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc);
S
Sage Weil 已提交
889
	if (IS_ERR(sb)) {
A
Al Viro 已提交
890
		res = ERR_CAST(sb);
S
Sage Weil 已提交
891 892 893
		goto out;
	}

894 895 896 897 898
	if (ceph_sb_to_client(sb) != fsc) {
		ceph_mdsc_destroy(fsc);
		destroy_fs_client(fsc);
		fsc = ceph_sb_to_client(sb);
		dout("get_sb got existing client %p\n", fsc);
S
Sage Weil 已提交
899
	} else {
900 901
		dout("get_sb using new client %p\n", fsc);
		err = ceph_register_bdi(sb, fsc);
A
Al Viro 已提交
902 903
		if (err < 0) {
			res = ERR_PTR(err);
S
Sage Weil 已提交
904
			goto out_splat;
A
Al Viro 已提交
905
		}
S
Sage Weil 已提交
906 907
	}

A
Al Viro 已提交
908 909
	res = ceph_real_mount(fsc, path);
	if (IS_ERR(res))
S
Sage Weil 已提交
910
		goto out_splat;
A
Al Viro 已提交
911 912 913
	dout("root %p inode %p ino %llx.%llx\n", res,
	     res->d_inode, ceph_vinop(res->d_inode));
	return res;
S
Sage Weil 已提交
914 915

out_splat:
916
	ceph_mdsc_close_sessions(fsc->mdsc);
917
	deactivate_locked_super(sb);
S
Sage Weil 已提交
918 919 920
	goto out_final;

out:
921 922
	ceph_mdsc_destroy(fsc);
	destroy_fs_client(fsc);
S
Sage Weil 已提交
923
out_final:
A
Al Viro 已提交
924 925
	dout("ceph_mount fail %ld\n", PTR_ERR(res));
	return res;
S
Sage Weil 已提交
926 927 928 929
}

static void ceph_kill_sb(struct super_block *s)
{
930
	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
S
Sage Weil 已提交
931
	dout("kill_sb %p\n", s);
932
	ceph_mdsc_pre_umount(fsc->mdsc);
S
Sage Weil 已提交
933
	kill_anon_super(s);    /* will call put_super after sb is r/o */
934 935
	ceph_mdsc_destroy(fsc);
	destroy_fs_client(fsc);
S
Sage Weil 已提交
936 937 938 939 940
}

static struct file_system_type ceph_fs_type = {
	.owner		= THIS_MODULE,
	.name		= "ceph",
A
Al Viro 已提交
941
	.mount		= ceph_mount,
S
Sage Weil 已提交
942 943 944 945 946 947 948 949 950
	.kill_sb	= ceph_kill_sb,
	.fs_flags	= FS_RENAME_DOES_D_MOVE,
};

#define _STRINGIFY(x) #x
#define STRINGIFY(x) _STRINGIFY(x)

static int __init init_ceph(void)
{
951
	int ret = init_caches();
S
Sage Weil 已提交
952
	if (ret)
953
		goto out;
S
Sage Weil 已提交
954

955
	ceph_xattr_init();
S
Sage Weil 已提交
956 957 958 959
	ret = register_filesystem(&ceph_fs_type);
	if (ret)
		goto out_icache;

960 961
	pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);

S
Sage Weil 已提交
962 963 964
	return 0;

out_icache:
965
	ceph_xattr_exit();
S
Sage Weil 已提交
966 967 968 969 970 971 972 973 974
	destroy_caches();
out:
	return ret;
}

static void __exit exit_ceph(void)
{
	dout("exit_ceph\n");
	unregister_filesystem(&ceph_fs_type);
975
	ceph_xattr_exit();
S
Sage Weil 已提交
976 977 978 979 980 981 982 983 984 985 986
	destroy_caches();
}

module_init(init_ceph);
module_exit(exit_ceph);

MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
MODULE_DESCRIPTION("Ceph filesystem for Linux");
MODULE_LICENSE("GPL");