locks.c 11.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
#include <linux/ceph/ceph_debug.h>
G
Greg Farnum 已提交
3 4 5

#include <linux/file.h>
#include <linux/namei.h>
6
#include <linux/random.h>
G
Greg Farnum 已提交
7 8 9

#include "super.h"
#include "mds_client.h"
10
#include <linux/ceph/pagelist.h>
G
Greg Farnum 已提交
11

12
static u64 lock_secret;
Y
Yan, Zheng 已提交
13 14
static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
                                         struct ceph_mds_request *req);
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32

static inline u64 secure_addr(void *addr)
{
	u64 v = lock_secret ^ (u64)(unsigned long)addr;
	/*
	 * Set the most significant bit, so that MDS knows the 'owner'
	 * is sufficient to identify the owner of lock. (old code uses
	 * both 'owner' and 'pid')
	 */
	v |= (1ULL << 63);
	return v;
}

void __init ceph_flock_init(void)
{
	get_random_bytes(&lock_secret, sizeof(lock_secret));
}

33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
{
	struct inode *inode = file_inode(src->fl_file);
	atomic_inc(&ceph_inode(inode)->i_filelock_ref);
}

static void ceph_fl_release_lock(struct file_lock *fl)
{
	struct inode *inode = file_inode(fl->fl_file);
	atomic_dec(&ceph_inode(inode)->i_filelock_ref);
}

static const struct file_lock_operations ceph_fl_lock_ops = {
	.fl_copy_lock = ceph_fl_copy_lock,
	.fl_release_private = ceph_fl_release_lock,
};

G
Greg Farnum 已提交
50 51 52
/**
 * Implement fcntl and flock locking functions.
 */
53
static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
54
			     int cmd, u8 wait, struct file_lock *fl)
G
Greg Farnum 已提交
55
{
56
	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
G
Greg Farnum 已提交
57 58
	struct ceph_mds_request *req;
	int err;
59
	u64 length = 0;
60
	u64 owner;
G
Greg Farnum 已提交
61

62 63 64 65 66 67 68 69 70 71 72
	if (operation == CEPH_MDS_OP_SETFILELOCK) {
		/*
		 * increasing i_filelock_ref closes race window between
		 * handling request reply and adding file_lock struct to
		 * inode. Otherwise, auth caps may get trimmed in the
		 * window. Caller function will decrease the counter.
		 */
		fl->fl_ops = &ceph_fl_lock_ops;
		atomic_inc(&ceph_inode(inode)->i_filelock_ref);
	}

Y
Yan, Zheng 已提交
73 74 75
	if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
		wait = 0;

G
Greg Farnum 已提交
76 77 78
	req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
	if (IS_ERR(req))
		return PTR_ERR(req);
79 80
	req->r_inode = inode;
	ihold(inode);
81
	req->r_num_caps = 1;
G
Greg Farnum 已提交
82

83 84 85 86 87 88
	/* mds requires start and length rather than start and end */
	if (LLONG_MAX == fl->fl_end)
		length = 0;
	else
		length = fl->fl_end - fl->fl_start + 1;

89
	owner = secure_addr(fl->fl_owner);
90 91 92 93 94

	dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
	     "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type,
	     (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
	     wait, fl->fl_type);
95

G
Greg Farnum 已提交
96 97
	req->r_args.filelock_change.rule = lock_type;
	req->r_args.filelock_change.type = cmd;
98
	req->r_args.filelock_change.owner = cpu_to_le64(owner);
99 100
	req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
	req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
G
Greg Farnum 已提交
101 102 103
	req->r_args.filelock_change.length = cpu_to_le64(length);
	req->r_args.filelock_change.wait = wait;

Y
Yan, Zheng 已提交
104 105 106
	if (wait)
		req->r_wait_for_completion = ceph_lock_wait_for_completion;

G
Greg Farnum 已提交
107
	err = ceph_mdsc_do_request(mdsc, inode, req);
108

109
	if (operation == CEPH_MDS_OP_GETFILELOCK) {
110
		fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
		if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
			fl->fl_type = F_RDLCK;
		else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
			fl->fl_type = F_WRLCK;
		else
			fl->fl_type = F_UNLCK;

		fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
		length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
						 le64_to_cpu(req->r_reply_info.filelock_reply->length);
		if (length >= 1)
			fl->fl_end = length -1;
		else
			fl->fl_end = 0;

	}
G
Greg Farnum 已提交
127 128
	ceph_mdsc_put_request(req);
	dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
S
Sage Weil 已提交
129
	     "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type,
130 131
	     (int)operation, (u64)fl->fl_pid, fl->fl_start,
	     length, wait, fl->fl_type, err);
G
Greg Farnum 已提交
132 133 134
	return err;
}

Y
Yan, Zheng 已提交
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
                                         struct ceph_mds_request *req)
{
	struct ceph_mds_request *intr_req;
	struct inode *inode = req->r_inode;
	int err, lock_type;

	BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK);
	if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL)
		lock_type = CEPH_LOCK_FCNTL_INTR;
	else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK)
		lock_type = CEPH_LOCK_FLOCK_INTR;
	else
		BUG_ON(1);
	BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK);

	err = wait_for_completion_interruptible(&req->r_completion);
	if (!err)
		return 0;

	dout("ceph_lock_wait_for_completion: request %llu was interrupted\n",
	     req->r_tid);

158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
	mutex_lock(&mdsc->mutex);
	if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
		err = 0;
	} else {
		/*
		 * ensure we aren't running concurrently with
		 * ceph_fill_trace or ceph_readdir_prepopulate, which
		 * rely on locks (dir mutex) held by our caller.
		 */
		mutex_lock(&req->r_fill_mutex);
		req->r_err = err;
		set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
		mutex_unlock(&req->r_fill_mutex);

		if (!req->r_session) {
			// haven't sent the request
			err = 0;
		}
	}
	mutex_unlock(&mdsc->mutex);
	if (!err)
		return 0;

Y
Yan, Zheng 已提交
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
	intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
					    USE_AUTH_MDS);
	if (IS_ERR(intr_req))
		return PTR_ERR(intr_req);

	intr_req->r_inode = inode;
	ihold(inode);
	intr_req->r_num_caps = 1;

	intr_req->r_args.filelock_change = req->r_args.filelock_change;
	intr_req->r_args.filelock_change.rule = lock_type;
	intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK;

	err = ceph_mdsc_do_request(mdsc, inode, intr_req);
	ceph_mdsc_put_request(intr_req);

	if (err && err != -ERESTARTSYS)
		return err;

200
	wait_for_completion_killable(&req->r_safe_completion);
Y
Yan, Zheng 已提交
201 202 203
	return 0;
}

G
Greg Farnum 已提交
204 205 206 207 208 209
/**
 * Attempt to set an fcntl lock.
 * For now, this just goes away to the server. Later it may be more awesome.
 */
int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
{
210
	struct inode *inode = file_inode(file);
G
Greg Farnum 已提交
211 212
	int err;
	u16 op = CEPH_MDS_OP_SETFILELOCK;
213 214
	u8 lock_cmd;
	u8 wait = 0;
G
Greg Farnum 已提交
215

Y
Yan, Zheng 已提交
216 217 218 219 220 221
	if (!(fl->fl_flags & FL_POSIX))
		return -ENOLCK;
	/* No mandatory locks */
	if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
		return -ENOLCK;

222
	dout("ceph_lock, fl_owner: %p", fl->fl_owner);
G
Greg Farnum 已提交
223 224

	/* set wait bit as appropriate, then make command as Ceph expects it*/
225
	if (IS_GETLK(cmd))
G
Greg Farnum 已提交
226
		op = CEPH_MDS_OP_GETFILELOCK;
227 228
	else if (IS_SETLKW(cmd))
		wait = 1;
G
Greg Farnum 已提交
229

230 231 232 233 234 235 236 237 238 239 240
	if (op == CEPH_MDS_OP_SETFILELOCK) {
		/*
		 * increasing i_filelock_ref closes race window between
		 * handling request reply and adding file_lock struct to
		 * inode. Otherwise, i_auth_cap may get trimmed in the
		 * window. Caller function will decrease the counter.
		 */
		fl->fl_ops = &ceph_fl_lock_ops;
		atomic_inc(&ceph_inode(inode)->i_filelock_ref);
	}

G
Greg Farnum 已提交
241 242 243 244 245 246 247
	if (F_RDLCK == fl->fl_type)
		lock_cmd = CEPH_LOCK_SHARED;
	else if (F_WRLCK == fl->fl_type)
		lock_cmd = CEPH_LOCK_EXCL;
	else
		lock_cmd = CEPH_LOCK_UNLOCK;

248
	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
G
Greg Farnum 已提交
249
	if (!err) {
250
		if (op != CEPH_MDS_OP_GETFILELOCK) {
251 252 253
			dout("mds locked, locking locally");
			err = posix_lock_file(file, fl, NULL);
			if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
S
Sage Weil 已提交
254 255 256
				/* undo! This should only happen if
				 * the kernel detects local
				 * deadlock. */
257
				ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
258
						  CEPH_LOCK_UNLOCK, 0, fl);
S
Sage Weil 已提交
259 260
				dout("got %d on posix_lock_file, undid lock",
				     err);
261
			}
G
Greg Farnum 已提交
262 263 264 265 266 267 268
		}
	}
	return err;
}

int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
{
269
	struct inode *inode = file_inode(file);
G
Greg Farnum 已提交
270
	int err;
271
	u8 lock_cmd;
272
	u8 wait = 0;
G
Greg Farnum 已提交
273

Y
Yan, Zheng 已提交
274 275 276
	if (!(fl->fl_flags & FL_FLOCK))
		return -ENOLCK;
	/* No mandatory locks */
Y
Yan, Zheng 已提交
277 278
	if (fl->fl_type & LOCK_MAND)
		return -EOPNOTSUPP;
Y
Yan, Zheng 已提交
279

280
	dout("ceph_flock, fl_file: %p", fl->fl_file);
G
Greg Farnum 已提交
281

282 283 284 285
	/* see comment in ceph_lock */
	fl->fl_ops = &ceph_fl_lock_ops;
	atomic_inc(&ceph_inode(inode)->i_filelock_ref);

286 287 288 289
	if (IS_SETLKW(cmd))
		wait = 1;

	if (F_RDLCK == fl->fl_type)
G
Greg Farnum 已提交
290
		lock_cmd = CEPH_LOCK_SHARED;
291
	else if (F_WRLCK == fl->fl_type)
G
Greg Farnum 已提交
292 293 294 295 296
		lock_cmd = CEPH_LOCK_EXCL;
	else
		lock_cmd = CEPH_LOCK_UNLOCK;

	err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
297
				inode, lock_cmd, wait, fl);
G
Greg Farnum 已提交
298
	if (!err) {
299
		err = locks_lock_file_wait(file, fl);
G
Greg Farnum 已提交
300 301 302
		if (err) {
			ceph_lock_message(CEPH_LOCK_FLOCK,
					  CEPH_MDS_OP_SETFILELOCK,
303
					  inode, CEPH_LOCK_UNLOCK, 0, fl);
304
			dout("got %d on locks_lock_file_wait, undid lock", err);
G
Greg Farnum 已提交
305 306 307 308 309
		}
	}
	return err;
}

310 311 312
/*
 * Fills in the passed counter variables, so you can prepare pagelist metadata
 * before calling ceph_encode_locks.
G
Greg Farnum 已提交
313 314 315
 */
void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
{
316
	struct file_lock *lock;
317
	struct file_lock_context *ctx;
G
Greg Farnum 已提交
318 319 320 321

	*fcntl_count = 0;
	*flock_count = 0;

322 323
	ctx = inode->i_flctx;
	if (ctx) {
324 325 326 327 328 329
		spin_lock(&ctx->flc_lock);
		list_for_each_entry(lock, &ctx->flc_posix, fl_list)
			++(*fcntl_count);
		list_for_each_entry(lock, &ctx->flc_flock, fl_list)
			++(*flock_count);
		spin_unlock(&ctx->flc_lock);
G
Greg Farnum 已提交
330 331 332 333 334
	}
	dout("counted %d flock locks and %d fcntl locks",
	     *flock_count, *fcntl_count);
}

335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365
/*
 * Given a pointer to a lock, convert it to a ceph filelock
 */
static int lock_to_ceph_filelock(struct file_lock *lock,
				 struct ceph_filelock *cephlock)
{
	int err = 0;
	cephlock->start = cpu_to_le64(lock->fl_start);
	cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
	cephlock->client = cpu_to_le64(0);
	cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
	cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));

	switch (lock->fl_type) {
	case F_RDLCK:
		cephlock->type = CEPH_LOCK_SHARED;
		break;
	case F_WRLCK:
		cephlock->type = CEPH_LOCK_EXCL;
		break;
	case F_UNLCK:
		cephlock->type = CEPH_LOCK_UNLOCK;
		break;
	default:
		dout("Have unknown lock type %d", lock->fl_type);
		err = -EINVAL;
	}

	return err;
}

G
Greg Farnum 已提交
366
/**
367
 * Encode the flock and fcntl locks for the given inode into the ceph_filelock
368
 * array. Must be called with inode->i_lock already held.
369
 * If we encounter more of a specific lock type than expected, return -ENOSPC.
G
Greg Farnum 已提交
370
 */
371 372 373
int ceph_encode_locks_to_buffer(struct inode *inode,
				struct ceph_filelock *flocks,
				int num_fcntl_locks, int num_flock_locks)
G
Greg Farnum 已提交
374 375
{
	struct file_lock *lock;
376
	struct file_lock_context *ctx = inode->i_flctx;
G
Greg Farnum 已提交
377
	int err = 0;
378 379
	int seen_fcntl = 0;
	int seen_flock = 0;
380
	int l = 0;
G
Greg Farnum 已提交
381 382 383

	dout("encoding %d flock and %d fcntl locks", num_flock_locks,
	     num_fcntl_locks);
384

385 386 387
	if (!ctx)
		return 0;

388
	spin_lock(&ctx->flc_lock);
389
	list_for_each_entry(lock, &ctx->flc_posix, fl_list) {
390 391 392 393
		++seen_fcntl;
		if (seen_fcntl > num_fcntl_locks) {
			err = -ENOSPC;
			goto fail;
G
Greg Farnum 已提交
394
		}
395 396 397 398
		err = lock_to_ceph_filelock(lock, &flocks[l]);
		if (err)
			goto fail;
		++l;
G
Greg Farnum 已提交
399
	}
400 401 402 403 404
	list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
		++seen_flock;
		if (seen_flock > num_flock_locks) {
			err = -ENOSPC;
			goto fail;
G
Greg Farnum 已提交
405
		}
406 407 408 409
		err = lock_to_ceph_filelock(lock, &flocks[l]);
		if (err)
			goto fail;
		++l;
G
Greg Farnum 已提交
410 411
	}
fail:
412
	spin_unlock(&ctx->flc_lock);
G
Greg Farnum 已提交
413 414 415
	return err;
}

416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
/**
 * Copy the encoded flock and fcntl locks into the pagelist.
 * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
 * sequential flock locks.
 * Returns zero on success.
 */
int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
			   struct ceph_pagelist *pagelist,
			   int num_fcntl_locks, int num_flock_locks)
{
	int err = 0;
	__le32 nlocks;

	nlocks = cpu_to_le32(num_fcntl_locks);
	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
	if (err)
		goto out_fail;

	err = ceph_pagelist_append(pagelist, flocks,
				   num_fcntl_locks * sizeof(*flocks));
	if (err)
		goto out_fail;

	nlocks = cpu_to_le32(num_flock_locks);
	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
	if (err)
		goto out_fail;

	err = ceph_pagelist_append(pagelist,
				   &flocks[num_fcntl_locks],
				   num_flock_locks * sizeof(*flocks));
out_fail:
	return err;
}