nbd.c 40.2 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 * Network block device - make block devices work over TCP
 *
 * Note that you can not swap over this thing, yet. Seems to work but
 * deadlocks sometimes - you can not swap over TCP in general.
 * 
P
Pavel Machek 已提交
7
 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
L
Linus Torvalds 已提交
8 9
 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
 *
10
 * This file is released under GPLv2 or later.
L
Linus Torvalds 已提交
11
 *
12
 * (part of code stolen from loop.c)
L
Linus Torvalds 已提交
13 14 15 16 17 18 19 20 21 22 23 24 25 26
 */

#include <linux/major.h>

#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/bio.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/ioctl.h>
27
#include <linux/mutex.h>
28 29 30
#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/kernel.h>
31
#include <linux/slab.h>
L
Linus Torvalds 已提交
32
#include <net/sock.h>
33
#include <linux/net.h>
34
#include <linux/kthread.h>
M
Markus Pargmann 已提交
35
#include <linux/types.h>
M
Markus Pargmann 已提交
36
#include <linux/debugfs.h>
J
Josef Bacik 已提交
37
#include <linux/blk-mq.h>
L
Linus Torvalds 已提交
38

39
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
40 41 42
#include <asm/types.h>

#include <linux/nbd.h>
J
Josef Bacik 已提交
43 44
#include <linux/nbd-netlink.h>
#include <net/genetlink.h>
L
Linus Torvalds 已提交
45

46 47 48
static DEFINE_IDR(nbd_index_idr);
static DEFINE_MUTEX(nbd_index_mutex);

J
Josef Bacik 已提交
49 50 51
struct nbd_sock {
	struct socket *sock;
	struct mutex tx_lock;
J
Josef Bacik 已提交
52 53
	struct request *pending;
	int sent;
54 55
	bool dead;
	int fallback_index;
J
Josef Bacik 已提交
56 57
};

58 59 60 61 62 63
struct recv_thread_args {
	struct work_struct work;
	struct nbd_device *nbd;
	int index;
};

J
Josef Bacik 已提交
64 65
#define NBD_TIMEDOUT			0
#define NBD_DISCONNECT_REQUESTED	1
J
Josef Bacik 已提交
66
#define NBD_DISCONNECTED		2
67
#define NBD_HAS_PID_FILE		3
J
Josef Bacik 已提交
68 69
#define NBD_HAS_CONFIG_REF		4
#define NBD_BOUND			5
J
Josef Bacik 已提交
70

71
struct nbd_config {
M
Markus Pargmann 已提交
72
	u32 flags;
J
Josef Bacik 已提交
73
	unsigned long runtime_flags;
74

75
	struct nbd_sock **socks;
J
Josef Bacik 已提交
76
	int num_connections;
77

J
Josef Bacik 已提交
78 79
	atomic_t recv_threads;
	wait_queue_head_t recv_wq;
80
	loff_t blksize;
M
Markus Pargmann 已提交
81
	loff_t bytesize;
M
Markus Pargmann 已提交
82 83 84
#if IS_ENABLED(CONFIG_DEBUG_FS)
	struct dentry *dbg_dir;
#endif
85 86
};

87 88 89
struct nbd_device {
	struct blk_mq_tag_set tag_set;

J
Josef Bacik 已提交
90
	int index;
91 92 93 94 95 96 97 98 99
	refcount_t config_refs;
	struct nbd_config *config;
	struct mutex config_lock;
	struct gendisk *disk;

	struct task_struct *task_recv;
	struct task_struct *task_setup;
};

J
Josef Bacik 已提交
100 101
struct nbd_cmd {
	struct nbd_device *nbd;
102
	int index;
J
Josef Bacik 已提交
103
	struct completion send_complete;
J
Josef Bacik 已提交
104 105
};

M
Markus Pargmann 已提交
106 107 108 109 110 111
#if IS_ENABLED(CONFIG_DEBUG_FS)
static struct dentry *nbd_dbg_dir;
#endif

#define nbd_name(nbd) ((nbd)->disk->disk_name)

112
#define NBD_MAGIC 0x68797548
L
Linus Torvalds 已提交
113

114
static unsigned int nbds_max = 16;
L
Laurent Vivier 已提交
115
static int max_part;
116
static struct workqueue_struct *recv_workqueue;
117
static int part_shift;
L
Linus Torvalds 已提交
118

J
Josef Bacik 已提交
119 120
static int nbd_dev_dbg_init(struct nbd_device *nbd);
static void nbd_dev_dbg_close(struct nbd_device *nbd);
121
static void nbd_config_put(struct nbd_device *nbd);
J
Josef Bacik 已提交
122
static void nbd_connect_reply(struct genl_info *info, int index);
J
Josef Bacik 已提交
123

124
static inline struct device *nbd_to_dev(struct nbd_device *nbd)
L
Linus Torvalds 已提交
125
{
126
	return disk_to_dev(nbd->disk);
L
Linus Torvalds 已提交
127 128 129 130 131 132 133 134
}

static const char *nbdcmd_to_ascii(int cmd)
{
	switch (cmd) {
	case  NBD_CMD_READ: return "read";
	case NBD_CMD_WRITE: return "write";
	case  NBD_CMD_DISC: return "disconnect";
A
Alex Bligh 已提交
135
	case NBD_CMD_FLUSH: return "flush";
P
Paul Clements 已提交
136
	case  NBD_CMD_TRIM: return "trim/discard";
L
Linus Torvalds 已提交
137 138 139 140
	}
	return "invalid";
}

141 142 143 144 145 146 147 148 149 150 151 152 153 154
static ssize_t pid_show(struct device *dev,
			struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);
	struct nbd_device *nbd = (struct nbd_device *)disk->private_data;

	return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
}

static struct device_attribute pid_attr = {
	.attr = { .name = "pid", .mode = S_IRUGO},
	.show = pid_show,
};

155 156 157 158 159 160 161 162 163
static void nbd_mark_nsock_dead(struct nbd_sock *nsock)
{
	if (!nsock->dead)
		kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
	nsock->dead = true;
	nsock->pending = NULL;
	nsock->sent = 0;
}

164
static void nbd_size_clear(struct nbd_device *nbd)
165
{
166 167 168 169
	if (nbd->config->bytesize) {
		set_capacity(nbd->disk, 0);
		kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
	}
170 171
}

172
static void nbd_size_update(struct nbd_device *nbd)
173
{
174 175 176 177
	struct nbd_config *config = nbd->config;
	blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
	blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
	set_capacity(nbd->disk, config->bytesize >> 9);
178 179 180
	kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
}

181 182
static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
			 loff_t nr_blocks)
183
{
184 185 186
	struct nbd_config *config = nbd->config;
	config->blksize = blocksize;
	config->bytesize = blocksize * nr_blocks;
187
	nbd_size_update(nbd);
188 189
}

J
Josef Bacik 已提交
190
static void nbd_end_request(struct nbd_cmd *cmd)
L
Linus Torvalds 已提交
191
{
J
Josef Bacik 已提交
192 193
	struct nbd_device *nbd = cmd->nbd;
	struct request *req = blk_mq_rq_from_pdu(cmd);
194
	int error = req->errors ? -EIO : 0;
L
Linus Torvalds 已提交
195

J
Josef Bacik 已提交
196
	dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd,
197
		error ? "failed" : "done");
L
Linus Torvalds 已提交
198

J
Josef Bacik 已提交
199
	blk_mq_complete_request(req, error);
L
Linus Torvalds 已提交
200 201
}

202 203 204
/*
 * Forcibly shutdown the socket causing all listeners to error
 */
205
static void sock_shutdown(struct nbd_device *nbd)
206
{
207
	struct nbd_config *config = nbd->config;
J
Josef Bacik 已提交
208
	int i;
M
Markus Pargmann 已提交
209

210
	if (config->num_connections == 0)
J
Josef Bacik 已提交
211
		return;
212
	if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags))
M
Markus Pargmann 已提交
213
		return;
M
Markus Pargmann 已提交
214

215 216
	for (i = 0; i < config->num_connections; i++) {
		struct nbd_sock *nsock = config->socks[i];
J
Josef Bacik 已提交
217 218
		mutex_lock(&nsock->tx_lock);
		kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
219
		nbd_mark_nsock_dead(nsock);
J
Josef Bacik 已提交
220 221 222
		mutex_unlock(&nsock->tx_lock);
	}
	dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
223 224
}

225 226
static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
						 bool reserved)
227
{
228 229
	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
	struct nbd_device *nbd = cmd->nbd;
230 231 232 233 234 235 236 237
	struct nbd_config *config;

	if (!refcount_inc_not_zero(&nbd->config_refs)) {
		req->errors = -EIO;
		return BLK_EH_HANDLED;
	}

	config = nbd->config;
M
Markus Pargmann 已提交
238

239
	if (config->num_connections > 1) {
240 241 242 243 244 245
		dev_err_ratelimited(nbd_to_dev(nbd),
				    "Connection timed out, retrying\n");
		/*
		 * Hooray we have more connections, requeue this IO, the submit
		 * path will put it on a real connection.
		 */
246 247
		if (config->socks && config->num_connections > 1) {
			if (cmd->index < config->num_connections) {
248
				struct nbd_sock *nsock =
249
					config->socks[cmd->index];
250 251 252 253 254
				mutex_lock(&nsock->tx_lock);
				nbd_mark_nsock_dead(nsock);
				mutex_unlock(&nsock->tx_lock);
			}
			blk_mq_requeue_request(req, true);
255
			nbd_config_put(nbd);
256 257 258 259 260 261
			return BLK_EH_NOT_HANDLED;
		}
	} else {
		dev_err_ratelimited(nbd_to_dev(nbd),
				    "Connection timed out\n");
	}
262
	set_bit(NBD_TIMEDOUT, &config->runtime_flags);
263
	req->errors = -EIO;
J
Josef Bacik 已提交
264
	sock_shutdown(nbd);
265 266
	nbd_config_put(nbd);

267
	return BLK_EH_HANDLED;
268 269
}

L
Linus Torvalds 已提交
270 271 272
/*
 *  Send or receive packet.
 */
A
Al Viro 已提交
273
static int sock_xmit(struct nbd_device *nbd, int index, int send,
J
Josef Bacik 已提交
274
		     struct iov_iter *iter, int msg_flags, int *sent)
L
Linus Torvalds 已提交
275
{
276 277
	struct nbd_config *config = nbd->config;
	struct socket *sock = config->socks[index]->sock;
L
Linus Torvalds 已提交
278 279
	int result;
	struct msghdr msg;
280
	unsigned long pflags = current->flags;
L
Linus Torvalds 已提交
281

282
	if (unlikely(!sock)) {
283
		dev_err_ratelimited(disk_to_dev(nbd->disk),
284 285
			"Attempted %s on closed socket in sock_xmit\n",
			(send ? "send" : "recv"));
286 287 288
		return -EINVAL;
	}

A
Al Viro 已提交
289
	msg.msg_iter = *iter;
290

291
	current->flags |= PF_MEMALLOC;
L
Linus Torvalds 已提交
292
	do {
293
		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
L
Linus Torvalds 已提交
294 295 296 297 298 299
		msg.msg_name = NULL;
		msg.msg_namelen = 0;
		msg.msg_control = NULL;
		msg.msg_controllen = 0;
		msg.msg_flags = msg_flags | MSG_NOSIGNAL;

M
Markus Pargmann 已提交
300
		if (send)
301
			result = sock_sendmsg(sock, &msg);
M
Markus Pargmann 已提交
302
		else
303
			result = sock_recvmsg(sock, &msg, msg.msg_flags);
L
Linus Torvalds 已提交
304 305 306 307 308 309

		if (result <= 0) {
			if (result == 0)
				result = -EPIPE; /* short read */
			break;
		}
J
Josef Bacik 已提交
310 311
		if (sent)
			*sent += result;
312
	} while (msg_data_left(&msg));
L
Linus Torvalds 已提交
313

314
	tsk_restore_flags(current, pflags, PF_MEMALLOC);
L
Linus Torvalds 已提交
315 316 317 318

	return result;
}

319
/* always call with the tx_lock held */
J
Josef Bacik 已提交
320
static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
L
Linus Torvalds 已提交
321
{
J
Josef Bacik 已提交
322
	struct request *req = blk_mq_rq_from_pdu(cmd);
323 324
	struct nbd_config *config = nbd->config;
	struct nbd_sock *nsock = config->socks[index];
325
	int result;
A
Al Viro 已提交
326 327 328
	struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
	struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
	struct iov_iter from;
329
	unsigned long size = blk_rq_bytes(req);
330
	struct bio *bio;
C
Christoph Hellwig 已提交
331
	u32 type;
J
Josef Bacik 已提交
332
	u32 tag = blk_mq_unique_tag(req);
J
Josef Bacik 已提交
333
	int sent = nsock->sent, skip = 0;
C
Christoph Hellwig 已提交
334

A
Al Viro 已提交
335 336
	iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));

337 338
	switch (req_op(req)) {
	case REQ_OP_DISCARD:
C
Christoph Hellwig 已提交
339
		type = NBD_CMD_TRIM;
340 341
		break;
	case REQ_OP_FLUSH:
C
Christoph Hellwig 已提交
342
		type = NBD_CMD_FLUSH;
343 344
		break;
	case REQ_OP_WRITE:
C
Christoph Hellwig 已提交
345
		type = NBD_CMD_WRITE;
346 347
		break;
	case REQ_OP_READ:
C
Christoph Hellwig 已提交
348
		type = NBD_CMD_READ;
349 350 351 352
		break;
	default:
		return -EIO;
	}
L
Linus Torvalds 已提交
353

354
	if (rq_data_dir(req) == WRITE &&
355
	    (config->flags & NBD_FLAG_READ_ONLY)) {
356 357 358 359 360
		dev_err_ratelimited(disk_to_dev(nbd->disk),
				    "Write on read-only\n");
		return -EIO;
	}

J
Josef Bacik 已提交
361 362 363 364 365 366 367 368 369 370 371
	/* We did a partial send previously, and we at least sent the whole
	 * request struct, so just go and send the rest of the pages in the
	 * request.
	 */
	if (sent) {
		if (sent >= sizeof(request)) {
			skip = sent - sizeof(request);
			goto send_pages;
		}
		iov_iter_advance(&from, sent);
	}
372
	cmd->index = index;
C
Christoph Hellwig 已提交
373
	request.type = htonl(type);
J
Josef Bacik 已提交
374
	if (type != NBD_CMD_FLUSH) {
A
Alex Bligh 已提交
375 376 377
		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
		request.len = htonl(size);
	}
J
Josef Bacik 已提交
378
	memcpy(request.handle, &tag, sizeof(tag));
L
Linus Torvalds 已提交
379

380
	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
J
Josef Bacik 已提交
381
		cmd, nbdcmd_to_ascii(type),
382
		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
A
Al Viro 已提交
383
	result = sock_xmit(nbd, index, 1, &from,
J
Josef Bacik 已提交
384
			(type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
L
Linus Torvalds 已提交
385
	if (result <= 0) {
J
Josef Bacik 已提交
386 387 388 389 390 391 392 393 394 395 396 397
		if (result == -ERESTARTSYS) {
			/* If we havne't sent anything we can just return BUSY,
			 * however if we have sent something we need to make
			 * sure we only allow this req to be sent until we are
			 * completely done.
			 */
			if (sent) {
				nsock->pending = req;
				nsock->sent = sent;
			}
			return BLK_MQ_RQ_QUEUE_BUSY;
		}
398
		dev_err_ratelimited(disk_to_dev(nbd->disk),
399
			"Send control failed (result %d)\n", result);
400
		return -EAGAIN;
L
Linus Torvalds 已提交
401
	}
J
Josef Bacik 已提交
402
send_pages:
403
	if (type != NBD_CMD_WRITE)
J
Josef Bacik 已提交
404
		goto out;
405 406 407 408 409

	bio = req->bio;
	while (bio) {
		struct bio *next = bio->bi_next;
		struct bvec_iter iter;
410
		struct bio_vec bvec;
411 412 413

		bio_for_each_segment(bvec, bio, iter) {
			bool is_last = !next && bio_iter_last(bvec, iter);
414
			int flags = is_last ? 0 : MSG_MORE;
415

416
			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
J
Josef Bacik 已提交
417
				cmd, bvec.bv_len);
A
Al Viro 已提交
418 419
			iov_iter_bvec(&from, ITER_BVEC | WRITE,
				      &bvec, 1, bvec.bv_len);
J
Josef Bacik 已提交
420 421 422 423 424 425 426 427 428
			if (skip) {
				if (skip >= iov_iter_count(&from)) {
					skip -= iov_iter_count(&from);
					continue;
				}
				iov_iter_advance(&from, skip);
				skip = 0;
			}
			result = sock_xmit(nbd, index, 1, &from, flags, &sent);
429
			if (result <= 0) {
J
Josef Bacik 已提交
430 431 432 433 434 435 436 437 438
				if (result == -ERESTARTSYS) {
					/* We've already sent the header, we
					 * have no choice but to set pending and
					 * return BUSY.
					 */
					nsock->pending = req;
					nsock->sent = sent;
					return BLK_MQ_RQ_QUEUE_BUSY;
				}
439
				dev_err(disk_to_dev(nbd->disk),
440 441
					"Send data failed (result %d)\n",
					result);
442
				return -EAGAIN;
443
			}
444 445 446 447 448 449 450 451
			/*
			 * The completion might already have come in,
			 * so break for the last one instead of letting
			 * the iterator do it. This prevents use-after-free
			 * of the bio.
			 */
			if (is_last)
				break;
L
Linus Torvalds 已提交
452
		}
453
		bio = next;
L
Linus Torvalds 已提交
454
	}
J
Josef Bacik 已提交
455 456 457
out:
	nsock->pending = NULL;
	nsock->sent = 0;
L
Linus Torvalds 已提交
458 459 460
	return 0;
}

461
static int nbd_disconnected(struct nbd_config *config)
462
{
463 464
	return test_bit(NBD_DISCONNECTED, &config->runtime_flags) ||
		test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
465 466
}

L
Linus Torvalds 已提交
467
/* NULL returned = something went wrong, inform userspace */
J
Josef Bacik 已提交
468
static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
L
Linus Torvalds 已提交
469
{
470
	struct nbd_config *config = nbd->config;
L
Linus Torvalds 已提交
471 472
	int result;
	struct nbd_reply reply;
J
Josef Bacik 已提交
473 474 475
	struct nbd_cmd *cmd;
	struct request *req = NULL;
	u16 hwq;
J
Josef Bacik 已提交
476
	u32 tag;
A
Al Viro 已提交
477 478
	struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)};
	struct iov_iter to;
L
Linus Torvalds 已提交
479 480

	reply.magic = 0;
A
Al Viro 已提交
481
	iov_iter_kvec(&to, READ | ITER_KVEC, &iov, 1, sizeof(reply));
J
Josef Bacik 已提交
482
	result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
L
Linus Torvalds 已提交
483
	if (result <= 0) {
484
		if (!nbd_disconnected(config))
J
Josef Bacik 已提交
485 486
			dev_err(disk_to_dev(nbd->disk),
				"Receive control failed (result %d)\n", result);
487
		return ERR_PTR(result);
L
Linus Torvalds 已提交
488
	}
489 490

	if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
491
		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
492
				(unsigned long)ntohl(reply.magic));
493
		return ERR_PTR(-EPROTO);
494 495
	}

J
Josef Bacik 已提交
496
	memcpy(&tag, reply.handle, sizeof(u32));
497

J
Josef Bacik 已提交
498 499 500 501 502 503 504 505
	hwq = blk_mq_unique_tag_to_hwq(tag);
	if (hwq < nbd->tag_set.nr_hw_queues)
		req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
				       blk_mq_unique_tag_to_tag(tag));
	if (!req || !blk_mq_request_started(req)) {
		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
			tag, req);
		return ERR_PTR(-ENOENT);
L
Linus Torvalds 已提交
506
	}
J
Josef Bacik 已提交
507
	cmd = blk_mq_rq_to_pdu(req);
L
Linus Torvalds 已提交
508
	if (ntohl(reply.error)) {
509
		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
510
			ntohl(reply.error));
511
		req->errors = -EIO;
J
Josef Bacik 已提交
512
		return cmd;
L
Linus Torvalds 已提交
513 514
	}

J
Josef Bacik 已提交
515
	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", cmd);
C
Christoph Hellwig 已提交
516
	if (rq_data_dir(req) != WRITE) {
517
		struct req_iterator iter;
518
		struct bio_vec bvec;
519 520

		rq_for_each_segment(bvec, req, iter) {
A
Al Viro 已提交
521 522
			iov_iter_bvec(&to, ITER_BVEC | READ,
				      &bvec, 1, bvec.bv_len);
J
Josef Bacik 已提交
523
			result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
524
			if (result <= 0) {
525
				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
526
					result);
527 528 529 530 531 532 533
				/*
				 * If we've disconnected or we only have 1
				 * connection then we need to make sure we
				 * complete this request, otherwise error out
				 * and let the timeout stuff handle resubmitting
				 * this request onto another connection.
				 */
534 535
				if (nbd_disconnected(config) ||
				    config->num_connections <= 1) {
536 537 538 539
					req->errors = -EIO;
					return cmd;
				}
				return ERR_PTR(-EIO);
540
			}
541
			dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
J
Josef Bacik 已提交
542
				cmd, bvec.bv_len);
L
Linus Torvalds 已提交
543
		}
J
Josef Bacik 已提交
544 545 546
	} else {
		/* See the comment in nbd_queue_rq. */
		wait_for_completion(&cmd->send_complete);
L
Linus Torvalds 已提交
547
	}
J
Josef Bacik 已提交
548
	return cmd;
L
Linus Torvalds 已提交
549 550
}

J
Josef Bacik 已提交
551
static void recv_work(struct work_struct *work)
L
Linus Torvalds 已提交
552
{
J
Josef Bacik 已提交
553 554 555 556
	struct recv_thread_args *args = container_of(work,
						     struct recv_thread_args,
						     work);
	struct nbd_device *nbd = args->nbd;
557
	struct nbd_config *config = nbd->config;
J
Josef Bacik 已提交
558
	struct nbd_cmd *cmd;
J
Josef Bacik 已提交
559
	int ret = 0;
L
Linus Torvalds 已提交
560

561
	while (1) {
J
Josef Bacik 已提交
562
		cmd = nbd_read_stat(nbd, args->index);
J
Josef Bacik 已提交
563
		if (IS_ERR(cmd)) {
564
			struct nbd_sock *nsock = config->socks[args->index];
565 566 567 568

			mutex_lock(&nsock->tx_lock);
			nbd_mark_nsock_dead(nsock);
			mutex_unlock(&nsock->tx_lock);
J
Josef Bacik 已提交
569
			ret = PTR_ERR(cmd);
570 571 572
			break;
		}

J
Josef Bacik 已提交
573
		nbd_end_request(cmd);
574
	}
575 576 577 578
	atomic_dec(&config->recv_threads);
	wake_up(&config->recv_wq);
	nbd_config_put(nbd);
	kfree(args);
L
Linus Torvalds 已提交
579 580
}

J
Josef Bacik 已提交
581
static void nbd_clear_req(struct request *req, void *data, bool reserved)
L
Linus Torvalds 已提交
582
{
J
Josef Bacik 已提交
583
	struct nbd_cmd *cmd;
L
Linus Torvalds 已提交
584

J
Josef Bacik 已提交
585 586 587
	if (!blk_mq_request_started(req))
		return;
	cmd = blk_mq_rq_to_pdu(req);
588
	req->errors = -EIO;
J
Josef Bacik 已提交
589 590 591 592 593 594
	nbd_end_request(cmd);
}

static void nbd_clear_que(struct nbd_device *nbd)
{
	blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
595
	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
L
Linus Torvalds 已提交
596 597
}

598 599
static int find_fallback(struct nbd_device *nbd, int index)
{
600
	struct nbd_config *config = nbd->config;
601
	int new_index = -1;
602
	struct nbd_sock *nsock = config->socks[index];
603 604
	int fallback = nsock->fallback_index;

605
	if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
606 607
		return new_index;

608
	if (config->num_connections <= 1) {
609 610 611 612 613
		dev_err_ratelimited(disk_to_dev(nbd->disk),
				    "Attempted send on invalid socket\n");
		return new_index;
	}

614 615
	if (fallback >= 0 && fallback < config->num_connections &&
	    !config->socks[fallback]->dead)
616 617 618
		return fallback;

	if (nsock->fallback_index < 0 ||
619 620
	    nsock->fallback_index >= config->num_connections ||
	    config->socks[nsock->fallback_index]->dead) {
621
		int i;
622
		for (i = 0; i < config->num_connections; i++) {
623 624
			if (i == index)
				continue;
625
			if (!config->socks[i]->dead) {
626 627 628 629 630 631 632 633 634 635 636 637 638 639
				new_index = i;
				break;
			}
		}
		nsock->fallback_index = new_index;
		if (new_index < 0) {
			dev_err_ratelimited(disk_to_dev(nbd->disk),
					    "Dead connection, failed to find a fallback\n");
			return new_index;
		}
	}
	new_index = nsock->fallback_index;
	return new_index;
}
640

J
Josef Bacik 已提交
641
static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
642
{
J
Josef Bacik 已提交
643 644
	struct request *req = blk_mq_rq_from_pdu(cmd);
	struct nbd_device *nbd = cmd->nbd;
645
	struct nbd_config *config;
J
Josef Bacik 已提交
646
	struct nbd_sock *nsock;
J
Josef Bacik 已提交
647
	int ret;
J
Josef Bacik 已提交
648

649 650 651 652 653 654 655 656
	if (!refcount_inc_not_zero(&nbd->config_refs)) {
		dev_err_ratelimited(disk_to_dev(nbd->disk),
				    "Socks array is empty\n");
		return -EINVAL;
	}
	config = nbd->config;

	if (index >= config->num_connections) {
657 658
		dev_err_ratelimited(disk_to_dev(nbd->disk),
				    "Attempted send on invalid socket\n");
659
		nbd_config_put(nbd);
J
Josef Bacik 已提交
660
		return -EINVAL;
J
Josef Bacik 已提交
661
	}
662
	req->errors = 0;
663
again:
664
	nsock = config->socks[index];
J
Josef Bacik 已提交
665
	mutex_lock(&nsock->tx_lock);
666 667
	if (nsock->dead) {
		index = find_fallback(nbd, index);
668 669 670 671
		if (index < 0) {
			ret = -EIO;
			goto out;
		}
J
Josef Bacik 已提交
672
		mutex_unlock(&nsock->tx_lock);
673
		goto again;
674 675
	}

J
Josef Bacik 已提交
676 677 678 679 680 681 682 683 684
	/* Handle the case that we have a pending request that was partially
	 * transmitted that _has_ to be serviced first.  We need to call requeue
	 * here so that it gets put _after_ the request that is already on the
	 * dispatch list.
	 */
	if (unlikely(nsock->pending && nsock->pending != req)) {
		blk_mq_requeue_request(req, true);
		ret = 0;
		goto out;
685
	}
686 687 688 689
	/*
	 * Some failures are related to the link going down, so anything that
	 * returns EAGAIN can be retried on a different socket.
	 */
J
Josef Bacik 已提交
690
	ret = nbd_send_cmd(nbd, cmd, index);
691 692 693 694 695 696 697
	if (ret == -EAGAIN) {
		dev_err_ratelimited(disk_to_dev(nbd->disk),
				    "Request send failed trying another connection\n");
		nbd_mark_nsock_dead(nsock);
		mutex_unlock(&nsock->tx_lock);
		goto again;
	}
J
Josef Bacik 已提交
698
out:
J
Josef Bacik 已提交
699
	mutex_unlock(&nsock->tx_lock);
700
	nbd_config_put(nbd);
J
Josef Bacik 已提交
701
	return ret;
702 703
}

J
Josef Bacik 已提交
704 705
static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
			const struct blk_mq_queue_data *bd)
L
Linus Torvalds 已提交
706
{
J
Josef Bacik 已提交
707
	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
J
Josef Bacik 已提交
708
	int ret;
L
Linus Torvalds 已提交
709

J
Josef Bacik 已提交
710 711 712 713 714 715 716 717 718 719
	/*
	 * Since we look at the bio's to send the request over the network we
	 * need to make sure the completion work doesn't mark this request done
	 * before we are done doing our send.  This keeps us from dereferencing
	 * freed data if we have particularly fast completions (ie we get the
	 * completion before we exit sock_xmit on the last bvec) or in the case
	 * that the server is misbehaving (or there was an error) before we're
	 * done sending everything over the wire.
	 */
	init_completion(&cmd->send_complete);
J
Josef Bacik 已提交
720
	blk_mq_start_request(bd->rq);
J
Josef Bacik 已提交
721 722 723 724 725 726 727 728 729 730 731

	/* We can be called directly from the user space process, which means we
	 * could possibly have signals pending so our sendmsg will fail.  In
	 * this case we need to return that we are busy, otherwise error out as
	 * appropriate.
	 */
	ret = nbd_handle_cmd(cmd, hctx->queue_num);
	if (ret < 0)
		ret = BLK_MQ_RQ_QUEUE_ERROR;
	if (!ret)
		ret = BLK_MQ_RQ_QUEUE_OK;
J
Josef Bacik 已提交
732 733
	complete(&cmd->send_complete);

J
Josef Bacik 已提交
734
	return ret;
L
Linus Torvalds 已提交
735 736
}

J
Josef Bacik 已提交
737 738
static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
			  bool netlink)
M
Markus Pargmann 已提交
739
{
740
	struct nbd_config *config = nbd->config;
J
Josef Bacik 已提交
741
	struct socket *sock;
J
Josef Bacik 已提交
742 743
	struct nbd_sock **socks;
	struct nbd_sock *nsock;
J
Josef Bacik 已提交
744 745 746 747 748
	int err;

	sock = sockfd_lookup(arg, &err);
	if (!sock)
		return err;
M
Markus Pargmann 已提交
749

J
Josef Bacik 已提交
750 751
	if (!netlink && !nbd->task_setup &&
	    !test_bit(NBD_BOUND, &config->runtime_flags))
J
Josef Bacik 已提交
752
		nbd->task_setup = current;
J
Josef Bacik 已提交
753 754 755 756

	if (!netlink &&
	    (nbd->task_setup != current ||
	     test_bit(NBD_BOUND, &config->runtime_flags))) {
J
Josef Bacik 已提交
757 758
		dev_err(disk_to_dev(nbd->disk),
			"Device being setup by another task");
J
Josef Bacik 已提交
759
		sockfd_put(sock);
J
Josef Bacik 已提交
760
		return -EBUSY;
M
Markus Pargmann 已提交
761 762
	}

763
	socks = krealloc(config->socks, (config->num_connections + 1) *
J
Josef Bacik 已提交
764
			 sizeof(struct nbd_sock *), GFP_KERNEL);
J
Josef Bacik 已提交
765 766
	if (!socks) {
		sockfd_put(sock);
J
Josef Bacik 已提交
767
		return -ENOMEM;
J
Josef Bacik 已提交
768
	}
J
Josef Bacik 已提交
769
	nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
J
Josef Bacik 已提交
770 771
	if (!nsock) {
		sockfd_put(sock);
J
Josef Bacik 已提交
772
		return -ENOMEM;
J
Josef Bacik 已提交
773
	}
J
Josef Bacik 已提交
774

775
	config->socks = socks;
M
Markus Pargmann 已提交
776

777 778
	nsock->fallback_index = -1;
	nsock->dead = false;
J
Josef Bacik 已提交
779 780
	mutex_init(&nsock->tx_lock);
	nsock->sock = sock;
J
Josef Bacik 已提交
781 782
	nsock->pending = NULL;
	nsock->sent = 0;
783
	socks[config->num_connections++] = nsock;
M
Markus Pargmann 已提交
784

J
Josef Bacik 已提交
785
	return 0;
M
Markus Pargmann 已提交
786 787
}

788 789 790
/* Reset all properties of an NBD device */
static void nbd_reset(struct nbd_device *nbd)
{
791
	nbd->config = NULL;
792
	nbd->tag_set.timeout = 0;
793 794 795 796 797
	queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
}

static void nbd_bdev_reset(struct block_device *bdev)
{
798 799
	if (bdev->bd_openers > 1)
		return;
800
	bd_set_size(bdev, 0);
801 802 803 804 805 806
	if (max_part > 0) {
		blkdev_reread_part(bdev);
		bdev->bd_invalidated = 1;
	}
}

807
static void nbd_parse_flags(struct nbd_device *nbd)
808
{
809 810
	struct nbd_config *config = nbd->config;
	if (config->flags & NBD_FLAG_READ_ONLY)
811 812 813
		set_disk_ro(nbd->disk, true);
	else
		set_disk_ro(nbd->disk, false);
814
	if (config->flags & NBD_FLAG_SEND_TRIM)
815
		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
816
	if (config->flags & NBD_FLAG_SEND_FLUSH)
817
		blk_queue_write_cache(nbd->disk->queue, true, false);
818
	else
819
		blk_queue_write_cache(nbd->disk->queue, false, false);
820 821
}

J
Josef Bacik 已提交
822 823
static void send_disconnects(struct nbd_device *nbd)
{
824
	struct nbd_config *config = nbd->config;
A
Al Viro 已提交
825 826 827 828 829 830
	struct nbd_request request = {
		.magic = htonl(NBD_REQUEST_MAGIC),
		.type = htonl(NBD_CMD_DISC),
	};
	struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
	struct iov_iter from;
J
Josef Bacik 已提交
831 832
	int i, ret;

833
	for (i = 0; i < config->num_connections; i++) {
A
Al Viro 已提交
834
		iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
J
Josef Bacik 已提交
835
		ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
J
Josef Bacik 已提交
836 837 838 839 840 841
		if (ret <= 0)
			dev_err(disk_to_dev(nbd->disk),
				"Send disconnect failed %d\n", ret);
	}
}

842
static int nbd_disconnect(struct nbd_device *nbd)
J
Josef Bacik 已提交
843
{
844
	struct nbd_config *config = nbd->config;
M
Markus Pargmann 已提交
845

846
	dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
J
Josef Bacik 已提交
847
	if (!test_and_set_bit(NBD_DISCONNECT_REQUESTED,
848
			      &config->runtime_flags))
J
Josef Bacik 已提交
849 850 851 852
		send_disconnects(nbd);
	return 0;
}

853
static void nbd_clear_sock(struct nbd_device *nbd)
P
Pavel Machek 已提交
854
{
J
Josef Bacik 已提交
855 856
	sock_shutdown(nbd);
	nbd_clear_que(nbd);
857 858 859 860 861 862 863 864 865
	nbd->task_setup = NULL;
}

static void nbd_config_put(struct nbd_device *nbd)
{
	if (refcount_dec_and_mutex_lock(&nbd->config_refs,
					&nbd->config_lock)) {
		struct nbd_config *config = nbd->config;
		nbd_dev_dbg_close(nbd);
866
		nbd_size_clear(nbd);
867 868 869 870
		if (test_and_clear_bit(NBD_HAS_PID_FILE,
				       &config->runtime_flags))
			device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
		nbd->task_recv = NULL;
871
		nbd_clear_sock(nbd);
872 873 874 875 876 877 878 879 880 881 882 883
		if (config->num_connections) {
			int i;
			for (i = 0; i < config->num_connections; i++) {
				sockfd_put(config->socks[i]->sock);
				kfree(config->socks[i]);
			}
			kfree(config->socks);
		}
		nbd_reset(nbd);
		mutex_unlock(&nbd->config_lock);
		module_put(THIS_MODULE);
	}
J
Josef Bacik 已提交
884 885
}

J
Josef Bacik 已提交
886
static int nbd_start_device(struct nbd_device *nbd)
J
Josef Bacik 已提交
887
{
888 889
	struct nbd_config *config = nbd->config;
	int num_connections = config->num_connections;
J
Josef Bacik 已提交
890
	int error = 0, i;
P
Pavel Machek 已提交
891

J
Josef Bacik 已提交
892 893
	if (nbd->task_recv)
		return -EBUSY;
894
	if (!config->socks)
J
Josef Bacik 已提交
895 896
		return -EINVAL;
	if (num_connections > 1 &&
897
	    !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
J
Josef Bacik 已提交
898
		dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
899
		return -EINVAL;
J
Josef Bacik 已提交
900
	}
M
Markus Pargmann 已提交
901

902
	blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
J
Josef Bacik 已提交
903
	nbd->task_recv = current;
M
Markus Pargmann 已提交
904

905
	nbd_parse_flags(nbd);
M
Markus Pargmann 已提交
906

J
Josef Bacik 已提交
907 908 909
	error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
	if (error) {
		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
910
		return error;
P
Pavel Machek 已提交
911
	}
912
	set_bit(NBD_HAS_PID_FILE, &config->runtime_flags);
913

J
Josef Bacik 已提交
914 915
	nbd_dev_dbg_init(nbd);
	for (i = 0; i < num_connections; i++) {
916 917 918 919 920 921 922 923 924 925 926 927 928 929
		struct recv_thread_args *args;

		args = kzalloc(sizeof(*args), GFP_KERNEL);
		if (!args) {
			sock_shutdown(nbd);
			return -ENOMEM;
		}
		sk_set_memalloc(config->socks[i]->sock->sk);
		atomic_inc(&config->recv_threads);
		refcount_inc(&nbd->config_refs);
		INIT_WORK(&args->work, recv_work);
		args->nbd = nbd;
		args->index = i;
		queue_work(recv_workqueue, &args->work);
930
	}
J
Josef Bacik 已提交
931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947
	return error;
}

static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev)
{
	struct nbd_config *config = nbd->config;
	int ret;

	ret = nbd_start_device(nbd);
	if (ret)
		return ret;

	bd_set_size(bdev, config->bytesize);
	if (max_part)
		bdev->bd_invalidated = 1;
	mutex_unlock(&nbd->config_lock);
	ret = wait_event_interruptible(config->recv_wq,
948
					 atomic_read(&config->recv_threads) == 0);
J
Josef Bacik 已提交
949
	if (ret)
950
		sock_shutdown(nbd);
J
Josef Bacik 已提交
951
	mutex_lock(&nbd->config_lock);
J
Josef Bacik 已提交
952
	bd_set_size(bdev, 0);
J
Josef Bacik 已提交
953
	/* user requested, ignore socket errors */
954
	if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags))
J
Josef Bacik 已提交
955
		ret = 0;
956
	if (test_bit(NBD_TIMEDOUT, &config->runtime_flags))
J
Josef Bacik 已提交
957 958
		ret = -ETIMEDOUT;
	return ret;
J
Josef Bacik 已提交
959 960
}

961 962 963 964 965 966
static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
				 struct block_device *bdev)
{
	nbd_clear_sock(nbd);
	kill_bdev(bdev);
	nbd_bdev_reset(bdev);
J
Josef Bacik 已提交
967 968 969
	if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
			       &nbd->config->runtime_flags))
		nbd_config_put(nbd);
970 971
}

J
Josef Bacik 已提交
972 973 974 975
/* Must be called with config_lock held */
static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
		       unsigned int cmd, unsigned long arg)
{
976 977
	struct nbd_config *config = nbd->config;

J
Josef Bacik 已提交
978 979
	switch (cmd) {
	case NBD_DISCONNECT:
980
		return nbd_disconnect(nbd);
J
Josef Bacik 已提交
981
	case NBD_CLEAR_SOCK:
982 983
		nbd_clear_sock_ioctl(nbd, bdev);
		return 0;
J
Josef Bacik 已提交
984
	case NBD_SET_SOCK:
J
Josef Bacik 已提交
985
		return nbd_add_socket(nbd, arg, false);
J
Josef Bacik 已提交
986
	case NBD_SET_BLKSIZE:
987
		nbd_size_set(nbd, arg,
988
			     div_s64(config->bytesize, arg));
989
		return 0;
L
Linus Torvalds 已提交
990
	case NBD_SET_SIZE:
991
		nbd_size_set(nbd, config->blksize,
992
			     div_s64(arg, config->blksize));
993
		return 0;
994
	case NBD_SET_SIZE_BLOCKS:
995
		nbd_size_set(nbd, config->blksize, arg);
996
		return 0;
997
	case NBD_SET_TIMEOUT:
J
Josef Bacik 已提交
998 999 1000 1001
		if (arg) {
			nbd->tag_set.timeout = arg * HZ;
			blk_queue_rq_timeout(nbd->disk->queue, arg * HZ);
		}
1002
		return 0;
P
Pavel Machek 已提交
1003

P
Paul Clements 已提交
1004
	case NBD_SET_FLAGS:
1005
		config->flags = arg;
P
Paul Clements 已提交
1006
		return 0;
J
Josef Bacik 已提交
1007
	case NBD_DO_IT:
J
Josef Bacik 已提交
1008
		return nbd_start_device_ioctl(nbd, bdev);
L
Linus Torvalds 已提交
1009
	case NBD_CLEAR_QUE:
1010 1011 1012 1013
		/*
		 * This is for compatibility only.  The queue is always cleared
		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
		 */
L
Linus Torvalds 已提交
1014 1015
		return 0;
	case NBD_PRINT_DEBUG:
J
Josef Bacik 已提交
1016 1017 1018 1019
		/*
		 * For compatibility only, we no longer keep a list of
		 * outstanding requests.
		 */
L
Linus Torvalds 已提交
1020 1021
		return 0;
	}
P
Pavel Machek 已提交
1022 1023 1024 1025 1026 1027
	return -ENOTTY;
}

static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
		     unsigned int cmd, unsigned long arg)
{
1028
	struct nbd_device *nbd = bdev->bd_disk->private_data;
J
Josef Bacik 已提交
1029 1030
	struct nbd_config *config = nbd->config;
	int error = -EINVAL;
P
Pavel Machek 已提交
1031 1032 1033 1034

	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

J
Josef Bacik 已提交
1035
	mutex_lock(&nbd->config_lock);
J
Josef Bacik 已提交
1036 1037 1038 1039 1040 1041 1042 1043 1044

	/* Don't allow ioctl operations on a nbd device that was created with
	 * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
	 */
	if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
	    (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
		error = __nbd_ioctl(bdev, nbd, cmd, arg);
	else
		dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n");
J
Josef Bacik 已提交
1045
	mutex_unlock(&nbd->config_lock);
P
Pavel Machek 已提交
1046
	return error;
L
Linus Torvalds 已提交
1047 1048
}

1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101
static struct nbd_config *nbd_alloc_config(void)
{
	struct nbd_config *config;

	config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
	if (!config)
		return NULL;
	atomic_set(&config->recv_threads, 0);
	init_waitqueue_head(&config->recv_wq);
	config->blksize = 1024;
	try_module_get(THIS_MODULE);
	return config;
}

static int nbd_open(struct block_device *bdev, fmode_t mode)
{
	struct nbd_device *nbd;
	int ret = 0;

	mutex_lock(&nbd_index_mutex);
	nbd = bdev->bd_disk->private_data;
	if (!nbd) {
		ret = -ENXIO;
		goto out;
	}
	if (!refcount_inc_not_zero(&nbd->config_refs)) {
		struct nbd_config *config;

		mutex_lock(&nbd->config_lock);
		if (refcount_inc_not_zero(&nbd->config_refs)) {
			mutex_unlock(&nbd->config_lock);
			goto out;
		}
		config = nbd->config = nbd_alloc_config();
		if (!config) {
			ret = -ENOMEM;
			mutex_unlock(&nbd->config_lock);
			goto out;
		}
		refcount_set(&nbd->config_refs, 1);
		mutex_unlock(&nbd->config_lock);
	}
out:
	mutex_unlock(&nbd_index_mutex);
	return ret;
}

static void nbd_release(struct gendisk *disk, fmode_t mode)
{
	struct nbd_device *nbd = disk->private_data;
	nbd_config_put(nbd);
}

1102
static const struct block_device_operations nbd_fops =
L
Linus Torvalds 已提交
1103 1104
{
	.owner =	THIS_MODULE,
1105 1106
	.open =		nbd_open,
	.release =	nbd_release,
1107
	.ioctl =	nbd_ioctl,
A
Al Viro 已提交
1108
	.compat_ioctl =	nbd_ioctl,
L
Linus Torvalds 已提交
1109 1110
};

M
Markus Pargmann 已提交
1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137
#if IS_ENABLED(CONFIG_DEBUG_FS)

static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
{
	struct nbd_device *nbd = s->private;

	if (nbd->task_recv)
		seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));

	return 0;
}

static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
{
	return single_open(file, nbd_dbg_tasks_show, inode->i_private);
}

static const struct file_operations nbd_dbg_tasks_ops = {
	.open = nbd_dbg_tasks_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
};

static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
{
	struct nbd_device *nbd = s->private;
1138
	u32 flags = nbd->config->flags;
M
Markus Pargmann 已提交
1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170

	seq_printf(s, "Hex: 0x%08x\n\n", flags);

	seq_puts(s, "Known flags:\n");

	if (flags & NBD_FLAG_HAS_FLAGS)
		seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
	if (flags & NBD_FLAG_READ_ONLY)
		seq_puts(s, "NBD_FLAG_READ_ONLY\n");
	if (flags & NBD_FLAG_SEND_FLUSH)
		seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
	if (flags & NBD_FLAG_SEND_TRIM)
		seq_puts(s, "NBD_FLAG_SEND_TRIM\n");

	return 0;
}

static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
{
	return single_open(file, nbd_dbg_flags_show, inode->i_private);
}

static const struct file_operations nbd_dbg_flags_ops = {
	.open = nbd_dbg_flags_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
};

static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
	struct dentry *dir;
1171
	struct nbd_config *config = nbd->config;
1172 1173 1174

	if (!nbd_dbg_dir)
		return -EIO;
M
Markus Pargmann 已提交
1175 1176

	dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
1177 1178 1179 1180
	if (!dir) {
		dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
			nbd_name(nbd));
		return -EIO;
M
Markus Pargmann 已提交
1181
	}
1182
	config->dbg_dir = dir;
M
Markus Pargmann 已提交
1183

1184
	debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
1185
	debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
1186
	debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
1187
	debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
1188
	debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
M
Markus Pargmann 已提交
1189 1190 1191 1192 1193 1194

	return 0;
}

static void nbd_dev_dbg_close(struct nbd_device *nbd)
{
1195
	debugfs_remove_recursive(nbd->config->dbg_dir);
M
Markus Pargmann 已提交
1196 1197 1198 1199 1200 1201 1202
}

static int nbd_dbg_init(void)
{
	struct dentry *dbg_dir;

	dbg_dir = debugfs_create_dir("nbd", NULL);
1203 1204
	if (!dbg_dir)
		return -EIO;
M
Markus Pargmann 已提交
1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237

	nbd_dbg_dir = dbg_dir;

	return 0;
}

static void nbd_dbg_close(void)
{
	debugfs_remove_recursive(nbd_dbg_dir);
}

#else  /* IS_ENABLED(CONFIG_DEBUG_FS) */

static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
	return 0;
}

static void nbd_dev_dbg_close(struct nbd_device *nbd)
{
}

static int nbd_dbg_init(void)
{
	return 0;
}

static void nbd_dbg_close(void)
{
}

#endif

J
Josef Bacik 已提交
1238 1239 1240 1241 1242 1243 1244 1245 1246
static int nbd_init_request(void *data, struct request *rq,
			    unsigned int hctx_idx, unsigned int request_idx,
			    unsigned int numa_node)
{
	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
	cmd->nbd = data;
	return 0;
}

1247
static const struct blk_mq_ops nbd_mq_ops = {
J
Josef Bacik 已提交
1248 1249
	.queue_rq	= nbd_queue_rq,
	.init_request	= nbd_init_request,
1250
	.timeout	= nbd_xmit_timeout,
J
Josef Bacik 已提交
1251 1252
};

1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292
static void nbd_dev_remove(struct nbd_device *nbd)
{
	struct gendisk *disk = nbd->disk;
	if (disk) {
		del_gendisk(disk);
		blk_cleanup_queue(disk->queue);
		blk_mq_free_tag_set(&nbd->tag_set);
		put_disk(disk);
	}
	kfree(nbd);
}

static int nbd_dev_add(int index)
{
	struct nbd_device *nbd;
	struct gendisk *disk;
	struct request_queue *q;
	int err = -ENOMEM;

	nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
	if (!nbd)
		goto out;

	disk = alloc_disk(1 << part_shift);
	if (!disk)
		goto out_free_nbd;

	if (index >= 0) {
		err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
				GFP_KERNEL);
		if (err == -ENOSPC)
			err = -EEXIST;
	} else {
		err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL);
		if (err >= 0)
			index = err;
	}
	if (err < 0)
		goto out_free_disk;

J
Josef Bacik 已提交
1293
	nbd->index = index;
1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325
	nbd->disk = disk;
	nbd->tag_set.ops = &nbd_mq_ops;
	nbd->tag_set.nr_hw_queues = 1;
	nbd->tag_set.queue_depth = 128;
	nbd->tag_set.numa_node = NUMA_NO_NODE;
	nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
	nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
		BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
	nbd->tag_set.driver_data = nbd;

	err = blk_mq_alloc_tag_set(&nbd->tag_set);
	if (err)
		goto out_free_idr;

	q = blk_mq_init_queue(&nbd->tag_set);
	if (IS_ERR(q)) {
		err = PTR_ERR(q);
		goto out_free_tags;
	}
	disk->queue = q;

	/*
	 * Tell the block layer that we are not a rotational device
	 */
	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
	disk->queue->limits.discard_granularity = 512;
	blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
	blk_queue_max_hw_sectors(disk->queue, 65536);
	disk->queue->limits.max_sectors = 256;

	mutex_init(&nbd->config_lock);
1326
	refcount_set(&nbd->config_refs, 0);
1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347
	disk->major = NBD_MAJOR;
	disk->first_minor = index << part_shift;
	disk->fops = &nbd_fops;
	disk->private_data = nbd;
	sprintf(disk->disk_name, "nbd%d", index);
	nbd_reset(nbd);
	add_disk(disk);
	return index;

out_free_tags:
	blk_mq_free_tag_set(&nbd->tag_set);
out_free_idr:
	idr_remove(&nbd_index_idr, index);
out_free_disk:
	put_disk(disk);
out_free_nbd:
	kfree(nbd);
out:
	return err;
}

J
Josef Bacik 已提交
1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576
static int find_free_cb(int id, void *ptr, void *data)
{
	struct nbd_device *nbd = ptr;
	struct nbd_device **found = data;

	if (!refcount_read(&nbd->config_refs)) {
		*found = nbd;
		return 1;
	}
	return 0;
}

/* Netlink interface. */
static struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
	[NBD_ATTR_INDEX]		=	{ .type = NLA_U32 },
	[NBD_ATTR_SIZE_BYTES]		=	{ .type = NLA_U64 },
	[NBD_ATTR_BLOCK_SIZE_BYTES]	=	{ .type = NLA_U64 },
	[NBD_ATTR_TIMEOUT]		=	{ .type = NLA_U64 },
	[NBD_ATTR_SERVER_FLAGS]		=	{ .type = NLA_U64 },
	[NBD_ATTR_CLIENT_FLAGS]		=	{ .type = NLA_U64 },
	[NBD_ATTR_SOCKETS]		=	{ .type = NLA_NESTED},
};

static struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
	[NBD_SOCK_FD]			=	{ .type = NLA_U32 },
};

static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
{
	struct nbd_device *nbd = NULL;
	struct nbd_config *config;
	int index = -1;
	int ret;

	if (!netlink_capable(skb, CAP_SYS_ADMIN))
		return -EPERM;

	if (info->attrs[NBD_ATTR_INDEX])
		index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
	if (!info->attrs[NBD_ATTR_SOCKETS]) {
		printk(KERN_ERR "nbd: must specify at least one socket\n");
		return -EINVAL;
	}
	if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
		printk(KERN_ERR "nbd: must specify a size in bytes for the device\n");
		return -EINVAL;
	}
again:
	mutex_lock(&nbd_index_mutex);
	if (index == -1) {
		ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd);
		if (ret == 0) {
			int new_index;
			new_index = nbd_dev_add(-1);
			if (new_index < 0) {
				mutex_unlock(&nbd_index_mutex);
				printk(KERN_ERR "nbd: failed to add new device\n");
				return ret;
			}
			nbd = idr_find(&nbd_index_idr, new_index);
		}
	} else {
		nbd = idr_find(&nbd_index_idr, index);
	}
	mutex_unlock(&nbd_index_mutex);
	if (!nbd) {
		printk(KERN_ERR "nbd: couldn't find device at index %d\n",
		       index);
		return -EINVAL;
	}

	mutex_lock(&nbd->config_lock);
	if (refcount_read(&nbd->config_refs)) {
		mutex_unlock(&nbd->config_lock);
		if (index == -1)
			goto again;
		printk(KERN_ERR "nbd: nbd%d already in use\n", index);
		return -EBUSY;
	}
	if (WARN_ON(nbd->config)) {
		mutex_unlock(&nbd->config_lock);
		return -EINVAL;
	}
	config = nbd->config = nbd_alloc_config();
	if (!nbd->config) {
		mutex_unlock(&nbd->config_lock);
		printk(KERN_ERR "nbd: couldn't allocate config\n");
		return -ENOMEM;
	}
	refcount_set(&nbd->config_refs, 1);
	set_bit(NBD_BOUND, &config->runtime_flags);

	if (info->attrs[NBD_ATTR_SIZE_BYTES]) {
		u64 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
		nbd_size_set(nbd, config->blksize,
			     div64_u64(bytes, config->blksize));
	}
	if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) {
		u64 bsize =
			nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
		nbd_size_set(nbd, bsize, div64_u64(config->bytesize, bsize));
	}
	if (info->attrs[NBD_ATTR_TIMEOUT]) {
		u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
		nbd->tag_set.timeout = timeout * HZ;
		blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
	}
	if (info->attrs[NBD_ATTR_SERVER_FLAGS])
		config->flags =
			nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
	if (info->attrs[NBD_ATTR_SOCKETS]) {
		struct nlattr *attr;
		int rem, fd;

		nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
				    rem) {
			struct nlattr *socks[NBD_SOCK_MAX+1];

			if (nla_type(attr) != NBD_SOCK_ITEM) {
				printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
				ret = -EINVAL;
				goto out;
			}
			ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
					       nbd_sock_policy);
			if (ret != 0) {
				printk(KERN_ERR "nbd: error processing sock list\n");
				ret = -EINVAL;
				goto out;
			}
			if (!socks[NBD_SOCK_FD])
				continue;
			fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
			ret = nbd_add_socket(nbd, fd, true);
			if (ret)
				goto out;
		}
	}
	ret = nbd_start_device(nbd);
out:
	mutex_unlock(&nbd->config_lock);
	if (!ret) {
		set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags);
		refcount_inc(&nbd->config_refs);
		nbd_connect_reply(info, nbd->index);
	}
	nbd_config_put(nbd);
	return ret;
}

static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
{
	struct nbd_device *nbd;
	int index;

	if (!netlink_capable(skb, CAP_SYS_ADMIN))
		return -EPERM;

	if (!info->attrs[NBD_ATTR_INDEX]) {
		printk(KERN_ERR "nbd: must specify an index to disconnect\n");
		return -EINVAL;
	}
	index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
	mutex_lock(&nbd_index_mutex);
	nbd = idr_find(&nbd_index_idr, index);
	mutex_unlock(&nbd_index_mutex);
	if (!nbd) {
		printk(KERN_ERR "nbd: couldn't find device at index %d\n",
		       index);
		return -EINVAL;
	}
	if (!refcount_inc_not_zero(&nbd->config_refs))
		return 0;
	mutex_lock(&nbd->config_lock);
	nbd_disconnect(nbd);
	mutex_unlock(&nbd->config_lock);
	if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
			       &nbd->config->runtime_flags))
		nbd_config_put(nbd);
	nbd_config_put(nbd);
	return 0;
}

static const struct genl_ops nbd_connect_genl_ops[] = {
	{
		.cmd	= NBD_CMD_CONNECT,
		.policy	= nbd_attr_policy,
		.doit	= nbd_genl_connect,
	},
	{
		.cmd	= NBD_CMD_DISCONNECT,
		.policy	= nbd_attr_policy,
		.doit	= nbd_genl_disconnect,
	},
};

static struct genl_family nbd_genl_family __ro_after_init = {
	.hdrsize	= 0,
	.name		= NBD_GENL_FAMILY_NAME,
	.version	= NBD_GENL_VERSION,
	.module		= THIS_MODULE,
	.ops		= nbd_connect_genl_ops,
	.n_ops		= ARRAY_SIZE(nbd_connect_genl_ops),
	.maxattr	= NBD_ATTR_MAX,
};

static void nbd_connect_reply(struct genl_info *info, int index)
{
	struct sk_buff *skb;
	void *msg_head;
	int ret;

	skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
	if (!skb)
		return;
	msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0,
				     NBD_CMD_CONNECT);
	if (!msg_head) {
		nlmsg_free(skb);
		return;
	}
	ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
	if (ret) {
		nlmsg_free(skb);
		return;
	}
	genlmsg_end(skb, msg_head);
	genlmsg_reply(skb, info);
}
L
Linus Torvalds 已提交
1577 1578 1579 1580 1581

static int __init nbd_init(void)
{
	int i;

1582
	BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
L
Linus Torvalds 已提交
1583

L
Laurent Vivier 已提交
1584
	if (max_part < 0) {
1585
		printk(KERN_ERR "nbd: max_part must be >= 0\n");
L
Laurent Vivier 已提交
1586 1587 1588 1589
		return -EINVAL;
	}

	part_shift = 0;
1590
	if (max_part > 0) {
L
Laurent Vivier 已提交
1591 1592
		part_shift = fls(max_part);

1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603
		/*
		 * Adjust max_part according to part_shift as it is exported
		 * to user space so that user can know the max number of
		 * partition kernel should be able to manage.
		 *
		 * Note that -1 is required because partition 0 is reserved
		 * for the whole disk.
		 */
		max_part = (1UL << part_shift) - 1;
	}

1604 1605 1606 1607 1608
	if ((1UL << part_shift) > DISK_MAX_PARTS)
		return -EINVAL;

	if (nbds_max > 1UL << (MINORBITS - part_shift))
		return -EINVAL;
1609 1610 1611 1612
	recv_workqueue = alloc_workqueue("knbd-recv",
					 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
	if (!recv_workqueue)
		return -ENOMEM;
1613

1614 1615
	if (register_blkdev(NBD_MAJOR, "nbd")) {
		destroy_workqueue(recv_workqueue);
1616
		return -EIO;
1617
	}
L
Linus Torvalds 已提交
1618

J
Josef Bacik 已提交
1619 1620 1621 1622 1623
	if (genl_register_family(&nbd_genl_family)) {
		unregister_blkdev(NBD_MAJOR, "nbd");
		destroy_workqueue(recv_workqueue);
		return -EINVAL;
	}
M
Markus Pargmann 已提交
1624 1625
	nbd_dbg_init();

1626 1627 1628 1629 1630 1631
	mutex_lock(&nbd_index_mutex);
	for (i = 0; i < nbds_max; i++)
		nbd_dev_add(i);
	mutex_unlock(&nbd_index_mutex);
	return 0;
}
L
Linus Torvalds 已提交
1632

1633 1634 1635 1636
static int nbd_exit_cb(int id, void *ptr, void *data)
{
	struct nbd_device *nbd = ptr;
	nbd_dev_remove(nbd);
L
Linus Torvalds 已提交
1637 1638 1639 1640 1641
	return 0;
}

static void __exit nbd_cleanup(void)
{
M
Markus Pargmann 已提交
1642 1643
	nbd_dbg_close();

1644 1645
	idr_for_each(&nbd_index_idr, &nbd_exit_cb, NULL);
	idr_destroy(&nbd_index_idr);
J
Josef Bacik 已提交
1646
	genl_unregister_family(&nbd_genl_family);
1647
	destroy_workqueue(recv_workqueue);
L
Linus Torvalds 已提交
1648 1649 1650 1651 1652 1653 1654 1655 1656
	unregister_blkdev(NBD_MAJOR, "nbd");
}

module_init(nbd_init);
module_exit(nbd_cleanup);

MODULE_DESCRIPTION("Network Block Device");
MODULE_LICENSE("GPL");

1657
module_param(nbds_max, int, 0444);
L
Laurent Vivier 已提交
1658 1659 1660
MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
module_param(max_part, int, 0444);
MODULE_PARM_DESC(max_part, "number of partitions per device (default: 0)");