nbd.c 25.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 * Network block device - make block devices work over TCP
 *
 * Note that you can not swap over this thing, yet. Seems to work but
 * deadlocks sometimes - you can not swap over TCP in general.
 * 
P
Pavel Machek 已提交
7
 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
L
Linus Torvalds 已提交
8 9
 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
 *
10
 * This file is released under GPLv2 or later.
L
Linus Torvalds 已提交
11
 *
12
 * (part of code stolen from loop.c)
L
Linus Torvalds 已提交
13 14 15 16 17 18 19 20 21 22 23 24 25 26
 */

#include <linux/major.h>

#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/bio.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/ioctl.h>
27
#include <linux/mutex.h>
28 29 30
#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/kernel.h>
31
#include <linux/slab.h>
L
Linus Torvalds 已提交
32
#include <net/sock.h>
33
#include <linux/net.h>
34
#include <linux/kthread.h>
M
Markus Pargmann 已提交
35
#include <linux/types.h>
M
Markus Pargmann 已提交
36
#include <linux/debugfs.h>
L
Linus Torvalds 已提交
37 38 39 40 41 42

#include <asm/uaccess.h>
#include <asm/types.h>

#include <linux/nbd.h>

43
struct nbd_device {
M
Markus Pargmann 已提交
44
	u32 flags;
45 46 47 48 49 50 51 52 53 54 55 56 57
	struct socket * sock;	/* If == NULL, device is not ready, yet	*/
	int magic;

	spinlock_t queue_lock;
	struct list_head queue_head;	/* Requests waiting result */
	struct request *active_req;
	wait_queue_head_t active_wq;
	struct list_head waiting_queue;	/* Requests to be sent */
	wait_queue_head_t waiting_wq;

	struct mutex tx_lock;
	struct gendisk *disk;
	int blksize;
M
Markus Pargmann 已提交
58
	loff_t bytesize;
59
	int xmit_timeout;
60
	bool timedout;
61
	bool disconnect; /* a disconnect has been requested by user */
M
Markus Pargmann 已提交
62 63

	struct timer_list timeout_timer;
M
Markus Pargmann 已提交
64 65
	/* protects initialization and shutdown of the socket */
	spinlock_t sock_lock;
M
Markus Pargmann 已提交
66 67
	struct task_struct *task_recv;
	struct task_struct *task_send;
M
Markus Pargmann 已提交
68 69 70 71

#if IS_ENABLED(CONFIG_DEBUG_FS)
	struct dentry *dbg_dir;
#endif
72 73
};

M
Markus Pargmann 已提交
74 75 76 77 78 79
#if IS_ENABLED(CONFIG_DEBUG_FS)
static struct dentry *nbd_dbg_dir;
#endif

#define nbd_name(nbd) ((nbd)->disk->disk_name)

80
#define NBD_MAGIC 0x68797548
L
Linus Torvalds 已提交
81

82
static unsigned int nbds_max = 16;
83
static struct nbd_device *nbd_dev;
L
Laurent Vivier 已提交
84
static int max_part;
L
Linus Torvalds 已提交
85 86 87 88 89 90 91 92 93 94 95 96 97

/*
 * Use just one lock (or at most 1 per NIC). Two arguments for this:
 * 1. Each NIC is essentially a synchronization point for all servers
 *    accessed through that NIC so there's no need to have more locks
 *    than NICs anyway.
 * 2. More locks lead to more "Dirty cache line bouncing" which will slow
 *    down each lock to the point where they're actually slower than just
 *    a single lock.
 * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this!
 */
static DEFINE_SPINLOCK(nbd_lock);

98
static inline struct device *nbd_to_dev(struct nbd_device *nbd)
L
Linus Torvalds 已提交
99
{
100
	return disk_to_dev(nbd->disk);
L
Linus Torvalds 已提交
101 102 103 104 105 106 107 108
}

static const char *nbdcmd_to_ascii(int cmd)
{
	switch (cmd) {
	case  NBD_CMD_READ: return "read";
	case NBD_CMD_WRITE: return "write";
	case  NBD_CMD_DISC: return "disconnect";
A
Alex Bligh 已提交
109
	case NBD_CMD_FLUSH: return "flush";
P
Paul Clements 已提交
110
	case  NBD_CMD_TRIM: return "trim/discard";
L
Linus Torvalds 已提交
111 112 113 114
	}
	return "invalid";
}

115
static void nbd_end_request(struct nbd_device *nbd, struct request *req)
L
Linus Torvalds 已提交
116
{
117
	int error = req->errors ? -EIO : 0;
118
	struct request_queue *q = req->q;
L
Linus Torvalds 已提交
119 120
	unsigned long flags;

121 122
	dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", req,
		error ? "failed" : "done");
L
Linus Torvalds 已提交
123 124

	spin_lock_irqsave(q->queue_lock, flags);
125
	__blk_end_request_all(req, error);
L
Linus Torvalds 已提交
126 127 128
	spin_unlock_irqrestore(q->queue_lock, flags);
}

129 130 131
/*
 * Forcibly shutdown the socket causing all listeners to error
 */
132
static void sock_shutdown(struct nbd_device *nbd)
133
{
M
Markus Pargmann 已提交
134 135 136 137
	spin_lock_irq(&nbd->sock_lock);

	if (!nbd->sock) {
		spin_unlock_irq(&nbd->sock_lock);
M
Markus Pargmann 已提交
138
		return;
M
Markus Pargmann 已提交
139
	}
M
Markus Pargmann 已提交
140 141 142

	dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
	kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
M
Markus Pargmann 已提交
143
	sockfd_put(nbd->sock);
M
Markus Pargmann 已提交
144
	nbd->sock = NULL;
M
Markus Pargmann 已提交
145 146 147
	spin_unlock_irq(&nbd->sock_lock);

	del_timer(&nbd->timeout_timer);
148 149 150 151
}

static void nbd_xmit_timeout(unsigned long arg)
{
M
Markus Pargmann 已提交
152
	struct nbd_device *nbd = (struct nbd_device *)arg;
M
Markus Pargmann 已提交
153
	unsigned long flags;
M
Markus Pargmann 已提交
154 155 156 157

	if (list_empty(&nbd->queue_head))
		return;

M
Markus Pargmann 已提交
158
	spin_lock_irqsave(&nbd->sock_lock, flags);
M
Markus Pargmann 已提交
159

160
	nbd->timedout = true;
161

M
Markus Pargmann 已提交
162 163
	if (nbd->sock)
		kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
M
Markus Pargmann 已提交
164

M
Markus Pargmann 已提交
165
	spin_unlock_irqrestore(&nbd->sock_lock, flags);
M
Markus Pargmann 已提交
166

M
Markus Pargmann 已提交
167
	dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
168 169
}

L
Linus Torvalds 已提交
170 171 172
/*
 *  Send or receive packet.
 */
173
static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
L
Linus Torvalds 已提交
174 175
		int msg_flags)
{
176
	struct socket *sock = nbd->sock;
L
Linus Torvalds 已提交
177 178 179
	int result;
	struct msghdr msg;
	struct kvec iov;
180
	unsigned long pflags = current->flags;
L
Linus Torvalds 已提交
181

182
	if (unlikely(!sock)) {
183
		dev_err(disk_to_dev(nbd->disk),
184 185
			"Attempted %s on closed socket in sock_xmit\n",
			(send ? "send" : "recv"));
186 187 188
		return -EINVAL;
	}

189
	current->flags |= PF_MEMALLOC;
L
Linus Torvalds 已提交
190
	do {
191
		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
L
Linus Torvalds 已提交
192 193 194 195 196 197 198 199
		iov.iov_base = buf;
		iov.iov_len = size;
		msg.msg_name = NULL;
		msg.msg_namelen = 0;
		msg.msg_control = NULL;
		msg.msg_controllen = 0;
		msg.msg_flags = msg_flags | MSG_NOSIGNAL;

M
Markus Pargmann 已提交
200
		if (send)
L
Linus Torvalds 已提交
201
			result = kernel_sendmsg(sock, &msg, &iov, 1, size);
M
Markus Pargmann 已提交
202
		else
203 204
			result = kernel_recvmsg(sock, &msg, &iov, 1, size,
						msg.msg_flags);
L
Linus Torvalds 已提交
205 206 207 208 209 210 211 212 213 214

		if (result <= 0) {
			if (result == 0)
				result = -EPIPE; /* short read */
			break;
		}
		size -= result;
		buf += result;
	} while (size > 0);

215
	tsk_restore_flags(current, pflags, PF_MEMALLOC);
L
Linus Torvalds 已提交
216

M
Markus Pargmann 已提交
217 218 219
	if (!send && nbd->xmit_timeout)
		mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);

L
Linus Torvalds 已提交
220 221 222
	return result;
}

223
static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
L
Linus Torvalds 已提交
224 225 226 227
		int flags)
{
	int result;
	void *kaddr = kmap(bvec->bv_page);
228 229
	result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
			   bvec->bv_len, flags);
L
Linus Torvalds 已提交
230 231 232 233
	kunmap(bvec->bv_page);
	return result;
}

234
/* always call with the tx_lock held */
235
static int nbd_send_req(struct nbd_device *nbd, struct request *req)
L
Linus Torvalds 已提交
236
{
237
	int result, flags;
L
Linus Torvalds 已提交
238
	struct nbd_request request;
239
	unsigned long size = blk_rq_bytes(req);
C
Christoph Hellwig 已提交
240 241 242 243 244 245 246 247 248 249 250 251
	u32 type;

	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
		type = NBD_CMD_DISC;
	else if (req->cmd_flags & REQ_DISCARD)
		type = NBD_CMD_TRIM;
	else if (req->cmd_flags & REQ_FLUSH)
		type = NBD_CMD_FLUSH;
	else if (rq_data_dir(req) == WRITE)
		type = NBD_CMD_WRITE;
	else
		type = NBD_CMD_READ;
L
Linus Torvalds 已提交
252

253
	memset(&request, 0, sizeof(request));
L
Linus Torvalds 已提交
254
	request.magic = htonl(NBD_REQUEST_MAGIC);
C
Christoph Hellwig 已提交
255 256
	request.type = htonl(type);
	if (type != NBD_CMD_FLUSH && type != NBD_CMD_DISC) {
A
Alex Bligh 已提交
257 258 259
		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
		request.len = htonl(size);
	}
L
Linus Torvalds 已提交
260 261
	memcpy(request.handle, &req, sizeof(req));

262
	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
C
Christoph Hellwig 已提交
263
		req, nbdcmd_to_ascii(type),
264
		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
265
	result = sock_xmit(nbd, 1, &request, sizeof(request),
C
Christoph Hellwig 已提交
266
			(type == NBD_CMD_WRITE) ? MSG_MORE : 0);
L
Linus Torvalds 已提交
267
	if (result <= 0) {
268
		dev_err(disk_to_dev(nbd->disk),
269
			"Send control failed (result %d)\n", result);
270
		return -EIO;
L
Linus Torvalds 已提交
271 272
	}

C
Christoph Hellwig 已提交
273
	if (type == NBD_CMD_WRITE) {
274
		struct req_iterator iter;
275
		struct bio_vec bvec;
L
Linus Torvalds 已提交
276 277 278 279
		/*
		 * we are really probing at internals to determine
		 * whether to set MSG_MORE or not...
		 */
280
		rq_for_each_segment(bvec, req, iter) {
281
			flags = 0;
K
Kent Overstreet 已提交
282
			if (!rq_iter_last(bvec, iter))
283
				flags = MSG_MORE;
284 285
			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
				req, bvec.bv_len);
286
			result = sock_send_bvec(nbd, &bvec, flags);
287
			if (result <= 0) {
288
				dev_err(disk_to_dev(nbd->disk),
289 290
					"Send data failed (result %d)\n",
					result);
291
				return -EIO;
292
			}
L
Linus Torvalds 已提交
293 294 295 296 297
		}
	}
	return 0;
}

298
static struct request *nbd_find_request(struct nbd_device *nbd,
299
					struct request *xreq)
L
Linus Torvalds 已提交
300
{
301
	struct request *req, *tmp;
302
	int err;
L
Linus Torvalds 已提交
303

304
	err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
305
	if (unlikely(err))
306
		return ERR_PTR(err);
307

308 309
	spin_lock(&nbd->queue_lock);
	list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
L
Linus Torvalds 已提交
310 311 312
		if (req != xreq)
			continue;
		list_del_init(&req->queuelist);
313
		spin_unlock(&nbd->queue_lock);
L
Linus Torvalds 已提交
314 315
		return req;
	}
316
	spin_unlock(&nbd->queue_lock);
317

318
	return ERR_PTR(-ENOENT);
L
Linus Torvalds 已提交
319 320
}

321
static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
L
Linus Torvalds 已提交
322 323 324
{
	int result;
	void *kaddr = kmap(bvec->bv_page);
325
	result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
L
Linus Torvalds 已提交
326 327 328 329 330 331
			MSG_WAITALL);
	kunmap(bvec->bv_page);
	return result;
}

/* NULL returned = something went wrong, inform userspace */
332
static struct request *nbd_read_stat(struct nbd_device *nbd)
L
Linus Torvalds 已提交
333 334 335 336 337 338
{
	int result;
	struct nbd_reply reply;
	struct request *req;

	reply.magic = 0;
339
	result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
L
Linus Torvalds 已提交
340
	if (result <= 0) {
341
		dev_err(disk_to_dev(nbd->disk),
342
			"Receive control failed (result %d)\n", result);
343
		return ERR_PTR(result);
L
Linus Torvalds 已提交
344
	}
345 346

	if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
347
		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
348
				(unsigned long)ntohl(reply.magic));
349
		return ERR_PTR(-EPROTO);
350 351
	}

352
	req = nbd_find_request(nbd, *(struct request **)reply.handle);
353
	if (IS_ERR(req)) {
354 355
		result = PTR_ERR(req);
		if (result != -ENOENT)
356
			return ERR_PTR(result);
357

358
		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
359
			reply.handle);
360
		return ERR_PTR(-EBADR);
L
Linus Torvalds 已提交
361 362 363
	}

	if (ntohl(reply.error)) {
364
		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
365
			ntohl(reply.error));
L
Linus Torvalds 已提交
366 367 368 369
		req->errors++;
		return req;
	}

370
	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
C
Christoph Hellwig 已提交
371
	if (rq_data_dir(req) != WRITE) {
372
		struct req_iterator iter;
373
		struct bio_vec bvec;
374 375

		rq_for_each_segment(bvec, req, iter) {
376
			result = sock_recv_bvec(nbd, &bvec);
377
			if (result <= 0) {
378
				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
379
					result);
380 381 382
				req->errors++;
				return req;
			}
383 384
			dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
				req, bvec.bv_len);
L
Linus Torvalds 已提交
385 386 387 388 389
		}
	}
	return req;
}

390 391
static ssize_t pid_show(struct device *dev,
			struct device_attribute *attr, char *buf)
392
{
393
	struct gendisk *disk = dev_to_disk(dev);
M
Markus Pargmann 已提交
394
	struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
395

M
Markus Pargmann 已提交
396
	return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
397 398
}

399
static struct device_attribute pid_attr = {
400
	.attr = { .name = "pid", .mode = S_IRUGO},
401 402 403
	.show = pid_show,
};

404
static int nbd_thread_recv(struct nbd_device *nbd)
L
Linus Torvalds 已提交
405 406
{
	struct request *req;
407
	int ret;
L
Linus Torvalds 已提交
408

409
	BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
410

411
	sk_set_memalloc(nbd->sock->sk);
M
Markus Pargmann 已提交
412 413 414

	nbd->task_recv = current;

415
	ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
416
	if (ret) {
417
		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
M
Markus Pargmann 已提交
418

M
Markus Pargmann 已提交
419
		nbd->task_recv = NULL;
M
Markus Pargmann 已提交
420

421 422
		return ret;
	}
423

424 425 426 427 428 429 430
	while (1) {
		req = nbd_read_stat(nbd);
		if (IS_ERR(req)) {
			ret = PTR_ERR(req);
			break;
		}

431
		nbd_end_request(nbd, req);
432
	}
433

M
Markus Pargmann 已提交
434 435
	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);

M
Markus Pargmann 已提交
436 437 438
	nbd->task_recv = NULL;

	return ret;
L
Linus Torvalds 已提交
439 440
}

441
static void nbd_clear_que(struct nbd_device *nbd)
L
Linus Torvalds 已提交
442 443 444
{
	struct request *req;

445
	BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
446

447
	/*
448
	 * Because we have set nbd->sock to NULL under the tx_lock, all
449 450 451 452 453 454
	 * modifications to the list must have completed by now.  For
	 * the same reason, the active_req must be NULL.
	 *
	 * As a consequence, we don't need to take the spin lock while
	 * purging the list here.
	 */
455 456
	BUG_ON(nbd->sock);
	BUG_ON(nbd->active_req);
457

458 459
	while (!list_empty(&nbd->queue_head)) {
		req = list_entry(nbd->queue_head.next, struct request,
460 461 462
				 queuelist);
		list_del_init(&req->queuelist);
		req->errors++;
463
		nbd_end_request(nbd, req);
464
	}
465 466 467 468 469 470

	while (!list_empty(&nbd->waiting_queue)) {
		req = list_entry(nbd->waiting_queue.next, struct request,
				 queuelist);
		list_del_init(&req->queuelist);
		req->errors++;
471
		nbd_end_request(nbd, req);
472
	}
473
	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
L
Linus Torvalds 已提交
474 475
}

476

477
static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
478
{
479
	if (req->cmd_type != REQ_TYPE_FS)
480 481
		goto error_out;

C
Christoph Hellwig 已提交
482 483 484 485 486
	if (rq_data_dir(req) == WRITE &&
	    (nbd->flags & NBD_FLAG_READ_ONLY)) {
		dev_err(disk_to_dev(nbd->disk),
			"Write on read-only\n");
		goto error_out;
A
Alex Bligh 已提交
487 488
	}

489 490
	req->errors = 0;

491 492 493 494
	mutex_lock(&nbd->tx_lock);
	if (unlikely(!nbd->sock)) {
		mutex_unlock(&nbd->tx_lock);
		dev_err(disk_to_dev(nbd->disk),
495
			"Attempted send on closed socket\n");
P
Pavel Machek 已提交
496
		goto error_out;
497 498
	}

499
	nbd->active_req = req;
500

M
Markus Pargmann 已提交
501 502 503
	if (nbd->xmit_timeout && list_empty_careful(&nbd->queue_head))
		mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);

504 505
	if (nbd_send_req(nbd, req) != 0) {
		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
506
		req->errors++;
507
		nbd_end_request(nbd, req);
508
	} else {
509
		spin_lock(&nbd->queue_lock);
510
		list_add_tail(&req->queuelist, &nbd->queue_head);
511
		spin_unlock(&nbd->queue_lock);
512 513
	}

514 515 516
	nbd->active_req = NULL;
	mutex_unlock(&nbd->tx_lock);
	wake_up_all(&nbd->active_wq);
517 518 519 520 521

	return;

error_out:
	req->errors++;
522
	nbd_end_request(nbd, req);
523 524
}

525
static int nbd_thread_send(void *data)
526
{
527
	struct nbd_device *nbd = data;
528 529
	struct request *req;

M
Markus Pargmann 已提交
530 531
	nbd->task_send = current;

532
	set_user_nice(current, MIN_NICE);
533
	while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
534
		/* wait for something to do */
535
		wait_event_interruptible(nbd->waiting_wq,
536
					 kthread_should_stop() ||
537
					 !list_empty(&nbd->waiting_queue));
538 539

		/* extract request */
540
		if (list_empty(&nbd->waiting_queue))
541 542
			continue;

543 544
		spin_lock_irq(&nbd->queue_lock);
		req = list_entry(nbd->waiting_queue.next, struct request,
545 546
				 queuelist);
		list_del_init(&req->queuelist);
547
		spin_unlock_irq(&nbd->queue_lock);
548 549

		/* handle request */
550
		nbd_handle_req(nbd, req);
551
	}
M
Markus Pargmann 已提交
552 553 554

	nbd->task_send = NULL;

555 556 557
	return 0;
}

L
Linus Torvalds 已提交
558 559 560
/*
 * We always wait for result of write, for now. It would be nice to make it optional
 * in future
561
 * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
L
Linus Torvalds 已提交
562 563 564
 *   { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
 */

565
static void nbd_request_handler(struct request_queue *q)
A
Alex Elder 已提交
566
		__releases(q->queue_lock) __acquires(q->queue_lock)
L
Linus Torvalds 已提交
567 568 569
{
	struct request *req;
	
570
	while ((req = blk_fetch_request(q)) != NULL) {
571
		struct nbd_device *nbd;
L
Linus Torvalds 已提交
572

573 574
		spin_unlock_irq(q->queue_lock);

575
		nbd = req->rq_disk->private_data;
L
Linus Torvalds 已提交
576

577
		BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
578

579 580 581
		dev_dbg(nbd_to_dev(nbd), "request %p: dequeued (flags=%x)\n",
			req, req->cmd_type);

582 583
		if (unlikely(!nbd->sock)) {
			dev_err(disk_to_dev(nbd->disk),
584
				"Attempted send on closed socket\n");
585
			req->errors++;
586
			nbd_end_request(nbd, req);
587 588 589 590
			spin_lock_irq(q->queue_lock);
			continue;
		}

591 592 593
		spin_lock_irq(&nbd->queue_lock);
		list_add_tail(&req->queuelist, &nbd->waiting_queue);
		spin_unlock_irq(&nbd->queue_lock);
L
Linus Torvalds 已提交
594

595
		wake_up(&nbd->waiting_wq);
596

L
Linus Torvalds 已提交
597 598 599 600
		spin_lock_irq(q->queue_lock);
	}
}

M
Markus Pargmann 已提交
601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619
static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock)
{
	int ret = 0;

	spin_lock_irq(&nbd->sock_lock);

	if (nbd->sock) {
		ret = -EBUSY;
		goto out;
	}

	nbd->sock = sock;

out:
	spin_unlock_irq(&nbd->sock_lock);

	return ret;
}

620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643
/* Reset all properties of an NBD device */
static void nbd_reset(struct nbd_device *nbd)
{
	nbd->disconnect = false;
	nbd->timedout = false;
	nbd->blksize = 1024;
	nbd->bytesize = 0;
	set_capacity(nbd->disk, 0);
	nbd->flags = 0;
	nbd->xmit_timeout = 0;
	queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
	del_timer_sync(&nbd->timeout_timer);
}

static void nbd_bdev_reset(struct block_device *bdev)
{
	set_device_ro(bdev, false);
	bdev->bd_inode->i_size = 0;
	if (max_part > 0) {
		blkdev_reread_part(bdev);
		bdev->bd_invalidated = 1;
	}
}

644 645 646 647 648 649 650 651 652 653 654 655
static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
{
	if (nbd->flags & NBD_FLAG_READ_ONLY)
		set_device_ro(bdev, true);
	if (nbd->flags & NBD_FLAG_SEND_TRIM)
		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
	if (nbd->flags & NBD_FLAG_SEND_FLUSH)
		blk_queue_flush(nbd->disk->queue, REQ_FLUSH);
	else
		blk_queue_flush(nbd->disk->queue, 0);
}

M
Markus Pargmann 已提交
656 657 658
static int nbd_dev_dbg_init(struct nbd_device *nbd);
static void nbd_dev_dbg_close(struct nbd_device *nbd);

P
Pavel Machek 已提交
659
/* Must be called with tx_lock held */
L
Linus Torvalds 已提交
660

661
static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
P
Pavel Machek 已提交
662 663
		       unsigned int cmd, unsigned long arg)
{
L
Linus Torvalds 已提交
664
	switch (cmd) {
P
Pavel Machek 已提交
665 666 667
	case NBD_DISCONNECT: {
		struct request sreq;

668
		dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
669 670
		if (!nbd->sock)
			return -EINVAL;
P
Pavel Machek 已提交
671

672 673 674
		mutex_unlock(&nbd->tx_lock);
		fsync_bdev(bdev);
		mutex_lock(&nbd->tx_lock);
675
		blk_rq_init(NULL, &sreq);
676
		sreq.cmd_type = REQ_TYPE_DRV_PRIV;
677 678

		/* Check again after getting mutex back.  */
679
		if (!nbd->sock)
L
Linus Torvalds 已提交
680
			return -EINVAL;
681

682
		nbd->disconnect = true;
P
Paul Clements 已提交
683

684
		nbd_send_req(nbd, &sreq);
P
Paul Clements 已提交
685
		return 0;
P
Pavel Machek 已提交
686
	}
L
Linus Torvalds 已提交
687
 
M
Markus Pargmann 已提交
688 689
	case NBD_CLEAR_SOCK:
		sock_shutdown(nbd);
690 691
		nbd_clear_que(nbd);
		BUG_ON(!list_empty(&nbd->queue_head));
692
		BUG_ON(!list_empty(&nbd->waiting_queue));
693
		kill_bdev(bdev);
P
Pavel Machek 已提交
694 695 696
		return 0;

	case NBD_SET_SOCK: {
A
Al Viro 已提交
697
		int err;
M
Markus Pargmann 已提交
698 699 700 701 702 703 704 705 706 707
		struct socket *sock = sockfd_lookup(arg, &err);

		if (!sock)
			return err;

		err = nbd_set_socket(nbd, sock);
		if (!err && max_part)
			bdev->bd_invalidated = 1;

		return err;
P
Pavel Machek 已提交
708 709
	}

L
Linus Torvalds 已提交
710
	case NBD_SET_BLKSIZE:
711 712 713 714 715
		nbd->blksize = arg;
		nbd->bytesize &= ~(nbd->blksize-1);
		bdev->bd_inode->i_size = nbd->bytesize;
		set_blocksize(bdev, nbd->blksize);
		set_capacity(nbd->disk, nbd->bytesize >> 9);
L
Linus Torvalds 已提交
716
		return 0;
P
Pavel Machek 已提交
717

L
Linus Torvalds 已提交
718
	case NBD_SET_SIZE:
719 720 721 722
		nbd->bytesize = arg & ~(nbd->blksize-1);
		bdev->bd_inode->i_size = nbd->bytesize;
		set_blocksize(bdev, nbd->blksize);
		set_capacity(nbd->disk, nbd->bytesize >> 9);
L
Linus Torvalds 已提交
723
		return 0;
P
Pavel Machek 已提交
724

725
	case NBD_SET_TIMEOUT:
726
		nbd->xmit_timeout = arg * HZ;
M
Markus Pargmann 已提交
727 728 729 730 731 732
		if (arg)
			mod_timer(&nbd->timeout_timer,
				  jiffies + nbd->xmit_timeout);
		else
			del_timer_sync(&nbd->timeout_timer);

733
		return 0;
P
Pavel Machek 已提交
734

P
Paul Clements 已提交
735 736 737 738
	case NBD_SET_FLAGS:
		nbd->flags = arg;
		return 0;

L
Linus Torvalds 已提交
739
	case NBD_SET_SIZE_BLOCKS:
740 741 742 743
		nbd->bytesize = ((u64) arg) * nbd->blksize;
		bdev->bd_inode->i_size = nbd->bytesize;
		set_blocksize(bdev, nbd->blksize);
		set_capacity(nbd->disk, nbd->bytesize >> 9);
L
Linus Torvalds 已提交
744
		return 0;
P
Pavel Machek 已提交
745 746 747 748 749

	case NBD_DO_IT: {
		struct task_struct *thread;
		int error;

M
Markus Pargmann 已提交
750
		if (nbd->task_recv)
751
			return -EBUSY;
A
Al Viro 已提交
752
		if (!nbd->sock)
L
Linus Torvalds 已提交
753
			return -EINVAL;
P
Pavel Machek 已提交
754

755
		mutex_unlock(&nbd->tx_lock);
P
Pavel Machek 已提交
756

757
		nbd_parse_flags(nbd, bdev);
P
Paul Clements 已提交
758

759
		thread = kthread_run(nbd_thread_send, nbd, "%s",
M
Markus Pargmann 已提交
760
				     nbd_name(nbd));
P
Pavel Machek 已提交
761
		if (IS_ERR(thread)) {
762
			mutex_lock(&nbd->tx_lock);
763
			return PTR_ERR(thread);
P
Pavel Machek 已提交
764
		}
765

M
Markus Pargmann 已提交
766
		nbd_dev_dbg_init(nbd);
767
		error = nbd_thread_recv(nbd);
M
Markus Pargmann 已提交
768
		nbd_dev_dbg_close(nbd);
769
		kthread_stop(thread);
P
Pavel Machek 已提交
770

771
		mutex_lock(&nbd->tx_lock);
772

773
		sock_shutdown(nbd);
774
		nbd_clear_que(nbd);
775
		kill_bdev(bdev);
776 777
		nbd_bdev_reset(bdev);

P
Paul Clements 已提交
778
		if (nbd->disconnect) /* user requested, ignore socket errors */
779 780 781 782
			error = 0;
		if (nbd->timedout)
			error = -ETIMEDOUT;

783 784
		nbd_reset(nbd);

785
		return error;
P
Pavel Machek 已提交
786 787
	}

L
Linus Torvalds 已提交
788
	case NBD_CLEAR_QUE:
789 790 791 792
		/*
		 * This is for compatibility only.  The queue is always cleared
		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
		 */
L
Linus Torvalds 已提交
793
		return 0;
P
Pavel Machek 已提交
794

L
Linus Torvalds 已提交
795
	case NBD_PRINT_DEBUG:
796
		dev_info(disk_to_dev(nbd->disk),
797
			"next = %p, prev = %p, head = %p\n",
798 799
			nbd->queue_head.next, nbd->queue_head.prev,
			&nbd->queue_head);
L
Linus Torvalds 已提交
800 801
		return 0;
	}
P
Pavel Machek 已提交
802 803 804 805 806 807
	return -ENOTTY;
}

static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
		     unsigned int cmd, unsigned long arg)
{
808
	struct nbd_device *nbd = bdev->bd_disk->private_data;
P
Pavel Machek 已提交
809 810 811 812 813
	int error;

	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

814
	BUG_ON(nbd->magic != NBD_MAGIC);
P
Pavel Machek 已提交
815

816 817 818
	mutex_lock(&nbd->tx_lock);
	error = __nbd_ioctl(bdev, nbd, cmd, arg);
	mutex_unlock(&nbd->tx_lock);
P
Pavel Machek 已提交
819 820

	return error;
L
Linus Torvalds 已提交
821 822
}

823
static const struct block_device_operations nbd_fops =
L
Linus Torvalds 已提交
824 825
{
	.owner =	THIS_MODULE,
826
	.ioctl =	nbd_ioctl,
A
Al Viro 已提交
827
	.compat_ioctl =	nbd_ioctl,
L
Linus Torvalds 已提交
828 829
};

M
Markus Pargmann 已提交
830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891
#if IS_ENABLED(CONFIG_DEBUG_FS)

static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
{
	struct nbd_device *nbd = s->private;

	if (nbd->task_recv)
		seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
	if (nbd->task_send)
		seq_printf(s, "send: %d\n", task_pid_nr(nbd->task_send));

	return 0;
}

static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
{
	return single_open(file, nbd_dbg_tasks_show, inode->i_private);
}

static const struct file_operations nbd_dbg_tasks_ops = {
	.open = nbd_dbg_tasks_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
};

static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
{
	struct nbd_device *nbd = s->private;
	u32 flags = nbd->flags;

	seq_printf(s, "Hex: 0x%08x\n\n", flags);

	seq_puts(s, "Known flags:\n");

	if (flags & NBD_FLAG_HAS_FLAGS)
		seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
	if (flags & NBD_FLAG_READ_ONLY)
		seq_puts(s, "NBD_FLAG_READ_ONLY\n");
	if (flags & NBD_FLAG_SEND_FLUSH)
		seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
	if (flags & NBD_FLAG_SEND_TRIM)
		seq_puts(s, "NBD_FLAG_SEND_TRIM\n");

	return 0;
}

static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
{
	return single_open(file, nbd_dbg_flags_show, inode->i_private);
}

static const struct file_operations nbd_dbg_flags_ops = {
	.open = nbd_dbg_flags_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
};

static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
	struct dentry *dir;
892 893 894

	if (!nbd_dbg_dir)
		return -EIO;
M
Markus Pargmann 已提交
895 896

	dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
897 898 899 900
	if (!dir) {
		dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
			nbd_name(nbd));
		return -EIO;
M
Markus Pargmann 已提交
901 902 903
	}
	nbd->dbg_dir = dir;

904 905 906 907 908
	debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
	debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
	debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout);
	debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize);
	debugfs_create_file("flags", 0444, dir, &nbd, &nbd_dbg_flags_ops);
M
Markus Pargmann 已提交
909 910 911 912 913 914 915 916 917 918 919 920 921 922

	return 0;
}

static void nbd_dev_dbg_close(struct nbd_device *nbd)
{
	debugfs_remove_recursive(nbd->dbg_dir);
}

static int nbd_dbg_init(void)
{
	struct dentry *dbg_dir;

	dbg_dir = debugfs_create_dir("nbd", NULL);
923 924
	if (!dbg_dir)
		return -EIO;
M
Markus Pargmann 已提交
925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957

	nbd_dbg_dir = dbg_dir;

	return 0;
}

static void nbd_dbg_close(void)
{
	debugfs_remove_recursive(nbd_dbg_dir);
}

#else  /* IS_ENABLED(CONFIG_DEBUG_FS) */

static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
	return 0;
}

static void nbd_dev_dbg_close(struct nbd_device *nbd)
{
}

static int nbd_dbg_init(void)
{
	return 0;
}

static void nbd_dbg_close(void)
{
}

#endif

L
Linus Torvalds 已提交
958 959 960 961 962 963 964 965 966
/*
 * And here should be modules and kernel interface 
 *  (Just smiley confuses emacs :-)
 */

static int __init nbd_init(void)
{
	int err = -ENOMEM;
	int i;
L
Laurent Vivier 已提交
967
	int part_shift;
L
Linus Torvalds 已提交
968

969
	BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
L
Linus Torvalds 已提交
970

L
Laurent Vivier 已提交
971
	if (max_part < 0) {
972
		printk(KERN_ERR "nbd: max_part must be >= 0\n");
L
Laurent Vivier 已提交
973 974 975 976
		return -EINVAL;
	}

	part_shift = 0;
977
	if (max_part > 0) {
L
Laurent Vivier 已提交
978 979
		part_shift = fls(max_part);

980 981 982 983 984 985 986 987 988 989 990
		/*
		 * Adjust max_part according to part_shift as it is exported
		 * to user space so that user can know the max number of
		 * partition kernel should be able to manage.
		 *
		 * Note that -1 is required because partition 0 is reserved
		 * for the whole disk.
		 */
		max_part = (1UL << part_shift) - 1;
	}

991 992 993 994 995 996
	if ((1UL << part_shift) > DISK_MAX_PARTS)
		return -EINVAL;

	if (nbds_max > 1UL << (MINORBITS - part_shift))
		return -EINVAL;

S
Sudip Mukherjee 已提交
997 998 999 1000
	nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
	if (!nbd_dev)
		return -ENOMEM;

1001
	for (i = 0; i < nbds_max; i++) {
L
Laurent Vivier 已提交
1002
		struct gendisk *disk = alloc_disk(1 << part_shift);
L
Linus Torvalds 已提交
1003 1004 1005 1006 1007 1008 1009 1010
		if (!disk)
			goto out;
		nbd_dev[i].disk = disk;
		/*
		 * The new linux 2.5 block layer implementation requires
		 * every gendisk to have its very own request_queue struct.
		 * These structs are big so we dynamically allocate them.
		 */
1011
		disk->queue = blk_init_queue(nbd_request_handler, &nbd_lock);
L
Linus Torvalds 已提交
1012 1013 1014 1015
		if (!disk->queue) {
			put_disk(disk);
			goto out;
		}
1016 1017 1018 1019
		/*
		 * Tell the block layer that we are not a rotational device
		 */
		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
1020
		queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
P
Paul Clements 已提交
1021
		disk->queue->limits.discard_granularity = 512;
1022
		blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
P
Paul Clements 已提交
1023
		disk->queue->limits.discard_zeroes_data = 0;
1024 1025
		blk_queue_max_hw_sectors(disk->queue, 65536);
		disk->queue->limits.max_sectors = 256;
L
Linus Torvalds 已提交
1026 1027 1028 1029 1030 1031 1032 1033 1034
	}

	if (register_blkdev(NBD_MAJOR, "nbd")) {
		err = -EIO;
		goto out;
	}

	printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR);

M
Markus Pargmann 已提交
1035 1036
	nbd_dbg_init();

1037
	for (i = 0; i < nbds_max; i++) {
L
Linus Torvalds 已提交
1038
		struct gendisk *disk = nbd_dev[i].disk;
1039
		nbd_dev[i].magic = NBD_MAGIC;
1040
		INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
L
Linus Torvalds 已提交
1041
		spin_lock_init(&nbd_dev[i].queue_lock);
M
Markus Pargmann 已提交
1042
		spin_lock_init(&nbd_dev[i].sock_lock);
L
Linus Torvalds 已提交
1043
		INIT_LIST_HEAD(&nbd_dev[i].queue_head);
1044
		mutex_init(&nbd_dev[i].tx_lock);
M
Markus Pargmann 已提交
1045 1046 1047
		init_timer(&nbd_dev[i].timeout_timer);
		nbd_dev[i].timeout_timer.function = nbd_xmit_timeout;
		nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i];
1048
		init_waitqueue_head(&nbd_dev[i].active_wq);
1049
		init_waitqueue_head(&nbd_dev[i].waiting_wq);
L
Linus Torvalds 已提交
1050
		disk->major = NBD_MAJOR;
L
Laurent Vivier 已提交
1051
		disk->first_minor = i << part_shift;
L
Linus Torvalds 已提交
1052 1053 1054
		disk->fops = &nbd_fops;
		disk->private_data = &nbd_dev[i];
		sprintf(disk->disk_name, "nbd%d", i);
1055
		nbd_reset(&nbd_dev[i]);
L
Linus Torvalds 已提交
1056 1057 1058 1059 1060 1061 1062 1063 1064
		add_disk(disk);
	}

	return 0;
out:
	while (i--) {
		blk_cleanup_queue(nbd_dev[i].disk->queue);
		put_disk(nbd_dev[i].disk);
	}
1065
	kfree(nbd_dev);
L
Linus Torvalds 已提交
1066 1067 1068 1069 1070 1071
	return err;
}

static void __exit nbd_cleanup(void)
{
	int i;
M
Markus Pargmann 已提交
1072 1073 1074

	nbd_dbg_close();

1075
	for (i = 0; i < nbds_max; i++) {
L
Linus Torvalds 已提交
1076
		struct gendisk *disk = nbd_dev[i].disk;
1077
		nbd_dev[i].magic = 0;
L
Linus Torvalds 已提交
1078 1079 1080 1081 1082 1083 1084
		if (disk) {
			del_gendisk(disk);
			blk_cleanup_queue(disk->queue);
			put_disk(disk);
		}
	}
	unregister_blkdev(NBD_MAJOR, "nbd");
1085
	kfree(nbd_dev);
L
Linus Torvalds 已提交
1086 1087 1088 1089 1090 1091 1092 1093 1094
	printk(KERN_INFO "nbd: unregistered device at major %d\n", NBD_MAJOR);
}

module_init(nbd_init);
module_exit(nbd_cleanup);

MODULE_DESCRIPTION("Network Block Device");
MODULE_LICENSE("GPL");

1095
module_param(nbds_max, int, 0444);
L
Laurent Vivier 已提交
1096 1097 1098
MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
module_param(max_part, int, 0444);
MODULE_PARM_DESC(max_part, "number of partitions per device (default: 0)");