nbd.c 25.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 * Network block device - make block devices work over TCP
 *
 * Note that you can not swap over this thing, yet. Seems to work but
 * deadlocks sometimes - you can not swap over TCP in general.
 * 
P
Pavel Machek 已提交
7
 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
L
Linus Torvalds 已提交
8 9
 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
 *
10
 * This file is released under GPLv2 or later.
L
Linus Torvalds 已提交
11
 *
12
 * (part of code stolen from loop.c)
L
Linus Torvalds 已提交
13 14 15 16 17 18 19 20 21 22 23 24 25 26
 */

#include <linux/major.h>

#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/bio.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/ioctl.h>
27
#include <linux/mutex.h>
28 29 30
#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/kernel.h>
31
#include <linux/slab.h>
L
Linus Torvalds 已提交
32
#include <net/sock.h>
33
#include <linux/net.h>
34
#include <linux/kthread.h>
M
Markus Pargmann 已提交
35
#include <linux/types.h>
M
Markus Pargmann 已提交
36
#include <linux/debugfs.h>
L
Linus Torvalds 已提交
37 38 39 40 41 42

#include <asm/uaccess.h>
#include <asm/types.h>

#include <linux/nbd.h>

43
struct nbd_device {
M
Markus Pargmann 已提交
44
	u32 flags;
45 46 47 48 49 50 51 52 53 54 55 56 57
	struct socket * sock;	/* If == NULL, device is not ready, yet	*/
	int magic;

	spinlock_t queue_lock;
	struct list_head queue_head;	/* Requests waiting result */
	struct request *active_req;
	wait_queue_head_t active_wq;
	struct list_head waiting_queue;	/* Requests to be sent */
	wait_queue_head_t waiting_wq;

	struct mutex tx_lock;
	struct gendisk *disk;
	int blksize;
M
Markus Pargmann 已提交
58
	loff_t bytesize;
59
	int xmit_timeout;
60
	bool timedout;
61
	bool disconnect; /* a disconnect has been requested by user */
M
Markus Pargmann 已提交
62 63

	struct timer_list timeout_timer;
M
Markus Pargmann 已提交
64 65
	/* protects initialization and shutdown of the socket */
	spinlock_t sock_lock;
M
Markus Pargmann 已提交
66 67
	struct task_struct *task_recv;
	struct task_struct *task_send;
M
Markus Pargmann 已提交
68 69 70 71

#if IS_ENABLED(CONFIG_DEBUG_FS)
	struct dentry *dbg_dir;
#endif
72 73
};

M
Markus Pargmann 已提交
74 75 76 77 78 79
#if IS_ENABLED(CONFIG_DEBUG_FS)
static struct dentry *nbd_dbg_dir;
#endif

#define nbd_name(nbd) ((nbd)->disk->disk_name)

80
#define NBD_MAGIC 0x68797548
L
Linus Torvalds 已提交
81

82
static unsigned int nbds_max = 16;
83
static struct nbd_device *nbd_dev;
L
Laurent Vivier 已提交
84
static int max_part;
L
Linus Torvalds 已提交
85 86 87 88 89 90 91 92 93 94 95 96 97

/*
 * Use just one lock (or at most 1 per NIC). Two arguments for this:
 * 1. Each NIC is essentially a synchronization point for all servers
 *    accessed through that NIC so there's no need to have more locks
 *    than NICs anyway.
 * 2. More locks lead to more "Dirty cache line bouncing" which will slow
 *    down each lock to the point where they're actually slower than just
 *    a single lock.
 * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this!
 */
static DEFINE_SPINLOCK(nbd_lock);

98
static inline struct device *nbd_to_dev(struct nbd_device *nbd)
L
Linus Torvalds 已提交
99
{
100
	return disk_to_dev(nbd->disk);
L
Linus Torvalds 已提交
101 102
}

103 104 105 106 107
static bool nbd_is_connected(struct nbd_device *nbd)
{
	return !!nbd->task_recv;
}

L
Linus Torvalds 已提交
108 109 110 111 112 113
static const char *nbdcmd_to_ascii(int cmd)
{
	switch (cmd) {
	case  NBD_CMD_READ: return "read";
	case NBD_CMD_WRITE: return "write";
	case  NBD_CMD_DISC: return "disconnect";
A
Alex Bligh 已提交
114
	case NBD_CMD_FLUSH: return "flush";
P
Paul Clements 已提交
115
	case  NBD_CMD_TRIM: return "trim/discard";
L
Linus Torvalds 已提交
116 117 118 119
	}
	return "invalid";
}

120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
{
	bdev->bd_inode->i_size = 0;
	set_capacity(nbd->disk, 0);
	kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);

	return 0;
}

static void nbd_size_update(struct nbd_device *nbd, struct block_device *bdev)
{
	if (!nbd_is_connected(nbd))
		return;

	bdev->bd_inode->i_size = nbd->bytesize;
	set_capacity(nbd->disk, nbd->bytesize >> 9);
	kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
}

static int nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
			int blocksize, int nr_blocks)
{
	int ret;

	ret = set_blocksize(bdev, blocksize);
	if (ret)
		return ret;

	nbd->blksize = blocksize;
	nbd->bytesize = (loff_t)blocksize * (loff_t)nr_blocks;

	nbd_size_update(nbd, bdev);

	return 0;
}

156
static void nbd_end_request(struct nbd_device *nbd, struct request *req)
L
Linus Torvalds 已提交
157
{
158
	int error = req->errors ? -EIO : 0;
159
	struct request_queue *q = req->q;
L
Linus Torvalds 已提交
160 161
	unsigned long flags;

162 163
	dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", req,
		error ? "failed" : "done");
L
Linus Torvalds 已提交
164 165

	spin_lock_irqsave(q->queue_lock, flags);
166
	__blk_end_request_all(req, error);
L
Linus Torvalds 已提交
167 168 169
	spin_unlock_irqrestore(q->queue_lock, flags);
}

170 171 172
/*
 * Forcibly shutdown the socket causing all listeners to error
 */
173
static void sock_shutdown(struct nbd_device *nbd)
174
{
M
Markus Pargmann 已提交
175 176 177 178
	spin_lock_irq(&nbd->sock_lock);

	if (!nbd->sock) {
		spin_unlock_irq(&nbd->sock_lock);
M
Markus Pargmann 已提交
179
		return;
M
Markus Pargmann 已提交
180
	}
M
Markus Pargmann 已提交
181 182 183

	dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
	kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
M
Markus Pargmann 已提交
184
	sockfd_put(nbd->sock);
M
Markus Pargmann 已提交
185
	nbd->sock = NULL;
M
Markus Pargmann 已提交
186 187 188
	spin_unlock_irq(&nbd->sock_lock);

	del_timer(&nbd->timeout_timer);
189 190 191 192
}

static void nbd_xmit_timeout(unsigned long arg)
{
M
Markus Pargmann 已提交
193
	struct nbd_device *nbd = (struct nbd_device *)arg;
M
Markus Pargmann 已提交
194
	unsigned long flags;
M
Markus Pargmann 已提交
195 196 197 198

	if (list_empty(&nbd->queue_head))
		return;

M
Markus Pargmann 已提交
199
	spin_lock_irqsave(&nbd->sock_lock, flags);
M
Markus Pargmann 已提交
200

201
	nbd->timedout = true;
202

M
Markus Pargmann 已提交
203 204
	if (nbd->sock)
		kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
M
Markus Pargmann 已提交
205

M
Markus Pargmann 已提交
206
	spin_unlock_irqrestore(&nbd->sock_lock, flags);
M
Markus Pargmann 已提交
207

M
Markus Pargmann 已提交
208
	dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
209 210
}

L
Linus Torvalds 已提交
211 212 213
/*
 *  Send or receive packet.
 */
214
static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
L
Linus Torvalds 已提交
215 216
		int msg_flags)
{
217
	struct socket *sock = nbd->sock;
L
Linus Torvalds 已提交
218 219 220
	int result;
	struct msghdr msg;
	struct kvec iov;
221
	unsigned long pflags = current->flags;
L
Linus Torvalds 已提交
222

223
	if (unlikely(!sock)) {
224
		dev_err(disk_to_dev(nbd->disk),
225 226
			"Attempted %s on closed socket in sock_xmit\n",
			(send ? "send" : "recv"));
227 228 229
		return -EINVAL;
	}

230
	current->flags |= PF_MEMALLOC;
L
Linus Torvalds 已提交
231
	do {
232
		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
L
Linus Torvalds 已提交
233 234 235 236 237 238 239 240
		iov.iov_base = buf;
		iov.iov_len = size;
		msg.msg_name = NULL;
		msg.msg_namelen = 0;
		msg.msg_control = NULL;
		msg.msg_controllen = 0;
		msg.msg_flags = msg_flags | MSG_NOSIGNAL;

M
Markus Pargmann 已提交
241
		if (send)
L
Linus Torvalds 已提交
242
			result = kernel_sendmsg(sock, &msg, &iov, 1, size);
M
Markus Pargmann 已提交
243
		else
244 245
			result = kernel_recvmsg(sock, &msg, &iov, 1, size,
						msg.msg_flags);
L
Linus Torvalds 已提交
246 247 248 249 250 251 252 253 254 255

		if (result <= 0) {
			if (result == 0)
				result = -EPIPE; /* short read */
			break;
		}
		size -= result;
		buf += result;
	} while (size > 0);

256
	tsk_restore_flags(current, pflags, PF_MEMALLOC);
L
Linus Torvalds 已提交
257

M
Markus Pargmann 已提交
258 259 260
	if (!send && nbd->xmit_timeout)
		mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);

L
Linus Torvalds 已提交
261 262 263
	return result;
}

264
static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
L
Linus Torvalds 已提交
265 266 267 268
		int flags)
{
	int result;
	void *kaddr = kmap(bvec->bv_page);
269 270
	result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
			   bvec->bv_len, flags);
L
Linus Torvalds 已提交
271 272 273 274
	kunmap(bvec->bv_page);
	return result;
}

275
/* always call with the tx_lock held */
276
static int nbd_send_req(struct nbd_device *nbd, struct request *req)
L
Linus Torvalds 已提交
277
{
278
	int result, flags;
L
Linus Torvalds 已提交
279
	struct nbd_request request;
280
	unsigned long size = blk_rq_bytes(req);
C
Christoph Hellwig 已提交
281 282 283 284
	u32 type;

	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
		type = NBD_CMD_DISC;
M
Mike Christie 已提交
285
	else if (req_op(req) == REQ_OP_DISCARD)
C
Christoph Hellwig 已提交
286
		type = NBD_CMD_TRIM;
287
	else if (req_op(req) == REQ_OP_FLUSH)
C
Christoph Hellwig 已提交
288 289 290 291 292
		type = NBD_CMD_FLUSH;
	else if (rq_data_dir(req) == WRITE)
		type = NBD_CMD_WRITE;
	else
		type = NBD_CMD_READ;
L
Linus Torvalds 已提交
293

294
	memset(&request, 0, sizeof(request));
L
Linus Torvalds 已提交
295
	request.magic = htonl(NBD_REQUEST_MAGIC);
C
Christoph Hellwig 已提交
296 297
	request.type = htonl(type);
	if (type != NBD_CMD_FLUSH && type != NBD_CMD_DISC) {
A
Alex Bligh 已提交
298 299 300
		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
		request.len = htonl(size);
	}
L
Linus Torvalds 已提交
301 302
	memcpy(request.handle, &req, sizeof(req));

303
	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
C
Christoph Hellwig 已提交
304
		req, nbdcmd_to_ascii(type),
305
		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
306
	result = sock_xmit(nbd, 1, &request, sizeof(request),
C
Christoph Hellwig 已提交
307
			(type == NBD_CMD_WRITE) ? MSG_MORE : 0);
L
Linus Torvalds 已提交
308
	if (result <= 0) {
309
		dev_err(disk_to_dev(nbd->disk),
310
			"Send control failed (result %d)\n", result);
311
		return -EIO;
L
Linus Torvalds 已提交
312 313
	}

C
Christoph Hellwig 已提交
314
	if (type == NBD_CMD_WRITE) {
315
		struct req_iterator iter;
316
		struct bio_vec bvec;
L
Linus Torvalds 已提交
317 318 319 320
		/*
		 * we are really probing at internals to determine
		 * whether to set MSG_MORE or not...
		 */
321
		rq_for_each_segment(bvec, req, iter) {
322
			flags = 0;
K
Kent Overstreet 已提交
323
			if (!rq_iter_last(bvec, iter))
324
				flags = MSG_MORE;
325 326
			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
				req, bvec.bv_len);
327
			result = sock_send_bvec(nbd, &bvec, flags);
328
			if (result <= 0) {
329
				dev_err(disk_to_dev(nbd->disk),
330 331
					"Send data failed (result %d)\n",
					result);
332
				return -EIO;
333
			}
L
Linus Torvalds 已提交
334 335 336 337 338
		}
	}
	return 0;
}

339
static struct request *nbd_find_request(struct nbd_device *nbd,
340
					struct request *xreq)
L
Linus Torvalds 已提交
341
{
342
	struct request *req, *tmp;
343
	int err;
L
Linus Torvalds 已提交
344

345
	err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
346
	if (unlikely(err))
347
		return ERR_PTR(err);
348

349 350
	spin_lock(&nbd->queue_lock);
	list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
L
Linus Torvalds 已提交
351 352 353
		if (req != xreq)
			continue;
		list_del_init(&req->queuelist);
354
		spin_unlock(&nbd->queue_lock);
L
Linus Torvalds 已提交
355 356
		return req;
	}
357
	spin_unlock(&nbd->queue_lock);
358

359
	return ERR_PTR(-ENOENT);
L
Linus Torvalds 已提交
360 361
}

362
static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
L
Linus Torvalds 已提交
363 364 365
{
	int result;
	void *kaddr = kmap(bvec->bv_page);
366
	result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
L
Linus Torvalds 已提交
367 368 369 370 371 372
			MSG_WAITALL);
	kunmap(bvec->bv_page);
	return result;
}

/* NULL returned = something went wrong, inform userspace */
373
static struct request *nbd_read_stat(struct nbd_device *nbd)
L
Linus Torvalds 已提交
374 375 376 377 378 379
{
	int result;
	struct nbd_reply reply;
	struct request *req;

	reply.magic = 0;
380
	result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
L
Linus Torvalds 已提交
381
	if (result <= 0) {
382
		dev_err(disk_to_dev(nbd->disk),
383
			"Receive control failed (result %d)\n", result);
384
		return ERR_PTR(result);
L
Linus Torvalds 已提交
385
	}
386 387

	if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
388
		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
389
				(unsigned long)ntohl(reply.magic));
390
		return ERR_PTR(-EPROTO);
391 392
	}

393
	req = nbd_find_request(nbd, *(struct request **)reply.handle);
394
	if (IS_ERR(req)) {
395 396
		result = PTR_ERR(req);
		if (result != -ENOENT)
397
			return ERR_PTR(result);
398

399
		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
400
			reply.handle);
401
		return ERR_PTR(-EBADR);
L
Linus Torvalds 已提交
402 403 404
	}

	if (ntohl(reply.error)) {
405
		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
406
			ntohl(reply.error));
L
Linus Torvalds 已提交
407 408 409 410
		req->errors++;
		return req;
	}

411
	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
C
Christoph Hellwig 已提交
412
	if (rq_data_dir(req) != WRITE) {
413
		struct req_iterator iter;
414
		struct bio_vec bvec;
415 416

		rq_for_each_segment(bvec, req, iter) {
417
			result = sock_recv_bvec(nbd, &bvec);
418
			if (result <= 0) {
419
				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
420
					result);
421 422 423
				req->errors++;
				return req;
			}
424 425
			dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
				req, bvec.bv_len);
L
Linus Torvalds 已提交
426 427 428 429 430
		}
	}
	return req;
}

431 432
static ssize_t pid_show(struct device *dev,
			struct device_attribute *attr, char *buf)
433
{
434
	struct gendisk *disk = dev_to_disk(dev);
M
Markus Pargmann 已提交
435
	struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
436

M
Markus Pargmann 已提交
437
	return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
438 439
}

440
static struct device_attribute pid_attr = {
441
	.attr = { .name = "pid", .mode = S_IRUGO},
442 443 444
	.show = pid_show,
};

445
static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
L
Linus Torvalds 已提交
446 447
{
	struct request *req;
448
	int ret;
L
Linus Torvalds 已提交
449

450
	BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
451

452
	sk_set_memalloc(nbd->sock->sk);
M
Markus Pargmann 已提交
453

454
	ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
455
	if (ret) {
456
		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
457 458
		return ret;
	}
459

460 461
	nbd_size_update(nbd, bdev);

462 463 464 465 466 467 468
	while (1) {
		req = nbd_read_stat(nbd);
		if (IS_ERR(req)) {
			ret = PTR_ERR(req);
			break;
		}

469
		nbd_end_request(nbd, req);
470
	}
471

472 473
	nbd_size_clear(nbd, bdev);

M
Markus Pargmann 已提交
474
	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
M
Markus Pargmann 已提交
475
	return ret;
L
Linus Torvalds 已提交
476 477
}

478
static void nbd_clear_que(struct nbd_device *nbd)
L
Linus Torvalds 已提交
479 480 481
{
	struct request *req;

482
	BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
483

484
	/*
485
	 * Because we have set nbd->sock to NULL under the tx_lock, all
486 487 488 489 490 491
	 * modifications to the list must have completed by now.  For
	 * the same reason, the active_req must be NULL.
	 *
	 * As a consequence, we don't need to take the spin lock while
	 * purging the list here.
	 */
492 493
	BUG_ON(nbd->sock);
	BUG_ON(nbd->active_req);
494

495 496
	while (!list_empty(&nbd->queue_head)) {
		req = list_entry(nbd->queue_head.next, struct request,
497 498 499
				 queuelist);
		list_del_init(&req->queuelist);
		req->errors++;
500
		nbd_end_request(nbd, req);
501
	}
502 503 504 505 506 507

	while (!list_empty(&nbd->waiting_queue)) {
		req = list_entry(nbd->waiting_queue.next, struct request,
				 queuelist);
		list_del_init(&req->queuelist);
		req->errors++;
508
		nbd_end_request(nbd, req);
509
	}
510
	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
L
Linus Torvalds 已提交
511 512
}

513

514
static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
515
{
516
	if (req->cmd_type != REQ_TYPE_FS)
517 518
		goto error_out;

C
Christoph Hellwig 已提交
519 520 521 522 523
	if (rq_data_dir(req) == WRITE &&
	    (nbd->flags & NBD_FLAG_READ_ONLY)) {
		dev_err(disk_to_dev(nbd->disk),
			"Write on read-only\n");
		goto error_out;
A
Alex Bligh 已提交
524 525
	}

526 527
	req->errors = 0;

528 529 530 531
	mutex_lock(&nbd->tx_lock);
	if (unlikely(!nbd->sock)) {
		mutex_unlock(&nbd->tx_lock);
		dev_err(disk_to_dev(nbd->disk),
532
			"Attempted send on closed socket\n");
P
Pavel Machek 已提交
533
		goto error_out;
534 535
	}

536
	nbd->active_req = req;
537

M
Markus Pargmann 已提交
538 539 540
	if (nbd->xmit_timeout && list_empty_careful(&nbd->queue_head))
		mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);

541 542
	if (nbd_send_req(nbd, req) != 0) {
		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
543
		req->errors++;
544
		nbd_end_request(nbd, req);
545
	} else {
546
		spin_lock(&nbd->queue_lock);
547
		list_add_tail(&req->queuelist, &nbd->queue_head);
548
		spin_unlock(&nbd->queue_lock);
549 550
	}

551 552 553
	nbd->active_req = NULL;
	mutex_unlock(&nbd->tx_lock);
	wake_up_all(&nbd->active_wq);
554 555 556 557 558

	return;

error_out:
	req->errors++;
559
	nbd_end_request(nbd, req);
560 561
}

562
static int nbd_thread_send(void *data)
563
{
564
	struct nbd_device *nbd = data;
565 566
	struct request *req;

M
Markus Pargmann 已提交
567 568
	nbd->task_send = current;

569
	set_user_nice(current, MIN_NICE);
570
	while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
571
		/* wait for something to do */
572
		wait_event_interruptible(nbd->waiting_wq,
573
					 kthread_should_stop() ||
574
					 !list_empty(&nbd->waiting_queue));
575 576

		/* extract request */
577
		if (list_empty(&nbd->waiting_queue))
578 579
			continue;

580 581
		spin_lock_irq(&nbd->queue_lock);
		req = list_entry(nbd->waiting_queue.next, struct request,
582 583
				 queuelist);
		list_del_init(&req->queuelist);
584
		spin_unlock_irq(&nbd->queue_lock);
585 586

		/* handle request */
587
		nbd_handle_req(nbd, req);
588
	}
M
Markus Pargmann 已提交
589 590 591

	nbd->task_send = NULL;

592 593 594
	return 0;
}

L
Linus Torvalds 已提交
595 596 597
/*
 * We always wait for result of write, for now. It would be nice to make it optional
 * in future
598
 * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
L
Linus Torvalds 已提交
599 600 601
 *   { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
 */

602
static void nbd_request_handler(struct request_queue *q)
A
Alex Elder 已提交
603
		__releases(q->queue_lock) __acquires(q->queue_lock)
L
Linus Torvalds 已提交
604 605 606
{
	struct request *req;
	
607
	while ((req = blk_fetch_request(q)) != NULL) {
608
		struct nbd_device *nbd;
L
Linus Torvalds 已提交
609

610 611
		spin_unlock_irq(q->queue_lock);

612
		nbd = req->rq_disk->private_data;
L
Linus Torvalds 已提交
613

614
		BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
615

616 617 618
		dev_dbg(nbd_to_dev(nbd), "request %p: dequeued (flags=%x)\n",
			req, req->cmd_type);

619
		if (unlikely(!nbd->sock)) {
620 621
			dev_err_ratelimited(disk_to_dev(nbd->disk),
					    "Attempted send on closed socket\n");
622
			req->errors++;
623
			nbd_end_request(nbd, req);
624 625 626 627
			spin_lock_irq(q->queue_lock);
			continue;
		}

628 629 630
		spin_lock_irq(&nbd->queue_lock);
		list_add_tail(&req->queuelist, &nbd->waiting_queue);
		spin_unlock_irq(&nbd->queue_lock);
L
Linus Torvalds 已提交
631

632
		wake_up(&nbd->waiting_wq);
633

L
Linus Torvalds 已提交
634 635 636 637
		spin_lock_irq(q->queue_lock);
	}
}

M
Markus Pargmann 已提交
638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656
static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock)
{
	int ret = 0;

	spin_lock_irq(&nbd->sock_lock);

	if (nbd->sock) {
		ret = -EBUSY;
		goto out;
	}

	nbd->sock = sock;

out:
	spin_unlock_irq(&nbd->sock_lock);

	return ret;
}

657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680
/* Reset all properties of an NBD device */
static void nbd_reset(struct nbd_device *nbd)
{
	nbd->disconnect = false;
	nbd->timedout = false;
	nbd->blksize = 1024;
	nbd->bytesize = 0;
	set_capacity(nbd->disk, 0);
	nbd->flags = 0;
	nbd->xmit_timeout = 0;
	queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
	del_timer_sync(&nbd->timeout_timer);
}

static void nbd_bdev_reset(struct block_device *bdev)
{
	set_device_ro(bdev, false);
	bdev->bd_inode->i_size = 0;
	if (max_part > 0) {
		blkdev_reread_part(bdev);
		bdev->bd_invalidated = 1;
	}
}

681 682 683 684 685 686 687
static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
{
	if (nbd->flags & NBD_FLAG_READ_ONLY)
		set_device_ro(bdev, true);
	if (nbd->flags & NBD_FLAG_SEND_TRIM)
		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
	if (nbd->flags & NBD_FLAG_SEND_FLUSH)
688
		blk_queue_write_cache(nbd->disk->queue, true, false);
689
	else
690
		blk_queue_write_cache(nbd->disk->queue, false, false);
691 692
}

M
Markus Pargmann 已提交
693 694 695
static int nbd_dev_dbg_init(struct nbd_device *nbd);
static void nbd_dev_dbg_close(struct nbd_device *nbd);

P
Pavel Machek 已提交
696
/* Must be called with tx_lock held */
L
Linus Torvalds 已提交
697

698
static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
P
Pavel Machek 已提交
699 700
		       unsigned int cmd, unsigned long arg)
{
L
Linus Torvalds 已提交
701
	switch (cmd) {
P
Pavel Machek 已提交
702 703 704
	case NBD_DISCONNECT: {
		struct request sreq;

705
		dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
706 707
		if (!nbd->sock)
			return -EINVAL;
P
Pavel Machek 已提交
708

709 710 711
		mutex_unlock(&nbd->tx_lock);
		fsync_bdev(bdev);
		mutex_lock(&nbd->tx_lock);
712
		blk_rq_init(NULL, &sreq);
713
		sreq.cmd_type = REQ_TYPE_DRV_PRIV;
714 715

		/* Check again after getting mutex back.  */
716
		if (!nbd->sock)
L
Linus Torvalds 已提交
717
			return -EINVAL;
718

719
		nbd->disconnect = true;
P
Paul Clements 已提交
720

721
		nbd_send_req(nbd, &sreq);
P
Paul Clements 已提交
722
		return 0;
P
Pavel Machek 已提交
723
	}
L
Linus Torvalds 已提交
724
 
M
Markus Pargmann 已提交
725 726
	case NBD_CLEAR_SOCK:
		sock_shutdown(nbd);
727 728
		nbd_clear_que(nbd);
		BUG_ON(!list_empty(&nbd->queue_head));
729
		BUG_ON(!list_empty(&nbd->waiting_queue));
730
		kill_bdev(bdev);
P
Pavel Machek 已提交
731 732 733
		return 0;

	case NBD_SET_SOCK: {
A
Al Viro 已提交
734
		int err;
M
Markus Pargmann 已提交
735 736 737 738 739 740 741 742 743 744
		struct socket *sock = sockfd_lookup(arg, &err);

		if (!sock)
			return err;

		err = nbd_set_socket(nbd, sock);
		if (!err && max_part)
			bdev->bd_invalidated = 1;

		return err;
P
Pavel Machek 已提交
745 746
	}

747
	case NBD_SET_BLKSIZE: {
A
Arnd Bergmann 已提交
748
		loff_t bsize = div_s64(nbd->bytesize, arg);
749 750 751

		return nbd_size_set(nbd, bdev, arg, bsize);
	}
P
Pavel Machek 已提交
752

L
Linus Torvalds 已提交
753
	case NBD_SET_SIZE:
754 755 756 757 758
		return nbd_size_set(nbd, bdev, nbd->blksize,
				    arg / nbd->blksize);

	case NBD_SET_SIZE_BLOCKS:
		return nbd_size_set(nbd, bdev, nbd->blksize, arg);
P
Pavel Machek 已提交
759

760
	case NBD_SET_TIMEOUT:
761
		nbd->xmit_timeout = arg * HZ;
M
Markus Pargmann 已提交
762 763 764 765 766 767
		if (arg)
			mod_timer(&nbd->timeout_timer,
				  jiffies + nbd->xmit_timeout);
		else
			del_timer_sync(&nbd->timeout_timer);

768
		return 0;
P
Pavel Machek 已提交
769

P
Paul Clements 已提交
770 771 772 773
	case NBD_SET_FLAGS:
		nbd->flags = arg;
		return 0;

P
Pavel Machek 已提交
774 775 776 777
	case NBD_DO_IT: {
		struct task_struct *thread;
		int error;

M
Markus Pargmann 已提交
778
		if (nbd->task_recv)
779
			return -EBUSY;
A
Al Viro 已提交
780
		if (!nbd->sock)
L
Linus Torvalds 已提交
781
			return -EINVAL;
P
Pavel Machek 已提交
782

V
Vegard Nossum 已提交
783 784
		/* We have to claim the device under the lock */
		nbd->task_recv = current;
785
		mutex_unlock(&nbd->tx_lock);
P
Pavel Machek 已提交
786

787
		nbd_parse_flags(nbd, bdev);
P
Paul Clements 已提交
788

789
		thread = kthread_run(nbd_thread_send, nbd, "%s",
M
Markus Pargmann 已提交
790
				     nbd_name(nbd));
P
Pavel Machek 已提交
791
		if (IS_ERR(thread)) {
792
			mutex_lock(&nbd->tx_lock);
V
Vegard Nossum 已提交
793
			nbd->task_recv = NULL;
794
			return PTR_ERR(thread);
P
Pavel Machek 已提交
795
		}
796

M
Markus Pargmann 已提交
797
		nbd_dev_dbg_init(nbd);
798
		error = nbd_thread_recv(nbd, bdev);
M
Markus Pargmann 已提交
799
		nbd_dev_dbg_close(nbd);
800
		kthread_stop(thread);
P
Pavel Machek 已提交
801

802
		mutex_lock(&nbd->tx_lock);
V
Vegard Nossum 已提交
803
		nbd->task_recv = NULL;
804

805
		sock_shutdown(nbd);
806
		nbd_clear_que(nbd);
807
		kill_bdev(bdev);
808 809
		nbd_bdev_reset(bdev);

P
Paul Clements 已提交
810
		if (nbd->disconnect) /* user requested, ignore socket errors */
811 812 813 814
			error = 0;
		if (nbd->timedout)
			error = -ETIMEDOUT;

815 816
		nbd_reset(nbd);

817
		return error;
P
Pavel Machek 已提交
818 819
	}

L
Linus Torvalds 已提交
820
	case NBD_CLEAR_QUE:
821 822 823 824
		/*
		 * This is for compatibility only.  The queue is always cleared
		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
		 */
L
Linus Torvalds 已提交
825
		return 0;
P
Pavel Machek 已提交
826

L
Linus Torvalds 已提交
827
	case NBD_PRINT_DEBUG:
828
		dev_info(disk_to_dev(nbd->disk),
829
			"next = %p, prev = %p, head = %p\n",
830 831
			nbd->queue_head.next, nbd->queue_head.prev,
			&nbd->queue_head);
L
Linus Torvalds 已提交
832 833
		return 0;
	}
P
Pavel Machek 已提交
834 835 836 837 838 839
	return -ENOTTY;
}

static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
		     unsigned int cmd, unsigned long arg)
{
840
	struct nbd_device *nbd = bdev->bd_disk->private_data;
P
Pavel Machek 已提交
841 842 843 844 845
	int error;

	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

846
	BUG_ON(nbd->magic != NBD_MAGIC);
P
Pavel Machek 已提交
847

848 849 850
	mutex_lock(&nbd->tx_lock);
	error = __nbd_ioctl(bdev, nbd, cmd, arg);
	mutex_unlock(&nbd->tx_lock);
P
Pavel Machek 已提交
851 852

	return error;
L
Linus Torvalds 已提交
853 854
}

855
static const struct block_device_operations nbd_fops =
L
Linus Torvalds 已提交
856 857
{
	.owner =	THIS_MODULE,
858
	.ioctl =	nbd_ioctl,
A
Al Viro 已提交
859
	.compat_ioctl =	nbd_ioctl,
L
Linus Torvalds 已提交
860 861
};

M
Markus Pargmann 已提交
862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923
#if IS_ENABLED(CONFIG_DEBUG_FS)

static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
{
	struct nbd_device *nbd = s->private;

	if (nbd->task_recv)
		seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
	if (nbd->task_send)
		seq_printf(s, "send: %d\n", task_pid_nr(nbd->task_send));

	return 0;
}

static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
{
	return single_open(file, nbd_dbg_tasks_show, inode->i_private);
}

static const struct file_operations nbd_dbg_tasks_ops = {
	.open = nbd_dbg_tasks_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
};

static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
{
	struct nbd_device *nbd = s->private;
	u32 flags = nbd->flags;

	seq_printf(s, "Hex: 0x%08x\n\n", flags);

	seq_puts(s, "Known flags:\n");

	if (flags & NBD_FLAG_HAS_FLAGS)
		seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
	if (flags & NBD_FLAG_READ_ONLY)
		seq_puts(s, "NBD_FLAG_READ_ONLY\n");
	if (flags & NBD_FLAG_SEND_FLUSH)
		seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
	if (flags & NBD_FLAG_SEND_TRIM)
		seq_puts(s, "NBD_FLAG_SEND_TRIM\n");

	return 0;
}

static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
{
	return single_open(file, nbd_dbg_flags_show, inode->i_private);
}

static const struct file_operations nbd_dbg_flags_ops = {
	.open = nbd_dbg_flags_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
};

static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
	struct dentry *dir;
924 925 926

	if (!nbd_dbg_dir)
		return -EIO;
M
Markus Pargmann 已提交
927 928

	dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
929 930 931 932
	if (!dir) {
		dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
			nbd_name(nbd));
		return -EIO;
M
Markus Pargmann 已提交
933 934 935
	}
	nbd->dbg_dir = dir;

936 937 938 939
	debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
	debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
	debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout);
	debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize);
940
	debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
M
Markus Pargmann 已提交
941 942 943 944 945 946 947 948 949 950 951 952 953 954

	return 0;
}

static void nbd_dev_dbg_close(struct nbd_device *nbd)
{
	debugfs_remove_recursive(nbd->dbg_dir);
}

static int nbd_dbg_init(void)
{
	struct dentry *dbg_dir;

	dbg_dir = debugfs_create_dir("nbd", NULL);
955 956
	if (!dbg_dir)
		return -EIO;
M
Markus Pargmann 已提交
957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989

	nbd_dbg_dir = dbg_dir;

	return 0;
}

static void nbd_dbg_close(void)
{
	debugfs_remove_recursive(nbd_dbg_dir);
}

#else  /* IS_ENABLED(CONFIG_DEBUG_FS) */

static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
	return 0;
}

static void nbd_dev_dbg_close(struct nbd_device *nbd)
{
}

static int nbd_dbg_init(void)
{
	return 0;
}

static void nbd_dbg_close(void)
{
}

#endif

L
Linus Torvalds 已提交
990 991 992 993 994 995 996 997 998
/*
 * And here should be modules and kernel interface 
 *  (Just smiley confuses emacs :-)
 */

static int __init nbd_init(void)
{
	int err = -ENOMEM;
	int i;
L
Laurent Vivier 已提交
999
	int part_shift;
L
Linus Torvalds 已提交
1000

1001
	BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
L
Linus Torvalds 已提交
1002

L
Laurent Vivier 已提交
1003
	if (max_part < 0) {
1004
		printk(KERN_ERR "nbd: max_part must be >= 0\n");
L
Laurent Vivier 已提交
1005 1006 1007 1008
		return -EINVAL;
	}

	part_shift = 0;
1009
	if (max_part > 0) {
L
Laurent Vivier 已提交
1010 1011
		part_shift = fls(max_part);

1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022
		/*
		 * Adjust max_part according to part_shift as it is exported
		 * to user space so that user can know the max number of
		 * partition kernel should be able to manage.
		 *
		 * Note that -1 is required because partition 0 is reserved
		 * for the whole disk.
		 */
		max_part = (1UL << part_shift) - 1;
	}

1023 1024 1025 1026 1027 1028
	if ((1UL << part_shift) > DISK_MAX_PARTS)
		return -EINVAL;

	if (nbds_max > 1UL << (MINORBITS - part_shift))
		return -EINVAL;

S
Sudip Mukherjee 已提交
1029 1030 1031 1032
	nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
	if (!nbd_dev)
		return -ENOMEM;

1033
	for (i = 0; i < nbds_max; i++) {
L
Laurent Vivier 已提交
1034
		struct gendisk *disk = alloc_disk(1 << part_shift);
L
Linus Torvalds 已提交
1035 1036 1037 1038 1039 1040 1041 1042
		if (!disk)
			goto out;
		nbd_dev[i].disk = disk;
		/*
		 * The new linux 2.5 block layer implementation requires
		 * every gendisk to have its very own request_queue struct.
		 * These structs are big so we dynamically allocate them.
		 */
1043
		disk->queue = blk_init_queue(nbd_request_handler, &nbd_lock);
L
Linus Torvalds 已提交
1044 1045 1046 1047
		if (!disk->queue) {
			put_disk(disk);
			goto out;
		}
1048 1049 1050 1051
		/*
		 * Tell the block layer that we are not a rotational device
		 */
		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
1052
		queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
P
Paul Clements 已提交
1053
		disk->queue->limits.discard_granularity = 512;
1054
		blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
P
Paul Clements 已提交
1055
		disk->queue->limits.discard_zeroes_data = 0;
1056 1057
		blk_queue_max_hw_sectors(disk->queue, 65536);
		disk->queue->limits.max_sectors = 256;
L
Linus Torvalds 已提交
1058 1059 1060 1061 1062 1063 1064 1065 1066
	}

	if (register_blkdev(NBD_MAJOR, "nbd")) {
		err = -EIO;
		goto out;
	}

	printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR);

M
Markus Pargmann 已提交
1067 1068
	nbd_dbg_init();

1069
	for (i = 0; i < nbds_max; i++) {
L
Linus Torvalds 已提交
1070
		struct gendisk *disk = nbd_dev[i].disk;
1071
		nbd_dev[i].magic = NBD_MAGIC;
1072
		INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
L
Linus Torvalds 已提交
1073
		spin_lock_init(&nbd_dev[i].queue_lock);
M
Markus Pargmann 已提交
1074
		spin_lock_init(&nbd_dev[i].sock_lock);
L
Linus Torvalds 已提交
1075
		INIT_LIST_HEAD(&nbd_dev[i].queue_head);
1076
		mutex_init(&nbd_dev[i].tx_lock);
M
Markus Pargmann 已提交
1077 1078 1079
		init_timer(&nbd_dev[i].timeout_timer);
		nbd_dev[i].timeout_timer.function = nbd_xmit_timeout;
		nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i];
1080
		init_waitqueue_head(&nbd_dev[i].active_wq);
1081
		init_waitqueue_head(&nbd_dev[i].waiting_wq);
L
Linus Torvalds 已提交
1082
		disk->major = NBD_MAJOR;
L
Laurent Vivier 已提交
1083
		disk->first_minor = i << part_shift;
L
Linus Torvalds 已提交
1084 1085 1086
		disk->fops = &nbd_fops;
		disk->private_data = &nbd_dev[i];
		sprintf(disk->disk_name, "nbd%d", i);
1087
		nbd_reset(&nbd_dev[i]);
L
Linus Torvalds 已提交
1088 1089 1090 1091 1092 1093 1094 1095 1096
		add_disk(disk);
	}

	return 0;
out:
	while (i--) {
		blk_cleanup_queue(nbd_dev[i].disk->queue);
		put_disk(nbd_dev[i].disk);
	}
1097
	kfree(nbd_dev);
L
Linus Torvalds 已提交
1098 1099 1100 1101 1102 1103
	return err;
}

static void __exit nbd_cleanup(void)
{
	int i;
M
Markus Pargmann 已提交
1104 1105 1106

	nbd_dbg_close();

1107
	for (i = 0; i < nbds_max; i++) {
L
Linus Torvalds 已提交
1108
		struct gendisk *disk = nbd_dev[i].disk;
1109
		nbd_dev[i].magic = 0;
L
Linus Torvalds 已提交
1110 1111 1112 1113 1114 1115 1116
		if (disk) {
			del_gendisk(disk);
			blk_cleanup_queue(disk->queue);
			put_disk(disk);
		}
	}
	unregister_blkdev(NBD_MAJOR, "nbd");
1117
	kfree(nbd_dev);
L
Linus Torvalds 已提交
1118 1119 1120 1121 1122 1123 1124 1125 1126
	printk(KERN_INFO "nbd: unregistered device at major %d\n", NBD_MAJOR);
}

module_init(nbd_init);
module_exit(nbd_cleanup);

MODULE_DESCRIPTION("Network Block Device");
MODULE_LICENSE("GPL");

1127
module_param(nbds_max, int, 0444);
L
Laurent Vivier 已提交
1128 1129 1130
MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
module_param(max_part, int, 0444);
MODULE_PARM_DESC(max_part, "number of partitions per device (default: 0)");