nbd.c 25.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 * Network block device - make block devices work over TCP
 *
 * Note that you can not swap over this thing, yet. Seems to work but
 * deadlocks sometimes - you can not swap over TCP in general.
 * 
P
Pavel Machek 已提交
7
 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
L
Linus Torvalds 已提交
8 9
 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
 *
10
 * This file is released under GPLv2 or later.
L
Linus Torvalds 已提交
11
 *
12
 * (part of code stolen from loop.c)
L
Linus Torvalds 已提交
13 14 15 16 17 18 19 20 21 22 23 24 25 26
 */

#include <linux/major.h>

#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/bio.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/ioctl.h>
27
#include <linux/mutex.h>
28 29 30
#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/kernel.h>
31
#include <linux/slab.h>
L
Linus Torvalds 已提交
32
#include <net/sock.h>
33
#include <linux/net.h>
34
#include <linux/kthread.h>
M
Markus Pargmann 已提交
35
#include <linux/types.h>
M
Markus Pargmann 已提交
36
#include <linux/debugfs.h>
L
Linus Torvalds 已提交
37 38 39 40 41 42

#include <asm/uaccess.h>
#include <asm/types.h>

#include <linux/nbd.h>

43
struct nbd_device {
M
Markus Pargmann 已提交
44
	u32 flags;
45 46 47 48 49 50 51 52 53 54 55 56 57
	struct socket * sock;	/* If == NULL, device is not ready, yet	*/
	int magic;

	spinlock_t queue_lock;
	struct list_head queue_head;	/* Requests waiting result */
	struct request *active_req;
	wait_queue_head_t active_wq;
	struct list_head waiting_queue;	/* Requests to be sent */
	wait_queue_head_t waiting_wq;

	struct mutex tx_lock;
	struct gendisk *disk;
	int blksize;
M
Markus Pargmann 已提交
58
	loff_t bytesize;
59
	int xmit_timeout;
60
	bool timedout;
61
	bool disconnect; /* a disconnect has been requested by user */
M
Markus Pargmann 已提交
62 63

	struct timer_list timeout_timer;
M
Markus Pargmann 已提交
64 65
	/* protects initialization and shutdown of the socket */
	spinlock_t sock_lock;
M
Markus Pargmann 已提交
66 67
	struct task_struct *task_recv;
	struct task_struct *task_send;
M
Markus Pargmann 已提交
68 69 70 71

#if IS_ENABLED(CONFIG_DEBUG_FS)
	struct dentry *dbg_dir;
#endif
72 73
};

M
Markus Pargmann 已提交
74 75 76 77 78 79
#if IS_ENABLED(CONFIG_DEBUG_FS)
static struct dentry *nbd_dbg_dir;
#endif

#define nbd_name(nbd) ((nbd)->disk->disk_name)

80
#define NBD_MAGIC 0x68797548
L
Linus Torvalds 已提交
81

82
static unsigned int nbds_max = 16;
83
static struct nbd_device *nbd_dev;
L
Laurent Vivier 已提交
84
static int max_part;
L
Linus Torvalds 已提交
85 86 87 88 89 90 91 92 93 94 95 96 97

/*
 * Use just one lock (or at most 1 per NIC). Two arguments for this:
 * 1. Each NIC is essentially a synchronization point for all servers
 *    accessed through that NIC so there's no need to have more locks
 *    than NICs anyway.
 * 2. More locks lead to more "Dirty cache line bouncing" which will slow
 *    down each lock to the point where they're actually slower than just
 *    a single lock.
 * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this!
 */
static DEFINE_SPINLOCK(nbd_lock);

98
static inline struct device *nbd_to_dev(struct nbd_device *nbd)
L
Linus Torvalds 已提交
99
{
100
	return disk_to_dev(nbd->disk);
L
Linus Torvalds 已提交
101 102
}

103 104 105 106 107
static bool nbd_is_connected(struct nbd_device *nbd)
{
	return !!nbd->task_recv;
}

L
Linus Torvalds 已提交
108 109 110 111 112 113
static const char *nbdcmd_to_ascii(int cmd)
{
	switch (cmd) {
	case  NBD_CMD_READ: return "read";
	case NBD_CMD_WRITE: return "write";
	case  NBD_CMD_DISC: return "disconnect";
A
Alex Bligh 已提交
114
	case NBD_CMD_FLUSH: return "flush";
P
Paul Clements 已提交
115
	case  NBD_CMD_TRIM: return "trim/discard";
L
Linus Torvalds 已提交
116 117 118 119
	}
	return "invalid";
}

120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
{
	bdev->bd_inode->i_size = 0;
	set_capacity(nbd->disk, 0);
	kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);

	return 0;
}

static void nbd_size_update(struct nbd_device *nbd, struct block_device *bdev)
{
	if (!nbd_is_connected(nbd))
		return;

	bdev->bd_inode->i_size = nbd->bytesize;
	set_capacity(nbd->disk, nbd->bytesize >> 9);
	kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
}

static int nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
			int blocksize, int nr_blocks)
{
	int ret;

	ret = set_blocksize(bdev, blocksize);
	if (ret)
		return ret;

	nbd->blksize = blocksize;
	nbd->bytesize = (loff_t)blocksize * (loff_t)nr_blocks;

	nbd_size_update(nbd, bdev);

	return 0;
}

156
static void nbd_end_request(struct nbd_device *nbd, struct request *req)
L
Linus Torvalds 已提交
157
{
158
	int error = req->errors ? -EIO : 0;
159
	struct request_queue *q = req->q;
L
Linus Torvalds 已提交
160 161
	unsigned long flags;

162 163
	dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", req,
		error ? "failed" : "done");
L
Linus Torvalds 已提交
164 165

	spin_lock_irqsave(q->queue_lock, flags);
166
	__blk_end_request_all(req, error);
L
Linus Torvalds 已提交
167 168 169
	spin_unlock_irqrestore(q->queue_lock, flags);
}

170 171 172
/*
 * Forcibly shutdown the socket causing all listeners to error
 */
173
static void sock_shutdown(struct nbd_device *nbd)
174
{
M
Markus Pargmann 已提交
175 176 177 178
	spin_lock_irq(&nbd->sock_lock);

	if (!nbd->sock) {
		spin_unlock_irq(&nbd->sock_lock);
M
Markus Pargmann 已提交
179
		return;
M
Markus Pargmann 已提交
180
	}
M
Markus Pargmann 已提交
181 182 183

	dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
	kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
M
Markus Pargmann 已提交
184
	sockfd_put(nbd->sock);
M
Markus Pargmann 已提交
185
	nbd->sock = NULL;
M
Markus Pargmann 已提交
186 187 188
	spin_unlock_irq(&nbd->sock_lock);

	del_timer(&nbd->timeout_timer);
189 190 191 192
}

static void nbd_xmit_timeout(unsigned long arg)
{
M
Markus Pargmann 已提交
193
	struct nbd_device *nbd = (struct nbd_device *)arg;
M
Markus Pargmann 已提交
194
	unsigned long flags;
M
Markus Pargmann 已提交
195 196 197 198

	if (list_empty(&nbd->queue_head))
		return;

M
Markus Pargmann 已提交
199
	spin_lock_irqsave(&nbd->sock_lock, flags);
M
Markus Pargmann 已提交
200

201
	nbd->timedout = true;
202

M
Markus Pargmann 已提交
203 204
	if (nbd->sock)
		kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
M
Markus Pargmann 已提交
205

M
Markus Pargmann 已提交
206
	spin_unlock_irqrestore(&nbd->sock_lock, flags);
M
Markus Pargmann 已提交
207

M
Markus Pargmann 已提交
208
	dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
209 210
}

L
Linus Torvalds 已提交
211 212 213
/*
 *  Send or receive packet.
 */
214
static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
L
Linus Torvalds 已提交
215 216
		int msg_flags)
{
217
	struct socket *sock = nbd->sock;
L
Linus Torvalds 已提交
218 219 220
	int result;
	struct msghdr msg;
	struct kvec iov;
221
	unsigned long pflags = current->flags;
L
Linus Torvalds 已提交
222

223
	if (unlikely(!sock)) {
224
		dev_err(disk_to_dev(nbd->disk),
225 226
			"Attempted %s on closed socket in sock_xmit\n",
			(send ? "send" : "recv"));
227 228 229
		return -EINVAL;
	}

230
	current->flags |= PF_MEMALLOC;
L
Linus Torvalds 已提交
231
	do {
232
		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
L
Linus Torvalds 已提交
233 234 235 236 237 238 239 240
		iov.iov_base = buf;
		iov.iov_len = size;
		msg.msg_name = NULL;
		msg.msg_namelen = 0;
		msg.msg_control = NULL;
		msg.msg_controllen = 0;
		msg.msg_flags = msg_flags | MSG_NOSIGNAL;

M
Markus Pargmann 已提交
241
		if (send)
L
Linus Torvalds 已提交
242
			result = kernel_sendmsg(sock, &msg, &iov, 1, size);
M
Markus Pargmann 已提交
243
		else
244 245
			result = kernel_recvmsg(sock, &msg, &iov, 1, size,
						msg.msg_flags);
L
Linus Torvalds 已提交
246 247 248 249 250 251 252 253 254 255

		if (result <= 0) {
			if (result == 0)
				result = -EPIPE; /* short read */
			break;
		}
		size -= result;
		buf += result;
	} while (size > 0);

256
	tsk_restore_flags(current, pflags, PF_MEMALLOC);
L
Linus Torvalds 已提交
257

M
Markus Pargmann 已提交
258 259 260
	if (!send && nbd->xmit_timeout)
		mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);

L
Linus Torvalds 已提交
261 262 263
	return result;
}

264
static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
L
Linus Torvalds 已提交
265 266 267 268
		int flags)
{
	int result;
	void *kaddr = kmap(bvec->bv_page);
269 270
	result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
			   bvec->bv_len, flags);
L
Linus Torvalds 已提交
271 272 273 274
	kunmap(bvec->bv_page);
	return result;
}

275
/* always call with the tx_lock held */
276
static int nbd_send_req(struct nbd_device *nbd, struct request *req)
L
Linus Torvalds 已提交
277
{
278
	int result, flags;
L
Linus Torvalds 已提交
279
	struct nbd_request request;
280
	unsigned long size = blk_rq_bytes(req);
C
Christoph Hellwig 已提交
281 282 283 284
	u32 type;

	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
		type = NBD_CMD_DISC;
M
Mike Christie 已提交
285
	else if (req_op(req) == REQ_OP_DISCARD)
C
Christoph Hellwig 已提交
286
		type = NBD_CMD_TRIM;
287
	else if (req_op(req) == REQ_OP_FLUSH)
C
Christoph Hellwig 已提交
288 289 290 291 292
		type = NBD_CMD_FLUSH;
	else if (rq_data_dir(req) == WRITE)
		type = NBD_CMD_WRITE;
	else
		type = NBD_CMD_READ;
L
Linus Torvalds 已提交
293

294
	memset(&request, 0, sizeof(request));
L
Linus Torvalds 已提交
295
	request.magic = htonl(NBD_REQUEST_MAGIC);
C
Christoph Hellwig 已提交
296 297
	request.type = htonl(type);
	if (type != NBD_CMD_FLUSH && type != NBD_CMD_DISC) {
A
Alex Bligh 已提交
298 299 300
		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
		request.len = htonl(size);
	}
L
Linus Torvalds 已提交
301 302
	memcpy(request.handle, &req, sizeof(req));

303
	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
C
Christoph Hellwig 已提交
304
		req, nbdcmd_to_ascii(type),
305
		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
306
	result = sock_xmit(nbd, 1, &request, sizeof(request),
C
Christoph Hellwig 已提交
307
			(type == NBD_CMD_WRITE) ? MSG_MORE : 0);
L
Linus Torvalds 已提交
308
	if (result <= 0) {
309
		dev_err(disk_to_dev(nbd->disk),
310
			"Send control failed (result %d)\n", result);
311
		return -EIO;
L
Linus Torvalds 已提交
312 313
	}

C
Christoph Hellwig 已提交
314
	if (type == NBD_CMD_WRITE) {
315
		struct req_iterator iter;
316
		struct bio_vec bvec;
L
Linus Torvalds 已提交
317 318 319 320
		/*
		 * we are really probing at internals to determine
		 * whether to set MSG_MORE or not...
		 */
321
		rq_for_each_segment(bvec, req, iter) {
322
			flags = 0;
K
Kent Overstreet 已提交
323
			if (!rq_iter_last(bvec, iter))
324
				flags = MSG_MORE;
325 326
			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
				req, bvec.bv_len);
327
			result = sock_send_bvec(nbd, &bvec, flags);
328
			if (result <= 0) {
329
				dev_err(disk_to_dev(nbd->disk),
330 331
					"Send data failed (result %d)\n",
					result);
332
				return -EIO;
333
			}
L
Linus Torvalds 已提交
334 335 336 337 338
		}
	}
	return 0;
}

339
static struct request *nbd_find_request(struct nbd_device *nbd,
340
					struct request *xreq)
L
Linus Torvalds 已提交
341
{
342
	struct request *req, *tmp;
343
	int err;
L
Linus Torvalds 已提交
344

345
	err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
346
	if (unlikely(err))
347
		return ERR_PTR(err);
348

349 350
	spin_lock(&nbd->queue_lock);
	list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
L
Linus Torvalds 已提交
351 352 353
		if (req != xreq)
			continue;
		list_del_init(&req->queuelist);
354
		spin_unlock(&nbd->queue_lock);
L
Linus Torvalds 已提交
355 356
		return req;
	}
357
	spin_unlock(&nbd->queue_lock);
358

359
	return ERR_PTR(-ENOENT);
L
Linus Torvalds 已提交
360 361
}

362
static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
L
Linus Torvalds 已提交
363 364 365
{
	int result;
	void *kaddr = kmap(bvec->bv_page);
366
	result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
L
Linus Torvalds 已提交
367 368 369 370 371 372
			MSG_WAITALL);
	kunmap(bvec->bv_page);
	return result;
}

/* NULL returned = something went wrong, inform userspace */
373
static struct request *nbd_read_stat(struct nbd_device *nbd)
L
Linus Torvalds 已提交
374 375 376 377 378 379
{
	int result;
	struct nbd_reply reply;
	struct request *req;

	reply.magic = 0;
380
	result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
L
Linus Torvalds 已提交
381
	if (result <= 0) {
382
		dev_err(disk_to_dev(nbd->disk),
383
			"Receive control failed (result %d)\n", result);
384
		return ERR_PTR(result);
L
Linus Torvalds 已提交
385
	}
386 387

	if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
388
		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
389
				(unsigned long)ntohl(reply.magic));
390
		return ERR_PTR(-EPROTO);
391 392
	}

393
	req = nbd_find_request(nbd, *(struct request **)reply.handle);
394
	if (IS_ERR(req)) {
395 396
		result = PTR_ERR(req);
		if (result != -ENOENT)
397
			return ERR_PTR(result);
398

399
		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
400
			reply.handle);
401
		return ERR_PTR(-EBADR);
L
Linus Torvalds 已提交
402 403 404
	}

	if (ntohl(reply.error)) {
405
		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
406
			ntohl(reply.error));
L
Linus Torvalds 已提交
407 408 409 410
		req->errors++;
		return req;
	}

411
	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
C
Christoph Hellwig 已提交
412
	if (rq_data_dir(req) != WRITE) {
413
		struct req_iterator iter;
414
		struct bio_vec bvec;
415 416

		rq_for_each_segment(bvec, req, iter) {
417
			result = sock_recv_bvec(nbd, &bvec);
418
			if (result <= 0) {
419
				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
420
					result);
421 422 423
				req->errors++;
				return req;
			}
424 425
			dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
				req, bvec.bv_len);
L
Linus Torvalds 已提交
426 427 428 429 430
		}
	}
	return req;
}

431 432
static ssize_t pid_show(struct device *dev,
			struct device_attribute *attr, char *buf)
433
{
434
	struct gendisk *disk = dev_to_disk(dev);
M
Markus Pargmann 已提交
435
	struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
436

M
Markus Pargmann 已提交
437
	return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
438 439
}

440
static struct device_attribute pid_attr = {
441
	.attr = { .name = "pid", .mode = S_IRUGO},
442 443 444
	.show = pid_show,
};

445
static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
L
Linus Torvalds 已提交
446 447
{
	struct request *req;
448
	int ret;
L
Linus Torvalds 已提交
449

450
	BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
451

452
	sk_set_memalloc(nbd->sock->sk);
M
Markus Pargmann 已提交
453 454 455

	nbd->task_recv = current;

456
	ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
457
	if (ret) {
458
		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
M
Markus Pargmann 已提交
459

M
Markus Pargmann 已提交
460
		nbd->task_recv = NULL;
M
Markus Pargmann 已提交
461

462 463
		return ret;
	}
464

465 466
	nbd_size_update(nbd, bdev);

467 468 469 470 471 472 473
	while (1) {
		req = nbd_read_stat(nbd);
		if (IS_ERR(req)) {
			ret = PTR_ERR(req);
			break;
		}

474
		nbd_end_request(nbd, req);
475
	}
476

477 478
	nbd_size_clear(nbd, bdev);

M
Markus Pargmann 已提交
479 480
	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);

M
Markus Pargmann 已提交
481 482 483
	nbd->task_recv = NULL;

	return ret;
L
Linus Torvalds 已提交
484 485
}

486
static void nbd_clear_que(struct nbd_device *nbd)
L
Linus Torvalds 已提交
487 488 489
{
	struct request *req;

490
	BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
491

492
	/*
493
	 * Because we have set nbd->sock to NULL under the tx_lock, all
494 495 496 497 498 499
	 * modifications to the list must have completed by now.  For
	 * the same reason, the active_req must be NULL.
	 *
	 * As a consequence, we don't need to take the spin lock while
	 * purging the list here.
	 */
500 501
	BUG_ON(nbd->sock);
	BUG_ON(nbd->active_req);
502

503 504
	while (!list_empty(&nbd->queue_head)) {
		req = list_entry(nbd->queue_head.next, struct request,
505 506 507
				 queuelist);
		list_del_init(&req->queuelist);
		req->errors++;
508
		nbd_end_request(nbd, req);
509
	}
510 511 512 513 514 515

	while (!list_empty(&nbd->waiting_queue)) {
		req = list_entry(nbd->waiting_queue.next, struct request,
				 queuelist);
		list_del_init(&req->queuelist);
		req->errors++;
516
		nbd_end_request(nbd, req);
517
	}
518
	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
L
Linus Torvalds 已提交
519 520
}

521

522
static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
523
{
524
	if (req->cmd_type != REQ_TYPE_FS)
525 526
		goto error_out;

C
Christoph Hellwig 已提交
527 528 529 530 531
	if (rq_data_dir(req) == WRITE &&
	    (nbd->flags & NBD_FLAG_READ_ONLY)) {
		dev_err(disk_to_dev(nbd->disk),
			"Write on read-only\n");
		goto error_out;
A
Alex Bligh 已提交
532 533
	}

534 535
	req->errors = 0;

536 537 538 539
	mutex_lock(&nbd->tx_lock);
	if (unlikely(!nbd->sock)) {
		mutex_unlock(&nbd->tx_lock);
		dev_err(disk_to_dev(nbd->disk),
540
			"Attempted send on closed socket\n");
P
Pavel Machek 已提交
541
		goto error_out;
542 543
	}

544
	nbd->active_req = req;
545

M
Markus Pargmann 已提交
546 547 548
	if (nbd->xmit_timeout && list_empty_careful(&nbd->queue_head))
		mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);

549 550
	if (nbd_send_req(nbd, req) != 0) {
		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
551
		req->errors++;
552
		nbd_end_request(nbd, req);
553
	} else {
554
		spin_lock(&nbd->queue_lock);
555
		list_add_tail(&req->queuelist, &nbd->queue_head);
556
		spin_unlock(&nbd->queue_lock);
557 558
	}

559 560 561
	nbd->active_req = NULL;
	mutex_unlock(&nbd->tx_lock);
	wake_up_all(&nbd->active_wq);
562 563 564 565 566

	return;

error_out:
	req->errors++;
567
	nbd_end_request(nbd, req);
568 569
}

570
static int nbd_thread_send(void *data)
571
{
572
	struct nbd_device *nbd = data;
573 574
	struct request *req;

M
Markus Pargmann 已提交
575 576
	nbd->task_send = current;

577
	set_user_nice(current, MIN_NICE);
578
	while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
579
		/* wait for something to do */
580
		wait_event_interruptible(nbd->waiting_wq,
581
					 kthread_should_stop() ||
582
					 !list_empty(&nbd->waiting_queue));
583 584

		/* extract request */
585
		if (list_empty(&nbd->waiting_queue))
586 587
			continue;

588 589
		spin_lock_irq(&nbd->queue_lock);
		req = list_entry(nbd->waiting_queue.next, struct request,
590 591
				 queuelist);
		list_del_init(&req->queuelist);
592
		spin_unlock_irq(&nbd->queue_lock);
593 594

		/* handle request */
595
		nbd_handle_req(nbd, req);
596
	}
M
Markus Pargmann 已提交
597 598 599

	nbd->task_send = NULL;

600 601 602
	return 0;
}

L
Linus Torvalds 已提交
603 604 605
/*
 * We always wait for result of write, for now. It would be nice to make it optional
 * in future
606
 * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
L
Linus Torvalds 已提交
607 608 609
 *   { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
 */

610
static void nbd_request_handler(struct request_queue *q)
A
Alex Elder 已提交
611
		__releases(q->queue_lock) __acquires(q->queue_lock)
L
Linus Torvalds 已提交
612 613 614
{
	struct request *req;
	
615
	while ((req = blk_fetch_request(q)) != NULL) {
616
		struct nbd_device *nbd;
L
Linus Torvalds 已提交
617

618 619
		spin_unlock_irq(q->queue_lock);

620
		nbd = req->rq_disk->private_data;
L
Linus Torvalds 已提交
621

622
		BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
623

624 625 626
		dev_dbg(nbd_to_dev(nbd), "request %p: dequeued (flags=%x)\n",
			req, req->cmd_type);

627
		if (unlikely(!nbd->sock)) {
628 629
			dev_err_ratelimited(disk_to_dev(nbd->disk),
					    "Attempted send on closed socket\n");
630
			req->errors++;
631
			nbd_end_request(nbd, req);
632 633 634 635
			spin_lock_irq(q->queue_lock);
			continue;
		}

636 637 638
		spin_lock_irq(&nbd->queue_lock);
		list_add_tail(&req->queuelist, &nbd->waiting_queue);
		spin_unlock_irq(&nbd->queue_lock);
L
Linus Torvalds 已提交
639

640
		wake_up(&nbd->waiting_wq);
641

L
Linus Torvalds 已提交
642 643 644 645
		spin_lock_irq(q->queue_lock);
	}
}

M
Markus Pargmann 已提交
646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664
static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock)
{
	int ret = 0;

	spin_lock_irq(&nbd->sock_lock);

	if (nbd->sock) {
		ret = -EBUSY;
		goto out;
	}

	nbd->sock = sock;

out:
	spin_unlock_irq(&nbd->sock_lock);

	return ret;
}

665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688
/* Reset all properties of an NBD device */
static void nbd_reset(struct nbd_device *nbd)
{
	nbd->disconnect = false;
	nbd->timedout = false;
	nbd->blksize = 1024;
	nbd->bytesize = 0;
	set_capacity(nbd->disk, 0);
	nbd->flags = 0;
	nbd->xmit_timeout = 0;
	queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
	del_timer_sync(&nbd->timeout_timer);
}

static void nbd_bdev_reset(struct block_device *bdev)
{
	set_device_ro(bdev, false);
	bdev->bd_inode->i_size = 0;
	if (max_part > 0) {
		blkdev_reread_part(bdev);
		bdev->bd_invalidated = 1;
	}
}

689 690 691 692 693 694 695
static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
{
	if (nbd->flags & NBD_FLAG_READ_ONLY)
		set_device_ro(bdev, true);
	if (nbd->flags & NBD_FLAG_SEND_TRIM)
		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
	if (nbd->flags & NBD_FLAG_SEND_FLUSH)
696
		blk_queue_write_cache(nbd->disk->queue, true, false);
697
	else
698
		blk_queue_write_cache(nbd->disk->queue, false, false);
699 700
}

M
Markus Pargmann 已提交
701 702 703
static int nbd_dev_dbg_init(struct nbd_device *nbd);
static void nbd_dev_dbg_close(struct nbd_device *nbd);

P
Pavel Machek 已提交
704
/* Must be called with tx_lock held */
L
Linus Torvalds 已提交
705

706
static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
P
Pavel Machek 已提交
707 708
		       unsigned int cmd, unsigned long arg)
{
L
Linus Torvalds 已提交
709
	switch (cmd) {
P
Pavel Machek 已提交
710 711 712
	case NBD_DISCONNECT: {
		struct request sreq;

713
		dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
714 715
		if (!nbd->sock)
			return -EINVAL;
P
Pavel Machek 已提交
716

717 718 719
		mutex_unlock(&nbd->tx_lock);
		fsync_bdev(bdev);
		mutex_lock(&nbd->tx_lock);
720
		blk_rq_init(NULL, &sreq);
721
		sreq.cmd_type = REQ_TYPE_DRV_PRIV;
722 723

		/* Check again after getting mutex back.  */
724
		if (!nbd->sock)
L
Linus Torvalds 已提交
725
			return -EINVAL;
726

727
		nbd->disconnect = true;
P
Paul Clements 已提交
728

729
		nbd_send_req(nbd, &sreq);
P
Paul Clements 已提交
730
		return 0;
P
Pavel Machek 已提交
731
	}
L
Linus Torvalds 已提交
732
 
M
Markus Pargmann 已提交
733 734
	case NBD_CLEAR_SOCK:
		sock_shutdown(nbd);
735 736
		nbd_clear_que(nbd);
		BUG_ON(!list_empty(&nbd->queue_head));
737
		BUG_ON(!list_empty(&nbd->waiting_queue));
738
		kill_bdev(bdev);
P
Pavel Machek 已提交
739 740 741
		return 0;

	case NBD_SET_SOCK: {
A
Al Viro 已提交
742
		int err;
M
Markus Pargmann 已提交
743 744 745 746 747 748 749 750 751 752
		struct socket *sock = sockfd_lookup(arg, &err);

		if (!sock)
			return err;

		err = nbd_set_socket(nbd, sock);
		if (!err && max_part)
			bdev->bd_invalidated = 1;

		return err;
P
Pavel Machek 已提交
753 754
	}

755
	case NBD_SET_BLKSIZE: {
A
Arnd Bergmann 已提交
756
		loff_t bsize = div_s64(nbd->bytesize, arg);
757 758 759

		return nbd_size_set(nbd, bdev, arg, bsize);
	}
P
Pavel Machek 已提交
760

L
Linus Torvalds 已提交
761
	case NBD_SET_SIZE:
762 763 764 765 766
		return nbd_size_set(nbd, bdev, nbd->blksize,
				    arg / nbd->blksize);

	case NBD_SET_SIZE_BLOCKS:
		return nbd_size_set(nbd, bdev, nbd->blksize, arg);
P
Pavel Machek 已提交
767

768
	case NBD_SET_TIMEOUT:
769
		nbd->xmit_timeout = arg * HZ;
M
Markus Pargmann 已提交
770 771 772 773 774 775
		if (arg)
			mod_timer(&nbd->timeout_timer,
				  jiffies + nbd->xmit_timeout);
		else
			del_timer_sync(&nbd->timeout_timer);

776
		return 0;
P
Pavel Machek 已提交
777

P
Paul Clements 已提交
778 779 780 781
	case NBD_SET_FLAGS:
		nbd->flags = arg;
		return 0;

P
Pavel Machek 已提交
782 783 784 785
	case NBD_DO_IT: {
		struct task_struct *thread;
		int error;

M
Markus Pargmann 已提交
786
		if (nbd->task_recv)
787
			return -EBUSY;
A
Al Viro 已提交
788
		if (!nbd->sock)
L
Linus Torvalds 已提交
789
			return -EINVAL;
P
Pavel Machek 已提交
790

791
		mutex_unlock(&nbd->tx_lock);
P
Pavel Machek 已提交
792

793
		nbd_parse_flags(nbd, bdev);
P
Paul Clements 已提交
794

795
		thread = kthread_run(nbd_thread_send, nbd, "%s",
M
Markus Pargmann 已提交
796
				     nbd_name(nbd));
P
Pavel Machek 已提交
797
		if (IS_ERR(thread)) {
798
			mutex_lock(&nbd->tx_lock);
799
			return PTR_ERR(thread);
P
Pavel Machek 已提交
800
		}
801

M
Markus Pargmann 已提交
802
		nbd_dev_dbg_init(nbd);
803
		error = nbd_thread_recv(nbd, bdev);
M
Markus Pargmann 已提交
804
		nbd_dev_dbg_close(nbd);
805
		kthread_stop(thread);
P
Pavel Machek 已提交
806

807
		mutex_lock(&nbd->tx_lock);
808

809
		sock_shutdown(nbd);
810
		nbd_clear_que(nbd);
811
		kill_bdev(bdev);
812 813
		nbd_bdev_reset(bdev);

P
Paul Clements 已提交
814
		if (nbd->disconnect) /* user requested, ignore socket errors */
815 816 817 818
			error = 0;
		if (nbd->timedout)
			error = -ETIMEDOUT;

819 820
		nbd_reset(nbd);

821
		return error;
P
Pavel Machek 已提交
822 823
	}

L
Linus Torvalds 已提交
824
	case NBD_CLEAR_QUE:
825 826 827 828
		/*
		 * This is for compatibility only.  The queue is always cleared
		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
		 */
L
Linus Torvalds 已提交
829
		return 0;
P
Pavel Machek 已提交
830

L
Linus Torvalds 已提交
831
	case NBD_PRINT_DEBUG:
832
		dev_info(disk_to_dev(nbd->disk),
833
			"next = %p, prev = %p, head = %p\n",
834 835
			nbd->queue_head.next, nbd->queue_head.prev,
			&nbd->queue_head);
L
Linus Torvalds 已提交
836 837
		return 0;
	}
P
Pavel Machek 已提交
838 839 840 841 842 843
	return -ENOTTY;
}

static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
		     unsigned int cmd, unsigned long arg)
{
844
	struct nbd_device *nbd = bdev->bd_disk->private_data;
P
Pavel Machek 已提交
845 846 847 848 849
	int error;

	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

850
	BUG_ON(nbd->magic != NBD_MAGIC);
P
Pavel Machek 已提交
851

852 853 854
	mutex_lock(&nbd->tx_lock);
	error = __nbd_ioctl(bdev, nbd, cmd, arg);
	mutex_unlock(&nbd->tx_lock);
P
Pavel Machek 已提交
855 856

	return error;
L
Linus Torvalds 已提交
857 858
}

859
static const struct block_device_operations nbd_fops =
L
Linus Torvalds 已提交
860 861
{
	.owner =	THIS_MODULE,
862
	.ioctl =	nbd_ioctl,
A
Al Viro 已提交
863
	.compat_ioctl =	nbd_ioctl,
L
Linus Torvalds 已提交
864 865
};

M
Markus Pargmann 已提交
866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927
#if IS_ENABLED(CONFIG_DEBUG_FS)

static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
{
	struct nbd_device *nbd = s->private;

	if (nbd->task_recv)
		seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
	if (nbd->task_send)
		seq_printf(s, "send: %d\n", task_pid_nr(nbd->task_send));

	return 0;
}

static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
{
	return single_open(file, nbd_dbg_tasks_show, inode->i_private);
}

static const struct file_operations nbd_dbg_tasks_ops = {
	.open = nbd_dbg_tasks_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
};

static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
{
	struct nbd_device *nbd = s->private;
	u32 flags = nbd->flags;

	seq_printf(s, "Hex: 0x%08x\n\n", flags);

	seq_puts(s, "Known flags:\n");

	if (flags & NBD_FLAG_HAS_FLAGS)
		seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
	if (flags & NBD_FLAG_READ_ONLY)
		seq_puts(s, "NBD_FLAG_READ_ONLY\n");
	if (flags & NBD_FLAG_SEND_FLUSH)
		seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
	if (flags & NBD_FLAG_SEND_TRIM)
		seq_puts(s, "NBD_FLAG_SEND_TRIM\n");

	return 0;
}

static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
{
	return single_open(file, nbd_dbg_flags_show, inode->i_private);
}

static const struct file_operations nbd_dbg_flags_ops = {
	.open = nbd_dbg_flags_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
};

static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
	struct dentry *dir;
928 929 930

	if (!nbd_dbg_dir)
		return -EIO;
M
Markus Pargmann 已提交
931 932

	dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
933 934 935 936
	if (!dir) {
		dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
			nbd_name(nbd));
		return -EIO;
M
Markus Pargmann 已提交
937 938 939
	}
	nbd->dbg_dir = dir;

940 941 942 943
	debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
	debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
	debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout);
	debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize);
944
	debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
M
Markus Pargmann 已提交
945 946 947 948 949 950 951 952 953 954 955 956 957 958

	return 0;
}

static void nbd_dev_dbg_close(struct nbd_device *nbd)
{
	debugfs_remove_recursive(nbd->dbg_dir);
}

static int nbd_dbg_init(void)
{
	struct dentry *dbg_dir;

	dbg_dir = debugfs_create_dir("nbd", NULL);
959 960
	if (!dbg_dir)
		return -EIO;
M
Markus Pargmann 已提交
961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993

	nbd_dbg_dir = dbg_dir;

	return 0;
}

static void nbd_dbg_close(void)
{
	debugfs_remove_recursive(nbd_dbg_dir);
}

#else  /* IS_ENABLED(CONFIG_DEBUG_FS) */

static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
	return 0;
}

static void nbd_dev_dbg_close(struct nbd_device *nbd)
{
}

static int nbd_dbg_init(void)
{
	return 0;
}

static void nbd_dbg_close(void)
{
}

#endif

L
Linus Torvalds 已提交
994 995 996 997 998 999 1000 1001 1002
/*
 * And here should be modules and kernel interface 
 *  (Just smiley confuses emacs :-)
 */

static int __init nbd_init(void)
{
	int err = -ENOMEM;
	int i;
L
Laurent Vivier 已提交
1003
	int part_shift;
L
Linus Torvalds 已提交
1004

1005
	BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
L
Linus Torvalds 已提交
1006

L
Laurent Vivier 已提交
1007
	if (max_part < 0) {
1008
		printk(KERN_ERR "nbd: max_part must be >= 0\n");
L
Laurent Vivier 已提交
1009 1010 1011 1012
		return -EINVAL;
	}

	part_shift = 0;
1013
	if (max_part > 0) {
L
Laurent Vivier 已提交
1014 1015
		part_shift = fls(max_part);

1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026
		/*
		 * Adjust max_part according to part_shift as it is exported
		 * to user space so that user can know the max number of
		 * partition kernel should be able to manage.
		 *
		 * Note that -1 is required because partition 0 is reserved
		 * for the whole disk.
		 */
		max_part = (1UL << part_shift) - 1;
	}

1027 1028 1029 1030 1031 1032
	if ((1UL << part_shift) > DISK_MAX_PARTS)
		return -EINVAL;

	if (nbds_max > 1UL << (MINORBITS - part_shift))
		return -EINVAL;

S
Sudip Mukherjee 已提交
1033 1034 1035 1036
	nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
	if (!nbd_dev)
		return -ENOMEM;

1037
	for (i = 0; i < nbds_max; i++) {
L
Laurent Vivier 已提交
1038
		struct gendisk *disk = alloc_disk(1 << part_shift);
L
Linus Torvalds 已提交
1039 1040 1041 1042 1043 1044 1045 1046
		if (!disk)
			goto out;
		nbd_dev[i].disk = disk;
		/*
		 * The new linux 2.5 block layer implementation requires
		 * every gendisk to have its very own request_queue struct.
		 * These structs are big so we dynamically allocate them.
		 */
1047
		disk->queue = blk_init_queue(nbd_request_handler, &nbd_lock);
L
Linus Torvalds 已提交
1048 1049 1050 1051
		if (!disk->queue) {
			put_disk(disk);
			goto out;
		}
1052 1053 1054 1055
		/*
		 * Tell the block layer that we are not a rotational device
		 */
		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
1056
		queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
P
Paul Clements 已提交
1057
		disk->queue->limits.discard_granularity = 512;
1058
		blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
P
Paul Clements 已提交
1059
		disk->queue->limits.discard_zeroes_data = 0;
1060 1061
		blk_queue_max_hw_sectors(disk->queue, 65536);
		disk->queue->limits.max_sectors = 256;
L
Linus Torvalds 已提交
1062 1063 1064 1065 1066 1067 1068 1069 1070
	}

	if (register_blkdev(NBD_MAJOR, "nbd")) {
		err = -EIO;
		goto out;
	}

	printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR);

M
Markus Pargmann 已提交
1071 1072
	nbd_dbg_init();

1073
	for (i = 0; i < nbds_max; i++) {
L
Linus Torvalds 已提交
1074
		struct gendisk *disk = nbd_dev[i].disk;
1075
		nbd_dev[i].magic = NBD_MAGIC;
1076
		INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
L
Linus Torvalds 已提交
1077
		spin_lock_init(&nbd_dev[i].queue_lock);
M
Markus Pargmann 已提交
1078
		spin_lock_init(&nbd_dev[i].sock_lock);
L
Linus Torvalds 已提交
1079
		INIT_LIST_HEAD(&nbd_dev[i].queue_head);
1080
		mutex_init(&nbd_dev[i].tx_lock);
M
Markus Pargmann 已提交
1081 1082 1083
		init_timer(&nbd_dev[i].timeout_timer);
		nbd_dev[i].timeout_timer.function = nbd_xmit_timeout;
		nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i];
1084
		init_waitqueue_head(&nbd_dev[i].active_wq);
1085
		init_waitqueue_head(&nbd_dev[i].waiting_wq);
L
Linus Torvalds 已提交
1086
		disk->major = NBD_MAJOR;
L
Laurent Vivier 已提交
1087
		disk->first_minor = i << part_shift;
L
Linus Torvalds 已提交
1088 1089 1090
		disk->fops = &nbd_fops;
		disk->private_data = &nbd_dev[i];
		sprintf(disk->disk_name, "nbd%d", i);
1091
		nbd_reset(&nbd_dev[i]);
L
Linus Torvalds 已提交
1092 1093 1094 1095 1096 1097 1098 1099 1100
		add_disk(disk);
	}

	return 0;
out:
	while (i--) {
		blk_cleanup_queue(nbd_dev[i].disk->queue);
		put_disk(nbd_dev[i].disk);
	}
1101
	kfree(nbd_dev);
L
Linus Torvalds 已提交
1102 1103 1104 1105 1106 1107
	return err;
}

static void __exit nbd_cleanup(void)
{
	int i;
M
Markus Pargmann 已提交
1108 1109 1110

	nbd_dbg_close();

1111
	for (i = 0; i < nbds_max; i++) {
L
Linus Torvalds 已提交
1112
		struct gendisk *disk = nbd_dev[i].disk;
1113
		nbd_dev[i].magic = 0;
L
Linus Torvalds 已提交
1114 1115 1116 1117 1118 1119 1120
		if (disk) {
			del_gendisk(disk);
			blk_cleanup_queue(disk->queue);
			put_disk(disk);
		}
	}
	unregister_blkdev(NBD_MAJOR, "nbd");
1121
	kfree(nbd_dev);
L
Linus Torvalds 已提交
1122 1123 1124 1125 1126 1127 1128 1129 1130
	printk(KERN_INFO "nbd: unregistered device at major %d\n", NBD_MAJOR);
}

module_init(nbd_init);
module_exit(nbd_cleanup);

MODULE_DESCRIPTION("Network Block Device");
MODULE_LICENSE("GPL");

1131
module_param(nbds_max, int, 0444);
L
Laurent Vivier 已提交
1132 1133 1134
MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
module_param(max_part, int, 0444);
MODULE_PARM_DESC(max_part, "number of partitions per device (default: 0)");