nbd.c 25.9 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 * Network block device - make block devices work over TCP
 *
 * Note that you can not swap over this thing, yet. Seems to work but
 * deadlocks sometimes - you can not swap over TCP in general.
 * 
P
Pavel Machek 已提交
7
 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
L
Linus Torvalds 已提交
8 9
 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
 *
10
 * This file is released under GPLv2 or later.
L
Linus Torvalds 已提交
11
 *
12
 * (part of code stolen from loop.c)
L
Linus Torvalds 已提交
13 14 15 16 17 18 19 20 21 22 23 24 25 26
 */

#include <linux/major.h>

#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/bio.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/ioctl.h>
27
#include <linux/mutex.h>
28 29 30
#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/kernel.h>
31
#include <linux/slab.h>
L
Linus Torvalds 已提交
32
#include <net/sock.h>
33
#include <linux/net.h>
34
#include <linux/kthread.h>
M
Markus Pargmann 已提交
35
#include <linux/types.h>
M
Markus Pargmann 已提交
36
#include <linux/debugfs.h>
L
Linus Torvalds 已提交
37 38 39 40 41 42

#include <asm/uaccess.h>
#include <asm/types.h>

#include <linux/nbd.h>

43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
struct nbd_device {
	int flags;
	struct socket * sock;	/* If == NULL, device is not ready, yet	*/
	int magic;

	spinlock_t queue_lock;
	struct list_head queue_head;	/* Requests waiting result */
	struct request *active_req;
	wait_queue_head_t active_wq;
	struct list_head waiting_queue;	/* Requests to be sent */
	wait_queue_head_t waiting_wq;

	struct mutex tx_lock;
	struct gendisk *disk;
	int blksize;
M
Markus Pargmann 已提交
58
	loff_t bytesize;
59
	int xmit_timeout;
60
	bool disconnect; /* a disconnect has been requested by user */
M
Markus Pargmann 已提交
61 62 63 64

	struct timer_list timeout_timer;
	struct task_struct *task_recv;
	struct task_struct *task_send;
M
Markus Pargmann 已提交
65 66 67 68

#if IS_ENABLED(CONFIG_DEBUG_FS)
	struct dentry *dbg_dir;
#endif
69 70
};

M
Markus Pargmann 已提交
71 72 73 74 75 76
#if IS_ENABLED(CONFIG_DEBUG_FS)
static struct dentry *nbd_dbg_dir;
#endif

#define nbd_name(nbd) ((nbd)->disk->disk_name)

77
#define NBD_MAGIC 0x68797548
L
Linus Torvalds 已提交
78

79
static unsigned int nbds_max = 16;
80
static struct nbd_device *nbd_dev;
L
Laurent Vivier 已提交
81
static int max_part;
L
Linus Torvalds 已提交
82 83 84 85 86 87 88 89 90 91 92 93 94

/*
 * Use just one lock (or at most 1 per NIC). Two arguments for this:
 * 1. Each NIC is essentially a synchronization point for all servers
 *    accessed through that NIC so there's no need to have more locks
 *    than NICs anyway.
 * 2. More locks lead to more "Dirty cache line bouncing" which will slow
 *    down each lock to the point where they're actually slower than just
 *    a single lock.
 * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this!
 */
static DEFINE_SPINLOCK(nbd_lock);

95
static inline struct device *nbd_to_dev(struct nbd_device *nbd)
L
Linus Torvalds 已提交
96
{
97
	return disk_to_dev(nbd->disk);
L
Linus Torvalds 已提交
98 99 100 101 102 103 104 105
}

static const char *nbdcmd_to_ascii(int cmd)
{
	switch (cmd) {
	case  NBD_CMD_READ: return "read";
	case NBD_CMD_WRITE: return "write";
	case  NBD_CMD_DISC: return "disconnect";
A
Alex Bligh 已提交
106
	case NBD_CMD_FLUSH: return "flush";
P
Paul Clements 已提交
107
	case  NBD_CMD_TRIM: return "trim/discard";
L
Linus Torvalds 已提交
108 109 110 111
	}
	return "invalid";
}

112
static void nbd_end_request(struct nbd_device *nbd, struct request *req)
L
Linus Torvalds 已提交
113
{
114
	int error = req->errors ? -EIO : 0;
115
	struct request_queue *q = req->q;
L
Linus Torvalds 已提交
116 117
	unsigned long flags;

118 119
	dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", req,
		error ? "failed" : "done");
L
Linus Torvalds 已提交
120 121

	spin_lock_irqsave(q->queue_lock, flags);
122
	__blk_end_request_all(req, error);
L
Linus Torvalds 已提交
123 124 125
	spin_unlock_irqrestore(q->queue_lock, flags);
}

126 127 128
/*
 * Forcibly shutdown the socket causing all listeners to error
 */
129
static void sock_shutdown(struct nbd_device *nbd)
130
{
M
Markus Pargmann 已提交
131 132 133 134 135 136 137
	if (!nbd->sock)
		return;

	dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
	kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
	nbd->sock = NULL;
	del_timer_sync(&nbd->timeout_timer);
138 139 140 141
}

static void nbd_xmit_timeout(unsigned long arg)
{
M
Markus Pargmann 已提交
142 143 144 145 146 147
	struct nbd_device *nbd = (struct nbd_device *)arg;
	struct task_struct *task;

	if (list_empty(&nbd->queue_head))
		return;

148
	nbd->disconnect = true;
M
Markus Pargmann 已提交
149 150 151 152

	task = READ_ONCE(nbd->task_recv);
	if (task)
		force_sig(SIGKILL, task);
153

M
Markus Pargmann 已提交
154 155 156 157 158
	task = READ_ONCE(nbd->task_send);
	if (task)
		force_sig(SIGKILL, nbd->task_send);

	dev_err(nbd_to_dev(nbd), "Connection timed out, killed receiver and sender, shutting down connection\n");
159 160
}

L
Linus Torvalds 已提交
161 162 163
/*
 *  Send or receive packet.
 */
164
static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
L
Linus Torvalds 已提交
165 166
		int msg_flags)
{
167
	struct socket *sock = nbd->sock;
L
Linus Torvalds 已提交
168 169 170
	int result;
	struct msghdr msg;
	struct kvec iov;
171
	sigset_t blocked, oldset;
172
	unsigned long pflags = current->flags;
L
Linus Torvalds 已提交
173

174
	if (unlikely(!sock)) {
175
		dev_err(disk_to_dev(nbd->disk),
176 177
			"Attempted %s on closed socket in sock_xmit\n",
			(send ? "send" : "recv"));
178 179 180
		return -EINVAL;
	}

L
Linus Torvalds 已提交
181 182
	/* Allow interception of SIGKILL only
	 * Don't allow other signals to interrupt the transmission */
183 184
	siginitsetinv(&blocked, sigmask(SIGKILL));
	sigprocmask(SIG_SETMASK, &blocked, &oldset);
L
Linus Torvalds 已提交
185

186
	current->flags |= PF_MEMALLOC;
L
Linus Torvalds 已提交
187
	do {
188
		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
L
Linus Torvalds 已提交
189 190 191 192 193 194 195 196
		iov.iov_base = buf;
		iov.iov_len = size;
		msg.msg_name = NULL;
		msg.msg_namelen = 0;
		msg.msg_control = NULL;
		msg.msg_controllen = 0;
		msg.msg_flags = msg_flags | MSG_NOSIGNAL;

M
Markus Pargmann 已提交
197
		if (send)
L
Linus Torvalds 已提交
198
			result = kernel_sendmsg(sock, &msg, &iov, 1, size);
M
Markus Pargmann 已提交
199
		else
200 201
			result = kernel_recvmsg(sock, &msg, &iov, 1, size,
						msg.msg_flags);
L
Linus Torvalds 已提交
202 203 204 205 206 207 208 209 210 211

		if (result <= 0) {
			if (result == 0)
				result = -EPIPE; /* short read */
			break;
		}
		size -= result;
		buf += result;
	} while (size > 0);

212
	sigprocmask(SIG_SETMASK, &oldset, NULL);
213
	tsk_restore_flags(current, pflags, PF_MEMALLOC);
L
Linus Torvalds 已提交
214

M
Markus Pargmann 已提交
215 216 217
	if (!send && nbd->xmit_timeout)
		mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);

L
Linus Torvalds 已提交
218 219 220
	return result;
}

221
static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
L
Linus Torvalds 已提交
222 223 224 225
		int flags)
{
	int result;
	void *kaddr = kmap(bvec->bv_page);
226 227
	result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
			   bvec->bv_len, flags);
L
Linus Torvalds 已提交
228 229 230 231
	kunmap(bvec->bv_page);
	return result;
}

232
/* always call with the tx_lock held */
233
static int nbd_send_req(struct nbd_device *nbd, struct request *req)
L
Linus Torvalds 已提交
234
{
235
	int result, flags;
L
Linus Torvalds 已提交
236
	struct nbd_request request;
237
	unsigned long size = blk_rq_bytes(req);
C
Christoph Hellwig 已提交
238 239 240 241 242 243 244 245 246 247 248 249
	u32 type;

	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
		type = NBD_CMD_DISC;
	else if (req->cmd_flags & REQ_DISCARD)
		type = NBD_CMD_TRIM;
	else if (req->cmd_flags & REQ_FLUSH)
		type = NBD_CMD_FLUSH;
	else if (rq_data_dir(req) == WRITE)
		type = NBD_CMD_WRITE;
	else
		type = NBD_CMD_READ;
L
Linus Torvalds 已提交
250

251
	memset(&request, 0, sizeof(request));
L
Linus Torvalds 已提交
252
	request.magic = htonl(NBD_REQUEST_MAGIC);
C
Christoph Hellwig 已提交
253 254
	request.type = htonl(type);
	if (type != NBD_CMD_FLUSH && type != NBD_CMD_DISC) {
A
Alex Bligh 已提交
255 256 257
		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
		request.len = htonl(size);
	}
L
Linus Torvalds 已提交
258 259
	memcpy(request.handle, &req, sizeof(req));

260
	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
C
Christoph Hellwig 已提交
261
		req, nbdcmd_to_ascii(type),
262
		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
263
	result = sock_xmit(nbd, 1, &request, sizeof(request),
C
Christoph Hellwig 已提交
264
			(type == NBD_CMD_WRITE) ? MSG_MORE : 0);
L
Linus Torvalds 已提交
265
	if (result <= 0) {
266
		dev_err(disk_to_dev(nbd->disk),
267
			"Send control failed (result %d)\n", result);
268
		return -EIO;
L
Linus Torvalds 已提交
269 270
	}

C
Christoph Hellwig 已提交
271
	if (type == NBD_CMD_WRITE) {
272
		struct req_iterator iter;
273
		struct bio_vec bvec;
L
Linus Torvalds 已提交
274 275 276 277
		/*
		 * we are really probing at internals to determine
		 * whether to set MSG_MORE or not...
		 */
278
		rq_for_each_segment(bvec, req, iter) {
279
			flags = 0;
K
Kent Overstreet 已提交
280
			if (!rq_iter_last(bvec, iter))
281
				flags = MSG_MORE;
282 283
			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
				req, bvec.bv_len);
284
			result = sock_send_bvec(nbd, &bvec, flags);
285
			if (result <= 0) {
286
				dev_err(disk_to_dev(nbd->disk),
287 288
					"Send data failed (result %d)\n",
					result);
289
				return -EIO;
290
			}
L
Linus Torvalds 已提交
291 292 293 294 295
		}
	}
	return 0;
}

296
static struct request *nbd_find_request(struct nbd_device *nbd,
297
					struct request *xreq)
L
Linus Torvalds 已提交
298
{
299
	struct request *req, *tmp;
300
	int err;
L
Linus Torvalds 已提交
301

302
	err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
303
	if (unlikely(err))
304
		return ERR_PTR(err);
305

306 307
	spin_lock(&nbd->queue_lock);
	list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
L
Linus Torvalds 已提交
308 309 310
		if (req != xreq)
			continue;
		list_del_init(&req->queuelist);
311
		spin_unlock(&nbd->queue_lock);
L
Linus Torvalds 已提交
312 313
		return req;
	}
314
	spin_unlock(&nbd->queue_lock);
315

316
	return ERR_PTR(-ENOENT);
L
Linus Torvalds 已提交
317 318
}

319
static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
L
Linus Torvalds 已提交
320 321 322
{
	int result;
	void *kaddr = kmap(bvec->bv_page);
323
	result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
L
Linus Torvalds 已提交
324 325 326 327 328 329
			MSG_WAITALL);
	kunmap(bvec->bv_page);
	return result;
}

/* NULL returned = something went wrong, inform userspace */
330
static struct request *nbd_read_stat(struct nbd_device *nbd)
L
Linus Torvalds 已提交
331 332 333 334 335 336
{
	int result;
	struct nbd_reply reply;
	struct request *req;

	reply.magic = 0;
337
	result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
L
Linus Torvalds 已提交
338
	if (result <= 0) {
339
		dev_err(disk_to_dev(nbd->disk),
340
			"Receive control failed (result %d)\n", result);
341
		return ERR_PTR(result);
L
Linus Torvalds 已提交
342
	}
343 344

	if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
345
		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
346
				(unsigned long)ntohl(reply.magic));
347
		return ERR_PTR(-EPROTO);
348 349
	}

350
	req = nbd_find_request(nbd, *(struct request **)reply.handle);
351
	if (IS_ERR(req)) {
352 353
		result = PTR_ERR(req);
		if (result != -ENOENT)
354
			return ERR_PTR(result);
355

356
		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
357
			reply.handle);
358
		return ERR_PTR(-EBADR);
L
Linus Torvalds 已提交
359 360 361
	}

	if (ntohl(reply.error)) {
362
		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
363
			ntohl(reply.error));
L
Linus Torvalds 已提交
364 365 366 367
		req->errors++;
		return req;
	}

368
	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
C
Christoph Hellwig 已提交
369
	if (rq_data_dir(req) != WRITE) {
370
		struct req_iterator iter;
371
		struct bio_vec bvec;
372 373

		rq_for_each_segment(bvec, req, iter) {
374
			result = sock_recv_bvec(nbd, &bvec);
375
			if (result <= 0) {
376
				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
377
					result);
378 379 380
				req->errors++;
				return req;
			}
381 382
			dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
				req, bvec.bv_len);
L
Linus Torvalds 已提交
383 384 385 386 387
		}
	}
	return req;
}

388 389
static ssize_t pid_show(struct device *dev,
			struct device_attribute *attr, char *buf)
390
{
391
	struct gendisk *disk = dev_to_disk(dev);
M
Markus Pargmann 已提交
392
	struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
393

M
Markus Pargmann 已提交
394
	return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
395 396
}

397
static struct device_attribute pid_attr = {
398
	.attr = { .name = "pid", .mode = S_IRUGO},
399 400 401
	.show = pid_show,
};

402
static int nbd_do_it(struct nbd_device *nbd)
L
Linus Torvalds 已提交
403 404
{
	struct request *req;
405
	int ret;
L
Linus Torvalds 已提交
406

407
	BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
408

409
	sk_set_memalloc(nbd->sock->sk);
M
Markus Pargmann 已提交
410 411 412

	nbd->task_recv = current;

413
	ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
414
	if (ret) {
415
		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
M
Markus Pargmann 已提交
416
		nbd->task_recv = NULL;
417 418
		return ret;
	}
419

420 421 422 423 424 425 426
	while (1) {
		req = nbd_read_stat(nbd);
		if (IS_ERR(req)) {
			ret = PTR_ERR(req);
			break;
		}

427
		nbd_end_request(nbd, req);
428
	}
429

M
Markus Pargmann 已提交
430 431
	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);

M
Markus Pargmann 已提交
432 433 434 435 436 437 438 439
	nbd->task_recv = NULL;

	if (signal_pending(current)) {
		siginfo_t info;

		ret = dequeue_signal_lock(current, &current->blocked, &info);
		dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n",
			 task_pid_nr(current), current->comm, ret);
440 441 442
		mutex_lock(&nbd->tx_lock);
		sock_shutdown(nbd);
		mutex_unlock(&nbd->tx_lock);
M
Markus Pargmann 已提交
443 444 445 446
		ret = -ETIMEDOUT;
	}

	return ret;
L
Linus Torvalds 已提交
447 448
}

449
static void nbd_clear_que(struct nbd_device *nbd)
L
Linus Torvalds 已提交
450 451 452
{
	struct request *req;

453
	BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
454

455
	/*
456
	 * Because we have set nbd->sock to NULL under the tx_lock, all
457 458 459 460 461 462
	 * modifications to the list must have completed by now.  For
	 * the same reason, the active_req must be NULL.
	 *
	 * As a consequence, we don't need to take the spin lock while
	 * purging the list here.
	 */
463 464
	BUG_ON(nbd->sock);
	BUG_ON(nbd->active_req);
465

466 467
	while (!list_empty(&nbd->queue_head)) {
		req = list_entry(nbd->queue_head.next, struct request,
468 469 470
				 queuelist);
		list_del_init(&req->queuelist);
		req->errors++;
471
		nbd_end_request(nbd, req);
472
	}
473 474 475 476 477 478

	while (!list_empty(&nbd->waiting_queue)) {
		req = list_entry(nbd->waiting_queue.next, struct request,
				 queuelist);
		list_del_init(&req->queuelist);
		req->errors++;
479
		nbd_end_request(nbd, req);
480
	}
481
	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
L
Linus Torvalds 已提交
482 483
}

484

485
static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
486
{
487
	if (req->cmd_type != REQ_TYPE_FS)
488 489
		goto error_out;

C
Christoph Hellwig 已提交
490 491 492 493 494
	if (rq_data_dir(req) == WRITE &&
	    (nbd->flags & NBD_FLAG_READ_ONLY)) {
		dev_err(disk_to_dev(nbd->disk),
			"Write on read-only\n");
		goto error_out;
A
Alex Bligh 已提交
495 496
	}

497 498
	req->errors = 0;

499 500 501 502
	mutex_lock(&nbd->tx_lock);
	if (unlikely(!nbd->sock)) {
		mutex_unlock(&nbd->tx_lock);
		dev_err(disk_to_dev(nbd->disk),
503
			"Attempted send on closed socket\n");
P
Pavel Machek 已提交
504
		goto error_out;
505 506
	}

507
	nbd->active_req = req;
508

M
Markus Pargmann 已提交
509 510 511
	if (nbd->xmit_timeout && list_empty_careful(&nbd->queue_head))
		mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);

512 513
	if (nbd_send_req(nbd, req) != 0) {
		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
514
		req->errors++;
515
		nbd_end_request(nbd, req);
516
	} else {
517
		spin_lock(&nbd->queue_lock);
518
		list_add_tail(&req->queuelist, &nbd->queue_head);
519
		spin_unlock(&nbd->queue_lock);
520 521
	}

522 523 524
	nbd->active_req = NULL;
	mutex_unlock(&nbd->tx_lock);
	wake_up_all(&nbd->active_wq);
525 526 527 528 529

	return;

error_out:
	req->errors++;
530
	nbd_end_request(nbd, req);
531 532 533 534
}

static int nbd_thread(void *data)
{
535
	struct nbd_device *nbd = data;
536 537
	struct request *req;

M
Markus Pargmann 已提交
538 539
	nbd->task_send = current;

540
	set_user_nice(current, MIN_NICE);
541
	while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
542
		/* wait for something to do */
543
		wait_event_interruptible(nbd->waiting_wq,
544
					 kthread_should_stop() ||
545
					 !list_empty(&nbd->waiting_queue));
546

M
Markus Pargmann 已提交
547 548 549 550 551 552 553 554
		if (signal_pending(current)) {
			siginfo_t info;
			int ret;

			ret = dequeue_signal_lock(current, &current->blocked,
						  &info);
			dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n",
				 task_pid_nr(current), current->comm, ret);
555 556 557
			mutex_lock(&nbd->tx_lock);
			sock_shutdown(nbd);
			mutex_unlock(&nbd->tx_lock);
M
Markus Pargmann 已提交
558 559 560
			break;
		}

561
		/* extract request */
562
		if (list_empty(&nbd->waiting_queue))
563 564
			continue;

565 566
		spin_lock_irq(&nbd->queue_lock);
		req = list_entry(nbd->waiting_queue.next, struct request,
567 568
				 queuelist);
		list_del_init(&req->queuelist);
569
		spin_unlock_irq(&nbd->queue_lock);
570 571

		/* handle request */
572
		nbd_handle_req(nbd, req);
573
	}
M
Markus Pargmann 已提交
574 575 576

	nbd->task_send = NULL;

577 578 579
	return 0;
}

L
Linus Torvalds 已提交
580 581 582
/*
 * We always wait for result of write, for now. It would be nice to make it optional
 * in future
583
 * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
L
Linus Torvalds 已提交
584 585 586
 *   { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
 */

P
Pavel Machek 已提交
587
static void do_nbd_request(struct request_queue *q)
A
Alex Elder 已提交
588
		__releases(q->queue_lock) __acquires(q->queue_lock)
L
Linus Torvalds 已提交
589 590 591
{
	struct request *req;
	
592
	while ((req = blk_fetch_request(q)) != NULL) {
593
		struct nbd_device *nbd;
L
Linus Torvalds 已提交
594

595 596
		spin_unlock_irq(q->queue_lock);

597
		nbd = req->rq_disk->private_data;
L
Linus Torvalds 已提交
598

599
		BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
600

601 602 603
		dev_dbg(nbd_to_dev(nbd), "request %p: dequeued (flags=%x)\n",
			req, req->cmd_type);

604 605
		if (unlikely(!nbd->sock)) {
			dev_err(disk_to_dev(nbd->disk),
606
				"Attempted send on closed socket\n");
607
			req->errors++;
608
			nbd_end_request(nbd, req);
609 610 611 612
			spin_lock_irq(q->queue_lock);
			continue;
		}

613 614 615
		spin_lock_irq(&nbd->queue_lock);
		list_add_tail(&req->queuelist, &nbd->waiting_queue);
		spin_unlock_irq(&nbd->queue_lock);
L
Linus Torvalds 已提交
616

617
		wake_up(&nbd->waiting_wq);
618

L
Linus Torvalds 已提交
619 620 621 622
		spin_lock_irq(q->queue_lock);
	}
}

M
Markus Pargmann 已提交
623 624 625
static int nbd_dev_dbg_init(struct nbd_device *nbd);
static void nbd_dev_dbg_close(struct nbd_device *nbd);

P
Pavel Machek 已提交
626
/* Must be called with tx_lock held */
L
Linus Torvalds 已提交
627

628
static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
P
Pavel Machek 已提交
629 630
		       unsigned int cmd, unsigned long arg)
{
L
Linus Torvalds 已提交
631
	switch (cmd) {
P
Pavel Machek 已提交
632 633 634
	case NBD_DISCONNECT: {
		struct request sreq;

635
		dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
636 637
		if (!nbd->sock)
			return -EINVAL;
P
Pavel Machek 已提交
638

639 640 641
		mutex_unlock(&nbd->tx_lock);
		fsync_bdev(bdev);
		mutex_lock(&nbd->tx_lock);
642
		blk_rq_init(NULL, &sreq);
643
		sreq.cmd_type = REQ_TYPE_DRV_PRIV;
644 645

		/* Check again after getting mutex back.  */
646
		if (!nbd->sock)
L
Linus Torvalds 已提交
647
			return -EINVAL;
648

649
		nbd->disconnect = true;
P
Paul Clements 已提交
650

651
		nbd_send_req(nbd, &sreq);
P
Paul Clements 已提交
652
		return 0;
P
Pavel Machek 已提交
653
	}
L
Linus Torvalds 已提交
654
 
P
Pavel Machek 已提交
655
	case NBD_CLEAR_SOCK: {
A
Al Viro 已提交
656
		struct socket *sock = nbd->sock;
657 658 659
		nbd->sock = NULL;
		nbd_clear_que(nbd);
		BUG_ON(!list_empty(&nbd->queue_head));
660
		BUG_ON(!list_empty(&nbd->waiting_queue));
661
		kill_bdev(bdev);
A
Al Viro 已提交
662 663
		if (sock)
			sockfd_put(sock);
P
Pavel Machek 已提交
664 665 666 667
		return 0;
	}

	case NBD_SET_SOCK: {
A
Al Viro 已提交
668 669 670
		struct socket *sock;
		int err;
		if (nbd->sock)
L
Linus Torvalds 已提交
671
			return -EBUSY;
A
Al Viro 已提交
672 673 674 675 676
		sock = sockfd_lookup(arg, &err);
		if (sock) {
			nbd->sock = sock;
			if (max_part > 0)
				bdev->bd_invalidated = 1;
677
			nbd->disconnect = false; /* we're connected now */
A
Al Viro 已提交
678
			return 0;
L
Linus Torvalds 已提交
679
		}
P
Pavel Machek 已提交
680 681 682
		return -EINVAL;
	}

L
Linus Torvalds 已提交
683
	case NBD_SET_BLKSIZE:
684 685 686 687 688
		nbd->blksize = arg;
		nbd->bytesize &= ~(nbd->blksize-1);
		bdev->bd_inode->i_size = nbd->bytesize;
		set_blocksize(bdev, nbd->blksize);
		set_capacity(nbd->disk, nbd->bytesize >> 9);
L
Linus Torvalds 已提交
689
		return 0;
P
Pavel Machek 已提交
690

L
Linus Torvalds 已提交
691
	case NBD_SET_SIZE:
692 693 694 695
		nbd->bytesize = arg & ~(nbd->blksize-1);
		bdev->bd_inode->i_size = nbd->bytesize;
		set_blocksize(bdev, nbd->blksize);
		set_capacity(nbd->disk, nbd->bytesize >> 9);
L
Linus Torvalds 已提交
696
		return 0;
P
Pavel Machek 已提交
697

698
	case NBD_SET_TIMEOUT:
699
		nbd->xmit_timeout = arg * HZ;
M
Markus Pargmann 已提交
700 701 702 703 704 705
		if (arg)
			mod_timer(&nbd->timeout_timer,
				  jiffies + nbd->xmit_timeout);
		else
			del_timer_sync(&nbd->timeout_timer);

706
		return 0;
P
Pavel Machek 已提交
707

P
Paul Clements 已提交
708 709 710 711
	case NBD_SET_FLAGS:
		nbd->flags = arg;
		return 0;

L
Linus Torvalds 已提交
712
	case NBD_SET_SIZE_BLOCKS:
713 714 715 716
		nbd->bytesize = ((u64) arg) * nbd->blksize;
		bdev->bd_inode->i_size = nbd->bytesize;
		set_blocksize(bdev, nbd->blksize);
		set_capacity(nbd->disk, nbd->bytesize >> 9);
L
Linus Torvalds 已提交
717
		return 0;
P
Pavel Machek 已提交
718 719 720

	case NBD_DO_IT: {
		struct task_struct *thread;
A
Al Viro 已提交
721
		struct socket *sock;
P
Pavel Machek 已提交
722 723
		int error;

M
Markus Pargmann 已提交
724
		if (nbd->task_recv)
725
			return -EBUSY;
A
Al Viro 已提交
726
		if (!nbd->sock)
L
Linus Torvalds 已提交
727
			return -EINVAL;
P
Pavel Machek 已提交
728

729
		mutex_unlock(&nbd->tx_lock);
P
Pavel Machek 已提交
730

731 732
		if (nbd->flags & NBD_FLAG_READ_ONLY)
			set_device_ro(bdev, true);
P
Paul Clements 已提交
733 734 735
		if (nbd->flags & NBD_FLAG_SEND_TRIM)
			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
				nbd->disk->queue);
A
Alex Bligh 已提交
736 737 738 739
		if (nbd->flags & NBD_FLAG_SEND_FLUSH)
			blk_queue_flush(nbd->disk->queue, REQ_FLUSH);
		else
			blk_queue_flush(nbd->disk->queue, 0);
P
Paul Clements 已提交
740

741
		thread = kthread_run(nbd_thread, nbd, "%s",
M
Markus Pargmann 已提交
742
				     nbd_name(nbd));
P
Pavel Machek 已提交
743
		if (IS_ERR(thread)) {
744
			mutex_lock(&nbd->tx_lock);
745
			return PTR_ERR(thread);
P
Pavel Machek 已提交
746
		}
747

M
Markus Pargmann 已提交
748
		nbd_dev_dbg_init(nbd);
749
		error = nbd_do_it(nbd);
M
Markus Pargmann 已提交
750
		nbd_dev_dbg_close(nbd);
751
		kthread_stop(thread);
P
Pavel Machek 已提交
752

753
		mutex_lock(&nbd->tx_lock);
754

755
		sock_shutdown(nbd);
A
Al Viro 已提交
756 757
		sock = nbd->sock;
		nbd->sock = NULL;
758
		nbd_clear_que(nbd);
759
		kill_bdev(bdev);
P
Paul Clements 已提交
760
		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
761
		set_device_ro(bdev, false);
A
Al Viro 已提交
762 763
		if (sock)
			sockfd_put(sock);
A
Alex Bligh 已提交
764
		nbd->flags = 0;
765
		nbd->bytesize = 0;
A
Al Viro 已提交
766
		bdev->bd_inode->i_size = 0;
767
		set_capacity(nbd->disk, 0);
L
Laurent Vivier 已提交
768
		if (max_part > 0)
769
			blkdev_reread_part(bdev);
P
Paul Clements 已提交
770 771
		if (nbd->disconnect) /* user requested, ignore socket errors */
			return 0;
772
		return error;
P
Pavel Machek 已提交
773 774
	}

L
Linus Torvalds 已提交
775
	case NBD_CLEAR_QUE:
776 777 778 779
		/*
		 * This is for compatibility only.  The queue is always cleared
		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
		 */
L
Linus Torvalds 已提交
780
		return 0;
P
Pavel Machek 已提交
781

L
Linus Torvalds 已提交
782
	case NBD_PRINT_DEBUG:
783
		dev_info(disk_to_dev(nbd->disk),
784
			"next = %p, prev = %p, head = %p\n",
785 786
			nbd->queue_head.next, nbd->queue_head.prev,
			&nbd->queue_head);
L
Linus Torvalds 已提交
787 788
		return 0;
	}
P
Pavel Machek 已提交
789 790 791 792 793 794
	return -ENOTTY;
}

static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
		     unsigned int cmd, unsigned long arg)
{
795
	struct nbd_device *nbd = bdev->bd_disk->private_data;
P
Pavel Machek 已提交
796 797 798 799 800
	int error;

	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

801
	BUG_ON(nbd->magic != NBD_MAGIC);
P
Pavel Machek 已提交
802

803 804 805
	mutex_lock(&nbd->tx_lock);
	error = __nbd_ioctl(bdev, nbd, cmd, arg);
	mutex_unlock(&nbd->tx_lock);
P
Pavel Machek 已提交
806 807

	return error;
L
Linus Torvalds 已提交
808 809
}

810
static const struct block_device_operations nbd_fops =
L
Linus Torvalds 已提交
811 812
{
	.owner =	THIS_MODULE,
813
	.ioctl =	nbd_ioctl,
L
Linus Torvalds 已提交
814 815
};

M
Markus Pargmann 已提交
816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970
#if IS_ENABLED(CONFIG_DEBUG_FS)

static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
{
	struct nbd_device *nbd = s->private;

	if (nbd->task_recv)
		seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
	if (nbd->task_send)
		seq_printf(s, "send: %d\n", task_pid_nr(nbd->task_send));

	return 0;
}

static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
{
	return single_open(file, nbd_dbg_tasks_show, inode->i_private);
}

static const struct file_operations nbd_dbg_tasks_ops = {
	.open = nbd_dbg_tasks_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
};

static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
{
	struct nbd_device *nbd = s->private;
	u32 flags = nbd->flags;

	seq_printf(s, "Hex: 0x%08x\n\n", flags);

	seq_puts(s, "Known flags:\n");

	if (flags & NBD_FLAG_HAS_FLAGS)
		seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
	if (flags & NBD_FLAG_READ_ONLY)
		seq_puts(s, "NBD_FLAG_READ_ONLY\n");
	if (flags & NBD_FLAG_SEND_FLUSH)
		seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
	if (flags & NBD_FLAG_SEND_TRIM)
		seq_puts(s, "NBD_FLAG_SEND_TRIM\n");

	return 0;
}

static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
{
	return single_open(file, nbd_dbg_flags_show, inode->i_private);
}

static const struct file_operations nbd_dbg_flags_ops = {
	.open = nbd_dbg_flags_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
};

static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
	struct dentry *dir;
	struct dentry *f;

	dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
	if (IS_ERR_OR_NULL(dir)) {
		dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s' (%ld)\n",
			nbd_name(nbd), PTR_ERR(dir));
		return PTR_ERR(dir);
	}
	nbd->dbg_dir = dir;

	f = debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
	if (IS_ERR_OR_NULL(f)) {
		dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'tasks', %ld\n",
			PTR_ERR(f));
		return PTR_ERR(f);
	}

	f = debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
	if (IS_ERR_OR_NULL(f)) {
		dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'size_bytes', %ld\n",
			PTR_ERR(f));
		return PTR_ERR(f);
	}

	f = debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout);
	if (IS_ERR_OR_NULL(f)) {
		dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'timeout', %ld\n",
			PTR_ERR(f));
		return PTR_ERR(f);
	}

	f = debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize);
	if (IS_ERR_OR_NULL(f)) {
		dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'blocksize', %ld\n",
			PTR_ERR(f));
		return PTR_ERR(f);
	}

	f = debugfs_create_file("flags", 0444, dir, &nbd, &nbd_dbg_flags_ops);
	if (IS_ERR_OR_NULL(f)) {
		dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'flags', %ld\n",
			PTR_ERR(f));
		return PTR_ERR(f);
	}

	return 0;
}

static void nbd_dev_dbg_close(struct nbd_device *nbd)
{
	debugfs_remove_recursive(nbd->dbg_dir);
}

static int nbd_dbg_init(void)
{
	struct dentry *dbg_dir;

	dbg_dir = debugfs_create_dir("nbd", NULL);
	if (IS_ERR(dbg_dir))
		return PTR_ERR(dbg_dir);

	nbd_dbg_dir = dbg_dir;

	return 0;
}

static void nbd_dbg_close(void)
{
	debugfs_remove_recursive(nbd_dbg_dir);
}

#else  /* IS_ENABLED(CONFIG_DEBUG_FS) */

static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
	return 0;
}

static void nbd_dev_dbg_close(struct nbd_device *nbd)
{
}

static int nbd_dbg_init(void)
{
	return 0;
}

static void nbd_dbg_close(void)
{
}

#endif

L
Linus Torvalds 已提交
971 972 973 974 975 976 977 978 979
/*
 * And here should be modules and kernel interface 
 *  (Just smiley confuses emacs :-)
 */

static int __init nbd_init(void)
{
	int err = -ENOMEM;
	int i;
L
Laurent Vivier 已提交
980
	int part_shift;
L
Linus Torvalds 已提交
981

982
	BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
L
Linus Torvalds 已提交
983

L
Laurent Vivier 已提交
984
	if (max_part < 0) {
985
		printk(KERN_ERR "nbd: max_part must be >= 0\n");
L
Laurent Vivier 已提交
986 987 988 989
		return -EINVAL;
	}

	part_shift = 0;
990
	if (max_part > 0) {
L
Laurent Vivier 已提交
991 992
		part_shift = fls(max_part);

993 994 995 996 997 998 999 1000 1001 1002 1003
		/*
		 * Adjust max_part according to part_shift as it is exported
		 * to user space so that user can know the max number of
		 * partition kernel should be able to manage.
		 *
		 * Note that -1 is required because partition 0 is reserved
		 * for the whole disk.
		 */
		max_part = (1UL << part_shift) - 1;
	}

1004 1005 1006 1007 1008 1009
	if ((1UL << part_shift) > DISK_MAX_PARTS)
		return -EINVAL;

	if (nbds_max > 1UL << (MINORBITS - part_shift))
		return -EINVAL;

S
Sudip Mukherjee 已提交
1010 1011 1012 1013
	nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
	if (!nbd_dev)
		return -ENOMEM;

1014
	for (i = 0; i < nbds_max; i++) {
L
Laurent Vivier 已提交
1015
		struct gendisk *disk = alloc_disk(1 << part_shift);
L
Linus Torvalds 已提交
1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028
		if (!disk)
			goto out;
		nbd_dev[i].disk = disk;
		/*
		 * The new linux 2.5 block layer implementation requires
		 * every gendisk to have its very own request_queue struct.
		 * These structs are big so we dynamically allocate them.
		 */
		disk->queue = blk_init_queue(do_nbd_request, &nbd_lock);
		if (!disk->queue) {
			put_disk(disk);
			goto out;
		}
1029 1030 1031 1032
		/*
		 * Tell the block layer that we are not a rotational device
		 */
		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
1033
		queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
P
Paul Clements 已提交
1034
		disk->queue->limits.discard_granularity = 512;
1035
		blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
P
Paul Clements 已提交
1036
		disk->queue->limits.discard_zeroes_data = 0;
1037 1038
		blk_queue_max_hw_sectors(disk->queue, 65536);
		disk->queue->limits.max_sectors = 256;
L
Linus Torvalds 已提交
1039 1040 1041 1042 1043 1044 1045 1046 1047
	}

	if (register_blkdev(NBD_MAJOR, "nbd")) {
		err = -EIO;
		goto out;
	}

	printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR);

M
Markus Pargmann 已提交
1048 1049
	nbd_dbg_init();

1050
	for (i = 0; i < nbds_max; i++) {
L
Linus Torvalds 已提交
1051
		struct gendisk *disk = nbd_dev[i].disk;
1052
		nbd_dev[i].magic = NBD_MAGIC;
1053
		INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
L
Linus Torvalds 已提交
1054 1055
		spin_lock_init(&nbd_dev[i].queue_lock);
		INIT_LIST_HEAD(&nbd_dev[i].queue_head);
1056
		mutex_init(&nbd_dev[i].tx_lock);
M
Markus Pargmann 已提交
1057 1058 1059
		init_timer(&nbd_dev[i].timeout_timer);
		nbd_dev[i].timeout_timer.function = nbd_xmit_timeout;
		nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i];
1060
		init_waitqueue_head(&nbd_dev[i].active_wq);
1061
		init_waitqueue_head(&nbd_dev[i].waiting_wq);
L
Linus Torvalds 已提交
1062
		nbd_dev[i].blksize = 1024;
1063
		nbd_dev[i].bytesize = 0;
L
Linus Torvalds 已提交
1064
		disk->major = NBD_MAJOR;
L
Laurent Vivier 已提交
1065
		disk->first_minor = i << part_shift;
L
Linus Torvalds 已提交
1066 1067 1068
		disk->fops = &nbd_fops;
		disk->private_data = &nbd_dev[i];
		sprintf(disk->disk_name, "nbd%d", i);
1069
		set_capacity(disk, 0);
L
Linus Torvalds 已提交
1070 1071 1072 1073 1074 1075 1076 1077 1078
		add_disk(disk);
	}

	return 0;
out:
	while (i--) {
		blk_cleanup_queue(nbd_dev[i].disk->queue);
		put_disk(nbd_dev[i].disk);
	}
1079
	kfree(nbd_dev);
L
Linus Torvalds 已提交
1080 1081 1082 1083 1084 1085
	return err;
}

static void __exit nbd_cleanup(void)
{
	int i;
M
Markus Pargmann 已提交
1086 1087 1088

	nbd_dbg_close();

1089
	for (i = 0; i < nbds_max; i++) {
L
Linus Torvalds 已提交
1090
		struct gendisk *disk = nbd_dev[i].disk;
1091
		nbd_dev[i].magic = 0;
L
Linus Torvalds 已提交
1092 1093 1094 1095 1096 1097 1098
		if (disk) {
			del_gendisk(disk);
			blk_cleanup_queue(disk->queue);
			put_disk(disk);
		}
	}
	unregister_blkdev(NBD_MAJOR, "nbd");
1099
	kfree(nbd_dev);
L
Linus Torvalds 已提交
1100 1101 1102 1103 1104 1105 1106 1107 1108
	printk(KERN_INFO "nbd: unregistered device at major %d\n", NBD_MAJOR);
}

module_init(nbd_init);
module_exit(nbd_cleanup);

MODULE_DESCRIPTION("Network Block Device");
MODULE_LICENSE("GPL");

1109
module_param(nbds_max, int, 0444);
L
Laurent Vivier 已提交
1110 1111 1112
MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
module_param(max_part, int, 0444);
MODULE_PARM_DESC(max_part, "number of partitions per device (default: 0)");