nbd.c 24.5 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 * Network block device - make block devices work over TCP
 *
 * Note that you can not swap over this thing, yet. Seems to work but
 * deadlocks sometimes - you can not swap over TCP in general.
 * 
P
Pavel Machek 已提交
7
 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
L
Linus Torvalds 已提交
8 9
 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
 *
10
 * This file is released under GPLv2 or later.
L
Linus Torvalds 已提交
11
 *
12
 * (part of code stolen from loop.c)
L
Linus Torvalds 已提交
13 14 15 16 17 18 19 20 21 22 23 24 25 26
 */

#include <linux/major.h>

#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/bio.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/ioctl.h>
27
#include <linux/mutex.h>
28 29 30
#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/kernel.h>
31
#include <linux/slab.h>
L
Linus Torvalds 已提交
32
#include <net/sock.h>
33
#include <linux/net.h>
34
#include <linux/kthread.h>
M
Markus Pargmann 已提交
35
#include <linux/types.h>
M
Markus Pargmann 已提交
36
#include <linux/debugfs.h>
L
Linus Torvalds 已提交
37 38 39 40 41 42

#include <asm/uaccess.h>
#include <asm/types.h>

#include <linux/nbd.h>

43
struct nbd_device {
M
Markus Pargmann 已提交
44
	u32 flags;
45 46 47 48 49 50 51 52 53 54 55 56 57
	struct socket * sock;	/* If == NULL, device is not ready, yet	*/
	int magic;

	spinlock_t queue_lock;
	struct list_head queue_head;	/* Requests waiting result */
	struct request *active_req;
	wait_queue_head_t active_wq;
	struct list_head waiting_queue;	/* Requests to be sent */
	wait_queue_head_t waiting_wq;

	struct mutex tx_lock;
	struct gendisk *disk;
	int blksize;
M
Markus Pargmann 已提交
58
	loff_t bytesize;
59
	int xmit_timeout;
60
	bool disconnect; /* a disconnect has been requested by user */
M
Markus Pargmann 已提交
61 62

	struct timer_list timeout_timer;
M
Markus Pargmann 已提交
63 64
	/* protects initialization and shutdown of the socket */
	spinlock_t sock_lock;
M
Markus Pargmann 已提交
65 66
	struct task_struct *task_recv;
	struct task_struct *task_send;
M
Markus Pargmann 已提交
67 68 69 70

#if IS_ENABLED(CONFIG_DEBUG_FS)
	struct dentry *dbg_dir;
#endif
71 72
};

M
Markus Pargmann 已提交
73 74 75 76 77 78
#if IS_ENABLED(CONFIG_DEBUG_FS)
static struct dentry *nbd_dbg_dir;
#endif

#define nbd_name(nbd) ((nbd)->disk->disk_name)

79
#define NBD_MAGIC 0x68797548
L
Linus Torvalds 已提交
80

81
static unsigned int nbds_max = 16;
82
static struct nbd_device *nbd_dev;
L
Laurent Vivier 已提交
83
static int max_part;
L
Linus Torvalds 已提交
84 85 86 87 88 89 90 91 92 93 94 95 96

/*
 * Use just one lock (or at most 1 per NIC). Two arguments for this:
 * 1. Each NIC is essentially a synchronization point for all servers
 *    accessed through that NIC so there's no need to have more locks
 *    than NICs anyway.
 * 2. More locks lead to more "Dirty cache line bouncing" which will slow
 *    down each lock to the point where they're actually slower than just
 *    a single lock.
 * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this!
 */
static DEFINE_SPINLOCK(nbd_lock);

97
static inline struct device *nbd_to_dev(struct nbd_device *nbd)
L
Linus Torvalds 已提交
98
{
99
	return disk_to_dev(nbd->disk);
L
Linus Torvalds 已提交
100 101 102 103 104 105 106 107
}

static const char *nbdcmd_to_ascii(int cmd)
{
	switch (cmd) {
	case  NBD_CMD_READ: return "read";
	case NBD_CMD_WRITE: return "write";
	case  NBD_CMD_DISC: return "disconnect";
A
Alex Bligh 已提交
108
	case NBD_CMD_FLUSH: return "flush";
P
Paul Clements 已提交
109
	case  NBD_CMD_TRIM: return "trim/discard";
L
Linus Torvalds 已提交
110 111 112 113
	}
	return "invalid";
}

114
static void nbd_end_request(struct nbd_device *nbd, struct request *req)
L
Linus Torvalds 已提交
115
{
116
	int error = req->errors ? -EIO : 0;
117
	struct request_queue *q = req->q;
L
Linus Torvalds 已提交
118 119
	unsigned long flags;

120 121
	dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", req,
		error ? "failed" : "done");
L
Linus Torvalds 已提交
122 123

	spin_lock_irqsave(q->queue_lock, flags);
124
	__blk_end_request_all(req, error);
L
Linus Torvalds 已提交
125 126 127
	spin_unlock_irqrestore(q->queue_lock, flags);
}

128 129 130
/*
 * Forcibly shutdown the socket causing all listeners to error
 */
131
static void sock_shutdown(struct nbd_device *nbd)
132
{
M
Markus Pargmann 已提交
133 134 135 136
	spin_lock_irq(&nbd->sock_lock);

	if (!nbd->sock) {
		spin_unlock_irq(&nbd->sock_lock);
M
Markus Pargmann 已提交
137
		return;
M
Markus Pargmann 已提交
138
	}
M
Markus Pargmann 已提交
139 140 141

	dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
	kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
M
Markus Pargmann 已提交
142
	sockfd_put(nbd->sock);
M
Markus Pargmann 已提交
143
	nbd->sock = NULL;
M
Markus Pargmann 已提交
144 145 146
	spin_unlock_irq(&nbd->sock_lock);

	del_timer(&nbd->timeout_timer);
147 148 149 150
}

static void nbd_xmit_timeout(unsigned long arg)
{
M
Markus Pargmann 已提交
151
	struct nbd_device *nbd = (struct nbd_device *)arg;
M
Markus Pargmann 已提交
152
	unsigned long flags;
M
Markus Pargmann 已提交
153 154 155 156

	if (list_empty(&nbd->queue_head))
		return;

157
	nbd->disconnect = true;
M
Markus Pargmann 已提交
158

M
Markus Pargmann 已提交
159
	spin_lock_irqsave(&nbd->sock_lock, flags);
M
Markus Pargmann 已提交
160

161

M
Markus Pargmann 已提交
162 163
	if (nbd->sock)
		kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
M
Markus Pargmann 已提交
164

M
Markus Pargmann 已提交
165
	spin_unlock_irqrestore(&nbd->sock_lock, flags);
M
Markus Pargmann 已提交
166

M
Markus Pargmann 已提交
167
	dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
168 169
}

L
Linus Torvalds 已提交
170 171 172
/*
 *  Send or receive packet.
 */
173
static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
L
Linus Torvalds 已提交
174 175
		int msg_flags)
{
176
	struct socket *sock = nbd->sock;
L
Linus Torvalds 已提交
177 178 179
	int result;
	struct msghdr msg;
	struct kvec iov;
180
	unsigned long pflags = current->flags;
L
Linus Torvalds 已提交
181

182
	if (unlikely(!sock)) {
183
		dev_err(disk_to_dev(nbd->disk),
184 185
			"Attempted %s on closed socket in sock_xmit\n",
			(send ? "send" : "recv"));
186 187 188
		return -EINVAL;
	}

189
	current->flags |= PF_MEMALLOC;
L
Linus Torvalds 已提交
190
	do {
191
		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
L
Linus Torvalds 已提交
192 193 194 195 196 197 198 199
		iov.iov_base = buf;
		iov.iov_len = size;
		msg.msg_name = NULL;
		msg.msg_namelen = 0;
		msg.msg_control = NULL;
		msg.msg_controllen = 0;
		msg.msg_flags = msg_flags | MSG_NOSIGNAL;

M
Markus Pargmann 已提交
200
		if (send)
L
Linus Torvalds 已提交
201
			result = kernel_sendmsg(sock, &msg, &iov, 1, size);
M
Markus Pargmann 已提交
202
		else
203 204
			result = kernel_recvmsg(sock, &msg, &iov, 1, size,
						msg.msg_flags);
L
Linus Torvalds 已提交
205 206 207 208 209 210 211 212 213 214

		if (result <= 0) {
			if (result == 0)
				result = -EPIPE; /* short read */
			break;
		}
		size -= result;
		buf += result;
	} while (size > 0);

215
	tsk_restore_flags(current, pflags, PF_MEMALLOC);
L
Linus Torvalds 已提交
216

M
Markus Pargmann 已提交
217 218 219
	if (!send && nbd->xmit_timeout)
		mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);

L
Linus Torvalds 已提交
220 221 222
	return result;
}

223
static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
L
Linus Torvalds 已提交
224 225 226 227
		int flags)
{
	int result;
	void *kaddr = kmap(bvec->bv_page);
228 229
	result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
			   bvec->bv_len, flags);
L
Linus Torvalds 已提交
230 231 232 233
	kunmap(bvec->bv_page);
	return result;
}

234
/* always call with the tx_lock held */
235
static int nbd_send_req(struct nbd_device *nbd, struct request *req)
L
Linus Torvalds 已提交
236
{
237
	int result, flags;
L
Linus Torvalds 已提交
238
	struct nbd_request request;
239
	unsigned long size = blk_rq_bytes(req);
C
Christoph Hellwig 已提交
240 241 242 243 244 245 246 247 248 249 250 251
	u32 type;

	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
		type = NBD_CMD_DISC;
	else if (req->cmd_flags & REQ_DISCARD)
		type = NBD_CMD_TRIM;
	else if (req->cmd_flags & REQ_FLUSH)
		type = NBD_CMD_FLUSH;
	else if (rq_data_dir(req) == WRITE)
		type = NBD_CMD_WRITE;
	else
		type = NBD_CMD_READ;
L
Linus Torvalds 已提交
252

253
	memset(&request, 0, sizeof(request));
L
Linus Torvalds 已提交
254
	request.magic = htonl(NBD_REQUEST_MAGIC);
C
Christoph Hellwig 已提交
255 256
	request.type = htonl(type);
	if (type != NBD_CMD_FLUSH && type != NBD_CMD_DISC) {
A
Alex Bligh 已提交
257 258 259
		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
		request.len = htonl(size);
	}
L
Linus Torvalds 已提交
260 261
	memcpy(request.handle, &req, sizeof(req));

262
	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
C
Christoph Hellwig 已提交
263
		req, nbdcmd_to_ascii(type),
264
		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
265
	result = sock_xmit(nbd, 1, &request, sizeof(request),
C
Christoph Hellwig 已提交
266
			(type == NBD_CMD_WRITE) ? MSG_MORE : 0);
L
Linus Torvalds 已提交
267
	if (result <= 0) {
268
		dev_err(disk_to_dev(nbd->disk),
269
			"Send control failed (result %d)\n", result);
270
		return -EIO;
L
Linus Torvalds 已提交
271 272
	}

C
Christoph Hellwig 已提交
273
	if (type == NBD_CMD_WRITE) {
274
		struct req_iterator iter;
275
		struct bio_vec bvec;
L
Linus Torvalds 已提交
276 277 278 279
		/*
		 * we are really probing at internals to determine
		 * whether to set MSG_MORE or not...
		 */
280
		rq_for_each_segment(bvec, req, iter) {
281
			flags = 0;
K
Kent Overstreet 已提交
282
			if (!rq_iter_last(bvec, iter))
283
				flags = MSG_MORE;
284 285
			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
				req, bvec.bv_len);
286
			result = sock_send_bvec(nbd, &bvec, flags);
287
			if (result <= 0) {
288
				dev_err(disk_to_dev(nbd->disk),
289 290
					"Send data failed (result %d)\n",
					result);
291
				return -EIO;
292
			}
L
Linus Torvalds 已提交
293 294 295 296 297
		}
	}
	return 0;
}

298
static struct request *nbd_find_request(struct nbd_device *nbd,
299
					struct request *xreq)
L
Linus Torvalds 已提交
300
{
301
	struct request *req, *tmp;
302
	int err;
L
Linus Torvalds 已提交
303

304
	err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
305
	if (unlikely(err))
306
		return ERR_PTR(err);
307

308 309
	spin_lock(&nbd->queue_lock);
	list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
L
Linus Torvalds 已提交
310 311 312
		if (req != xreq)
			continue;
		list_del_init(&req->queuelist);
313
		spin_unlock(&nbd->queue_lock);
L
Linus Torvalds 已提交
314 315
		return req;
	}
316
	spin_unlock(&nbd->queue_lock);
317

318
	return ERR_PTR(-ENOENT);
L
Linus Torvalds 已提交
319 320
}

321
static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
L
Linus Torvalds 已提交
322 323 324
{
	int result;
	void *kaddr = kmap(bvec->bv_page);
325
	result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
L
Linus Torvalds 已提交
326 327 328 329 330 331
			MSG_WAITALL);
	kunmap(bvec->bv_page);
	return result;
}

/* NULL returned = something went wrong, inform userspace */
332
static struct request *nbd_read_stat(struct nbd_device *nbd)
L
Linus Torvalds 已提交
333 334 335 336 337 338
{
	int result;
	struct nbd_reply reply;
	struct request *req;

	reply.magic = 0;
339
	result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
L
Linus Torvalds 已提交
340
	if (result <= 0) {
341
		dev_err(disk_to_dev(nbd->disk),
342
			"Receive control failed (result %d)\n", result);
343
		return ERR_PTR(result);
L
Linus Torvalds 已提交
344
	}
345 346

	if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
347
		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
348
				(unsigned long)ntohl(reply.magic));
349
		return ERR_PTR(-EPROTO);
350 351
	}

352
	req = nbd_find_request(nbd, *(struct request **)reply.handle);
353
	if (IS_ERR(req)) {
354 355
		result = PTR_ERR(req);
		if (result != -ENOENT)
356
			return ERR_PTR(result);
357

358
		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
359
			reply.handle);
360
		return ERR_PTR(-EBADR);
L
Linus Torvalds 已提交
361 362 363
	}

	if (ntohl(reply.error)) {
364
		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
365
			ntohl(reply.error));
L
Linus Torvalds 已提交
366 367 368 369
		req->errors++;
		return req;
	}

370
	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
C
Christoph Hellwig 已提交
371
	if (rq_data_dir(req) != WRITE) {
372
		struct req_iterator iter;
373
		struct bio_vec bvec;
374 375

		rq_for_each_segment(bvec, req, iter) {
376
			result = sock_recv_bvec(nbd, &bvec);
377
			if (result <= 0) {
378
				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
379
					result);
380 381 382
				req->errors++;
				return req;
			}
383 384
			dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
				req, bvec.bv_len);
L
Linus Torvalds 已提交
385 386 387 388 389
		}
	}
	return req;
}

390 391
static ssize_t pid_show(struct device *dev,
			struct device_attribute *attr, char *buf)
392
{
393
	struct gendisk *disk = dev_to_disk(dev);
M
Markus Pargmann 已提交
394
	struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
395

M
Markus Pargmann 已提交
396
	return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
397 398
}

399
static struct device_attribute pid_attr = {
400
	.attr = { .name = "pid", .mode = S_IRUGO},
401 402 403
	.show = pid_show,
};

404
static int nbd_thread_recv(struct nbd_device *nbd)
L
Linus Torvalds 已提交
405 406
{
	struct request *req;
407
	int ret;
L
Linus Torvalds 已提交
408

409
	BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
410

411
	sk_set_memalloc(nbd->sock->sk);
M
Markus Pargmann 已提交
412 413 414

	nbd->task_recv = current;

415
	ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
416
	if (ret) {
417
		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
M
Markus Pargmann 已提交
418

M
Markus Pargmann 已提交
419
		nbd->task_recv = NULL;
M
Markus Pargmann 已提交
420

421 422
		return ret;
	}
423

424 425 426 427 428 429 430
	while (1) {
		req = nbd_read_stat(nbd);
		if (IS_ERR(req)) {
			ret = PTR_ERR(req);
			break;
		}

431
		nbd_end_request(nbd, req);
432
	}
433

M
Markus Pargmann 已提交
434 435
	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);

M
Markus Pargmann 已提交
436 437 438
	nbd->task_recv = NULL;

	return ret;
L
Linus Torvalds 已提交
439 440
}

441
static void nbd_clear_que(struct nbd_device *nbd)
L
Linus Torvalds 已提交
442 443 444
{
	struct request *req;

445
	BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
446

447
	/*
448
	 * Because we have set nbd->sock to NULL under the tx_lock, all
449 450 451 452 453 454
	 * modifications to the list must have completed by now.  For
	 * the same reason, the active_req must be NULL.
	 *
	 * As a consequence, we don't need to take the spin lock while
	 * purging the list here.
	 */
455 456
	BUG_ON(nbd->sock);
	BUG_ON(nbd->active_req);
457

458 459
	while (!list_empty(&nbd->queue_head)) {
		req = list_entry(nbd->queue_head.next, struct request,
460 461 462
				 queuelist);
		list_del_init(&req->queuelist);
		req->errors++;
463
		nbd_end_request(nbd, req);
464
	}
465 466 467 468 469 470

	while (!list_empty(&nbd->waiting_queue)) {
		req = list_entry(nbd->waiting_queue.next, struct request,
				 queuelist);
		list_del_init(&req->queuelist);
		req->errors++;
471
		nbd_end_request(nbd, req);
472
	}
473
	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
L
Linus Torvalds 已提交
474 475
}

476

477
static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
478
{
479
	if (req->cmd_type != REQ_TYPE_FS)
480 481
		goto error_out;

C
Christoph Hellwig 已提交
482 483 484 485 486
	if (rq_data_dir(req) == WRITE &&
	    (nbd->flags & NBD_FLAG_READ_ONLY)) {
		dev_err(disk_to_dev(nbd->disk),
			"Write on read-only\n");
		goto error_out;
A
Alex Bligh 已提交
487 488
	}

489 490
	req->errors = 0;

491 492 493 494
	mutex_lock(&nbd->tx_lock);
	if (unlikely(!nbd->sock)) {
		mutex_unlock(&nbd->tx_lock);
		dev_err(disk_to_dev(nbd->disk),
495
			"Attempted send on closed socket\n");
P
Pavel Machek 已提交
496
		goto error_out;
497 498
	}

499
	nbd->active_req = req;
500

M
Markus Pargmann 已提交
501 502 503
	if (nbd->xmit_timeout && list_empty_careful(&nbd->queue_head))
		mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);

504 505
	if (nbd_send_req(nbd, req) != 0) {
		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
506
		req->errors++;
507
		nbd_end_request(nbd, req);
508
	} else {
509
		spin_lock(&nbd->queue_lock);
510
		list_add_tail(&req->queuelist, &nbd->queue_head);
511
		spin_unlock(&nbd->queue_lock);
512 513
	}

514 515 516
	nbd->active_req = NULL;
	mutex_unlock(&nbd->tx_lock);
	wake_up_all(&nbd->active_wq);
517 518 519 520 521

	return;

error_out:
	req->errors++;
522
	nbd_end_request(nbd, req);
523 524
}

525
static int nbd_thread_send(void *data)
526
{
527
	struct nbd_device *nbd = data;
528 529
	struct request *req;

M
Markus Pargmann 已提交
530 531
	nbd->task_send = current;

532
	set_user_nice(current, MIN_NICE);
533
	while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
534
		/* wait for something to do */
535
		wait_event_interruptible(nbd->waiting_wq,
536
					 kthread_should_stop() ||
537
					 !list_empty(&nbd->waiting_queue));
538 539

		/* extract request */
540
		if (list_empty(&nbd->waiting_queue))
541 542
			continue;

543 544
		spin_lock_irq(&nbd->queue_lock);
		req = list_entry(nbd->waiting_queue.next, struct request,
545 546
				 queuelist);
		list_del_init(&req->queuelist);
547
		spin_unlock_irq(&nbd->queue_lock);
548 549

		/* handle request */
550
		nbd_handle_req(nbd, req);
551
	}
M
Markus Pargmann 已提交
552 553 554

	nbd->task_send = NULL;

555 556 557
	return 0;
}

L
Linus Torvalds 已提交
558 559 560
/*
 * We always wait for result of write, for now. It would be nice to make it optional
 * in future
561
 * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
L
Linus Torvalds 已提交
562 563 564
 *   { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
 */

565
static void nbd_request_handler(struct request_queue *q)
A
Alex Elder 已提交
566
		__releases(q->queue_lock) __acquires(q->queue_lock)
L
Linus Torvalds 已提交
567 568 569
{
	struct request *req;
	
570
	while ((req = blk_fetch_request(q)) != NULL) {
571
		struct nbd_device *nbd;
L
Linus Torvalds 已提交
572

573 574
		spin_unlock_irq(q->queue_lock);

575
		nbd = req->rq_disk->private_data;
L
Linus Torvalds 已提交
576

577
		BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
578

579 580 581
		dev_dbg(nbd_to_dev(nbd), "request %p: dequeued (flags=%x)\n",
			req, req->cmd_type);

582 583
		if (unlikely(!nbd->sock)) {
			dev_err(disk_to_dev(nbd->disk),
584
				"Attempted send on closed socket\n");
585
			req->errors++;
586
			nbd_end_request(nbd, req);
587 588 589 590
			spin_lock_irq(q->queue_lock);
			continue;
		}

591 592 593
		spin_lock_irq(&nbd->queue_lock);
		list_add_tail(&req->queuelist, &nbd->waiting_queue);
		spin_unlock_irq(&nbd->queue_lock);
L
Linus Torvalds 已提交
594

595
		wake_up(&nbd->waiting_wq);
596

L
Linus Torvalds 已提交
597 598 599 600
		spin_lock_irq(q->queue_lock);
	}
}

M
Markus Pargmann 已提交
601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619
static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock)
{
	int ret = 0;

	spin_lock_irq(&nbd->sock_lock);

	if (nbd->sock) {
		ret = -EBUSY;
		goto out;
	}

	nbd->sock = sock;

out:
	spin_unlock_irq(&nbd->sock_lock);

	return ret;
}

M
Markus Pargmann 已提交
620 621 622
static int nbd_dev_dbg_init(struct nbd_device *nbd);
static void nbd_dev_dbg_close(struct nbd_device *nbd);

P
Pavel Machek 已提交
623
/* Must be called with tx_lock held */
L
Linus Torvalds 已提交
624

625
static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
P
Pavel Machek 已提交
626 627
		       unsigned int cmd, unsigned long arg)
{
L
Linus Torvalds 已提交
628
	switch (cmd) {
P
Pavel Machek 已提交
629 630 631
	case NBD_DISCONNECT: {
		struct request sreq;

632
		dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
633 634
		if (!nbd->sock)
			return -EINVAL;
P
Pavel Machek 已提交
635

636 637 638
		mutex_unlock(&nbd->tx_lock);
		fsync_bdev(bdev);
		mutex_lock(&nbd->tx_lock);
639
		blk_rq_init(NULL, &sreq);
640
		sreq.cmd_type = REQ_TYPE_DRV_PRIV;
641 642

		/* Check again after getting mutex back.  */
643
		if (!nbd->sock)
L
Linus Torvalds 已提交
644
			return -EINVAL;
645

646
		nbd->disconnect = true;
P
Paul Clements 已提交
647

648
		nbd_send_req(nbd, &sreq);
P
Paul Clements 已提交
649
		return 0;
P
Pavel Machek 已提交
650
	}
L
Linus Torvalds 已提交
651
 
M
Markus Pargmann 已提交
652 653
	case NBD_CLEAR_SOCK:
		sock_shutdown(nbd);
654 655
		nbd_clear_que(nbd);
		BUG_ON(!list_empty(&nbd->queue_head));
656
		BUG_ON(!list_empty(&nbd->waiting_queue));
657
		kill_bdev(bdev);
P
Pavel Machek 已提交
658 659 660
		return 0;

	case NBD_SET_SOCK: {
A
Al Viro 已提交
661
		int err;
M
Markus Pargmann 已提交
662 663 664 665 666 667 668 669 670 671
		struct socket *sock = sockfd_lookup(arg, &err);

		if (!sock)
			return err;

		err = nbd_set_socket(nbd, sock);
		if (!err && max_part)
			bdev->bd_invalidated = 1;

		return err;
P
Pavel Machek 已提交
672 673
	}

L
Linus Torvalds 已提交
674
	case NBD_SET_BLKSIZE:
675 676 677 678 679
		nbd->blksize = arg;
		nbd->bytesize &= ~(nbd->blksize-1);
		bdev->bd_inode->i_size = nbd->bytesize;
		set_blocksize(bdev, nbd->blksize);
		set_capacity(nbd->disk, nbd->bytesize >> 9);
L
Linus Torvalds 已提交
680
		return 0;
P
Pavel Machek 已提交
681

L
Linus Torvalds 已提交
682
	case NBD_SET_SIZE:
683 684 685 686
		nbd->bytesize = arg & ~(nbd->blksize-1);
		bdev->bd_inode->i_size = nbd->bytesize;
		set_blocksize(bdev, nbd->blksize);
		set_capacity(nbd->disk, nbd->bytesize >> 9);
L
Linus Torvalds 已提交
687
		return 0;
P
Pavel Machek 已提交
688

689
	case NBD_SET_TIMEOUT:
690
		nbd->xmit_timeout = arg * HZ;
M
Markus Pargmann 已提交
691 692 693 694 695 696
		if (arg)
			mod_timer(&nbd->timeout_timer,
				  jiffies + nbd->xmit_timeout);
		else
			del_timer_sync(&nbd->timeout_timer);

697
		return 0;
P
Pavel Machek 已提交
698

P
Paul Clements 已提交
699 700 701 702
	case NBD_SET_FLAGS:
		nbd->flags = arg;
		return 0;

L
Linus Torvalds 已提交
703
	case NBD_SET_SIZE_BLOCKS:
704 705 706 707
		nbd->bytesize = ((u64) arg) * nbd->blksize;
		bdev->bd_inode->i_size = nbd->bytesize;
		set_blocksize(bdev, nbd->blksize);
		set_capacity(nbd->disk, nbd->bytesize >> 9);
L
Linus Torvalds 已提交
708
		return 0;
P
Pavel Machek 已提交
709 710 711 712 713

	case NBD_DO_IT: {
		struct task_struct *thread;
		int error;

M
Markus Pargmann 已提交
714
		if (nbd->task_recv)
715
			return -EBUSY;
A
Al Viro 已提交
716
		if (!nbd->sock)
L
Linus Torvalds 已提交
717
			return -EINVAL;
P
Pavel Machek 已提交
718

719
		mutex_unlock(&nbd->tx_lock);
P
Pavel Machek 已提交
720

721 722
		if (nbd->flags & NBD_FLAG_READ_ONLY)
			set_device_ro(bdev, true);
P
Paul Clements 已提交
723 724 725
		if (nbd->flags & NBD_FLAG_SEND_TRIM)
			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
				nbd->disk->queue);
A
Alex Bligh 已提交
726 727 728 729
		if (nbd->flags & NBD_FLAG_SEND_FLUSH)
			blk_queue_flush(nbd->disk->queue, REQ_FLUSH);
		else
			blk_queue_flush(nbd->disk->queue, 0);
P
Paul Clements 已提交
730

731
		thread = kthread_run(nbd_thread_send, nbd, "%s",
M
Markus Pargmann 已提交
732
				     nbd_name(nbd));
P
Pavel Machek 已提交
733
		if (IS_ERR(thread)) {
734
			mutex_lock(&nbd->tx_lock);
735
			return PTR_ERR(thread);
P
Pavel Machek 已提交
736
		}
737

M
Markus Pargmann 已提交
738
		nbd_dev_dbg_init(nbd);
739
		error = nbd_thread_recv(nbd);
M
Markus Pargmann 已提交
740
		nbd_dev_dbg_close(nbd);
741
		kthread_stop(thread);
P
Pavel Machek 已提交
742

743
		mutex_lock(&nbd->tx_lock);
744

745
		sock_shutdown(nbd);
746
		nbd_clear_que(nbd);
747
		kill_bdev(bdev);
P
Paul Clements 已提交
748
		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
749
		set_device_ro(bdev, false);
A
Alex Bligh 已提交
750
		nbd->flags = 0;
751
		nbd->bytesize = 0;
A
Al Viro 已提交
752
		bdev->bd_inode->i_size = 0;
753
		set_capacity(nbd->disk, 0);
L
Laurent Vivier 已提交
754
		if (max_part > 0)
755
			blkdev_reread_part(bdev);
P
Paul Clements 已提交
756 757
		if (nbd->disconnect) /* user requested, ignore socket errors */
			return 0;
758
		return error;
P
Pavel Machek 已提交
759 760
	}

L
Linus Torvalds 已提交
761
	case NBD_CLEAR_QUE:
762 763 764 765
		/*
		 * This is for compatibility only.  The queue is always cleared
		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
		 */
L
Linus Torvalds 已提交
766
		return 0;
P
Pavel Machek 已提交
767

L
Linus Torvalds 已提交
768
	case NBD_PRINT_DEBUG:
769
		dev_info(disk_to_dev(nbd->disk),
770
			"next = %p, prev = %p, head = %p\n",
771 772
			nbd->queue_head.next, nbd->queue_head.prev,
			&nbd->queue_head);
L
Linus Torvalds 已提交
773 774
		return 0;
	}
P
Pavel Machek 已提交
775 776 777 778 779 780
	return -ENOTTY;
}

static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
		     unsigned int cmd, unsigned long arg)
{
781
	struct nbd_device *nbd = bdev->bd_disk->private_data;
P
Pavel Machek 已提交
782 783 784 785 786
	int error;

	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

787
	BUG_ON(nbd->magic != NBD_MAGIC);
P
Pavel Machek 已提交
788

789 790 791
	mutex_lock(&nbd->tx_lock);
	error = __nbd_ioctl(bdev, nbd, cmd, arg);
	mutex_unlock(&nbd->tx_lock);
P
Pavel Machek 已提交
792 793

	return error;
L
Linus Torvalds 已提交
794 795
}

796
static const struct block_device_operations nbd_fops =
L
Linus Torvalds 已提交
797 798
{
	.owner =	THIS_MODULE,
799
	.ioctl =	nbd_ioctl,
A
Al Viro 已提交
800
	.compat_ioctl =	nbd_ioctl,
L
Linus Torvalds 已提交
801 802
};

M
Markus Pargmann 已提交
803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864
#if IS_ENABLED(CONFIG_DEBUG_FS)

static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
{
	struct nbd_device *nbd = s->private;

	if (nbd->task_recv)
		seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
	if (nbd->task_send)
		seq_printf(s, "send: %d\n", task_pid_nr(nbd->task_send));

	return 0;
}

static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
{
	return single_open(file, nbd_dbg_tasks_show, inode->i_private);
}

static const struct file_operations nbd_dbg_tasks_ops = {
	.open = nbd_dbg_tasks_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
};

static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
{
	struct nbd_device *nbd = s->private;
	u32 flags = nbd->flags;

	seq_printf(s, "Hex: 0x%08x\n\n", flags);

	seq_puts(s, "Known flags:\n");

	if (flags & NBD_FLAG_HAS_FLAGS)
		seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
	if (flags & NBD_FLAG_READ_ONLY)
		seq_puts(s, "NBD_FLAG_READ_ONLY\n");
	if (flags & NBD_FLAG_SEND_FLUSH)
		seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
	if (flags & NBD_FLAG_SEND_TRIM)
		seq_puts(s, "NBD_FLAG_SEND_TRIM\n");

	return 0;
}

static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
{
	return single_open(file, nbd_dbg_flags_show, inode->i_private);
}

static const struct file_operations nbd_dbg_flags_ops = {
	.open = nbd_dbg_flags_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
};

static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
	struct dentry *dir;
865 866 867

	if (!nbd_dbg_dir)
		return -EIO;
M
Markus Pargmann 已提交
868 869

	dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
870 871 872 873
	if (!dir) {
		dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
			nbd_name(nbd));
		return -EIO;
M
Markus Pargmann 已提交
874 875 876
	}
	nbd->dbg_dir = dir;

877 878 879 880 881
	debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
	debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
	debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout);
	debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize);
	debugfs_create_file("flags", 0444, dir, &nbd, &nbd_dbg_flags_ops);
M
Markus Pargmann 已提交
882 883 884 885 886 887 888 889 890 891 892 893 894 895

	return 0;
}

static void nbd_dev_dbg_close(struct nbd_device *nbd)
{
	debugfs_remove_recursive(nbd->dbg_dir);
}

static int nbd_dbg_init(void)
{
	struct dentry *dbg_dir;

	dbg_dir = debugfs_create_dir("nbd", NULL);
896 897
	if (!dbg_dir)
		return -EIO;
M
Markus Pargmann 已提交
898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930

	nbd_dbg_dir = dbg_dir;

	return 0;
}

static void nbd_dbg_close(void)
{
	debugfs_remove_recursive(nbd_dbg_dir);
}

#else  /* IS_ENABLED(CONFIG_DEBUG_FS) */

static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
	return 0;
}

static void nbd_dev_dbg_close(struct nbd_device *nbd)
{
}

static int nbd_dbg_init(void)
{
	return 0;
}

static void nbd_dbg_close(void)
{
}

#endif

L
Linus Torvalds 已提交
931 932 933 934 935 936 937 938 939
/*
 * And here should be modules and kernel interface 
 *  (Just smiley confuses emacs :-)
 */

static int __init nbd_init(void)
{
	int err = -ENOMEM;
	int i;
L
Laurent Vivier 已提交
940
	int part_shift;
L
Linus Torvalds 已提交
941

942
	BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
L
Linus Torvalds 已提交
943

L
Laurent Vivier 已提交
944
	if (max_part < 0) {
945
		printk(KERN_ERR "nbd: max_part must be >= 0\n");
L
Laurent Vivier 已提交
946 947 948 949
		return -EINVAL;
	}

	part_shift = 0;
950
	if (max_part > 0) {
L
Laurent Vivier 已提交
951 952
		part_shift = fls(max_part);

953 954 955 956 957 958 959 960 961 962 963
		/*
		 * Adjust max_part according to part_shift as it is exported
		 * to user space so that user can know the max number of
		 * partition kernel should be able to manage.
		 *
		 * Note that -1 is required because partition 0 is reserved
		 * for the whole disk.
		 */
		max_part = (1UL << part_shift) - 1;
	}

964 965 966 967 968 969
	if ((1UL << part_shift) > DISK_MAX_PARTS)
		return -EINVAL;

	if (nbds_max > 1UL << (MINORBITS - part_shift))
		return -EINVAL;

S
Sudip Mukherjee 已提交
970 971 972 973
	nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
	if (!nbd_dev)
		return -ENOMEM;

974
	for (i = 0; i < nbds_max; i++) {
L
Laurent Vivier 已提交
975
		struct gendisk *disk = alloc_disk(1 << part_shift);
L
Linus Torvalds 已提交
976 977 978 979 980 981 982 983
		if (!disk)
			goto out;
		nbd_dev[i].disk = disk;
		/*
		 * The new linux 2.5 block layer implementation requires
		 * every gendisk to have its very own request_queue struct.
		 * These structs are big so we dynamically allocate them.
		 */
984
		disk->queue = blk_init_queue(nbd_request_handler, &nbd_lock);
L
Linus Torvalds 已提交
985 986 987 988
		if (!disk->queue) {
			put_disk(disk);
			goto out;
		}
989 990 991 992
		/*
		 * Tell the block layer that we are not a rotational device
		 */
		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
993
		queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
P
Paul Clements 已提交
994
		disk->queue->limits.discard_granularity = 512;
995
		blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
P
Paul Clements 已提交
996
		disk->queue->limits.discard_zeroes_data = 0;
997 998
		blk_queue_max_hw_sectors(disk->queue, 65536);
		disk->queue->limits.max_sectors = 256;
L
Linus Torvalds 已提交
999 1000 1001 1002 1003 1004 1005 1006 1007
	}

	if (register_blkdev(NBD_MAJOR, "nbd")) {
		err = -EIO;
		goto out;
	}

	printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR);

M
Markus Pargmann 已提交
1008 1009
	nbd_dbg_init();

1010
	for (i = 0; i < nbds_max; i++) {
L
Linus Torvalds 已提交
1011
		struct gendisk *disk = nbd_dev[i].disk;
1012
		nbd_dev[i].magic = NBD_MAGIC;
1013
		INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
L
Linus Torvalds 已提交
1014
		spin_lock_init(&nbd_dev[i].queue_lock);
M
Markus Pargmann 已提交
1015
		spin_lock_init(&nbd_dev[i].sock_lock);
L
Linus Torvalds 已提交
1016
		INIT_LIST_HEAD(&nbd_dev[i].queue_head);
1017
		mutex_init(&nbd_dev[i].tx_lock);
M
Markus Pargmann 已提交
1018 1019 1020
		init_timer(&nbd_dev[i].timeout_timer);
		nbd_dev[i].timeout_timer.function = nbd_xmit_timeout;
		nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i];
1021
		init_waitqueue_head(&nbd_dev[i].active_wq);
1022
		init_waitqueue_head(&nbd_dev[i].waiting_wq);
L
Linus Torvalds 已提交
1023
		nbd_dev[i].blksize = 1024;
1024
		nbd_dev[i].bytesize = 0;
L
Linus Torvalds 已提交
1025
		disk->major = NBD_MAJOR;
L
Laurent Vivier 已提交
1026
		disk->first_minor = i << part_shift;
L
Linus Torvalds 已提交
1027 1028 1029
		disk->fops = &nbd_fops;
		disk->private_data = &nbd_dev[i];
		sprintf(disk->disk_name, "nbd%d", i);
1030
		set_capacity(disk, 0);
L
Linus Torvalds 已提交
1031 1032 1033 1034 1035 1036 1037 1038 1039
		add_disk(disk);
	}

	return 0;
out:
	while (i--) {
		blk_cleanup_queue(nbd_dev[i].disk->queue);
		put_disk(nbd_dev[i].disk);
	}
1040
	kfree(nbd_dev);
L
Linus Torvalds 已提交
1041 1042 1043 1044 1045 1046
	return err;
}

static void __exit nbd_cleanup(void)
{
	int i;
M
Markus Pargmann 已提交
1047 1048 1049

	nbd_dbg_close();

1050
	for (i = 0; i < nbds_max; i++) {
L
Linus Torvalds 已提交
1051
		struct gendisk *disk = nbd_dev[i].disk;
1052
		nbd_dev[i].magic = 0;
L
Linus Torvalds 已提交
1053 1054 1055 1056 1057 1058 1059
		if (disk) {
			del_gendisk(disk);
			blk_cleanup_queue(disk->queue);
			put_disk(disk);
		}
	}
	unregister_blkdev(NBD_MAJOR, "nbd");
1060
	kfree(nbd_dev);
L
Linus Torvalds 已提交
1061 1062 1063 1064 1065 1066 1067 1068 1069
	printk(KERN_INFO "nbd: unregistered device at major %d\n", NBD_MAJOR);
}

module_init(nbd_init);
module_exit(nbd_cleanup);

MODULE_DESCRIPTION("Network Block Device");
MODULE_LICENSE("GPL");

1070
module_param(nbds_max, int, 0444);
L
Laurent Vivier 已提交
1071 1072 1073
MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
module_param(max_part, int, 0444);
MODULE_PARM_DESC(max_part, "number of partitions per device (default: 0)");