nbd.c 20.9 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 * Network block device - make block devices work over TCP
 *
 * Note that you can not swap over this thing, yet. Seems to work but
 * deadlocks sometimes - you can not swap over TCP in general.
 * 
P
Pavel Machek 已提交
7
 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
L
Linus Torvalds 已提交
8 9
 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
 *
10
 * This file is released under GPLv2 or later.
L
Linus Torvalds 已提交
11
 *
12
 * (part of code stolen from loop.c)
L
Linus Torvalds 已提交
13 14 15 16 17 18 19 20 21 22 23 24 25 26
 */

#include <linux/major.h>

#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/bio.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/ioctl.h>
27
#include <linux/mutex.h>
28 29 30
#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/kernel.h>
31
#include <linux/slab.h>
L
Linus Torvalds 已提交
32
#include <net/sock.h>
33
#include <linux/net.h>
34
#include <linux/kthread.h>
L
Linus Torvalds 已提交
35 36 37 38 39 40

#include <asm/uaccess.h>
#include <asm/types.h>

#include <linux/nbd.h>

41
#define NBD_MAGIC 0x68797548
L
Linus Torvalds 已提交
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57

#ifdef NDEBUG
#define dprintk(flags, fmt...)
#else /* NDEBUG */
#define dprintk(flags, fmt...) do { \
	if (debugflags & (flags)) printk(KERN_DEBUG fmt); \
} while (0)
#define DBG_IOCTL       0x0004
#define DBG_INIT        0x0010
#define DBG_EXIT        0x0020
#define DBG_BLKDEV      0x0100
#define DBG_RX          0x0200
#define DBG_TX          0x0400
static unsigned int debugflags;
#endif /* NDEBUG */

58
static unsigned int nbds_max = 16;
59
static struct nbd_device *nbd_dev;
L
Laurent Vivier 已提交
60
static int max_part;
L
Linus Torvalds 已提交
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105

/*
 * Use just one lock (or at most 1 per NIC). Two arguments for this:
 * 1. Each NIC is essentially a synchronization point for all servers
 *    accessed through that NIC so there's no need to have more locks
 *    than NICs anyway.
 * 2. More locks lead to more "Dirty cache line bouncing" which will slow
 *    down each lock to the point where they're actually slower than just
 *    a single lock.
 * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this!
 */
static DEFINE_SPINLOCK(nbd_lock);

#ifndef NDEBUG
static const char *ioctl_cmd_to_ascii(int cmd)
{
	switch (cmd) {
	case NBD_SET_SOCK: return "set-sock";
	case NBD_SET_BLKSIZE: return "set-blksize";
	case NBD_SET_SIZE: return "set-size";
	case NBD_DO_IT: return "do-it";
	case NBD_CLEAR_SOCK: return "clear-sock";
	case NBD_CLEAR_QUE: return "clear-que";
	case NBD_PRINT_DEBUG: return "print-debug";
	case NBD_SET_SIZE_BLOCKS: return "set-size-blocks";
	case NBD_DISCONNECT: return "disconnect";
	case BLKROSET: return "set-read-only";
	case BLKFLSBUF: return "flush-buffer-cache";
	}
	return "unknown";
}

static const char *nbdcmd_to_ascii(int cmd)
{
	switch (cmd) {
	case  NBD_CMD_READ: return "read";
	case NBD_CMD_WRITE: return "write";
	case  NBD_CMD_DISC: return "disconnect";
	}
	return "invalid";
}
#endif /* NDEBUG */

static void nbd_end_request(struct request *req)
{
106
	int error = req->errors ? -EIO : 0;
107
	struct request_queue *q = req->q;
L
Linus Torvalds 已提交
108 109 110
	unsigned long flags;

	dprintk(DBG_BLKDEV, "%s: request %p: %s\n", req->rq_disk->disk_name,
111
			req, error ? "failed" : "done");
L
Linus Torvalds 已提交
112 113

	spin_lock_irqsave(q->queue_lock, flags);
114
	__blk_end_request_all(req, error);
L
Linus Torvalds 已提交
115 116 117
	spin_unlock_irqrestore(q->queue_lock, flags);
}

118
static void sock_shutdown(struct nbd_device *nbd, int lock)
119 120 121 122 123 124 125 126
{
	/* Forcibly shutdown the socket causing all listeners
	 * to error
	 *
	 * FIXME: This code is duplicated from sys_shutdown, but
	 * there should be a more generic interface rather than
	 * calling socket ops directly here */
	if (lock)
127 128 129 130 131
		mutex_lock(&nbd->tx_lock);
	if (nbd->sock) {
		dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
		kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
		nbd->sock = NULL;
132 133
	}
	if (lock)
134
		mutex_unlock(&nbd->tx_lock);
135 136 137 138 139 140 141 142 143 144 145
}

static void nbd_xmit_timeout(unsigned long arg)
{
	struct task_struct *task = (struct task_struct *)arg;

	printk(KERN_WARNING "nbd: killing hung xmit (%s, pid: %d)\n",
		task->comm, task->pid);
	force_sig(SIGKILL, task);
}

L
Linus Torvalds 已提交
146 147 148
/*
 *  Send or receive packet.
 */
149
static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
L
Linus Torvalds 已提交
150 151
		int msg_flags)
{
152
	struct socket *sock = nbd->sock;
L
Linus Torvalds 已提交
153 154 155
	int result;
	struct msghdr msg;
	struct kvec iov;
156
	sigset_t blocked, oldset;
157
	unsigned long pflags = current->flags;
L
Linus Torvalds 已提交
158

159
	if (unlikely(!sock)) {
160
		dev_err(disk_to_dev(nbd->disk),
161 162
			"Attempted %s on closed socket in sock_xmit\n",
			(send ? "send" : "recv"));
163 164 165
		return -EINVAL;
	}

L
Linus Torvalds 已提交
166 167
	/* Allow interception of SIGKILL only
	 * Don't allow other signals to interrupt the transmission */
168 169
	siginitsetinv(&blocked, sigmask(SIGKILL));
	sigprocmask(SIG_SETMASK, &blocked, &oldset);
L
Linus Torvalds 已提交
170

171
	current->flags |= PF_MEMALLOC;
L
Linus Torvalds 已提交
172
	do {
173
		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
L
Linus Torvalds 已提交
174 175 176 177 178 179 180 181
		iov.iov_base = buf;
		iov.iov_len = size;
		msg.msg_name = NULL;
		msg.msg_namelen = 0;
		msg.msg_control = NULL;
		msg.msg_controllen = 0;
		msg.msg_flags = msg_flags | MSG_NOSIGNAL;

182 183 184
		if (send) {
			struct timer_list ti;

185
			if (nbd->xmit_timeout) {
186 187 188
				init_timer(&ti);
				ti.function = nbd_xmit_timeout;
				ti.data = (unsigned long)current;
189
				ti.expires = jiffies + nbd->xmit_timeout;
190 191
				add_timer(&ti);
			}
L
Linus Torvalds 已提交
192
			result = kernel_sendmsg(sock, &msg, &iov, 1, size);
193
			if (nbd->xmit_timeout)
194 195
				del_timer_sync(&ti);
		} else
196 197
			result = kernel_recvmsg(sock, &msg, &iov, 1, size,
						msg.msg_flags);
L
Linus Torvalds 已提交
198 199 200 201

		if (signal_pending(current)) {
			siginfo_t info;
			printk(KERN_WARNING "nbd (pid %d: %s) got signal %d\n",
202
				task_pid_nr(current), current->comm,
203
				dequeue_signal_lock(current, &current->blocked, &info));
L
Linus Torvalds 已提交
204
			result = -EINTR;
205
			sock_shutdown(nbd, !send);
L
Linus Torvalds 已提交
206 207 208 209 210 211 212 213 214 215 216 217
			break;
		}

		if (result <= 0) {
			if (result == 0)
				result = -EPIPE; /* short read */
			break;
		}
		size -= result;
		buf += result;
	} while (size > 0);

218
	sigprocmask(SIG_SETMASK, &oldset, NULL);
219
	tsk_restore_flags(current, pflags, PF_MEMALLOC);
L
Linus Torvalds 已提交
220 221 222 223

	return result;
}

224
static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
L
Linus Torvalds 已提交
225 226 227 228
		int flags)
{
	int result;
	void *kaddr = kmap(bvec->bv_page);
229 230
	result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
			   bvec->bv_len, flags);
L
Linus Torvalds 已提交
231 232 233 234
	kunmap(bvec->bv_page);
	return result;
}

235
/* always call with the tx_lock held */
236
static int nbd_send_req(struct nbd_device *nbd, struct request *req)
L
Linus Torvalds 已提交
237
{
238
	int result, flags;
L
Linus Torvalds 已提交
239
	struct nbd_request request;
240
	unsigned long size = blk_rq_bytes(req);
L
Linus Torvalds 已提交
241 242 243

	request.magic = htonl(NBD_REQUEST_MAGIC);
	request.type = htonl(nbd_cmd(req));
244
	request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
L
Linus Torvalds 已提交
245 246 247
	request.len = htonl(size);
	memcpy(request.handle, &req, sizeof(req));

248
	dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n",
249
			nbd->disk->disk_name, req,
L
Linus Torvalds 已提交
250
			nbdcmd_to_ascii(nbd_cmd(req)),
251
			(unsigned long long)blk_rq_pos(req) << 9,
252
			blk_rq_bytes(req));
253
	result = sock_xmit(nbd, 1, &request, sizeof(request),
254
			(nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0);
L
Linus Torvalds 已提交
255
	if (result <= 0) {
256
		dev_err(disk_to_dev(nbd->disk),
257
			"Send control failed (result %d)\n", result);
L
Linus Torvalds 已提交
258 259 260 261
		goto error_out;
	}

	if (nbd_cmd(req) == NBD_CMD_WRITE) {
262 263
		struct req_iterator iter;
		struct bio_vec *bvec;
L
Linus Torvalds 已提交
264 265 266 267
		/*
		 * we are really probing at internals to determine
		 * whether to set MSG_MORE or not...
		 */
268
		rq_for_each_segment(bvec, req, iter) {
269 270 271 272
			flags = 0;
			if (!rq_iter_last(req, iter))
				flags = MSG_MORE;
			dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n",
273 274
					nbd->disk->disk_name, req, bvec->bv_len);
			result = sock_send_bvec(nbd, bvec, flags);
275
			if (result <= 0) {
276
				dev_err(disk_to_dev(nbd->disk),
277 278
					"Send data failed (result %d)\n",
					result);
279 280
				goto error_out;
			}
L
Linus Torvalds 已提交
281 282 283 284 285
		}
	}
	return 0;

error_out:
P
Pavel Machek 已提交
286
	return -EIO;
L
Linus Torvalds 已提交
287 288
}

289
static struct request *nbd_find_request(struct nbd_device *nbd,
290
					struct request *xreq)
L
Linus Torvalds 已提交
291
{
292
	struct request *req, *tmp;
293
	int err;
L
Linus Torvalds 已提交
294

295
	err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
296 297 298
	if (unlikely(err))
		goto out;

299 300
	spin_lock(&nbd->queue_lock);
	list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
L
Linus Torvalds 已提交
301 302 303
		if (req != xreq)
			continue;
		list_del_init(&req->queuelist);
304
		spin_unlock(&nbd->queue_lock);
L
Linus Torvalds 已提交
305 306
		return req;
	}
307
	spin_unlock(&nbd->queue_lock);
308 309 310 311 312

	err = -ENOENT;

out:
	return ERR_PTR(err);
L
Linus Torvalds 已提交
313 314
}

315
static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
L
Linus Torvalds 已提交
316 317 318
{
	int result;
	void *kaddr = kmap(bvec->bv_page);
319
	result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
L
Linus Torvalds 已提交
320 321 322 323 324 325
			MSG_WAITALL);
	kunmap(bvec->bv_page);
	return result;
}

/* NULL returned = something went wrong, inform userspace */
326
static struct request *nbd_read_stat(struct nbd_device *nbd)
L
Linus Torvalds 已提交
327 328 329 330 331 332
{
	int result;
	struct nbd_reply reply;
	struct request *req;

	reply.magic = 0;
333
	result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
L
Linus Torvalds 已提交
334
	if (result <= 0) {
335
		dev_err(disk_to_dev(nbd->disk),
336
			"Receive control failed (result %d)\n", result);
L
Linus Torvalds 已提交
337 338
		goto harderror;
	}
339 340

	if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
341
		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
342 343 344 345 346
				(unsigned long)ntohl(reply.magic));
		result = -EPROTO;
		goto harderror;
	}

347
	req = nbd_find_request(nbd, *(struct request **)reply.handle);
348
	if (IS_ERR(req)) {
349 350 351 352
		result = PTR_ERR(req);
		if (result != -ENOENT)
			goto harderror;

353
		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
354
			reply.handle);
L
Linus Torvalds 已提交
355 356 357 358 359
		result = -EBADR;
		goto harderror;
	}

	if (ntohl(reply.error)) {
360
		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
361
			ntohl(reply.error));
L
Linus Torvalds 已提交
362 363 364 365 366
		req->errors++;
		return req;
	}

	dprintk(DBG_RX, "%s: request %p: got reply\n",
367
			nbd->disk->disk_name, req);
L
Linus Torvalds 已提交
368
	if (nbd_cmd(req) == NBD_CMD_READ) {
369 370 371 372
		struct req_iterator iter;
		struct bio_vec *bvec;

		rq_for_each_segment(bvec, req, iter) {
373
			result = sock_recv_bvec(nbd, bvec);
374
			if (result <= 0) {
375
				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
376
					result);
377 378 379 380
				req->errors++;
				return req;
			}
			dprintk(DBG_RX, "%s: request %p: got %d bytes data\n",
381
				nbd->disk->disk_name, req, bvec->bv_len);
L
Linus Torvalds 已提交
382 383 384 385
		}
	}
	return req;
harderror:
386
	nbd->harderror = result;
L
Linus Torvalds 已提交
387 388 389
	return NULL;
}

390 391
static ssize_t pid_show(struct device *dev,
			struct device_attribute *attr, char *buf)
392
{
393 394 395
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%ld\n",
396 397 398
		(long) ((struct nbd_device *)disk->private_data)->pid);
}

399
static struct device_attribute pid_attr = {
400
	.attr = { .name = "pid", .mode = S_IRUGO},
401 402 403
	.show = pid_show,
};

404
static int nbd_do_it(struct nbd_device *nbd)
L
Linus Torvalds 已提交
405 406
{
	struct request *req;
407
	int ret;
L
Linus Torvalds 已提交
408

409
	BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
410

411
	sk_set_memalloc(nbd->sock->sk);
412 413
	nbd->pid = task_pid_nr(current);
	ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
414
	if (ret) {
415 416
		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
		nbd->pid = 0;
417 418
		return ret;
	}
419

420
	while ((req = nbd_read_stat(nbd)) != NULL)
L
Linus Torvalds 已提交
421
		nbd_end_request(req);
422

423 424
	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
	nbd->pid = 0;
425
	return 0;
L
Linus Torvalds 已提交
426 427
}

428
static void nbd_clear_que(struct nbd_device *nbd)
L
Linus Torvalds 已提交
429 430 431
{
	struct request *req;

432
	BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
433

434
	/*
435
	 * Because we have set nbd->sock to NULL under the tx_lock, all
436 437 438 439 440 441
	 * modifications to the list must have completed by now.  For
	 * the same reason, the active_req must be NULL.
	 *
	 * As a consequence, we don't need to take the spin lock while
	 * purging the list here.
	 */
442 443
	BUG_ON(nbd->sock);
	BUG_ON(nbd->active_req);
444

445 446
	while (!list_empty(&nbd->queue_head)) {
		req = list_entry(nbd->queue_head.next, struct request,
447 448 449 450 451
				 queuelist);
		list_del_init(&req->queuelist);
		req->errors++;
		nbd_end_request(req);
	}
452 453 454 455 456 457 458 459

	while (!list_empty(&nbd->waiting_queue)) {
		req = list_entry(nbd->waiting_queue.next, struct request,
				 queuelist);
		list_del_init(&req->queuelist);
		req->errors++;
		nbd_end_request(req);
	}
L
Linus Torvalds 已提交
460 461
}

462

463
static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
464
{
465
	if (req->cmd_type != REQ_TYPE_FS)
466 467 468 469 470
		goto error_out;

	nbd_cmd(req) = NBD_CMD_READ;
	if (rq_data_dir(req) == WRITE) {
		nbd_cmd(req) = NBD_CMD_WRITE;
471 472
		if (nbd->flags & NBD_READ_ONLY) {
			dev_err(disk_to_dev(nbd->disk),
473
				"Write on read-only\n");
474 475 476 477 478 479
			goto error_out;
		}
	}

	req->errors = 0;

480 481 482 483
	mutex_lock(&nbd->tx_lock);
	if (unlikely(!nbd->sock)) {
		mutex_unlock(&nbd->tx_lock);
		dev_err(disk_to_dev(nbd->disk),
484
			"Attempted send on closed socket\n");
P
Pavel Machek 已提交
485
		goto error_out;
486 487
	}

488
	nbd->active_req = req;
489

490 491
	if (nbd_send_req(nbd, req) != 0) {
		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
492 493 494
		req->errors++;
		nbd_end_request(req);
	} else {
495
		spin_lock(&nbd->queue_lock);
496
		list_add_tail(&req->queuelist, &nbd->queue_head);
497
		spin_unlock(&nbd->queue_lock);
498 499
	}

500 501 502
	nbd->active_req = NULL;
	mutex_unlock(&nbd->tx_lock);
	wake_up_all(&nbd->active_wq);
503 504 505 506 507 508 509 510 511 512

	return;

error_out:
	req->errors++;
	nbd_end_request(req);
}

static int nbd_thread(void *data)
{
513
	struct nbd_device *nbd = data;
514 515 516
	struct request *req;

	set_user_nice(current, -20);
517
	while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
518
		/* wait for something to do */
519
		wait_event_interruptible(nbd->waiting_wq,
520
					 kthread_should_stop() ||
521
					 !list_empty(&nbd->waiting_queue));
522 523

		/* extract request */
524
		if (list_empty(&nbd->waiting_queue))
525 526
			continue;

527 528
		spin_lock_irq(&nbd->queue_lock);
		req = list_entry(nbd->waiting_queue.next, struct request,
529 530
				 queuelist);
		list_del_init(&req->queuelist);
531
		spin_unlock_irq(&nbd->queue_lock);
532 533

		/* handle request */
534
		nbd_handle_req(nbd, req);
535 536 537 538
	}
	return 0;
}

L
Linus Torvalds 已提交
539 540 541
/*
 * We always wait for result of write, for now. It would be nice to make it optional
 * in future
542
 * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
L
Linus Torvalds 已提交
543 544 545
 *   { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
 */

P
Pavel Machek 已提交
546
static void do_nbd_request(struct request_queue *q)
L
Linus Torvalds 已提交
547 548 549
{
	struct request *req;
	
550
	while ((req = blk_fetch_request(q)) != NULL) {
551
		struct nbd_device *nbd;
L
Linus Torvalds 已提交
552

553 554
		spin_unlock_irq(q->queue_lock);

555 556
		dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%x)\n",
				req->rq_disk->disk_name, req, req->cmd_type);
L
Linus Torvalds 已提交
557

558
		nbd = req->rq_disk->private_data;
L
Linus Torvalds 已提交
559

560
		BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
561

562 563
		if (unlikely(!nbd->sock)) {
			dev_err(disk_to_dev(nbd->disk),
564
				"Attempted send on closed socket\n");
565 566 567 568 569 570
			req->errors++;
			nbd_end_request(req);
			spin_lock_irq(q->queue_lock);
			continue;
		}

571 572 573
		spin_lock_irq(&nbd->queue_lock);
		list_add_tail(&req->queuelist, &nbd->waiting_queue);
		spin_unlock_irq(&nbd->queue_lock);
L
Linus Torvalds 已提交
574

575
		wake_up(&nbd->waiting_wq);
576

L
Linus Torvalds 已提交
577 578 579 580
		spin_lock_irq(q->queue_lock);
	}
}

P
Pavel Machek 已提交
581
/* Must be called with tx_lock held */
L
Linus Torvalds 已提交
582

583
static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
P
Pavel Machek 已提交
584 585
		       unsigned int cmd, unsigned long arg)
{
L
Linus Torvalds 已提交
586
	switch (cmd) {
P
Pavel Machek 已提交
587 588 589
	case NBD_DISCONNECT: {
		struct request sreq;

590
		dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
P
Pavel Machek 已提交
591

592
		blk_rq_init(NULL, &sreq);
593
		sreq.cmd_type = REQ_TYPE_SPECIAL;
L
Linus Torvalds 已提交
594
		nbd_cmd(&sreq) = NBD_CMD_DISC;
595
		if (!nbd->sock)
L
Linus Torvalds 已提交
596
			return -EINVAL;
597
		nbd_send_req(nbd, &sreq);
L
Linus Torvalds 已提交
598
                return 0;
P
Pavel Machek 已提交
599
	}
L
Linus Torvalds 已提交
600
 
P
Pavel Machek 已提交
601 602 603
	case NBD_CLEAR_SOCK: {
		struct file *file;

604 605 606 607 608
		nbd->sock = NULL;
		file = nbd->file;
		nbd->file = NULL;
		nbd_clear_que(nbd);
		BUG_ON(!list_empty(&nbd->queue_head));
609
		BUG_ON(!list_empty(&nbd->waiting_queue));
L
Linus Torvalds 已提交
610 611
		if (file)
			fput(file);
P
Pavel Machek 已提交
612 613 614 615 616
		return 0;
	}

	case NBD_SET_SOCK: {
		struct file *file;
617
		if (nbd->file)
L
Linus Torvalds 已提交
618 619 620
			return -EBUSY;
		file = fget(arg);
		if (file) {
A
Al Viro 已提交
621
			struct inode *inode = file->f_path.dentry->d_inode;
L
Linus Torvalds 已提交
622
			if (S_ISSOCK(inode->i_mode)) {
623 624
				nbd->file = file;
				nbd->sock = SOCKET_I(inode);
L
Laurent Vivier 已提交
625 626
				if (max_part > 0)
					bdev->bd_invalidated = 1;
P
Pavel Machek 已提交
627
				return 0;
L
Linus Torvalds 已提交
628 629 630 631
			} else {
				fput(file);
			}
		}
P
Pavel Machek 已提交
632 633 634
		return -EINVAL;
	}

L
Linus Torvalds 已提交
635
	case NBD_SET_BLKSIZE:
636 637 638 639 640
		nbd->blksize = arg;
		nbd->bytesize &= ~(nbd->blksize-1);
		bdev->bd_inode->i_size = nbd->bytesize;
		set_blocksize(bdev, nbd->blksize);
		set_capacity(nbd->disk, nbd->bytesize >> 9);
L
Linus Torvalds 已提交
641
		return 0;
P
Pavel Machek 已提交
642

L
Linus Torvalds 已提交
643
	case NBD_SET_SIZE:
644 645 646 647
		nbd->bytesize = arg & ~(nbd->blksize-1);
		bdev->bd_inode->i_size = nbd->bytesize;
		set_blocksize(bdev, nbd->blksize);
		set_capacity(nbd->disk, nbd->bytesize >> 9);
L
Linus Torvalds 已提交
648
		return 0;
P
Pavel Machek 已提交
649

650
	case NBD_SET_TIMEOUT:
651
		nbd->xmit_timeout = arg * HZ;
652
		return 0;
P
Pavel Machek 已提交
653

L
Linus Torvalds 已提交
654
	case NBD_SET_SIZE_BLOCKS:
655 656 657 658
		nbd->bytesize = ((u64) arg) * nbd->blksize;
		bdev->bd_inode->i_size = nbd->bytesize;
		set_blocksize(bdev, nbd->blksize);
		set_capacity(nbd->disk, nbd->bytesize >> 9);
L
Linus Torvalds 已提交
659
		return 0;
P
Pavel Machek 已提交
660 661 662 663 664 665

	case NBD_DO_IT: {
		struct task_struct *thread;
		struct file *file;
		int error;

666
		if (nbd->pid)
667
			return -EBUSY;
668
		if (!nbd->file)
L
Linus Torvalds 已提交
669
			return -EINVAL;
P
Pavel Machek 已提交
670

671
		mutex_unlock(&nbd->tx_lock);
P
Pavel Machek 已提交
672

673
		thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name);
P
Pavel Machek 已提交
674
		if (IS_ERR(thread)) {
675
			mutex_lock(&nbd->tx_lock);
676
			return PTR_ERR(thread);
P
Pavel Machek 已提交
677
		}
678
		wake_up_process(thread);
679
		error = nbd_do_it(nbd);
680
		kthread_stop(thread);
P
Pavel Machek 已提交
681

682
		mutex_lock(&nbd->tx_lock);
683 684
		if (error)
			return error;
685 686 687 688 689
		sock_shutdown(nbd, 0);
		file = nbd->file;
		nbd->file = NULL;
		nbd_clear_que(nbd);
		dev_warn(disk_to_dev(nbd->disk), "queue cleared\n");
L
Linus Torvalds 已提交
690 691
		if (file)
			fput(file);
692
		nbd->bytesize = 0;
A
Al Viro 已提交
693
		bdev->bd_inode->i_size = 0;
694
		set_capacity(nbd->disk, 0);
L
Laurent Vivier 已提交
695
		if (max_part > 0)
A
Al Viro 已提交
696
			ioctl_by_bdev(bdev, BLKRRPART, 0);
697
		return nbd->harderror;
P
Pavel Machek 已提交
698 699
	}

L
Linus Torvalds 已提交
700
	case NBD_CLEAR_QUE:
701 702 703 704
		/*
		 * This is for compatibility only.  The queue is always cleared
		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
		 */
705
		BUG_ON(!nbd->sock && !list_empty(&nbd->queue_head));
L
Linus Torvalds 已提交
706
		return 0;
P
Pavel Machek 已提交
707

L
Linus Torvalds 已提交
708
	case NBD_PRINT_DEBUG:
709
		dev_info(disk_to_dev(nbd->disk),
710
			"next = %p, prev = %p, head = %p\n",
711 712
			nbd->queue_head.next, nbd->queue_head.prev,
			&nbd->queue_head);
L
Linus Torvalds 已提交
713 714
		return 0;
	}
P
Pavel Machek 已提交
715 716 717 718 719 720
	return -ENOTTY;
}

static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
		     unsigned int cmd, unsigned long arg)
{
721
	struct nbd_device *nbd = bdev->bd_disk->private_data;
P
Pavel Machek 已提交
722 723 724 725 726
	int error;

	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

727
	BUG_ON(nbd->magic != NBD_MAGIC);
P
Pavel Machek 已提交
728 729 730

	/* Anyone capable of this syscall can do *real bad* things */
	dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n",
731
		nbd->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
P
Pavel Machek 已提交
732

733 734 735
	mutex_lock(&nbd->tx_lock);
	error = __nbd_ioctl(bdev, nbd, cmd, arg);
	mutex_unlock(&nbd->tx_lock);
P
Pavel Machek 已提交
736 737

	return error;
L
Linus Torvalds 已提交
738 739
}

740
static const struct block_device_operations nbd_fops =
L
Linus Torvalds 已提交
741 742
{
	.owner =	THIS_MODULE,
743
	.ioctl =	nbd_ioctl,
L
Linus Torvalds 已提交
744 745 746 747 748 749 750 751 752 753 754
};

/*
 * And here should be modules and kernel interface 
 *  (Just smiley confuses emacs :-)
 */

static int __init nbd_init(void)
{
	int err = -ENOMEM;
	int i;
L
Laurent Vivier 已提交
755
	int part_shift;
L
Linus Torvalds 已提交
756

757
	BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
L
Linus Torvalds 已提交
758

L
Laurent Vivier 已提交
759
	if (max_part < 0) {
760
		printk(KERN_ERR "nbd: max_part must be >= 0\n");
L
Laurent Vivier 已提交
761 762 763
		return -EINVAL;
	}

764 765 766 767
	nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
	if (!nbd_dev)
		return -ENOMEM;

L
Laurent Vivier 已提交
768
	part_shift = 0;
769
	if (max_part > 0) {
L
Laurent Vivier 已提交
770 771
		part_shift = fls(max_part);

772 773 774 775 776 777 778 779 780 781 782
		/*
		 * Adjust max_part according to part_shift as it is exported
		 * to user space so that user can know the max number of
		 * partition kernel should be able to manage.
		 *
		 * Note that -1 is required because partition 0 is reserved
		 * for the whole disk.
		 */
		max_part = (1UL << part_shift) - 1;
	}

783 784 785 786 787 788
	if ((1UL << part_shift) > DISK_MAX_PARTS)
		return -EINVAL;

	if (nbds_max > 1UL << (MINORBITS - part_shift))
		return -EINVAL;

789
	for (i = 0; i < nbds_max; i++) {
L
Laurent Vivier 已提交
790
		struct gendisk *disk = alloc_disk(1 << part_shift);
L
Linus Torvalds 已提交
791 792 793 794 795 796 797 798 799 800 801 802 803
		if (!disk)
			goto out;
		nbd_dev[i].disk = disk;
		/*
		 * The new linux 2.5 block layer implementation requires
		 * every gendisk to have its very own request_queue struct.
		 * These structs are big so we dynamically allocate them.
		 */
		disk->queue = blk_init_queue(do_nbd_request, &nbd_lock);
		if (!disk->queue) {
			put_disk(disk);
			goto out;
		}
804 805 806 807
		/*
		 * Tell the block layer that we are not a rotational device
		 */
		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
L
Linus Torvalds 已提交
808 809 810 811 812 813 814 815 816 817
	}

	if (register_blkdev(NBD_MAJOR, "nbd")) {
		err = -EIO;
		goto out;
	}

	printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR);
	dprintk(DBG_INIT, "nbd: debugflags=0x%x\n", debugflags);

818
	for (i = 0; i < nbds_max; i++) {
L
Linus Torvalds 已提交
819 820
		struct gendisk *disk = nbd_dev[i].disk;
		nbd_dev[i].file = NULL;
821
		nbd_dev[i].magic = NBD_MAGIC;
L
Linus Torvalds 已提交
822
		nbd_dev[i].flags = 0;
823
		INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
L
Linus Torvalds 已提交
824 825
		spin_lock_init(&nbd_dev[i].queue_lock);
		INIT_LIST_HEAD(&nbd_dev[i].queue_head);
826
		mutex_init(&nbd_dev[i].tx_lock);
827
		init_waitqueue_head(&nbd_dev[i].active_wq);
828
		init_waitqueue_head(&nbd_dev[i].waiting_wq);
L
Linus Torvalds 已提交
829
		nbd_dev[i].blksize = 1024;
830
		nbd_dev[i].bytesize = 0;
L
Linus Torvalds 已提交
831
		disk->major = NBD_MAJOR;
L
Laurent Vivier 已提交
832
		disk->first_minor = i << part_shift;
L
Linus Torvalds 已提交
833 834 835
		disk->fops = &nbd_fops;
		disk->private_data = &nbd_dev[i];
		sprintf(disk->disk_name, "nbd%d", i);
836
		set_capacity(disk, 0);
L
Linus Torvalds 已提交
837 838 839 840 841 842 843 844 845
		add_disk(disk);
	}

	return 0;
out:
	while (i--) {
		blk_cleanup_queue(nbd_dev[i].disk->queue);
		put_disk(nbd_dev[i].disk);
	}
846
	kfree(nbd_dev);
L
Linus Torvalds 已提交
847 848 849 850 851 852
	return err;
}

static void __exit nbd_cleanup(void)
{
	int i;
853
	for (i = 0; i < nbds_max; i++) {
L
Linus Torvalds 已提交
854
		struct gendisk *disk = nbd_dev[i].disk;
855
		nbd_dev[i].magic = 0;
L
Linus Torvalds 已提交
856 857 858 859 860 861 862
		if (disk) {
			del_gendisk(disk);
			blk_cleanup_queue(disk->queue);
			put_disk(disk);
		}
	}
	unregister_blkdev(NBD_MAJOR, "nbd");
863
	kfree(nbd_dev);
L
Linus Torvalds 已提交
864 865 866 867 868 869 870 871 872
	printk(KERN_INFO "nbd: unregistered device at major %d\n", NBD_MAJOR);
}

module_init(nbd_init);
module_exit(nbd_cleanup);

MODULE_DESCRIPTION("Network Block Device");
MODULE_LICENSE("GPL");

873
module_param(nbds_max, int, 0444);
L
Laurent Vivier 已提交
874 875 876
MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
module_param(max_part, int, 0444);
MODULE_PARM_DESC(max_part, "number of partitions per device (default: 0)");
L
Linus Torvalds 已提交
877 878 879 880
#ifndef NDEBUG
module_param(debugflags, int, 0644);
MODULE_PARM_DESC(debugflags, "flags for controlling debug output");
#endif