nbd.c 20.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 * Network block device - make block devices work over TCP
 *
 * Note that you can not swap over this thing, yet. Seems to work but
 * deadlocks sometimes - you can not swap over TCP in general.
 * 
P
Pavel Machek 已提交
7
 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
L
Linus Torvalds 已提交
8 9
 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
 *
10
 * This file is released under GPLv2 or later.
L
Linus Torvalds 已提交
11
 *
12
 * (part of code stolen from loop.c)
L
Linus Torvalds 已提交
13 14 15 16 17 18 19 20 21 22 23 24 25 26
 */

#include <linux/major.h>

#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/bio.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/ioctl.h>
27
#include <linux/mutex.h>
28 29 30
#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/kernel.h>
31
#include <linux/slab.h>
L
Linus Torvalds 已提交
32
#include <net/sock.h>
33
#include <linux/net.h>
34
#include <linux/kthread.h>
L
Linus Torvalds 已提交
35 36 37 38 39 40

#include <asm/uaccess.h>
#include <asm/types.h>

#include <linux/nbd.h>

41
#define NBD_MAGIC 0x68797548
L
Linus Torvalds 已提交
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57

#ifdef NDEBUG
#define dprintk(flags, fmt...)
#else /* NDEBUG */
#define dprintk(flags, fmt...) do { \
	if (debugflags & (flags)) printk(KERN_DEBUG fmt); \
} while (0)
#define DBG_IOCTL       0x0004
#define DBG_INIT        0x0010
#define DBG_EXIT        0x0020
#define DBG_BLKDEV      0x0100
#define DBG_RX          0x0200
#define DBG_TX          0x0400
static unsigned int debugflags;
#endif /* NDEBUG */

58
static unsigned int nbds_max = 16;
59
static struct nbd_device *nbd_dev;
L
Laurent Vivier 已提交
60
static int max_part;
L
Linus Torvalds 已提交
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105

/*
 * Use just one lock (or at most 1 per NIC). Two arguments for this:
 * 1. Each NIC is essentially a synchronization point for all servers
 *    accessed through that NIC so there's no need to have more locks
 *    than NICs anyway.
 * 2. More locks lead to more "Dirty cache line bouncing" which will slow
 *    down each lock to the point where they're actually slower than just
 *    a single lock.
 * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this!
 */
static DEFINE_SPINLOCK(nbd_lock);

#ifndef NDEBUG
static const char *ioctl_cmd_to_ascii(int cmd)
{
	switch (cmd) {
	case NBD_SET_SOCK: return "set-sock";
	case NBD_SET_BLKSIZE: return "set-blksize";
	case NBD_SET_SIZE: return "set-size";
	case NBD_DO_IT: return "do-it";
	case NBD_CLEAR_SOCK: return "clear-sock";
	case NBD_CLEAR_QUE: return "clear-que";
	case NBD_PRINT_DEBUG: return "print-debug";
	case NBD_SET_SIZE_BLOCKS: return "set-size-blocks";
	case NBD_DISCONNECT: return "disconnect";
	case BLKROSET: return "set-read-only";
	case BLKFLSBUF: return "flush-buffer-cache";
	}
	return "unknown";
}

static const char *nbdcmd_to_ascii(int cmd)
{
	switch (cmd) {
	case  NBD_CMD_READ: return "read";
	case NBD_CMD_WRITE: return "write";
	case  NBD_CMD_DISC: return "disconnect";
	}
	return "invalid";
}
#endif /* NDEBUG */

static void nbd_end_request(struct request *req)
{
106
	int error = req->errors ? -EIO : 0;
107
	struct request_queue *q = req->q;
L
Linus Torvalds 已提交
108 109 110
	unsigned long flags;

	dprintk(DBG_BLKDEV, "%s: request %p: %s\n", req->rq_disk->disk_name,
111
			req, error ? "failed" : "done");
L
Linus Torvalds 已提交
112 113

	spin_lock_irqsave(q->queue_lock, flags);
114
	__blk_end_request_all(req, error);
L
Linus Torvalds 已提交
115 116 117
	spin_unlock_irqrestore(q->queue_lock, flags);
}

118
static void sock_shutdown(struct nbd_device *nbd, int lock)
119 120 121 122 123 124 125 126
{
	/* Forcibly shutdown the socket causing all listeners
	 * to error
	 *
	 * FIXME: This code is duplicated from sys_shutdown, but
	 * there should be a more generic interface rather than
	 * calling socket ops directly here */
	if (lock)
127 128 129 130 131
		mutex_lock(&nbd->tx_lock);
	if (nbd->sock) {
		dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
		kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
		nbd->sock = NULL;
132 133
	}
	if (lock)
134
		mutex_unlock(&nbd->tx_lock);
135 136 137 138 139 140 141 142 143 144 145
}

static void nbd_xmit_timeout(unsigned long arg)
{
	struct task_struct *task = (struct task_struct *)arg;

	printk(KERN_WARNING "nbd: killing hung xmit (%s, pid: %d)\n",
		task->comm, task->pid);
	force_sig(SIGKILL, task);
}

L
Linus Torvalds 已提交
146 147 148
/*
 *  Send or receive packet.
 */
149
static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
L
Linus Torvalds 已提交
150 151
		int msg_flags)
{
152
	struct socket *sock = nbd->sock;
L
Linus Torvalds 已提交
153 154 155
	int result;
	struct msghdr msg;
	struct kvec iov;
156
	sigset_t blocked, oldset;
157
	unsigned long pflags = current->flags;
L
Linus Torvalds 已提交
158

159
	if (unlikely(!sock)) {
160
		dev_err(disk_to_dev(nbd->disk),
161 162
			"Attempted %s on closed socket in sock_xmit\n",
			(send ? "send" : "recv"));
163 164 165
		return -EINVAL;
	}

L
Linus Torvalds 已提交
166 167
	/* Allow interception of SIGKILL only
	 * Don't allow other signals to interrupt the transmission */
168 169
	siginitsetinv(&blocked, sigmask(SIGKILL));
	sigprocmask(SIG_SETMASK, &blocked, &oldset);
L
Linus Torvalds 已提交
170

171
	current->flags |= PF_MEMALLOC;
L
Linus Torvalds 已提交
172
	do {
173
		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
L
Linus Torvalds 已提交
174 175 176 177 178 179 180 181
		iov.iov_base = buf;
		iov.iov_len = size;
		msg.msg_name = NULL;
		msg.msg_namelen = 0;
		msg.msg_control = NULL;
		msg.msg_controllen = 0;
		msg.msg_flags = msg_flags | MSG_NOSIGNAL;

182 183 184
		if (send) {
			struct timer_list ti;

185
			if (nbd->xmit_timeout) {
186 187 188
				init_timer(&ti);
				ti.function = nbd_xmit_timeout;
				ti.data = (unsigned long)current;
189
				ti.expires = jiffies + nbd->xmit_timeout;
190 191
				add_timer(&ti);
			}
L
Linus Torvalds 已提交
192
			result = kernel_sendmsg(sock, &msg, &iov, 1, size);
193
			if (nbd->xmit_timeout)
194 195
				del_timer_sync(&ti);
		} else
196 197
			result = kernel_recvmsg(sock, &msg, &iov, 1, size,
						msg.msg_flags);
L
Linus Torvalds 已提交
198 199 200 201

		if (signal_pending(current)) {
			siginfo_t info;
			printk(KERN_WARNING "nbd (pid %d: %s) got signal %d\n",
202
				task_pid_nr(current), current->comm,
203
				dequeue_signal_lock(current, &current->blocked, &info));
L
Linus Torvalds 已提交
204
			result = -EINTR;
205
			sock_shutdown(nbd, !send);
L
Linus Torvalds 已提交
206 207 208 209 210 211 212 213 214 215 216 217
			break;
		}

		if (result <= 0) {
			if (result == 0)
				result = -EPIPE; /* short read */
			break;
		}
		size -= result;
		buf += result;
	} while (size > 0);

218
	sigprocmask(SIG_SETMASK, &oldset, NULL);
219
	tsk_restore_flags(current, pflags, PF_MEMALLOC);
L
Linus Torvalds 已提交
220 221 222 223

	return result;
}

224
static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
L
Linus Torvalds 已提交
225 226 227 228
		int flags)
{
	int result;
	void *kaddr = kmap(bvec->bv_page);
229 230
	result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
			   bvec->bv_len, flags);
L
Linus Torvalds 已提交
231 232 233 234
	kunmap(bvec->bv_page);
	return result;
}

235
/* always call with the tx_lock held */
236
static int nbd_send_req(struct nbd_device *nbd, struct request *req)
L
Linus Torvalds 已提交
237
{
238
	int result, flags;
L
Linus Torvalds 已提交
239
	struct nbd_request request;
240
	unsigned long size = blk_rq_bytes(req);
L
Linus Torvalds 已提交
241 242 243

	request.magic = htonl(NBD_REQUEST_MAGIC);
	request.type = htonl(nbd_cmd(req));
244
	request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
L
Linus Torvalds 已提交
245 246 247
	request.len = htonl(size);
	memcpy(request.handle, &req, sizeof(req));

248
	dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n",
249
			nbd->disk->disk_name, req,
L
Linus Torvalds 已提交
250
			nbdcmd_to_ascii(nbd_cmd(req)),
251
			(unsigned long long)blk_rq_pos(req) << 9,
252
			blk_rq_bytes(req));
253
	result = sock_xmit(nbd, 1, &request, sizeof(request),
254
			(nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0);
L
Linus Torvalds 已提交
255
	if (result <= 0) {
256
		dev_err(disk_to_dev(nbd->disk),
257
			"Send control failed (result %d)\n", result);
L
Linus Torvalds 已提交
258 259 260 261
		goto error_out;
	}

	if (nbd_cmd(req) == NBD_CMD_WRITE) {
262 263
		struct req_iterator iter;
		struct bio_vec *bvec;
L
Linus Torvalds 已提交
264 265 266 267
		/*
		 * we are really probing at internals to determine
		 * whether to set MSG_MORE or not...
		 */
268
		rq_for_each_segment(bvec, req, iter) {
269 270 271 272
			flags = 0;
			if (!rq_iter_last(req, iter))
				flags = MSG_MORE;
			dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n",
273 274
					nbd->disk->disk_name, req, bvec->bv_len);
			result = sock_send_bvec(nbd, bvec, flags);
275
			if (result <= 0) {
276
				dev_err(disk_to_dev(nbd->disk),
277 278
					"Send data failed (result %d)\n",
					result);
279 280
				goto error_out;
			}
L
Linus Torvalds 已提交
281 282 283 284 285
		}
	}
	return 0;

error_out:
P
Pavel Machek 已提交
286
	return -EIO;
L
Linus Torvalds 已提交
287 288
}

289
static struct request *nbd_find_request(struct nbd_device *nbd,
290
					struct request *xreq)
L
Linus Torvalds 已提交
291
{
292
	struct request *req, *tmp;
293
	int err;
L
Linus Torvalds 已提交
294

295
	err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
296 297 298
	if (unlikely(err))
		goto out;

299 300
	spin_lock(&nbd->queue_lock);
	list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
L
Linus Torvalds 已提交
301 302 303
		if (req != xreq)
			continue;
		list_del_init(&req->queuelist);
304
		spin_unlock(&nbd->queue_lock);
L
Linus Torvalds 已提交
305 306
		return req;
	}
307
	spin_unlock(&nbd->queue_lock);
308 309 310 311 312

	err = -ENOENT;

out:
	return ERR_PTR(err);
L
Linus Torvalds 已提交
313 314
}

315
static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
L
Linus Torvalds 已提交
316 317 318
{
	int result;
	void *kaddr = kmap(bvec->bv_page);
319
	result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
L
Linus Torvalds 已提交
320 321 322 323 324 325
			MSG_WAITALL);
	kunmap(bvec->bv_page);
	return result;
}

/* NULL returned = something went wrong, inform userspace */
326
static struct request *nbd_read_stat(struct nbd_device *nbd)
L
Linus Torvalds 已提交
327 328 329 330 331 332
{
	int result;
	struct nbd_reply reply;
	struct request *req;

	reply.magic = 0;
333
	result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
L
Linus Torvalds 已提交
334
	if (result <= 0) {
335
		dev_err(disk_to_dev(nbd->disk),
336
			"Receive control failed (result %d)\n", result);
L
Linus Torvalds 已提交
337 338
		goto harderror;
	}
339 340

	if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
341
		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
342 343 344 345 346
				(unsigned long)ntohl(reply.magic));
		result = -EPROTO;
		goto harderror;
	}

347
	req = nbd_find_request(nbd, *(struct request **)reply.handle);
348
	if (IS_ERR(req)) {
349 350 351 352
		result = PTR_ERR(req);
		if (result != -ENOENT)
			goto harderror;

353
		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
354
			reply.handle);
L
Linus Torvalds 已提交
355 356 357 358 359
		result = -EBADR;
		goto harderror;
	}

	if (ntohl(reply.error)) {
360
		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
361
			ntohl(reply.error));
L
Linus Torvalds 已提交
362 363 364 365 366
		req->errors++;
		return req;
	}

	dprintk(DBG_RX, "%s: request %p: got reply\n",
367
			nbd->disk->disk_name, req);
L
Linus Torvalds 已提交
368
	if (nbd_cmd(req) == NBD_CMD_READ) {
369 370 371 372
		struct req_iterator iter;
		struct bio_vec *bvec;

		rq_for_each_segment(bvec, req, iter) {
373
			result = sock_recv_bvec(nbd, bvec);
374
			if (result <= 0) {
375
				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
376
					result);
377 378 379 380
				req->errors++;
				return req;
			}
			dprintk(DBG_RX, "%s: request %p: got %d bytes data\n",
381
				nbd->disk->disk_name, req, bvec->bv_len);
L
Linus Torvalds 已提交
382 383 384 385
		}
	}
	return req;
harderror:
386
	nbd->harderror = result;
L
Linus Torvalds 已提交
387 388 389
	return NULL;
}

390 391
static ssize_t pid_show(struct device *dev,
			struct device_attribute *attr, char *buf)
392
{
393 394 395
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%ld\n",
396 397 398
		(long) ((struct nbd_device *)disk->private_data)->pid);
}

399
static struct device_attribute pid_attr = {
400
	.attr = { .name = "pid", .mode = S_IRUGO},
401 402 403
	.show = pid_show,
};

404
static int nbd_do_it(struct nbd_device *nbd)
L
Linus Torvalds 已提交
405 406
{
	struct request *req;
407
	int ret;
L
Linus Torvalds 已提交
408

409
	BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
410

411
	sk_set_memalloc(nbd->sock->sk);
412 413
	nbd->pid = task_pid_nr(current);
	ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
414
	if (ret) {
415 416
		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
		nbd->pid = 0;
417 418
		return ret;
	}
419

420
	while ((req = nbd_read_stat(nbd)) != NULL)
L
Linus Torvalds 已提交
421
		nbd_end_request(req);
422

423 424
	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
	nbd->pid = 0;
425
	return 0;
L
Linus Torvalds 已提交
426 427
}

428
static void nbd_clear_que(struct nbd_device *nbd)
L
Linus Torvalds 已提交
429 430 431
{
	struct request *req;

432
	BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
433

434
	/*
435
	 * Because we have set nbd->sock to NULL under the tx_lock, all
436 437 438 439 440 441
	 * modifications to the list must have completed by now.  For
	 * the same reason, the active_req must be NULL.
	 *
	 * As a consequence, we don't need to take the spin lock while
	 * purging the list here.
	 */
442 443
	BUG_ON(nbd->sock);
	BUG_ON(nbd->active_req);
444

445 446
	while (!list_empty(&nbd->queue_head)) {
		req = list_entry(nbd->queue_head.next, struct request,
447 448 449 450 451
				 queuelist);
		list_del_init(&req->queuelist);
		req->errors++;
		nbd_end_request(req);
	}
L
Linus Torvalds 已提交
452 453
}

454

455
static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
456
{
457
	if (req->cmd_type != REQ_TYPE_FS)
458 459 460 461 462
		goto error_out;

	nbd_cmd(req) = NBD_CMD_READ;
	if (rq_data_dir(req) == WRITE) {
		nbd_cmd(req) = NBD_CMD_WRITE;
463 464
		if (nbd->flags & NBD_READ_ONLY) {
			dev_err(disk_to_dev(nbd->disk),
465
				"Write on read-only\n");
466 467 468 469 470 471
			goto error_out;
		}
	}

	req->errors = 0;

472 473 474 475
	mutex_lock(&nbd->tx_lock);
	if (unlikely(!nbd->sock)) {
		mutex_unlock(&nbd->tx_lock);
		dev_err(disk_to_dev(nbd->disk),
476
			"Attempted send on closed socket\n");
P
Pavel Machek 已提交
477
		goto error_out;
478 479
	}

480
	nbd->active_req = req;
481

482 483
	if (nbd_send_req(nbd, req) != 0) {
		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
484 485 486
		req->errors++;
		nbd_end_request(req);
	} else {
487
		spin_lock(&nbd->queue_lock);
488
		list_add_tail(&req->queuelist, &nbd->queue_head);
489
		spin_unlock(&nbd->queue_lock);
490 491
	}

492 493 494
	nbd->active_req = NULL;
	mutex_unlock(&nbd->tx_lock);
	wake_up_all(&nbd->active_wq);
495 496 497 498 499 500 501 502 503 504

	return;

error_out:
	req->errors++;
	nbd_end_request(req);
}

static int nbd_thread(void *data)
{
505
	struct nbd_device *nbd = data;
506 507 508
	struct request *req;

	set_user_nice(current, -20);
509
	while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
510
		/* wait for something to do */
511
		wait_event_interruptible(nbd->waiting_wq,
512
					 kthread_should_stop() ||
513
					 !list_empty(&nbd->waiting_queue));
514 515

		/* extract request */
516
		if (list_empty(&nbd->waiting_queue))
517 518
			continue;

519 520
		spin_lock_irq(&nbd->queue_lock);
		req = list_entry(nbd->waiting_queue.next, struct request,
521 522
				 queuelist);
		list_del_init(&req->queuelist);
523
		spin_unlock_irq(&nbd->queue_lock);
524 525

		/* handle request */
526
		nbd_handle_req(nbd, req);
527 528 529 530
	}
	return 0;
}

L
Linus Torvalds 已提交
531 532 533
/*
 * We always wait for result of write, for now. It would be nice to make it optional
 * in future
534
 * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
L
Linus Torvalds 已提交
535 536 537
 *   { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
 */

P
Pavel Machek 已提交
538
static void do_nbd_request(struct request_queue *q)
L
Linus Torvalds 已提交
539 540 541
{
	struct request *req;
	
542
	while ((req = blk_fetch_request(q)) != NULL) {
543
		struct nbd_device *nbd;
L
Linus Torvalds 已提交
544

545 546
		spin_unlock_irq(q->queue_lock);

547 548
		dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%x)\n",
				req->rq_disk->disk_name, req, req->cmd_type);
L
Linus Torvalds 已提交
549

550
		nbd = req->rq_disk->private_data;
L
Linus Torvalds 已提交
551

552
		BUG_ON(nbd->magic != NBD_MAGIC);
L
Linus Torvalds 已提交
553

554 555
		if (unlikely(!nbd->sock)) {
			dev_err(disk_to_dev(nbd->disk),
556
				"Attempted send on closed socket\n");
557 558 559 560 561 562
			req->errors++;
			nbd_end_request(req);
			spin_lock_irq(q->queue_lock);
			continue;
		}

563 564 565
		spin_lock_irq(&nbd->queue_lock);
		list_add_tail(&req->queuelist, &nbd->waiting_queue);
		spin_unlock_irq(&nbd->queue_lock);
L
Linus Torvalds 已提交
566

567
		wake_up(&nbd->waiting_wq);
568

L
Linus Torvalds 已提交
569 570 571 572
		spin_lock_irq(q->queue_lock);
	}
}

P
Pavel Machek 已提交
573
/* Must be called with tx_lock held */
L
Linus Torvalds 已提交
574

575
static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
P
Pavel Machek 已提交
576 577
		       unsigned int cmd, unsigned long arg)
{
L
Linus Torvalds 已提交
578
	switch (cmd) {
P
Pavel Machek 已提交
579 580 581
	case NBD_DISCONNECT: {
		struct request sreq;

582
		dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
P
Pavel Machek 已提交
583

584
		blk_rq_init(NULL, &sreq);
585
		sreq.cmd_type = REQ_TYPE_SPECIAL;
L
Linus Torvalds 已提交
586
		nbd_cmd(&sreq) = NBD_CMD_DISC;
587
		if (!nbd->sock)
L
Linus Torvalds 已提交
588
			return -EINVAL;
589
		nbd_send_req(nbd, &sreq);
L
Linus Torvalds 已提交
590
                return 0;
P
Pavel Machek 已提交
591
	}
L
Linus Torvalds 已提交
592
 
P
Pavel Machek 已提交
593 594 595
	case NBD_CLEAR_SOCK: {
		struct file *file;

596 597 598 599 600
		nbd->sock = NULL;
		file = nbd->file;
		nbd->file = NULL;
		nbd_clear_que(nbd);
		BUG_ON(!list_empty(&nbd->queue_head));
L
Linus Torvalds 已提交
601 602
		if (file)
			fput(file);
P
Pavel Machek 已提交
603 604 605 606 607
		return 0;
	}

	case NBD_SET_SOCK: {
		struct file *file;
608
		if (nbd->file)
L
Linus Torvalds 已提交
609 610 611
			return -EBUSY;
		file = fget(arg);
		if (file) {
A
Al Viro 已提交
612
			struct inode *inode = file->f_path.dentry->d_inode;
L
Linus Torvalds 已提交
613
			if (S_ISSOCK(inode->i_mode)) {
614 615
				nbd->file = file;
				nbd->sock = SOCKET_I(inode);
L
Laurent Vivier 已提交
616 617
				if (max_part > 0)
					bdev->bd_invalidated = 1;
P
Pavel Machek 已提交
618
				return 0;
L
Linus Torvalds 已提交
619 620 621 622
			} else {
				fput(file);
			}
		}
P
Pavel Machek 已提交
623 624 625
		return -EINVAL;
	}

L
Linus Torvalds 已提交
626
	case NBD_SET_BLKSIZE:
627 628 629 630 631
		nbd->blksize = arg;
		nbd->bytesize &= ~(nbd->blksize-1);
		bdev->bd_inode->i_size = nbd->bytesize;
		set_blocksize(bdev, nbd->blksize);
		set_capacity(nbd->disk, nbd->bytesize >> 9);
L
Linus Torvalds 已提交
632
		return 0;
P
Pavel Machek 已提交
633

L
Linus Torvalds 已提交
634
	case NBD_SET_SIZE:
635 636 637 638
		nbd->bytesize = arg & ~(nbd->blksize-1);
		bdev->bd_inode->i_size = nbd->bytesize;
		set_blocksize(bdev, nbd->blksize);
		set_capacity(nbd->disk, nbd->bytesize >> 9);
L
Linus Torvalds 已提交
639
		return 0;
P
Pavel Machek 已提交
640

641
	case NBD_SET_TIMEOUT:
642
		nbd->xmit_timeout = arg * HZ;
643
		return 0;
P
Pavel Machek 已提交
644

L
Linus Torvalds 已提交
645
	case NBD_SET_SIZE_BLOCKS:
646 647 648 649
		nbd->bytesize = ((u64) arg) * nbd->blksize;
		bdev->bd_inode->i_size = nbd->bytesize;
		set_blocksize(bdev, nbd->blksize);
		set_capacity(nbd->disk, nbd->bytesize >> 9);
L
Linus Torvalds 已提交
650
		return 0;
P
Pavel Machek 已提交
651 652 653 654 655 656

	case NBD_DO_IT: {
		struct task_struct *thread;
		struct file *file;
		int error;

657
		if (nbd->pid)
658
			return -EBUSY;
659
		if (!nbd->file)
L
Linus Torvalds 已提交
660
			return -EINVAL;
P
Pavel Machek 已提交
661

662
		mutex_unlock(&nbd->tx_lock);
P
Pavel Machek 已提交
663

664
		thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name);
P
Pavel Machek 已提交
665
		if (IS_ERR(thread)) {
666
			mutex_lock(&nbd->tx_lock);
667
			return PTR_ERR(thread);
P
Pavel Machek 已提交
668
		}
669
		wake_up_process(thread);
670
		error = nbd_do_it(nbd);
671
		kthread_stop(thread);
P
Pavel Machek 已提交
672

673
		mutex_lock(&nbd->tx_lock);
674 675
		if (error)
			return error;
676 677 678 679 680
		sock_shutdown(nbd, 0);
		file = nbd->file;
		nbd->file = NULL;
		nbd_clear_que(nbd);
		dev_warn(disk_to_dev(nbd->disk), "queue cleared\n");
L
Linus Torvalds 已提交
681 682
		if (file)
			fput(file);
683
		nbd->bytesize = 0;
A
Al Viro 已提交
684
		bdev->bd_inode->i_size = 0;
685
		set_capacity(nbd->disk, 0);
L
Laurent Vivier 已提交
686
		if (max_part > 0)
A
Al Viro 已提交
687
			ioctl_by_bdev(bdev, BLKRRPART, 0);
688
		return nbd->harderror;
P
Pavel Machek 已提交
689 690
	}

L
Linus Torvalds 已提交
691
	case NBD_CLEAR_QUE:
692 693 694 695
		/*
		 * This is for compatibility only.  The queue is always cleared
		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
		 */
696
		BUG_ON(!nbd->sock && !list_empty(&nbd->queue_head));
L
Linus Torvalds 已提交
697
		return 0;
P
Pavel Machek 已提交
698

L
Linus Torvalds 已提交
699
	case NBD_PRINT_DEBUG:
700
		dev_info(disk_to_dev(nbd->disk),
701
			"next = %p, prev = %p, head = %p\n",
702 703
			nbd->queue_head.next, nbd->queue_head.prev,
			&nbd->queue_head);
L
Linus Torvalds 已提交
704 705
		return 0;
	}
P
Pavel Machek 已提交
706 707 708 709 710 711
	return -ENOTTY;
}

static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
		     unsigned int cmd, unsigned long arg)
{
712
	struct nbd_device *nbd = bdev->bd_disk->private_data;
P
Pavel Machek 已提交
713 714 715 716 717
	int error;

	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

718
	BUG_ON(nbd->magic != NBD_MAGIC);
P
Pavel Machek 已提交
719 720 721

	/* Anyone capable of this syscall can do *real bad* things */
	dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n",
722
		nbd->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
P
Pavel Machek 已提交
723

724 725 726
	mutex_lock(&nbd->tx_lock);
	error = __nbd_ioctl(bdev, nbd, cmd, arg);
	mutex_unlock(&nbd->tx_lock);
P
Pavel Machek 已提交
727 728

	return error;
L
Linus Torvalds 已提交
729 730
}

731
static const struct block_device_operations nbd_fops =
L
Linus Torvalds 已提交
732 733
{
	.owner =	THIS_MODULE,
734
	.ioctl =	nbd_ioctl,
L
Linus Torvalds 已提交
735 736 737 738 739 740 741 742 743 744 745
};

/*
 * And here should be modules and kernel interface 
 *  (Just smiley confuses emacs :-)
 */

static int __init nbd_init(void)
{
	int err = -ENOMEM;
	int i;
L
Laurent Vivier 已提交
746
	int part_shift;
L
Linus Torvalds 已提交
747

748
	BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
L
Linus Torvalds 已提交
749

L
Laurent Vivier 已提交
750
	if (max_part < 0) {
751
		printk(KERN_ERR "nbd: max_part must be >= 0\n");
L
Laurent Vivier 已提交
752 753 754
		return -EINVAL;
	}

755 756 757 758
	nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
	if (!nbd_dev)
		return -ENOMEM;

L
Laurent Vivier 已提交
759
	part_shift = 0;
760
	if (max_part > 0) {
L
Laurent Vivier 已提交
761 762
		part_shift = fls(max_part);

763 764 765 766 767 768 769 770 771 772 773
		/*
		 * Adjust max_part according to part_shift as it is exported
		 * to user space so that user can know the max number of
		 * partition kernel should be able to manage.
		 *
		 * Note that -1 is required because partition 0 is reserved
		 * for the whole disk.
		 */
		max_part = (1UL << part_shift) - 1;
	}

774 775 776 777 778 779
	if ((1UL << part_shift) > DISK_MAX_PARTS)
		return -EINVAL;

	if (nbds_max > 1UL << (MINORBITS - part_shift))
		return -EINVAL;

780
	for (i = 0; i < nbds_max; i++) {
L
Laurent Vivier 已提交
781
		struct gendisk *disk = alloc_disk(1 << part_shift);
L
Linus Torvalds 已提交
782 783 784 785 786 787 788 789 790 791 792 793 794
		if (!disk)
			goto out;
		nbd_dev[i].disk = disk;
		/*
		 * The new linux 2.5 block layer implementation requires
		 * every gendisk to have its very own request_queue struct.
		 * These structs are big so we dynamically allocate them.
		 */
		disk->queue = blk_init_queue(do_nbd_request, &nbd_lock);
		if (!disk->queue) {
			put_disk(disk);
			goto out;
		}
795 796 797 798
		/*
		 * Tell the block layer that we are not a rotational device
		 */
		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
L
Linus Torvalds 已提交
799 800 801 802 803 804 805 806 807 808
	}

	if (register_blkdev(NBD_MAJOR, "nbd")) {
		err = -EIO;
		goto out;
	}

	printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR);
	dprintk(DBG_INIT, "nbd: debugflags=0x%x\n", debugflags);

809
	for (i = 0; i < nbds_max; i++) {
L
Linus Torvalds 已提交
810 811
		struct gendisk *disk = nbd_dev[i].disk;
		nbd_dev[i].file = NULL;
812
		nbd_dev[i].magic = NBD_MAGIC;
L
Linus Torvalds 已提交
813
		nbd_dev[i].flags = 0;
814
		INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
L
Linus Torvalds 已提交
815 816
		spin_lock_init(&nbd_dev[i].queue_lock);
		INIT_LIST_HEAD(&nbd_dev[i].queue_head);
817
		mutex_init(&nbd_dev[i].tx_lock);
818
		init_waitqueue_head(&nbd_dev[i].active_wq);
819
		init_waitqueue_head(&nbd_dev[i].waiting_wq);
L
Linus Torvalds 已提交
820
		nbd_dev[i].blksize = 1024;
821
		nbd_dev[i].bytesize = 0;
L
Linus Torvalds 已提交
822
		disk->major = NBD_MAJOR;
L
Laurent Vivier 已提交
823
		disk->first_minor = i << part_shift;
L
Linus Torvalds 已提交
824 825 826
		disk->fops = &nbd_fops;
		disk->private_data = &nbd_dev[i];
		sprintf(disk->disk_name, "nbd%d", i);
827
		set_capacity(disk, 0);
L
Linus Torvalds 已提交
828 829 830 831 832 833 834 835 836
		add_disk(disk);
	}

	return 0;
out:
	while (i--) {
		blk_cleanup_queue(nbd_dev[i].disk->queue);
		put_disk(nbd_dev[i].disk);
	}
837
	kfree(nbd_dev);
L
Linus Torvalds 已提交
838 839 840 841 842 843
	return err;
}

static void __exit nbd_cleanup(void)
{
	int i;
844
	for (i = 0; i < nbds_max; i++) {
L
Linus Torvalds 已提交
845
		struct gendisk *disk = nbd_dev[i].disk;
846
		nbd_dev[i].magic = 0;
L
Linus Torvalds 已提交
847 848 849 850 851 852 853
		if (disk) {
			del_gendisk(disk);
			blk_cleanup_queue(disk->queue);
			put_disk(disk);
		}
	}
	unregister_blkdev(NBD_MAJOR, "nbd");
854
	kfree(nbd_dev);
L
Linus Torvalds 已提交
855 856 857 858 859 860 861 862 863
	printk(KERN_INFO "nbd: unregistered device at major %d\n", NBD_MAJOR);
}

module_init(nbd_init);
module_exit(nbd_cleanup);

MODULE_DESCRIPTION("Network Block Device");
MODULE_LICENSE("GPL");

864
module_param(nbds_max, int, 0444);
L
Laurent Vivier 已提交
865 866 867
MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
module_param(max_part, int, 0444);
MODULE_PARM_DESC(max_part, "number of partitions per device (default: 0)");
L
Linus Torvalds 已提交
868 869 870 871
#ifndef NDEBUG
module_param(debugflags, int, 0644);
MODULE_PARM_DESC(debugflags, "flags for controlling debug output");
#endif