ubd_kern.c 35.8 KB
Newer Older
1
/*
2
 * Copyright (C) 2018 Cambridge Greys Ltd
A
Anton Ivanov 已提交
3
 * Copyright (C) 2015-2016 Anton Ivanov (aivanov@brocade.com)
L
Linus Torvalds 已提交
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
 * Licensed under the GPL
 */

/* 2001-09-28...2002-04-17
 * Partition stuff by James_McMechan@hotmail.com
 * old style ubd by setting UBD_SHIFT to 0
 * 2002-09-27...2002-10-18 massive tinkering for 2.5
 * partitions have changed in 2.5
 * 2003-01-29 more tinkering for 2.5.59-1
 * This should now address the sysfs problems and has
 * the symlink for devfs to allow for booting with
 * the common /dev/ubd/discX/... names rather than
 * only /dev/ubdN/discN this version also has lots of
 * clean ups preparing for ubd-many.
 * James McMechan
 */

#define UBD_SHIFT 4

A
Al Viro 已提交
24 25 26
#include <linux/module.h>
#include <linux/init.h>
#include <linux/blkdev.h>
27
#include <linux/blk-mq.h>
A
Al Viro 已提交
28 29 30 31 32 33 34 35 36 37 38
#include <linux/ata.h>
#include <linux/hdreg.h>
#include <linux/cdrom.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/ctype.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/platform_device.h>
#include <linux/scatterlist.h>
#include <asm/tlbflush.h>
39
#include <kern_util.h>
L
Linus Torvalds 已提交
40
#include "mconsole_kern.h"
41 42
#include <init.h>
#include <irq_kern.h>
A
Al Viro 已提交
43
#include "ubd.h"
44
#include <os.h>
L
Linus Torvalds 已提交
45 46
#include "cow.h"

47 48
/* Max request size is determined by sector mask - 32K */
#define UBD_MAX_REQUEST (8 * sizeof(long))
L
Linus Torvalds 已提交
49 50

struct io_thread_req {
51
	struct request *req;
L
Linus Torvalds 已提交
52 53 54 55 56 57
	int fds[2];
	unsigned long offsets[2];
	unsigned long long offset;
	unsigned long length;
	char *buffer;
	int sectorsize;
58 59 60
	unsigned long sector_mask;
	unsigned long long cow_offset;
	unsigned long bitmap_words[2];
L
Linus Torvalds 已提交
61 62 63
	int error;
};

A
Anton Ivanov 已提交
64 65 66 67 68 69 70 71 72 73 74

static struct io_thread_req * (*irq_req_buffer)[];
static struct io_thread_req *irq_remainder;
static int irq_remainder_size;

static struct io_thread_req * (*io_req_buffer)[];
static struct io_thread_req *io_remainder;
static int io_remainder_size;



75
static inline int ubd_test_bit(__u64 bit, unsigned char *data)
L
Linus Torvalds 已提交
76 77 78 79
{
	__u64 n;
	int bits, off;

80
	bits = sizeof(data[0]) * 8;
L
Linus Torvalds 已提交
81 82
	n = bit / bits;
	off = bit % bits;
83
	return (data[n] & (1 << off)) != 0;
L
Linus Torvalds 已提交
84 85
}

86
static inline void ubd_set_bit(__u64 bit, unsigned char *data)
L
Linus Torvalds 已提交
87 88 89 90
{
	__u64 n;
	int bits, off;

91
	bits = sizeof(data[0]) * 8;
L
Linus Torvalds 已提交
92 93
	n = bit / bits;
	off = bit % bits;
94
	data[n] |= (1 << off);
L
Linus Torvalds 已提交
95 96 97 98 99
}
/*End stuff from ubd_user.h*/

#define DRIVER_NAME "uml-blkdev"

100
static DEFINE_MUTEX(ubd_lock);
A
Arnd Bergmann 已提交
101
static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */
L
Linus Torvalds 已提交
102

A
Al Viro 已提交
103
static int ubd_open(struct block_device *bdev, fmode_t mode);
104
static void ubd_release(struct gendisk *disk, fmode_t mode);
A
Al Viro 已提交
105
static int ubd_ioctl(struct block_device *bdev, fmode_t mode,
L
Linus Torvalds 已提交
106
		     unsigned int cmd, unsigned long arg);
107
static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
L
Linus Torvalds 已提交
108

109
#define MAX_DEV (16)
L
Linus Torvalds 已提交
110

111
static const struct block_device_operations ubd_blops = {
L
Linus Torvalds 已提交
112
        .owner		= THIS_MODULE,
A
Al Viro 已提交
113 114 115
        .open		= ubd_open,
        .release	= ubd_release,
        .ioctl		= ubd_ioctl,
116
	.getgeo		= ubd_getgeo,
L
Linus Torvalds 已提交
117 118 119
};

/* Protected by ubd_lock */
120
static int fake_major = UBD_MAJOR;
L
Linus Torvalds 已提交
121 122
static struct gendisk *ubd_gendisk[MAX_DEV];
static struct gendisk *fake_gendisk[MAX_DEV];
123

L
Linus Torvalds 已提交
124 125 126 127 128 129 130 131 132 133
#ifdef CONFIG_BLK_DEV_UBD_SYNC
#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \
					 .cl = 1 })
#else
#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 0, .c = 0, \
					 .cl = 1 })
#endif
static struct openflags global_openflags = OPEN_FLAGS;

struct cow {
134
	/* backing file name */
L
Linus Torvalds 已提交
135
	char *file;
136
	/* backing file fd */
L
Linus Torvalds 已提交
137 138 139 140
	int fd;
	unsigned long *bitmap;
	unsigned long bitmap_len;
	int bitmap_offset;
141
	int data_offset;
L
Linus Torvalds 已提交
142 143
};

J
Jeff Dike 已提交
144 145
#define MAX_SG 64

L
Linus Torvalds 已提交
146
struct ubd {
147 148
	/* name (and fd, below) of the file opened for writing, either the
	 * backing or the cow file. */
L
Linus Torvalds 已提交
149 150 151 152 153 154
	char *file;
	int count;
	int fd;
	__u64 size;
	struct openflags boot_openflags;
	struct openflags openflags;
155 156
	unsigned shared:1;
	unsigned no_cow:1;
L
Linus Torvalds 已提交
157 158
	struct cow cow;
	struct platform_device pdev;
159
	struct request_queue *queue;
160
	struct blk_mq_tag_set tag_set;
161
	spinlock_t lock;
162 163
};

L
Linus Torvalds 已提交
164 165
#define DEFAULT_COW { \
	.file =			NULL, \
166 167
	.fd =			-1,	\
	.bitmap =		NULL, \
L
Linus Torvalds 已提交
168
	.bitmap_offset =	0, \
169
	.data_offset =		0, \
L
Linus Torvalds 已提交
170 171 172 173 174 175 176 177 178
}

#define DEFAULT_UBD { \
	.file = 		NULL, \
	.count =		0, \
	.fd =			-1, \
	.size =			-1, \
	.boot_openflags =	OPEN_FLAGS, \
	.openflags =		OPEN_FLAGS, \
179
	.no_cow =               0, \
180
	.shared =		0, \
181
	.cow =			DEFAULT_COW, \
182
	.lock =			__SPIN_LOCK_UNLOCKED(ubd_devs.lock), \
L
Linus Torvalds 已提交
183 184
}

185
/* Protected by ubd_lock */
186
static struct ubd ubd_devs[MAX_DEV] = { [0 ... MAX_DEV - 1] = DEFAULT_UBD };
L
Linus Torvalds 已提交
187 188 189 190 191 192

/* Only changed by fake_ide_setup which is a setup */
static int fake_ide = 0;
static struct proc_dir_entry *proc_ide_root = NULL;
static struct proc_dir_entry *proc_ide = NULL;

193 194 195
static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
				 const struct blk_mq_queue_data *bd);

L
Linus Torvalds 已提交
196 197 198 199 200 201
static void make_proc_ide(void)
{
	proc_ide_root = proc_mkdir("ide", NULL);
	proc_ide = proc_mkdir("ide0", proc_ide_root);
}

202
static int fake_ide_media_proc_show(struct seq_file *m, void *v)
L
Linus Torvalds 已提交
203
{
204 205 206 207
	seq_puts(m, "disk\n");
	return 0;
}

W
WANG Cong 已提交
208
static void make_ide_entries(const char *dev_name)
L
Linus Torvalds 已提交
209 210 211 212 213 214 215 216 217
{
	struct proc_dir_entry *dir, *ent;
	char name[64];

	if(proc_ide_root == NULL) make_proc_ide();

	dir = proc_mkdir(dev_name, proc_ide);
	if(!dir) return;

218 219
	ent = proc_create_single("media", S_IRUGO, dir,
			fake_ide_media_proc_show);
L
Linus Torvalds 已提交
220
	if(!ent) return;
W
WANG Cong 已提交
221
	snprintf(name, sizeof(name), "ide0/%s", dev_name);
L
Linus Torvalds 已提交
222 223 224 225 226 227
	proc_symlink(dev_name, proc_ide_root, name);
}

static int fake_ide_setup(char *str)
{
	fake_ide = 1;
228
	return 1;
L
Linus Torvalds 已提交
229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
}

__setup("fake_ide", fake_ide_setup);

__uml_help(fake_ide_setup,
"fake_ide\n"
"    Create ide0 entries that map onto ubd devices.\n\n"
);

static int parse_unit(char **ptr)
{
	char *str = *ptr, *end;
	int n = -1;

	if(isdigit(*str)) {
		n = simple_strtoul(str, &end, 0);
		if(end == str)
246
			return -1;
L
Linus Torvalds 已提交
247 248
		*ptr = end;
	}
249
	else if (('a' <= *str) && (*str <= 'z')) {
L
Linus Torvalds 已提交
250 251 252 253
		n = *str - 'a';
		str++;
		*ptr = str;
	}
254
	return n;
L
Linus Torvalds 已提交
255 256
}

257 258 259 260
/* If *index_out == -1 at exit, the passed option was a general one;
 * otherwise, the str pointer is used (and owned) inside ubd_devs array, so it
 * should not be freed on exit.
 */
261
static int ubd_setup_common(char *str, int *index_out, char **error_out)
L
Linus Torvalds 已提交
262
{
263
	struct ubd *ubd_dev;
L
Linus Torvalds 已提交
264 265
	struct openflags flags = global_openflags;
	char *backing_file;
266
	int n, err = 0, i;
L
Linus Torvalds 已提交
267 268 269 270 271 272 273 274 275 276

	if(index_out) *index_out = -1;
	n = *str;
	if(n == '='){
		char *end;
		int major;

		str++;
		if(!strcmp(str, "sync")){
			global_openflags = of_sync(global_openflags);
277
			goto out1;
L
Linus Torvalds 已提交
278
		}
279 280

		err = -EINVAL;
L
Linus Torvalds 已提交
281 282
		major = simple_strtoul(str, &end, 0);
		if((*end != '\0') || (end == str)){
283
			*error_out = "Didn't parse major number";
284
			goto out1;
L
Linus Torvalds 已提交
285 286
		}

287
		mutex_lock(&ubd_lock);
288
		if (fake_major != UBD_MAJOR) {
289 290 291
			*error_out = "Can't assign a fake major twice";
			goto out1;
		}
292

293
		fake_major = major;
L
Linus Torvalds 已提交
294 295 296

		printk(KERN_INFO "Setting extra ubd major number to %d\n",
		       major);
297 298 299 300
		err = 0;
	out1:
		mutex_unlock(&ubd_lock);
		return err;
L
Linus Torvalds 已提交
301 302 303 304
	}

	n = parse_unit(&str);
	if(n < 0){
305 306
		*error_out = "Couldn't parse device number";
		return -EINVAL;
L
Linus Torvalds 已提交
307 308
	}
	if(n >= MAX_DEV){
309 310
		*error_out = "Device number out of range";
		return 1;
L
Linus Torvalds 已提交
311 312
	}

313
	err = -EBUSY;
314
	mutex_lock(&ubd_lock);
L
Linus Torvalds 已提交
315

316 317
	ubd_dev = &ubd_devs[n];
	if(ubd_dev->file != NULL){
318
		*error_out = "Device is already configured";
L
Linus Torvalds 已提交
319 320 321 322 323 324
		goto out;
	}

	if (index_out)
		*index_out = n;

325
	err = -EINVAL;
326
	for (i = 0; i < sizeof("rscd="); i++) {
L
Linus Torvalds 已提交
327 328 329 330 331 332 333 334
		switch (*str) {
		case 'r':
			flags.w = 0;
			break;
		case 's':
			flags.s = 1;
			break;
		case 'd':
335
			ubd_dev->no_cow = 1;
L
Linus Torvalds 已提交
336
			break;
337
		case 'c':
338
			ubd_dev->shared = 1;
339
			break;
L
Linus Torvalds 已提交
340 341 342 343
		case '=':
			str++;
			goto break_loop;
		default:
344 345
			*error_out = "Expected '=' or flag letter "
				"(r, s, c, or d)";
L
Linus Torvalds 已提交
346 347 348 349 350
			goto out;
		}
		str++;
	}

351 352 353 354
	if (*str == '=')
		*error_out = "Too many flags specified";
	else
		*error_out = "Missing '='";
L
Linus Torvalds 已提交
355 356 357 358 359
	goto out;

break_loop:
	backing_file = strchr(str, ',');

360
	if (backing_file == NULL)
L
Linus Torvalds 已提交
361 362
		backing_file = strchr(str, ':');

363 364 365 366 367
	if(backing_file != NULL){
		if(ubd_dev->no_cow){
			*error_out = "Can't specify both 'd' and a cow file";
			goto out;
		}
L
Linus Torvalds 已提交
368 369 370 371 372
		else {
			*backing_file = '\0';
			backing_file++;
		}
	}
373
	err = 0;
374 375 376
	ubd_dev->file = str;
	ubd_dev->cow.file = backing_file;
	ubd_dev->boot_openflags = flags;
L
Linus Torvalds 已提交
377
out:
378
	mutex_unlock(&ubd_lock);
379
	return err;
L
Linus Torvalds 已提交
380 381 382 383
}

static int ubd_setup(char *str)
{
384 385 386 387 388 389 390 391
	char *error;
	int err;

	err = ubd_setup_common(str, NULL, &error);
	if(err)
		printk(KERN_ERR "Failed to initialize device with \"%s\" : "
		       "%s\n", str, error);
	return 1;
L
Linus Torvalds 已提交
392 393 394 395 396 397 398 399 400 401 402
}

__setup("ubd", ubd_setup);
__uml_help(ubd_setup,
"ubd<n><flags>=<filename>[(:|,)<filename2>]\n"
"    This is used to associate a device with a file in the underlying\n"
"    filesystem. When specifying two filenames, the first one is the\n"
"    COW name and the second is the backing file name. As separator you can\n"
"    use either a ':' or a ',': the first one allows writing things like;\n"
"	ubd0=~/Uml/root_cow:~/Uml/root_backing_file\n"
"    while with a ',' the shell would not expand the 2nd '~'.\n"
403
"    When using only one filename, UML will detect whether to treat it like\n"
L
Linus Torvalds 已提交
404 405 406 407 408 409 410 411 412 413
"    a COW file or a backing file. To override this detection, add the 'd'\n"
"    flag:\n"
"	ubd0d=BackingFile\n"
"    Usually, there is a filesystem in the file, but \n"
"    that's not required. Swap devices containing swap files can be\n"
"    specified like this. Also, a file which doesn't contain a\n"
"    filesystem can have its contents read in the virtual \n"
"    machine by running 'dd' on the device. <n> must be in the range\n"
"    0 to 7. Appending an 'r' to the number will cause that device\n"
"    to be mounted read-only. For example ubd1r=./ext_fs. Appending\n"
J
Jeff Dike 已提交
414 415 416 417
"    an 's' will cause data to be written to disk on the host immediately.\n"
"    'c' will cause the device to be treated as being shared between multiple\n"
"    UMLs and file locking will be turned off - this is appropriate for a\n"
"    cluster filesystem and inappropriate at almost all other times.\n\n"
L
Linus Torvalds 已提交
418 419
);

J
Jeff Dike 已提交
420
static int udb_setup(char *str)
L
Linus Torvalds 已提交
421 422 423
{
	printk("udb%s specified on command line is almost certainly a ubd -> "
	       "udb TYPO\n", str);
424
	return 1;
L
Linus Torvalds 已提交
425 426 427 428 429
}

__setup("udb", udb_setup);
__uml_help(udb_setup,
"udb\n"
430 431 432
"    This option is here solely to catch ubd -> udb typos, which can be\n"
"    to impossible to catch visually unless you specifically look for\n"
"    them.  The only result of any option starting with 'udb' is an error\n"
L
Linus Torvalds 已提交
433 434 435
"    in the boot output.\n\n"
);

436
/* Only changed by ubd_init, which is an initcall. */
437
static int thread_fd = -1;
J
Jeff Dike 已提交
438

A
Anton Ivanov 已提交
439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491
/* Function to read several request pointers at a time
* handling fractional reads if (and as) needed
*/

static int bulk_req_safe_read(
	int fd,
	struct io_thread_req * (*request_buffer)[],
	struct io_thread_req **remainder,
	int *remainder_size,
	int max_recs
	)
{
	int n = 0;
	int res = 0;

	if (*remainder_size > 0) {
		memmove(
			(char *) request_buffer,
			(char *) remainder, *remainder_size
		);
		n = *remainder_size;
	}

	res = os_read_file(
			fd,
			((char *) request_buffer) + *remainder_size,
			sizeof(struct io_thread_req *)*max_recs
				- *remainder_size
		);
	if (res > 0) {
		n += res;
		if ((n % sizeof(struct io_thread_req *)) > 0) {
			/*
			* Read somehow returned not a multiple of dword
			* theoretically possible, but never observed in the
			* wild, so read routine must be able to handle it
			*/
			*remainder_size = n % sizeof(struct io_thread_req *);
			WARN(*remainder_size > 0, "UBD IPC read returned a partial result");
			memmove(
				remainder,
				((char *) request_buffer) +
					(n/sizeof(struct io_thread_req *))*sizeof(struct io_thread_req *),
				*remainder_size
			);
			n = n - *remainder_size;
		}
	} else {
		n = res;
	}
	return n;
}

492
/* Called without dev->lock held, and only in interrupt context. */
493
static void ubd_handler(void)
L
Linus Torvalds 已提交
494
{
495
	int n;
A
Anton Ivanov 已提交
496
	int count;
497

J
Jeff Dike 已提交
498
	while(1){
A
Anton Ivanov 已提交
499 500 501 502 503 504 505 506
		n = bulk_req_safe_read(
			thread_fd,
			irq_req_buffer,
			&irq_remainder,
			&irq_remainder_size,
			UBD_REQ_BUFFER_SIZE
		);
		if (n < 0) {
J
Jeff Dike 已提交
507 508 509 510 511 512
			if(n == -EAGAIN)
				break;
			printk(KERN_ERR "spurious interrupt in ubd_handler, "
			       "err = %d\n", -n);
			return;
		}
A
Anton Ivanov 已提交
513
		for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
514 515
			struct io_thread_req *io_req = (*irq_req_buffer)[count];

516 517
			if (!blk_update_request(io_req->req, io_req->error, io_req->length))
				__blk_mq_end_request(io_req->req, io_req->error);
518 519

			kfree(io_req);
A
Anton Ivanov 已提交
520
		}
J
Jeff Dike 已提交
521
	}
522
	reactivate_fd(thread_fd, UBD_IRQ);
L
Linus Torvalds 已提交
523 524
}

A
Al Viro 已提交
525
static irqreturn_t ubd_intr(int irq, void *dev)
L
Linus Torvalds 已提交
526
{
527
	ubd_handler();
528
	return IRQ_HANDLED;
529
}
530

531 532
/* Only changed by ubd_init, which is an initcall. */
static int io_pid = -1;
533

534
static void kill_io_thread(void)
535
{
536
	if(io_pid != -1)
537
		os_kill_process(io_pid, 1);
538
}
L
Linus Torvalds 已提交
539

540 541
__uml_exitcall(kill_io_thread);

542
static inline int ubd_file_size(struct ubd *ubd_dev, __u64 *size_out)
L
Linus Torvalds 已提交
543 544
{
	char *file;
R
Richard Weinberger 已提交
545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561
	int fd;
	int err;

	__u32 version;
	__u32 align;
	char *backing_file;
	time_t mtime;
	unsigned long long size;
	int sector_size;
	int bitmap_offset;

	if (ubd_dev->file && ubd_dev->cow.file) {
		file = ubd_dev->cow.file;

		goto out;
	}

562
	fd = os_open_file(ubd_dev->file, of_read(OPENFLAGS()), 0);
R
Richard Weinberger 已提交
563 564 565 566 567 568
	if (fd < 0)
		return fd;

	err = read_cow_header(file_reader, &fd, &version, &backing_file, \
		&mtime, &size, &sector_size, &align, &bitmap_offset);
	os_close_file(fd);
L
Linus Torvalds 已提交
569

R
Richard Weinberger 已提交
570 571 572 573 574 575
	if(err == -EINVAL)
		file = ubd_dev->file;
	else
		file = backing_file;

out:
576
	return os_file_size(file, size_out);
L
Linus Torvalds 已提交
577 578
}

579 580 581 582
static int read_cow_bitmap(int fd, void *buf, int offset, int len)
{
	int err;

583
	err = os_pread_file(fd, buf, len, offset);
584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760
	if (err < 0)
		return err;

	return 0;
}

static int backing_file_mismatch(char *file, __u64 size, time_t mtime)
{
	unsigned long modtime;
	unsigned long long actual;
	int err;

	err = os_file_modtime(file, &modtime);
	if (err < 0) {
		printk(KERN_ERR "Failed to get modification time of backing "
		       "file \"%s\", err = %d\n", file, -err);
		return err;
	}

	err = os_file_size(file, &actual);
	if (err < 0) {
		printk(KERN_ERR "Failed to get size of backing file \"%s\", "
		       "err = %d\n", file, -err);
		return err;
	}

	if (actual != size) {
		/*__u64 can be a long on AMD64 and with %lu GCC complains; so
		 * the typecast.*/
		printk(KERN_ERR "Size mismatch (%llu vs %llu) of COW header "
		       "vs backing file\n", (unsigned long long) size, actual);
		return -EINVAL;
	}
	if (modtime != mtime) {
		printk(KERN_ERR "mtime mismatch (%ld vs %ld) of COW header vs "
		       "backing file\n", mtime, modtime);
		return -EINVAL;
	}
	return 0;
}

static int path_requires_switch(char *from_cmdline, char *from_cow, char *cow)
{
	struct uml_stat buf1, buf2;
	int err;

	if (from_cmdline == NULL)
		return 0;
	if (!strcmp(from_cmdline, from_cow))
		return 0;

	err = os_stat_file(from_cmdline, &buf1);
	if (err < 0) {
		printk(KERN_ERR "Couldn't stat '%s', err = %d\n", from_cmdline,
		       -err);
		return 0;
	}
	err = os_stat_file(from_cow, &buf2);
	if (err < 0) {
		printk(KERN_ERR "Couldn't stat '%s', err = %d\n", from_cow,
		       -err);
		return 1;
	}
	if ((buf1.ust_dev == buf2.ust_dev) && (buf1.ust_ino == buf2.ust_ino))
		return 0;

	printk(KERN_ERR "Backing file mismatch - \"%s\" requested, "
	       "\"%s\" specified in COW header of \"%s\"\n",
	       from_cmdline, from_cow, cow);
	return 1;
}

static int open_ubd_file(char *file, struct openflags *openflags, int shared,
		  char **backing_file_out, int *bitmap_offset_out,
		  unsigned long *bitmap_len_out, int *data_offset_out,
		  int *create_cow_out)
{
	time_t mtime;
	unsigned long long size;
	__u32 version, align;
	char *backing_file;
	int fd, err, sectorsize, asked_switch, mode = 0644;

	fd = os_open_file(file, *openflags, mode);
	if (fd < 0) {
		if ((fd == -ENOENT) && (create_cow_out != NULL))
			*create_cow_out = 1;
		if (!openflags->w ||
		    ((fd != -EROFS) && (fd != -EACCES)))
			return fd;
		openflags->w = 0;
		fd = os_open_file(file, *openflags, mode);
		if (fd < 0)
			return fd;
	}

	if (shared)
		printk(KERN_INFO "Not locking \"%s\" on the host\n", file);
	else {
		err = os_lock_file(fd, openflags->w);
		if (err < 0) {
			printk(KERN_ERR "Failed to lock '%s', err = %d\n",
			       file, -err);
			goto out_close;
		}
	}

	/* Successful return case! */
	if (backing_file_out == NULL)
		return fd;

	err = read_cow_header(file_reader, &fd, &version, &backing_file, &mtime,
			      &size, &sectorsize, &align, bitmap_offset_out);
	if (err && (*backing_file_out != NULL)) {
		printk(KERN_ERR "Failed to read COW header from COW file "
		       "\"%s\", errno = %d\n", file, -err);
		goto out_close;
	}
	if (err)
		return fd;

	asked_switch = path_requires_switch(*backing_file_out, backing_file,
					    file);

	/* Allow switching only if no mismatch. */
	if (asked_switch && !backing_file_mismatch(*backing_file_out, size,
						   mtime)) {
		printk(KERN_ERR "Switching backing file to '%s'\n",
		       *backing_file_out);
		err = write_cow_header(file, fd, *backing_file_out,
				       sectorsize, align, &size);
		if (err) {
			printk(KERN_ERR "Switch failed, errno = %d\n", -err);
			goto out_close;
		}
	} else {
		*backing_file_out = backing_file;
		err = backing_file_mismatch(*backing_file_out, size, mtime);
		if (err)
			goto out_close;
	}

	cow_sizes(version, size, sectorsize, align, *bitmap_offset_out,
		  bitmap_len_out, data_offset_out);

	return fd;
 out_close:
	os_close_file(fd);
	return err;
}

static int create_cow_file(char *cow_file, char *backing_file,
		    struct openflags flags,
		    int sectorsize, int alignment, int *bitmap_offset_out,
		    unsigned long *bitmap_len_out, int *data_offset_out)
{
	int err, fd;

	flags.c = 1;
	fd = open_ubd_file(cow_file, &flags, 0, NULL, NULL, NULL, NULL, NULL);
	if (fd < 0) {
		err = fd;
		printk(KERN_ERR "Open of COW file '%s' failed, errno = %d\n",
		       cow_file, -err);
		goto out;
	}

	err = init_cow_file(fd, cow_file, backing_file, sectorsize, alignment,
			    bitmap_offset_out, bitmap_len_out,
			    data_offset_out);
	if (!err)
		return fd;
	os_close_file(fd);
 out:
	return err;
}

761
static void ubd_close_dev(struct ubd *ubd_dev)
L
Linus Torvalds 已提交
762
{
763 764
	os_close_file(ubd_dev->fd);
	if(ubd_dev->cow.file == NULL)
L
Linus Torvalds 已提交
765 766
		return;

767 768 769
	os_close_file(ubd_dev->cow.fd);
	vfree(ubd_dev->cow.bitmap);
	ubd_dev->cow.bitmap = NULL;
L
Linus Torvalds 已提交
770 771
}

772
static int ubd_open_dev(struct ubd *ubd_dev)
L
Linus Torvalds 已提交
773 774 775 776
{
	struct openflags flags;
	char **back_ptr;
	int err, create_cow, *create_ptr;
777
	int fd;
L
Linus Torvalds 已提交
778

779
	ubd_dev->openflags = ubd_dev->boot_openflags;
L
Linus Torvalds 已提交
780
	create_cow = 0;
781 782
	create_ptr = (ubd_dev->cow.file != NULL) ? &create_cow : NULL;
	back_ptr = ubd_dev->no_cow ? NULL : &ubd_dev->cow.file;
783 784

	fd = open_ubd_file(ubd_dev->file, &ubd_dev->openflags, ubd_dev->shared,
785 786
				back_ptr, &ubd_dev->cow.bitmap_offset,
				&ubd_dev->cow.bitmap_len, &ubd_dev->cow.data_offset,
787
				create_ptr);
L
Linus Torvalds 已提交
788

789 790
	if((fd == -ENOENT) && create_cow){
		fd = create_cow_file(ubd_dev->file, ubd_dev->cow.file,
791
					  ubd_dev->openflags, SECTOR_SIZE, PAGE_SIZE,
792 793 794
					  &ubd_dev->cow.bitmap_offset,
					  &ubd_dev->cow.bitmap_len,
					  &ubd_dev->cow.data_offset);
795
		if(fd >= 0){
L
Linus Torvalds 已提交
796
			printk(KERN_INFO "Creating \"%s\" as COW file for "
797
			       "\"%s\"\n", ubd_dev->file, ubd_dev->cow.file);
L
Linus Torvalds 已提交
798 799 800
		}
	}

801
	if(fd < 0){
802
		printk("Failed to open '%s', errno = %d\n", ubd_dev->file,
803 804
		       -fd);
		return fd;
L
Linus Torvalds 已提交
805
	}
806
	ubd_dev->fd = fd;
L
Linus Torvalds 已提交
807

808
	if(ubd_dev->cow.file != NULL){
809
		blk_queue_max_hw_sectors(ubd_dev->queue, 8 * sizeof(long));
810

L
Linus Torvalds 已提交
811
		err = -ENOMEM;
J
Jesper Juhl 已提交
812
		ubd_dev->cow.bitmap = vmalloc(ubd_dev->cow.bitmap_len);
813
		if(ubd_dev->cow.bitmap == NULL){
L
Linus Torvalds 已提交
814 815 816 817 818
			printk(KERN_ERR "Failed to vmalloc COW bitmap\n");
			goto error;
		}
		flush_tlb_kernel_vm();

819 820 821
		err = read_cow_bitmap(ubd_dev->fd, ubd_dev->cow.bitmap,
				      ubd_dev->cow.bitmap_offset,
				      ubd_dev->cow.bitmap_len);
L
Linus Torvalds 已提交
822 823 824
		if(err < 0)
			goto error;

825
		flags = ubd_dev->openflags;
L
Linus Torvalds 已提交
826
		flags.w = 0;
827
		err = open_ubd_file(ubd_dev->cow.file, &flags, ubd_dev->shared, NULL,
828
				    NULL, NULL, NULL, NULL);
L
Linus Torvalds 已提交
829
		if(err < 0) goto error;
830
		ubd_dev->cow.fd = err;
L
Linus Torvalds 已提交
831
	}
832
	blk_queue_flag_set(QUEUE_FLAG_NONROT, ubd_dev->queue);
833
	return 0;
L
Linus Torvalds 已提交
834
 error:
835
	os_close_file(ubd_dev->fd);
836
	return err;
L
Linus Torvalds 已提交
837 838
}

J
Jeff Dike 已提交
839 840
static void ubd_device_release(struct device *dev)
{
841
	struct ubd *ubd_dev = dev_get_drvdata(dev);
J
Jeff Dike 已提交
842 843

	blk_cleanup_queue(ubd_dev->queue);
844
	blk_mq_free_tag_set(&ubd_dev->tag_set);
J
Jeff Dike 已提交
845 846 847
	*ubd_dev = ((struct ubd) DEFAULT_UBD);
}

848
static int ubd_disk_register(int major, u64 size, int unit,
849
			     struct gendisk **disk_out)
L
Linus Torvalds 已提交
850
{
851
	struct device *parent = NULL;
L
Linus Torvalds 已提交
852 853 854 855
	struct gendisk *disk;

	disk = alloc_disk(1 << UBD_SHIFT);
	if(disk == NULL)
856
		return -ENOMEM;
L
Linus Torvalds 已提交
857 858 859 860 861

	disk->major = major;
	disk->first_minor = unit << UBD_SHIFT;
	disk->fops = &ubd_blops;
	set_capacity(disk, size / 512);
862
	if (major == UBD_MAJOR)
L
Linus Torvalds 已提交
863
		sprintf(disk->disk_name, "ubd%c", 'a' + unit);
864
	else
L
Linus Torvalds 已提交
865 866 867
		sprintf(disk->disk_name, "ubd_fake%d", unit);

	/* sysfs register (not for ide fake devices) */
868
	if (major == UBD_MAJOR) {
869 870
		ubd_devs[unit].pdev.id   = unit;
		ubd_devs[unit].pdev.name = DRIVER_NAME;
J
Jeff Dike 已提交
871
		ubd_devs[unit].pdev.dev.release = ubd_device_release;
872
		dev_set_drvdata(&ubd_devs[unit].pdev.dev, &ubd_devs[unit]);
873
		platform_device_register(&ubd_devs[unit].pdev);
874
		parent = &ubd_devs[unit].pdev.dev;
L
Linus Torvalds 已提交
875 876
	}

877
	disk->private_data = &ubd_devs[unit];
878
	disk->queue = ubd_devs[unit].queue;
879
	device_add_disk(parent, disk, NULL);
L
Linus Torvalds 已提交
880 881 882 883 884

	*disk_out = disk;
	return 0;
}

885
#define ROUND_BLOCK(n) ((n + (SECTOR_SIZE - 1)) & (-SECTOR_SIZE))
L
Linus Torvalds 已提交
886

887 888 889 890
static const struct blk_mq_ops ubd_mq_ops = {
	.queue_rq = ubd_queue_rq,
};

891
static int ubd_add(int n, char **error_out)
L
Linus Torvalds 已提交
892
{
893
	struct ubd *ubd_dev = &ubd_devs[n];
894
	int err = 0;
L
Linus Torvalds 已提交
895

896
	if(ubd_dev->file == NULL)
J
Jeff Dike 已提交
897
		goto out;
L
Linus Torvalds 已提交
898

899
	err = ubd_file_size(ubd_dev, &ubd_dev->size);
900 901
	if(err < 0){
		*error_out = "Couldn't determine size of device's file";
902
		goto out;
903
	}
L
Linus Torvalds 已提交
904

905
	ubd_dev->size = ROUND_BLOCK(ubd_dev->size);
L
Linus Torvalds 已提交
906

907 908 909 910 911 912
	ubd_dev->tag_set.ops = &ubd_mq_ops;
	ubd_dev->tag_set.queue_depth = 64;
	ubd_dev->tag_set.numa_node = NUMA_NO_NODE;
	ubd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
	ubd_dev->tag_set.driver_data = ubd_dev;
	ubd_dev->tag_set.nr_hw_queues = 1;
J
Jeff Dike 已提交
913

914 915
	err = blk_mq_alloc_tag_set(&ubd_dev->tag_set);
	if (err)
916
		goto out;
917 918 919 920 921

	ubd_dev->queue = blk_mq_init_queue(&ubd_dev->tag_set);
	if (IS_ERR(ubd_dev->queue)) {
		err = PTR_ERR(ubd_dev->queue);
		goto out_cleanup;
922
	}
923

924
	ubd_dev->queue->queuedata = ubd_dev;
925
	blk_queue_write_cache(ubd_dev->queue, true, false);
926

927
	blk_queue_max_segments(ubd_dev->queue, MAX_SG);
928
	err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]);
929 930
	if(err){
		*error_out = "Failed to register device";
931
		goto out_cleanup_tags;
932
	}
933

934
	if (fake_major != UBD_MAJOR)
935
		ubd_disk_register(fake_major, ubd_dev->size, n,
936
				  &fake_gendisk[n]);
L
Linus Torvalds 已提交
937

J
Jeff Dike 已提交
938 939 940 941
	/*
	 * Perhaps this should also be under the "if (fake_major)" above
	 * using the fake_disk->disk_name
	 */
L
Linus Torvalds 已提交
942 943 944
	if (fake_ide)
		make_ide_entries(ubd_gendisk[n]->disk_name);

J
Jeff Dike 已提交
945 946 947
	err = 0;
out:
	return err;
948

949 950
out_cleanup_tags:
	blk_mq_free_tag_set(&ubd_dev->tag_set);
951 952 953
out_cleanup:
	blk_cleanup_queue(ubd_dev->queue);
	goto out;
L
Linus Torvalds 已提交
954 955
}

956
static int ubd_config(char *str, char **error_out)
L
Linus Torvalds 已提交
957
{
958
	int n, ret;
L
Linus Torvalds 已提交
959

960 961 962 963
	/* This string is possibly broken up and stored, so it's only
	 * freed if ubd_setup_common fails, or if only general options
	 * were set.
	 */
J
Jeff Dike 已提交
964
	str = kstrdup(str, GFP_KERNEL);
965
	if (str == NULL) {
966 967
		*error_out = "Failed to allocate memory";
		return -ENOMEM;
L
Linus Torvalds 已提交
968
	}
969 970 971

	ret = ubd_setup_common(str, &n, error_out);
	if (ret)
972
		goto err_free;
973

974 975
	if (n == -1) {
		ret = 0;
976
		goto err_free;
L
Linus Torvalds 已提交
977 978
	}

979
	mutex_lock(&ubd_lock);
980
	ret = ubd_add(n, error_out);
981
	if (ret)
982
		ubd_devs[n].file = NULL;
983
	mutex_unlock(&ubd_lock);
L
Linus Torvalds 已提交
984

985
out:
986
	return ret;
987 988 989 990

err_free:
	kfree(str);
	goto out;
L
Linus Torvalds 已提交
991 992 993 994
}

static int ubd_get_config(char *name, char *str, int size, char **error_out)
{
995
	struct ubd *ubd_dev;
L
Linus Torvalds 已提交
996 997 998 999 1000
	int n, len = 0;

	n = parse_unit(&name);
	if((n >= MAX_DEV) || (n < 0)){
		*error_out = "ubd_get_config : device number out of range";
1001
		return -1;
L
Linus Torvalds 已提交
1002 1003
	}

1004
	ubd_dev = &ubd_devs[n];
1005
	mutex_lock(&ubd_lock);
L
Linus Torvalds 已提交
1006

1007
	if(ubd_dev->file == NULL){
L
Linus Torvalds 已提交
1008 1009 1010 1011
		CONFIG_CHUNK(str, size, len, "", 1);
		goto out;
	}

1012
	CONFIG_CHUNK(str, size, len, ubd_dev->file, 0);
L
Linus Torvalds 已提交
1013

1014
	if(ubd_dev->cow.file != NULL){
L
Linus Torvalds 已提交
1015
		CONFIG_CHUNK(str, size, len, ",", 0);
1016
		CONFIG_CHUNK(str, size, len, ubd_dev->cow.file, 1);
L
Linus Torvalds 已提交
1017 1018 1019 1020
	}
	else CONFIG_CHUNK(str, size, len, "", 1);

 out:
1021
	mutex_unlock(&ubd_lock);
1022
	return len;
L
Linus Torvalds 已提交
1023 1024
}

J
Jeff Dike 已提交
1025 1026
static int ubd_id(char **str, int *start_out, int *end_out)
{
1027
	int n;
J
Jeff Dike 已提交
1028 1029

	n = parse_unit(str);
1030 1031 1032
	*start_out = 0;
	*end_out = MAX_DEV - 1;
	return n;
J
Jeff Dike 已提交
1033 1034
}

1035
static int ubd_remove(int n, char **error_out)
L
Linus Torvalds 已提交
1036
{
J
Jeff Dike 已提交
1037
	struct gendisk *disk = ubd_gendisk[n];
1038
	struct ubd *ubd_dev;
J
Jeff Dike 已提交
1039
	int err = -ENODEV;
L
Linus Torvalds 已提交
1040

1041
	mutex_lock(&ubd_lock);
L
Linus Torvalds 已提交
1042

1043
	ubd_dev = &ubd_devs[n];
L
Linus Torvalds 已提交
1044

1045
	if(ubd_dev->file == NULL)
J
Jeff Dike 已提交
1046
		goto out;
L
Linus Torvalds 已提交
1047

J
Jeff Dike 已提交
1048 1049
	/* you cannot remove a open disk */
	err = -EBUSY;
1050
	if(ubd_dev->count > 0)
L
Linus Torvalds 已提交
1051 1052
		goto out;

1053
	ubd_gendisk[n] = NULL;
1054 1055 1056 1057
	if(disk != NULL){
		del_gendisk(disk);
		put_disk(disk);
	}
L
Linus Torvalds 已提交
1058 1059 1060 1061 1062 1063 1064 1065

	if(fake_gendisk[n] != NULL){
		del_gendisk(fake_gendisk[n]);
		put_disk(fake_gendisk[n]);
		fake_gendisk[n] = NULL;
	}

	err = 0;
J
Jeff Dike 已提交
1066
	platform_device_unregister(&ubd_dev->pdev);
J
Jeff Dike 已提交
1067
out:
1068
	mutex_unlock(&ubd_lock);
J
Jeff Dike 已提交
1069
	return err;
L
Linus Torvalds 已提交
1070 1071
}

1072
/* All these are called by mconsole in process context and without
1073
 * ubd-specific locks.  The structure itself is const except for .list.
1074
 */
L
Linus Torvalds 已提交
1075
static struct mc_device ubd_mc = {
J
Jeff Dike 已提交
1076
	.list		= LIST_HEAD_INIT(ubd_mc.list),
L
Linus Torvalds 已提交
1077 1078
	.name		= "ubd",
	.config		= ubd_config,
1079
	.get_config	= ubd_get_config,
J
Jeff Dike 已提交
1080
	.id		= ubd_id,
L
Linus Torvalds 已提交
1081 1082 1083
	.remove		= ubd_remove,
};

1084
static int __init ubd_mc_init(void)
L
Linus Torvalds 已提交
1085 1086 1087 1088 1089 1090 1091
{
	mconsole_register_dev(&ubd_mc);
	return 0;
}

__initcall(ubd_mc_init);

1092 1093 1094 1095
static int __init ubd0_init(void)
{
	struct ubd *ubd_dev = &ubd_devs[0];

1096
	mutex_lock(&ubd_lock);
1097 1098
	if(ubd_dev->file == NULL)
		ubd_dev->file = "root_fs";
1099 1100
	mutex_unlock(&ubd_lock);

1101
	return 0;
1102 1103 1104 1105
}

__initcall(ubd0_init);

1106
/* Used in ubd_init, which is an initcall */
1107 1108 1109 1110
static struct platform_driver ubd_driver = {
	.driver = {
		.name  = DRIVER_NAME,
	},
L
Linus Torvalds 已提交
1111 1112
};

1113
static int __init ubd_init(void)
L
Linus Torvalds 已提交
1114
{
1115 1116
	char *error;
	int i, err;
L
Linus Torvalds 已提交
1117

1118
	if (register_blkdev(UBD_MAJOR, "ubd"))
L
Linus Torvalds 已提交
1119 1120
		return -1;

1121
	if (fake_major != UBD_MAJOR) {
L
Linus Torvalds 已提交
1122 1123 1124 1125 1126 1127
		char name[sizeof("ubd_nnn\0")];

		snprintf(name, sizeof(name), "ubd_%d", fake_major);
		if (register_blkdev(fake_major, "ubd"))
			return -1;
	}
A
Anton Ivanov 已提交
1128

1129 1130 1131
	irq_req_buffer = kmalloc_array(UBD_REQ_BUFFER_SIZE,
				       sizeof(struct io_thread_req *),
				       GFP_KERNEL
A
Anton Ivanov 已提交
1132 1133 1134 1135 1136 1137 1138
		);
	irq_remainder = 0;

	if (irq_req_buffer == NULL) {
		printk(KERN_ERR "Failed to initialize ubd buffering\n");
		return -1;
	}
1139 1140 1141
	io_req_buffer = kmalloc_array(UBD_REQ_BUFFER_SIZE,
				      sizeof(struct io_thread_req *),
				      GFP_KERNEL
A
Anton Ivanov 已提交
1142 1143 1144 1145 1146 1147 1148 1149
		);

	io_remainder = 0;

	if (io_req_buffer == NULL) {
		printk(KERN_ERR "Failed to initialize ubd buffering\n");
		return -1;
	}
1150
	platform_driver_register(&ubd_driver);
1151
	mutex_lock(&ubd_lock);
1152 1153 1154 1155 1156 1157
	for (i = 0; i < MAX_DEV; i++){
		err = ubd_add(i, &error);
		if(err)
			printk(KERN_ERR "Failed to initialize ubd device %d :"
			       "%s\n", i, error);
	}
1158
	mutex_unlock(&ubd_lock);
L
Linus Torvalds 已提交
1159 1160 1161 1162 1163
	return 0;
}

late_initcall(ubd_init);

1164
static int __init ubd_driver_init(void){
1165 1166 1167 1168 1169 1170 1171 1172 1173 1174
	unsigned long stack;
	int err;

	/* Set by CONFIG_BLK_DEV_UBD_SYNC or ubd=sync.*/
	if(global_openflags.s){
		printk(KERN_INFO "ubd: Synchronous mode\n");
		/* Letting ubd=sync be like using ubd#s= instead of ubd#= is
		 * enough. So use anyway the io thread. */
	}
	stack = alloc_stack(0, 0);
1175
	io_pid = start_io_thread(stack + PAGE_SIZE - sizeof(void *),
1176 1177
				 &thread_fd);
	if(io_pid < 0){
1178
		printk(KERN_ERR
1179 1180 1181
		       "ubd : Failed to start I/O thread (errno = %d) - "
		       "falling back to synchronous I/O\n", -io_pid);
		io_pid = -1;
1182
		return 0;
1183
	}
1184
	err = um_request_irq(UBD_IRQ, thread_fd, IRQ_READ, ubd_intr,
Y
Yong Zhang 已提交
1185
			     0, "ubd", ubd_devs);
1186 1187
	if(err != 0)
		printk(KERN_ERR "um_request_irq failed - errno = %d\n", -err);
1188
	return 0;
1189 1190 1191 1192
}

device_initcall(ubd_driver_init);

A
Al Viro 已提交
1193
static int ubd_open(struct block_device *bdev, fmode_t mode)
L
Linus Torvalds 已提交
1194
{
A
Al Viro 已提交
1195
	struct gendisk *disk = bdev->bd_disk;
1196
	struct ubd *ubd_dev = disk->private_data;
L
Linus Torvalds 已提交
1197 1198
	int err = 0;

A
Arnd Bergmann 已提交
1199
	mutex_lock(&ubd_mutex);
1200 1201
	if(ubd_dev->count == 0){
		err = ubd_open_dev(ubd_dev);
L
Linus Torvalds 已提交
1202 1203
		if(err){
			printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n",
1204
			       disk->disk_name, ubd_dev->file, -err);
L
Linus Torvalds 已提交
1205 1206 1207
			goto out;
		}
	}
1208 1209
	ubd_dev->count++;
	set_disk_ro(disk, !ubd_dev->openflags.w);
1210 1211 1212

	/* This should no more be needed. And it didn't work anyway to exclude
	 * read-write remounting of filesystems.*/
A
Al Viro 已提交
1213
	/*if((mode & FMODE_WRITE) && !ubd_dev->openflags.w){
1214
	        if(--ubd_dev->count == 0) ubd_close_dev(ubd_dev);
L
Linus Torvalds 已提交
1215
	        err = -EROFS;
1216
	}*/
1217
out:
A
Arnd Bergmann 已提交
1218
	mutex_unlock(&ubd_mutex);
1219
	return err;
L
Linus Torvalds 已提交
1220 1221
}

1222
static void ubd_release(struct gendisk *disk, fmode_t mode)
L
Linus Torvalds 已提交
1223
{
1224
	struct ubd *ubd_dev = disk->private_data;
L
Linus Torvalds 已提交
1225

A
Arnd Bergmann 已提交
1226
	mutex_lock(&ubd_mutex);
1227
	if(--ubd_dev->count == 0)
1228
		ubd_close_dev(ubd_dev);
A
Arnd Bergmann 已提交
1229
	mutex_unlock(&ubd_mutex);
L
Linus Torvalds 已提交
1230 1231
}

1232 1233 1234 1235
static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask,
			  __u64 *cow_offset, unsigned long *bitmap,
			  __u64 bitmap_offset, unsigned long *bitmap_words,
			  __u64 bitmap_len)
L
Linus Torvalds 已提交
1236
{
1237
	__u64 sector = io_offset >> SECTOR_SHIFT;
1238 1239
	int i, update_bitmap = 0;

1240
	for (i = 0; i < length >> SECTOR_SHIFT; i++) {
1241 1242 1243 1244
		if(cow_mask != NULL)
			ubd_set_bit(i, (unsigned char *) cow_mask);
		if(ubd_test_bit(sector + i, (unsigned char *) bitmap))
			continue;
L
Linus Torvalds 已提交
1245

1246 1247 1248 1249 1250 1251
		update_bitmap = 1;
		ubd_set_bit(sector + i, (unsigned char *) bitmap);
	}

	if(!update_bitmap)
		return;
L
Linus Torvalds 已提交
1252

1253
	*cow_offset = sector / (sizeof(unsigned long) * 8);
L
Linus Torvalds 已提交
1254

1255 1256 1257 1258 1259
	/* This takes care of the case where we're exactly at the end of the
	 * device, and *cow_offset + 1 is off the end.  So, just back it up
	 * by one word.  Thanks to Lynn Kerby for the fix and James McMechan
	 * for the original diagnosis.
	 */
J
Jiri Olsa 已提交
1260 1261
	if (*cow_offset == (DIV_ROUND_UP(bitmap_len,
					 sizeof(unsigned long)) - 1))
1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273
		(*cow_offset)--;

	bitmap_words[0] = bitmap[*cow_offset];
	bitmap_words[1] = bitmap[*cow_offset + 1];

	*cow_offset *= sizeof(unsigned long);
	*cow_offset += bitmap_offset;
}

static void cowify_req(struct io_thread_req *req, unsigned long *bitmap,
		       __u64 bitmap_offset, __u64 bitmap_len)
{
1274
	__u64 sector = req->offset >> SECTOR_SHIFT;
1275 1276
	int i;

1277
	if (req->length > (sizeof(req->sector_mask) * 8) << SECTOR_SHIFT)
1278 1279
		panic("Operation too long");

1280 1281
	if (req_op(req->req) == REQ_OP_READ) {
		for (i = 0; i < req->length >> SECTOR_SHIFT; i++) {
1282
			if(ubd_test_bit(sector + i, (unsigned char *) bitmap))
1283
				ubd_set_bit(i, (unsigned char *)
1284
					    &req->sector_mask);
1285
		}
1286 1287 1288 1289
	}
	else cowify_bitmap(req->offset, req->length, &req->sector_mask,
			   &req->cow_offset, bitmap, bitmap_offset,
			   req->bitmap_words, bitmap_len);
L
Linus Torvalds 已提交
1290 1291
}

1292 1293
static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req,
		u64 off, struct bio_vec *bvec)
L
Linus Torvalds 已提交
1294
{
1295 1296 1297
	struct ubd *dev = hctx->queue->queuedata;
	struct io_thread_req *io_req;
	int ret;
L
Linus Torvalds 已提交
1298

1299 1300 1301
	io_req = kmalloc(sizeof(struct io_thread_req), GFP_ATOMIC);
	if (!io_req)
		return -ENOMEM;
1302 1303

	io_req->req = req;
1304 1305 1306 1307
	if (dev->cow.file)
		io_req->fds[0] = dev->cow.fd;
	else
		io_req->fds[0] = dev->fd;
1308
	io_req->error = 0;
1309

1310
	if (bvec != NULL) {
1311
		io_req->buffer = page_address(bvec->bv_page) + bvec->bv_offset;
1312 1313 1314 1315
		io_req->length = bvec->bv_len;
	} else {
		io_req->buffer = NULL;
		io_req->length = blk_rq_bytes(req);
1316
	}
1317

1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329
	io_req->sectorsize = SECTOR_SIZE;
	io_req->fds[1] = dev->fd;
	io_req->cow_offset = -1;
	io_req->offset = off;
	io_req->sector_mask = 0;
	io_req->offsets[0] = 0;
	io_req->offsets[1] = dev->cow.data_offset;

	if (dev->cow.file)
		cowify_req(io_req, dev->cow.bitmap,
			   dev->cow.bitmap_offset, dev->cow.bitmap_len);

1330 1331 1332 1333
	ret = os_write_file(thread_fd, &io_req, sizeof(io_req));
	if (ret != sizeof(io_req)) {
		if (ret != -EAGAIN)
			pr_err("write to io thread failed: %d\n", -ret);
1334 1335
		kfree(io_req);
	}
1336
	return ret;
1337 1338
}

1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354
static int queue_rw_req(struct blk_mq_hw_ctx *hctx, struct request *req)
{
	struct req_iterator iter;
	struct bio_vec bvec;
	int ret;
	u64 off = (u64)blk_rq_pos(req) << SECTOR_SHIFT;

	rq_for_each_segment(bvec, req, iter) {
		ret = ubd_queue_one_vec(hctx, req, off, &bvec);
		if (ret < 0)
			return ret;
		off += bvec.bv_len;
	}
	return 0;
}

1355 1356
static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
				 const struct blk_mq_queue_data *bd)
L
Linus Torvalds 已提交
1357
{
1358
	struct ubd *ubd_dev = hctx->queue->queuedata;
1359
	struct request *req = bd->rq;
1360
	int ret = 0, res = BLK_STS_OK;
J
Jeff Dike 已提交
1361

1362 1363
	blk_mq_start_request(req);

1364 1365
	spin_lock_irq(&ubd_dev->lock);

1366 1367 1368
	switch (req_op(req)) {
	/* operations with no lentgth/offset arguments */
	case REQ_OP_FLUSH:
1369
		ret = ubd_queue_one_vec(hctx, req, 0, NULL);
1370 1371 1372 1373 1374 1375 1376 1377
		break;
	case REQ_OP_READ:
	case REQ_OP_WRITE:
		ret = queue_rw_req(hctx, req);
		break;
	default:
		WARN_ON_ONCE(1);
		res = BLK_STS_NOTSUPP;
1378
	}
1379

1380 1381 1382
	spin_unlock_irq(&ubd_dev->lock);

	if (ret < 0)
1383
		blk_mq_requeue_request(req, true);
1384

1385
	return res;
L
Linus Torvalds 已提交
1386 1387
}

1388 1389
static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
1390
	struct ubd *ubd_dev = bdev->bd_disk->private_data;
1391 1392 1393

	geo->heads = 128;
	geo->sectors = 32;
1394
	geo->cylinders = ubd_dev->size / (128 * 32 * 512);
1395 1396 1397
	return 0;
}

A
Al Viro 已提交
1398
static int ubd_ioctl(struct block_device *bdev, fmode_t mode,
L
Linus Torvalds 已提交
1399 1400
		     unsigned int cmd, unsigned long arg)
{
A
Al Viro 已提交
1401
	struct ubd *ubd_dev = bdev->bd_disk->private_data;
1402
	u16 ubd_id[ATA_ID_WORDS];
L
Linus Torvalds 已提交
1403 1404 1405 1406

	switch (cmd) {
		struct cdrom_volctrl volume;
	case HDIO_GET_IDENTITY:
1407 1408 1409 1410
		memset(&ubd_id, 0, ATA_ID_WORDS * 2);
		ubd_id[ATA_ID_CYLS]	= ubd_dev->size / (128 * 32 * 512);
		ubd_id[ATA_ID_HEADS]	= 128;
		ubd_id[ATA_ID_SECTORS]	= 32;
L
Linus Torvalds 已提交
1411 1412
		if(copy_to_user((char __user *) arg, (char *) &ubd_id,
				 sizeof(ubd_id)))
1413 1414
			return -EFAULT;
		return 0;
1415

L
Linus Torvalds 已提交
1416 1417
	case CDROMVOLREAD:
		if(copy_from_user(&volume, (char __user *) arg, sizeof(volume)))
1418
			return -EFAULT;
L
Linus Torvalds 已提交
1419 1420 1421 1422 1423
		volume.channel0 = 255;
		volume.channel1 = 255;
		volume.channel2 = 255;
		volume.channel3 = 255;
		if(copy_to_user((char __user *) arg, &volume, sizeof(volume)))
1424 1425
			return -EFAULT;
		return 0;
L
Linus Torvalds 已提交
1426
	}
1427
	return -EINVAL;
L
Linus Torvalds 已提交
1428 1429
}

1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443
static int map_error(int error_code)
{
	switch (error_code) {
	case 0:
		return BLK_STS_OK;
	case ENOSYS:
	case EOPNOTSUPP:
		return BLK_STS_NOTSUPP;
	case ENOSPC:
		return BLK_STS_NOSPC;
	}
	return BLK_STS_IOERR;
}

1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456
/*
 * Everything from here onwards *IS NOT PART OF THE KERNEL*
 *
 * The following functions are part of UML hypervisor code.
 * All functions from here onwards are executed as a helper
 * thread and are not allowed to execute any kernel functions.
 *
 * Any communication must occur strictly via shared memory and IPC.
 *
 * Do not add printks, locks, kernel memory operations, etc - it
 * will result in unpredictable behaviour and/or crashes.
 */

1457
static int update_bitmap(struct io_thread_req *req)
L
Linus Torvalds 已提交
1458
{
1459
	int n;
L
Linus Torvalds 已提交
1460

1461
	if(req->cow_offset == -1)
1462
		return map_error(0);
L
Linus Torvalds 已提交
1463

1464 1465
	n = os_pwrite_file(req->fds[1], &req->bitmap_words,
			  sizeof(req->bitmap_words), req->cow_offset);
1466
	if(n != sizeof(req->bitmap_words))
1467
		return map_error(-n);
L
Linus Torvalds 已提交
1468

1469
	return map_error(0);
1470
}
L
Linus Torvalds 已提交
1471

1472
static void do_io(struct io_thread_req *req)
1473 1474 1475 1476 1477 1478
{
	char *buf;
	unsigned long len;
	int n, nsectors, start, end, bit;
	__u64 off;

1479
	if (req_op(req->req) == REQ_OP_FLUSH) {
1480
		/* fds[0] is always either the rw image or our cow file */
1481
		req->error = map_error(-os_sync_file(req->fds[0]));
1482 1483 1484
		return;
	}

1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499
	nsectors = req->length / req->sectorsize;
	start = 0;
	do {
		bit = ubd_test_bit(start, (unsigned char *) &req->sector_mask);
		end = start;
		while((end < nsectors) &&
		      (ubd_test_bit(end, (unsigned char *)
				    &req->sector_mask) == bit))
			end++;

		off = req->offset + req->offsets[bit] +
			start * req->sectorsize;
		len = (end - start) * req->sectorsize;
		buf = &req->buffer[start * req->sectorsize];

1500
		if (req_op(req->req) == REQ_OP_READ) {
1501 1502 1503 1504
			n = 0;
			do {
				buf = &buf[n];
				len -= n;
1505
				n = os_pread_file(req->fds[bit], buf, len, off);
1506
				if(n < 0){
1507
					req->error = map_error(-n);
1508 1509 1510 1511 1512
					return;
				}
			} while((n < len) && (n != 0));
			if (n < len) memset(&buf[n], 0, len - n);
		} else {
1513
			n = os_pwrite_file(req->fds[bit], buf, len, off);
1514
			if(n != len){
1515
				req->error = map_error(-n);
1516 1517 1518 1519 1520 1521
				return;
			}
		}

		start = end;
	} while(start < nsectors);
L
Linus Torvalds 已提交
1522

1523
	req->error = update_bitmap(req);
L
Linus Torvalds 已提交
1524
}
1525 1526 1527 1528 1529 1530

/* Changed in start_io_thread, which is serialized by being called only
 * from ubd_init, which is an initcall.
 */
int kernel_fd = -1;

1531 1532
/* Only changed by the io thread. XXX: currently unused. */
static int io_count = 0;
1533 1534 1535

int io_thread(void *arg)
{
A
Anton Ivanov 已提交
1536
	int n, count, written, res;
1537

1538 1539
	os_fix_helper_signals();

1540
	while(1){
A
Anton Ivanov 已提交
1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551
		n = bulk_req_safe_read(
			kernel_fd,
			io_req_buffer,
			&io_remainder,
			&io_remainder_size,
			UBD_REQ_BUFFER_SIZE
		);
		if (n < 0) {
			if (n == -EAGAIN) {
				ubd_read_poll(-1);
				continue;
1552 1553
			}
		}
A
Anton Ivanov 已提交
1554 1555 1556 1557 1558 1559 1560 1561 1562 1563

		for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
			io_count++;
			do_io((*io_req_buffer)[count]);
		}

		written = 0;

		do {
			res = os_write_file(kernel_fd, ((char *) io_req_buffer) + written, n);
A
Anton Ivanov 已提交
1564
			if (res >= 0) {
A
Anton Ivanov 已提交
1565 1566 1567 1568 1569 1570
				written += res;
			}
			if (written < n) {
				ubd_write_poll(-1);
			}
		} while (written < n);
1571 1572
	}

1573 1574
	return 0;
}