dm-zoned-target.c 27.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (C) 2017 Western Digital Corporation or its affiliates.
 *
 * This file is released under the GPL.
 */

#include "dm-zoned.h"

#include <linux/module.h>

#define	DM_MSG_PREFIX		"zoned"

#define DMZ_MIN_BIOS		8192

H
Hannes Reinecke 已提交
16 17
#define DMZ_MAX_DEVS		2

18 19 20 21
/*
 * Zone BIO context.
 */
struct dmz_bioctx {
22
	struct dmz_dev		*dev;
23 24
	struct dm_zone		*zone;
	struct bio		*bio;
25
	refcount_t		ref;
26 27 28 29 30 31 32
};

/*
 * Chunk work descriptor.
 */
struct dm_chunk_work {
	struct work_struct	work;
33
	refcount_t		refcount;
34 35 36 37 38 39 40 41 42
	struct dmz_target	*target;
	unsigned int		chunk;
	struct bio_list		bio_list;
};

/*
 * Target descriptor.
 */
struct dmz_target {
H
Hannes Reinecke 已提交
43
	struct dm_dev		*ddev[DMZ_MAX_DEVS];
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58

	unsigned long		flags;

	/* Zoned block device information */
	struct dmz_dev		*dev;

	/* For metadata handling */
	struct dmz_metadata     *metadata;

	/* For reclaim */
	struct dmz_reclaim	*reclaim;

	/* For chunk work */
	struct radix_tree_root	chunk_rxtree;
	struct workqueue_struct *chunk_wq;
59
	struct mutex		chunk_lock;
60 61

	/* For cloned BIOs to zones */
62
	struct bio_set		bio_set;
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80

	/* For flush */
	spinlock_t		flush_lock;
	struct bio_list		flush_list;
	struct delayed_work	flush_work;
	struct workqueue_struct *flush_wq;
};

/*
 * Flush intervals (seconds).
 */
#define DMZ_FLUSH_PERIOD	(10 * HZ)

/*
 * Target BIO completion.
 */
static inline void dmz_bio_endio(struct bio *bio, blk_status_t status)
{
81 82
	struct dmz_bioctx *bioctx =
		dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
83

84 85
	if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK)
		bio->bi_status = status;
H
Hannes Reinecke 已提交
86
	if (bioctx->dev && bio->bi_status != BLK_STS_OK)
87
		bioctx->dev->flags |= DMZ_CHECK_BDEV;
88 89 90 91 92 93 94 95 96 97 98 99 100

	if (refcount_dec_and_test(&bioctx->ref)) {
		struct dm_zone *zone = bioctx->zone;

		if (zone) {
			if (bio->bi_status != BLK_STS_OK &&
			    bio_op(bio) == REQ_OP_WRITE &&
			    dmz_is_seq(zone))
				set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
			dmz_deactivate_zone(zone);
		}
		bio_endio(bio);
	}
101 102 103
}

/*
104
 * Completion callback for an internally cloned target BIO. This terminates the
105 106
 * target BIO when there are no more references to its context.
 */
107
static void dmz_clone_endio(struct bio *clone)
108
{
109 110
	struct dmz_bioctx *bioctx = clone->bi_private;
	blk_status_t status = clone->bi_status;
111

112
	bio_put(clone);
113 114 115 116
	dmz_bio_endio(bioctx->bio, status);
}

/*
117
 * Issue a clone of a target BIO. The clone may only partially process the
118 119
 * original target BIO.
 */
120 121 122
static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
			  struct bio *bio, sector_t chunk_block,
			  unsigned int nr_blocks)
123
{
124 125
	struct dmz_bioctx *bioctx =
		dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
126
	struct dmz_dev *dev = zone->dev;
127 128
	struct bio *clone;

129 130 131
	if (dev->flags & DMZ_BDEV_DYING)
		return -EIO;

132
	clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set);
133 134 135
	if (!clone)
		return -ENOMEM;

136 137
	bio_set_dev(clone, dev->bdev);
	bioctx->dev = dev;
138 139
	clone->bi_iter.bi_sector =
		dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
140
	clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT;
141
	clone->bi_end_io = dmz_clone_endio;
142 143 144 145
	clone->bi_private = bioctx;

	bio_advance(bio, clone->bi_iter.bi_size);

146
	refcount_inc(&bioctx->ref);
147 148
	generic_make_request(clone);

149 150 151
	if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone))
		zone->wp_block += nr_blocks;

152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
	return 0;
}

/*
 * Zero out pages of discarded blocks accessed by a read BIO.
 */
static void dmz_handle_read_zero(struct dmz_target *dmz, struct bio *bio,
				 sector_t chunk_block, unsigned int nr_blocks)
{
	unsigned int size = nr_blocks << DMZ_BLOCK_SHIFT;

	/* Clear nr_blocks */
	swap(bio->bi_iter.bi_size, size);
	zero_fill_bio(bio);
	swap(bio->bi_iter.bi_size, size);

	bio_advance(bio, size);
}

/*
 * Process a read BIO.
 */
static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
			   struct bio *bio)
{
177 178
	struct dmz_metadata *zmd = dmz->metadata;
	sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio));
179 180 181 182 183 184 185 186 187 188 189
	unsigned int nr_blocks = dmz_bio_blocks(bio);
	sector_t end_block = chunk_block + nr_blocks;
	struct dm_zone *rzone, *bzone;
	int ret;

	/* Read into unmapped chunks need only zeroing the BIO buffer */
	if (!zone) {
		zero_fill_bio(bio);
		return 0;
	}

190 191 192
	DMDEBUG("(%s): READ chunk %llu -> %s zone %u, block %llu, %u blocks",
		dmz_metadata_label(zmd),
		(unsigned long long)dmz_bio_chunk(zmd, bio),
193 194
		(dmz_is_rnd(zone) ? "RND" :
		 (dmz_is_cache(zone) ? "CACHE" : "SEQ")),
195 196
		zone->id,
		(unsigned long long)chunk_block, nr_blocks);
197 198 199 200 201

	/* Check block validity to determine the read location */
	bzone = zone->bzone;
	while (chunk_block < end_block) {
		nr_blocks = 0;
202 203
		if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
		    chunk_block < zone->wp_block) {
204
			/* Test block validity in the data zone */
205
			ret = dmz_block_valid(zmd, zone, chunk_block);
206 207 208 209 210 211 212 213 214 215 216 217 218 219
			if (ret < 0)
				return ret;
			if (ret > 0) {
				/* Read data zone blocks */
				nr_blocks = ret;
				rzone = zone;
			}
		}

		/*
		 * No valid blocks found in the data zone.
		 * Check the buffer zone, if there is one.
		 */
		if (!nr_blocks && bzone) {
220
			ret = dmz_block_valid(zmd, bzone, chunk_block);
221 222 223 224 225 226 227 228 229 230 231
			if (ret < 0)
				return ret;
			if (ret > 0) {
				/* Read buffer zone blocks */
				nr_blocks = ret;
				rzone = bzone;
			}
		}

		if (nr_blocks) {
			/* Valid blocks found: read them */
232 233 234 235
			nr_blocks = min_t(unsigned int, nr_blocks,
					  end_block - chunk_block);
			ret = dmz_submit_bio(dmz, rzone, bio,
					     chunk_block, nr_blocks);
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
			if (ret)
				return ret;
			chunk_block += nr_blocks;
		} else {
			/* No valid block: zeroout the current BIO block */
			dmz_handle_read_zero(dmz, bio, chunk_block, 1);
			chunk_block++;
		}
	}

	return 0;
}

/*
 * Write blocks directly in a data zone, at the write pointer.
 * If a buffer zone is assigned, invalidate the blocks written
 * in place.
 */
static int dmz_handle_direct_write(struct dmz_target *dmz,
				   struct dm_zone *zone, struct bio *bio,
				   sector_t chunk_block,
				   unsigned int nr_blocks)
{
	struct dmz_metadata *zmd = dmz->metadata;
	struct dm_zone *bzone = zone->bzone;
	int ret;

	if (dmz_is_readonly(zone))
		return -EROFS;

	/* Submit write */
267 268 269
	ret = dmz_submit_bio(dmz, zone, bio, chunk_block, nr_blocks);
	if (ret)
		return ret;
270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297

	/*
	 * Validate the blocks in the data zone and invalidate
	 * in the buffer zone, if there is one.
	 */
	ret = dmz_validate_blocks(zmd, zone, chunk_block, nr_blocks);
	if (ret == 0 && bzone)
		ret = dmz_invalidate_blocks(zmd, bzone, chunk_block, nr_blocks);

	return ret;
}

/*
 * Write blocks in the buffer zone of @zone.
 * If no buffer zone is assigned yet, get one.
 * Called with @zone write locked.
 */
static int dmz_handle_buffered_write(struct dmz_target *dmz,
				     struct dm_zone *zone, struct bio *bio,
				     sector_t chunk_block,
				     unsigned int nr_blocks)
{
	struct dmz_metadata *zmd = dmz->metadata;
	struct dm_zone *bzone;
	int ret;

	/* Get the buffer zone. One will be allocated if needed */
	bzone = dmz_get_chunk_buffer(zmd, zone);
298 299
	if (IS_ERR(bzone))
		return PTR_ERR(bzone);
300 301 302 303 304

	if (dmz_is_readonly(bzone))
		return -EROFS;

	/* Submit write */
305 306 307
	ret = dmz_submit_bio(dmz, bzone, bio, chunk_block, nr_blocks);
	if (ret)
		return ret;
308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325

	/*
	 * Validate the blocks in the buffer zone
	 * and invalidate in the data zone.
	 */
	ret = dmz_validate_blocks(zmd, bzone, chunk_block, nr_blocks);
	if (ret == 0 && chunk_block < zone->wp_block)
		ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);

	return ret;
}

/*
 * Process a write BIO.
 */
static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone,
			    struct bio *bio)
{
326 327
	struct dmz_metadata *zmd = dmz->metadata;
	sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio));
328 329 330 331 332
	unsigned int nr_blocks = dmz_bio_blocks(bio);

	if (!zone)
		return -ENOSPC;

333 334 335
	DMDEBUG("(%s): WRITE chunk %llu -> %s zone %u, block %llu, %u blocks",
		dmz_metadata_label(zmd),
		(unsigned long long)dmz_bio_chunk(zmd, bio),
336 337
		(dmz_is_rnd(zone) ? "RND" :
		 (dmz_is_cache(zone) ? "CACHE" : "SEQ")),
338 339
		zone->id,
		(unsigned long long)chunk_block, nr_blocks);
340

341 342
	if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
	    chunk_block == zone->wp_block) {
343 344 345 346 347
		/*
		 * zone is a random zone or it is a sequential zone
		 * and the BIO is aligned to the zone write pointer:
		 * direct write the zone.
		 */
348 349
		return dmz_handle_direct_write(dmz, zone, bio,
					       chunk_block, nr_blocks);
350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367
	}

	/*
	 * This is an unaligned write in a sequential zone:
	 * use buffered write.
	 */
	return dmz_handle_buffered_write(dmz, zone, bio, chunk_block, nr_blocks);
}

/*
 * Process a discard BIO.
 */
static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone,
			      struct bio *bio)
{
	struct dmz_metadata *zmd = dmz->metadata;
	sector_t block = dmz_bio_block(bio);
	unsigned int nr_blocks = dmz_bio_blocks(bio);
368
	sector_t chunk_block = dmz_chunk_block(zmd, block);
369 370 371 372 373 374 375 376 377
	int ret = 0;

	/* For unmapped chunks, there is nothing to do */
	if (!zone)
		return 0;

	if (dmz_is_readonly(zone))
		return -EROFS;

378 379 380 381 382
	DMDEBUG("(%s): DISCARD chunk %llu -> zone %u, block %llu, %u blocks",
		dmz_metadata_label(dmz->metadata),
		(unsigned long long)dmz_bio_chunk(zmd, bio),
		zone->id,
		(unsigned long long)chunk_block, nr_blocks);
383 384 385 386 387

	/*
	 * Invalidate blocks in the data zone and its
	 * buffer zone if one is mapped.
	 */
388 389
	if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
	    chunk_block < zone->wp_block)
390 391 392 393 394 395 396 397 398 399 400 401 402
		ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
	if (ret == 0 && zone->bzone)
		ret = dmz_invalidate_blocks(zmd, zone->bzone,
					    chunk_block, nr_blocks);
	return ret;
}

/*
 * Process a BIO.
 */
static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
			   struct bio *bio)
{
403 404
	struct dmz_bioctx *bioctx =
		dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422
	struct dmz_metadata *zmd = dmz->metadata;
	struct dm_zone *zone;
	int ret;

	/*
	 * Write may trigger a zone allocation. So make sure the
	 * allocation can succeed.
	 */
	if (bio_op(bio) == REQ_OP_WRITE)
		dmz_schedule_reclaim(dmz->reclaim);

	dmz_lock_metadata(zmd);

	/*
	 * Get the data zone mapping the chunk. There may be no
	 * mapping for read and discard. If a mapping is obtained,
	 + the zone returned will be set to active state.
	 */
423
	zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(zmd, bio),
424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
				     bio_op(bio));
	if (IS_ERR(zone)) {
		ret = PTR_ERR(zone);
		goto out;
	}

	/* Process the BIO */
	if (zone) {
		dmz_activate_zone(zone);
		bioctx->zone = zone;
	}

	switch (bio_op(bio)) {
	case REQ_OP_READ:
		ret = dmz_handle_read(dmz, zone, bio);
		break;
	case REQ_OP_WRITE:
		ret = dmz_handle_write(dmz, zone, bio);
		break;
	case REQ_OP_DISCARD:
	case REQ_OP_WRITE_ZEROES:
		ret = dmz_handle_discard(dmz, zone, bio);
		break;
	default:
448 449
		DMERR("(%s): Unsupported BIO operation 0x%x",
		      dmz_metadata_label(dmz->metadata), bio_op(bio));
450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469
		ret = -EIO;
	}

	/*
	 * Release the chunk mapping. This will check that the mapping
	 * is still valid, that is, that the zone used still has valid blocks.
	 */
	if (zone)
		dmz_put_chunk_mapping(zmd, zone);
out:
	dmz_bio_endio(bio, errno_to_blk_status(ret));

	dmz_unlock_metadata(zmd);
}

/*
 * Increment a chunk reference counter.
 */
static inline void dmz_get_chunk_work(struct dm_chunk_work *cw)
{
470
	refcount_inc(&cw->refcount);
471 472 473 474 475 476 477 478
}

/*
 * Decrement a chunk work reference count and
 * free it if it becomes 0.
 */
static void dmz_put_chunk_work(struct dm_chunk_work *cw)
{
479
	if (refcount_dec_and_test(&cw->refcount)) {
480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521
		WARN_ON(!bio_list_empty(&cw->bio_list));
		radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk);
		kfree(cw);
	}
}

/*
 * Chunk BIO work function.
 */
static void dmz_chunk_work(struct work_struct *work)
{
	struct dm_chunk_work *cw = container_of(work, struct dm_chunk_work, work);
	struct dmz_target *dmz = cw->target;
	struct bio *bio;

	mutex_lock(&dmz->chunk_lock);

	/* Process the chunk BIOs */
	while ((bio = bio_list_pop(&cw->bio_list))) {
		mutex_unlock(&dmz->chunk_lock);
		dmz_handle_bio(dmz, cw, bio);
		mutex_lock(&dmz->chunk_lock);
		dmz_put_chunk_work(cw);
	}

	/* Queueing the work incremented the work refcount */
	dmz_put_chunk_work(cw);

	mutex_unlock(&dmz->chunk_lock);
}

/*
 * Flush work.
 */
static void dmz_flush_work(struct work_struct *work)
{
	struct dmz_target *dmz = container_of(work, struct dmz_target, flush_work.work);
	struct bio *bio;
	int ret;

	/* Flush dirty metadata blocks */
	ret = dmz_flush_metadata(dmz->metadata);
522
	if (ret)
523
		DMDEBUG("(%s): Metadata flush failed, rc=%d",
524
			dmz_metadata_label(dmz->metadata), ret);
525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544

	/* Process queued flush requests */
	while (1) {
		spin_lock(&dmz->flush_lock);
		bio = bio_list_pop(&dmz->flush_list);
		spin_unlock(&dmz->flush_lock);

		if (!bio)
			break;

		dmz_bio_endio(bio, errno_to_blk_status(ret));
	}

	queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
}

/*
 * Get a chunk work and start it to process a new BIO.
 * If the BIO chunk has no work yet, create one.
 */
545
static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
546
{
547
	unsigned int chunk = dmz_bio_chunk(dmz->metadata, bio);
548
	struct dm_chunk_work *cw;
549
	int ret = 0;
550 551 552 553 554

	mutex_lock(&dmz->chunk_lock);

	/* Get the BIO chunk work. If one is not active yet, create one */
	cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk);
555 556 557
	if (cw) {
		dmz_get_chunk_work(cw);
	} else {
558
		/* Create a new chunk work */
559
		cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO);
560 561
		if (unlikely(!cw)) {
			ret = -ENOMEM;
562
			goto out;
563
		}
564 565

		INIT_WORK(&cw->work, dmz_chunk_work);
566
		refcount_set(&cw->refcount, 1);
567 568 569 570 571 572 573 574 575 576 577 578 579
		cw->target = dmz;
		cw->chunk = chunk;
		bio_list_init(&cw->bio_list);

		ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw);
		if (unlikely(ret)) {
			kfree(cw);
			goto out;
		}
	}

	bio_list_add(&cw->bio_list, bio);

580
	dmz_reclaim_bio_acc(dmz->reclaim);
581 582 583 584
	if (queue_work(dmz->chunk_wq, &cw->work))
		dmz_get_chunk_work(cw);
out:
	mutex_unlock(&dmz->chunk_lock);
585
	return ret;
586 587
}

588
/*
589
 * Check if the backing device is being removed. If it's on the way out,
590 591 592 593 594
 * start failing I/O. Reclaim and metadata components also call this
 * function to cleanly abort operation in the event of such failure.
 */
bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
{
595 596
	if (dmz_dev->flags & DMZ_BDEV_DYING)
		return true;
597

598 599 600 601 602 603
	if (dmz_dev->flags & DMZ_CHECK_BDEV)
		return !dmz_check_bdev(dmz_dev);

	if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) {
		dmz_dev_warn(dmz_dev, "Backing device queue dying");
		dmz_dev->flags |= DMZ_BDEV_DYING;
604 605 606 607 608
	}

	return dmz_dev->flags & DMZ_BDEV_DYING;
}

609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633
/*
 * Check the backing device availability. This detects such events as
 * backing device going offline due to errors, media removals, etc.
 * This check is less efficient than dmz_bdev_is_dying() and should
 * only be performed as a part of error handling.
 */
bool dmz_check_bdev(struct dmz_dev *dmz_dev)
{
	struct gendisk *disk;

	dmz_dev->flags &= ~DMZ_CHECK_BDEV;

	if (dmz_bdev_is_dying(dmz_dev))
		return false;

	disk = dmz_dev->bdev->bd_disk;
	if (disk->fops->check_events &&
	    disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) {
		dmz_dev_warn(dmz_dev, "Backing device offline");
		dmz_dev->flags |= DMZ_BDEV_DYING;
	}

	return !(dmz_dev->flags & DMZ_BDEV_DYING);
}

634 635 636 637 638 639
/*
 * Process a new BIO.
 */
static int dmz_map(struct dm_target *ti, struct bio *bio)
{
	struct dmz_target *dmz = ti->private;
640
	struct dmz_metadata *zmd = dmz->metadata;
641 642 643 644
	struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
	sector_t sector = bio->bi_iter.bi_sector;
	unsigned int nr_sectors = bio_sectors(bio);
	sector_t chunk_sector;
645
	int ret;
646

647
	if (dmz_dev_is_dying(zmd))
648 649
		return DM_MAPIO_KILL;

650 651 652 653 654 655
	DMDEBUG("(%s): BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
		dmz_metadata_label(zmd),
		bio_op(bio), (unsigned long long)sector, nr_sectors,
		(unsigned long long)dmz_bio_chunk(zmd, bio),
		(unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)),
		(unsigned int)dmz_bio_blocks(bio));
656

657
	if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE)
658 659 660 661 662 663 664
		return DM_MAPIO_REMAPPED;

	/* The BIO should be block aligned */
	if ((nr_sectors & DMZ_BLOCK_SECTORS_MASK) || (sector & DMZ_BLOCK_SECTORS_MASK))
		return DM_MAPIO_KILL;

	/* Initialize the BIO context */
665
	bioctx->dev = NULL;
666 667
	bioctx->zone = NULL;
	bioctx->bio = bio;
668
	refcount_set(&bioctx->ref, 1);
669 670

	/* Set the BIO pending in the flush list */
671
	if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) {
672 673 674 675 676 677 678 679
		spin_lock(&dmz->flush_lock);
		bio_list_add(&dmz->flush_list, bio);
		spin_unlock(&dmz->flush_lock);
		mod_delayed_work(dmz->flush_wq, &dmz->flush_work, 0);
		return DM_MAPIO_SUBMITTED;
	}

	/* Split zone BIOs to fit entirely into a zone */
680 681 682
	chunk_sector = sector & (dmz_zone_nr_sectors(zmd) - 1);
	if (chunk_sector + nr_sectors > dmz_zone_nr_sectors(zmd))
		dm_accept_partial_bio(bio, dmz_zone_nr_sectors(zmd) - chunk_sector);
683 684

	/* Now ready to handle this BIO */
685 686
	ret = dmz_queue_chunk_work(dmz, bio);
	if (ret) {
687
		DMDEBUG("(%s): BIO op %d, can't process chunk %llu, err %i",
688 689 690
			dmz_metadata_label(zmd),
			bio_op(bio), (u64)dmz_bio_chunk(zmd, bio),
			ret);
691 692
		return DM_MAPIO_REQUEUE;
	}
693 694 695 696 697 698 699

	return DM_MAPIO_SUBMITTED;
}

/*
 * Get zoned device information.
 */
H
Hannes Reinecke 已提交
700 701
static int dmz_get_zoned_device(struct dm_target *ti, char *path,
				int idx, int nr_devs)
702 703
{
	struct dmz_target *dmz = ti->private;
H
Hannes Reinecke 已提交
704
	struct dm_dev *ddev;
705 706
	struct dmz_dev *dev;
	int ret;
H
Hannes Reinecke 已提交
707
	struct block_device *bdev;
708 709

	/* Get the target device */
H
Hannes Reinecke 已提交
710
	ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &ddev);
711 712 713 714 715
	if (ret) {
		ti->error = "Get target device failed";
		return ret;
	}

H
Hannes Reinecke 已提交
716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741
	bdev = ddev->bdev;
	if (bdev_zoned_model(bdev) == BLK_ZONED_NONE) {
		if (nr_devs == 1) {
			ti->error = "Invalid regular device";
			goto err;
		}
		if (idx != 0) {
			ti->error = "First device must be a regular device";
			goto err;
		}
		if (dmz->ddev[0]) {
			ti->error = "Too many regular devices";
			goto err;
		}
		dev = &dmz->dev[idx];
		dev->flags = DMZ_BDEV_REGULAR;
	} else {
		if (dmz->ddev[idx]) {
			ti->error = "Too many zoned devices";
			goto err;
		}
		if (nr_devs > 1 && idx == 0) {
			ti->error = "First device must be a regular device";
			goto err;
		}
		dev = &dmz->dev[idx];
742
	}
H
Hannes Reinecke 已提交
743
	dev->bdev = bdev;
744 745
	(void)bdevname(dev->bdev, dev->name);

H
Hannes Reinecke 已提交
746 747 748
	dev->capacity = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
	if (ti->begin) {
		ti->error = "Partial mapping is not supported";
749 750 751
		goto err;
	}

H
Hannes Reinecke 已提交
752
	dmz->ddev[idx] = ddev;
753 754 755

	return 0;
err:
H
Hannes Reinecke 已提交
756 757
	dm_put_device(ti, ddev);
	return -EINVAL;
758 759 760 761 762 763 764 765
}

/*
 * Cleanup zoned device information.
 */
static void dmz_put_zoned_device(struct dm_target *ti)
{
	struct dmz_target *dmz = ti->private;
H
Hannes Reinecke 已提交
766
	int i;
767

H
Hannes Reinecke 已提交
768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810
	for (i = 0; i < DMZ_MAX_DEVS; i++) {
		if (dmz->ddev[i]) {
			dm_put_device(ti, dmz->ddev[i]);
			dmz->ddev[i] = NULL;
		}
	}
}

static int dmz_fixup_devices(struct dm_target *ti)
{
	struct dmz_target *dmz = ti->private;
	struct dmz_dev *reg_dev, *zoned_dev;
	struct request_queue *q;

	/*
	 * When we have two devices, the first one must be a regular block
	 * device and the second a zoned block device.
	 */
	if (dmz->ddev[0] && dmz->ddev[1]) {
		reg_dev = &dmz->dev[0];
		if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) {
			ti->error = "Primary disk is not a regular device";
			return -EINVAL;
		}
		zoned_dev = &dmz->dev[1];
		if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
			ti->error = "Secondary disk is not a zoned device";
			return -EINVAL;
		}
	} else {
		reg_dev = NULL;
		zoned_dev = &dmz->dev[0];
		if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
			ti->error = "Disk is not a zoned device";
			return -EINVAL;
		}
	}
	q = bdev_get_queue(zoned_dev->bdev);
	zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q);
	zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk);

	if (reg_dev) {
		reg_dev->zone_nr_sectors = zoned_dev->zone_nr_sectors;
811 812 813
		reg_dev->nr_zones =
			DIV_ROUND_UP_SECTOR_T(reg_dev->capacity,
					      reg_dev->zone_nr_sectors);
H
Hannes Reinecke 已提交
814 815 816
		zoned_dev->zone_offset = reg_dev->nr_zones;
	}
	return 0;
817 818 819 820 821 822 823 824 825 826 827
}

/*
 * Setup target.
 */
static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
	struct dmz_target *dmz;
	int ret;

	/* Check arguments */
H
Hannes Reinecke 已提交
828
	if (argc < 1 || argc > 2) {
829 830 831 832 833 834 835 836 837 838
		ti->error = "Invalid argument count";
		return -EINVAL;
	}

	/* Allocate and initialize the target descriptor */
	dmz = kzalloc(sizeof(struct dmz_target), GFP_KERNEL);
	if (!dmz) {
		ti->error = "Unable to allocate the zoned target descriptor";
		return -ENOMEM;
	}
H
Hannes Reinecke 已提交
839 840 841 842 843 844
	dmz->dev = kcalloc(2, sizeof(struct dmz_dev), GFP_KERNEL);
	if (!dmz->dev) {
		ti->error = "Unable to allocate the zoned device descriptors";
		kfree(dmz);
		return -ENOMEM;
	}
845 846 847
	ti->private = dmz;

	/* Get the target zoned block device */
H
Hannes Reinecke 已提交
848 849 850 851 852 853 854 855 856 857 858 859
	ret = dmz_get_zoned_device(ti, argv[0], 0, argc);
	if (ret)
		goto err;

	if (argc == 2) {
		ret = dmz_get_zoned_device(ti, argv[1], 1, argc);
		if (ret) {
			dmz_put_zoned_device(ti);
			goto err;
		}
	}
	ret = dmz_fixup_devices(ti);
860
	if (ret) {
H
Hannes Reinecke 已提交
861
		dmz_put_zoned_device(ti);
862 863 864 865
		goto err;
	}

	/* Initialize metadata */
H
Hannes Reinecke 已提交
866
	ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata,
867
			       dm_table_device_name(ti->table));
868 869 870 871 872 873
	if (ret) {
		ti->error = "Metadata initialization failed";
		goto err_dev;
	}

	/* Set target (no write same support) */
874
	ti->max_io_len = dmz_zone_nr_sectors(dmz->metadata) << 9;
875 876 877 878 879 880 881 882
	ti->num_flush_bios = 1;
	ti->num_discard_bios = 1;
	ti->num_write_zeroes_bios = 1;
	ti->per_io_data_size = sizeof(struct dmz_bioctx);
	ti->flush_supported = true;
	ti->discards_supported = true;

	/* The exposed capacity is the number of chunks that can be mapped */
883 884
	ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) <<
		dmz_zone_nr_sectors_shift(dmz->metadata);
885 886

	/* Zone BIO */
887 888
	ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0);
	if (ret) {
889 890 891 892 893 894
		ti->error = "Create BIO set failed";
		goto err_meta;
	}

	/* Chunk BIO work */
	mutex_init(&dmz->chunk_lock);
895
	INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO);
896 897 898
	dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s",
					WQ_MEM_RECLAIM | WQ_UNBOUND, 0,
					dmz_metadata_label(dmz->metadata));
899 900 901 902 903 904 905 906 907 908 909
	if (!dmz->chunk_wq) {
		ti->error = "Create chunk workqueue failed";
		ret = -ENOMEM;
		goto err_bio;
	}

	/* Flush work */
	spin_lock_init(&dmz->flush_lock);
	bio_list_init(&dmz->flush_list);
	INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work);
	dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM,
910
						dmz_metadata_label(dmz->metadata));
911 912 913 914 915 916 917 918
	if (!dmz->flush_wq) {
		ti->error = "Create flush workqueue failed";
		ret = -ENOMEM;
		goto err_cwq;
	}
	mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);

	/* Initialize reclaim */
919
	ret = dmz_ctr_reclaim(dmz->metadata, &dmz->reclaim);
920 921 922 923 924
	if (ret) {
		ti->error = "Zone reclaim initialization failed";
		goto err_fwq;
	}

925 926 927 928
	DMINFO("(%s): Target device: %llu 512-byte logical sectors (%llu blocks)",
	       dmz_metadata_label(dmz->metadata),
	       (unsigned long long)ti->len,
	       (unsigned long long)dmz_sect2blk(ti->len));
929 930 931 932 933 934 935

	return 0;
err_fwq:
	destroy_workqueue(dmz->flush_wq);
err_cwq:
	destroy_workqueue(dmz->chunk_wq);
err_bio:
936
	mutex_destroy(&dmz->chunk_lock);
937
	bioset_exit(&dmz->bio_set);
938 939 940 941 942
err_meta:
	dmz_dtr_metadata(dmz->metadata);
err_dev:
	dmz_put_zoned_device(ti);
err:
H
Hannes Reinecke 已提交
943
	kfree(dmz->dev);
944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967
	kfree(dmz);

	return ret;
}

/*
 * Cleanup target.
 */
static void dmz_dtr(struct dm_target *ti)
{
	struct dmz_target *dmz = ti->private;

	flush_workqueue(dmz->chunk_wq);
	destroy_workqueue(dmz->chunk_wq);

	dmz_dtr_reclaim(dmz->reclaim);

	cancel_delayed_work_sync(&dmz->flush_work);
	destroy_workqueue(dmz->flush_wq);

	(void) dmz_flush_metadata(dmz->metadata);

	dmz_dtr_metadata(dmz->metadata);

968
	bioset_exit(&dmz->bio_set);
969 970 971

	dmz_put_zoned_device(ti);

972 973
	mutex_destroy(&dmz->chunk_lock);

H
Hannes Reinecke 已提交
974
	kfree(dmz->dev);
975 976 977 978 979 980 981 982 983
	kfree(dmz);
}

/*
 * Setup target request queue limits.
 */
static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
	struct dmz_target *dmz = ti->private;
984
	unsigned int chunk_sectors = dmz_zone_nr_sectors(dmz->metadata);
985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008

	limits->logical_block_size = DMZ_BLOCK_SIZE;
	limits->physical_block_size = DMZ_BLOCK_SIZE;

	blk_limits_io_min(limits, DMZ_BLOCK_SIZE);
	blk_limits_io_opt(limits, DMZ_BLOCK_SIZE);

	limits->discard_alignment = DMZ_BLOCK_SIZE;
	limits->discard_granularity = DMZ_BLOCK_SIZE;
	limits->max_discard_sectors = chunk_sectors;
	limits->max_hw_discard_sectors = chunk_sectors;
	limits->max_write_zeroes_sectors = chunk_sectors;

	/* FS hint to try to align to the device zone size */
	limits->chunk_sectors = chunk_sectors;
	limits->max_sectors = chunk_sectors;

	/* We are exposing a drive-managed zoned block device */
	limits->zoned = BLK_ZONED_NONE;
}

/*
 * Pass on ioctl to the backend device.
 */
1009
static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
1010 1011
{
	struct dmz_target *dmz = ti->private;
1012
	struct dmz_dev *dev = &dmz->dev[0];
1013

1014
	if (!dmz_check_bdev(dev))
1015
		return -EIO;
1016

1017
	*bdev = dev->bdev;
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048

	return 0;
}

/*
 * Stop works on suspend.
 */
static void dmz_suspend(struct dm_target *ti)
{
	struct dmz_target *dmz = ti->private;

	flush_workqueue(dmz->chunk_wq);
	dmz_suspend_reclaim(dmz->reclaim);
	cancel_delayed_work_sync(&dmz->flush_work);
}

/*
 * Restart works on resume or if suspend failed.
 */
static void dmz_resume(struct dm_target *ti)
{
	struct dmz_target *dmz = ti->private;

	queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
	dmz_resume_reclaim(dmz->reclaim);
}

static int dmz_iterate_devices(struct dm_target *ti,
			       iterate_devices_callout_fn fn, void *data)
{
	struct dmz_target *dmz = ti->private;
H
Hannes Reinecke 已提交
1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059
	unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata);
	sector_t capacity;
	int r;

	capacity = dmz->dev[0].capacity & ~(zone_nr_sectors - 1);
	r = fn(ti, dmz->ddev[0], 0, capacity, data);
	if (!r && dmz->ddev[1]) {
		capacity = dmz->dev[1].capacity & ~(zone_nr_sectors - 1);
		r = fn(ti, dmz->ddev[1], 0, capacity, data);
	}
	return r;
1060 1061
}

1062 1063 1064 1065 1066 1067 1068
static void dmz_status(struct dm_target *ti, status_type_t type,
		       unsigned int status_flags, char *result,
		       unsigned int maxlen)
{
	struct dmz_target *dmz = ti->private;
	ssize_t sz = 0;
	char buf[BDEVNAME_SIZE];
H
Hannes Reinecke 已提交
1069
	struct dmz_dev *dev;
1070 1071 1072

	switch (type) {
	case STATUSTYPE_INFO:
1073
		DMEMIT("%u zones %u/%u cache %u/%u random %u/%u sequential",
1074
		       dmz_nr_zones(dmz->metadata),
1075 1076
		       dmz_nr_unmap_cache_zones(dmz->metadata),
		       dmz_nr_cache_zones(dmz->metadata),
1077 1078 1079 1080 1081 1082
		       dmz_nr_unmap_rnd_zones(dmz->metadata),
		       dmz_nr_rnd_zones(dmz->metadata),
		       dmz_nr_unmap_seq_zones(dmz->metadata),
		       dmz_nr_seq_zones(dmz->metadata));
		break;
	case STATUSTYPE_TABLE:
H
Hannes Reinecke 已提交
1083 1084
		dev = &dmz->dev[0];
		format_dev_t(buf, dev->bdev->bd_dev);
1085
		DMEMIT("%s", buf);
H
Hannes Reinecke 已提交
1086 1087 1088 1089 1090
		if (dmz->dev[1].bdev) {
			dev = &dmz->dev[1];
			format_dev_t(buf, dev->bdev->bd_dev);
			DMEMIT(" %s", buf);
		}
1091 1092 1093 1094 1095
		break;
	}
	return;
}

1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109
static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv,
		       char *result, unsigned int maxlen)
{
	struct dmz_target *dmz = ti->private;
	int r = -EINVAL;

	if (!strcasecmp(argv[0], "reclaim")) {
		dmz_schedule_reclaim(dmz->reclaim);
		r = 0;
	} else
		DMERR("unrecognized message %s", argv[0]);
	return r;
}

1110 1111
static struct target_type dmz_type = {
	.name		 = "zoned",
H
Hannes Reinecke 已提交
1112
	.version	 = {2, 0, 0},
1113 1114 1115 1116 1117 1118 1119 1120 1121 1122
	.features	 = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM,
	.module		 = THIS_MODULE,
	.ctr		 = dmz_ctr,
	.dtr		 = dmz_dtr,
	.map		 = dmz_map,
	.io_hints	 = dmz_io_hints,
	.prepare_ioctl	 = dmz_prepare_ioctl,
	.postsuspend	 = dmz_suspend,
	.resume		 = dmz_resume,
	.iterate_devices = dmz_iterate_devices,
1123
	.status		 = dmz_status,
1124
	.message	 = dmz_message,
1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142
};

static int __init dmz_init(void)
{
	return dm_register_target(&dmz_type);
}

static void __exit dmz_exit(void)
{
	dm_unregister_target(&dmz_type);
}

module_init(dmz_init);
module_exit(dmz_exit);

MODULE_DESCRIPTION(DM_NAME " target for zoned block devices");
MODULE_AUTHOR("Damien Le Moal <damien.lemoal@wdc.com>");
MODULE_LICENSE("GPL");