dm-zoned-target.c 27.0 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (C) 2017 Western Digital Corporation or its affiliates.
 *
 * This file is released under the GPL.
 */

#include "dm-zoned.h"

#include <linux/module.h>

#define	DM_MSG_PREFIX		"zoned"

#define DMZ_MIN_BIOS		8192

H
Hannes Reinecke 已提交
16 17
#define DMZ_MAX_DEVS		2

18 19 20 21
/*
 * Zone BIO context.
 */
struct dmz_bioctx {
22
	struct dmz_dev		*dev;
23 24
	struct dm_zone		*zone;
	struct bio		*bio;
25
	refcount_t		ref;
26 27 28 29 30 31 32
};

/*
 * Chunk work descriptor.
 */
struct dm_chunk_work {
	struct work_struct	work;
33
	refcount_t		refcount;
34 35 36 37 38 39 40 41 42
	struct dmz_target	*target;
	unsigned int		chunk;
	struct bio_list		bio_list;
};

/*
 * Target descriptor.
 */
struct dmz_target {
H
Hannes Reinecke 已提交
43
	struct dm_dev		*ddev[DMZ_MAX_DEVS];
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58

	unsigned long		flags;

	/* Zoned block device information */
	struct dmz_dev		*dev;

	/* For metadata handling */
	struct dmz_metadata     *metadata;

	/* For reclaim */
	struct dmz_reclaim	*reclaim;

	/* For chunk work */
	struct radix_tree_root	chunk_rxtree;
	struct workqueue_struct *chunk_wq;
59
	struct mutex		chunk_lock;
60 61

	/* For cloned BIOs to zones */
62
	struct bio_set		bio_set;
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80

	/* For flush */
	spinlock_t		flush_lock;
	struct bio_list		flush_list;
	struct delayed_work	flush_work;
	struct workqueue_struct *flush_wq;
};

/*
 * Flush intervals (seconds).
 */
#define DMZ_FLUSH_PERIOD	(10 * HZ)

/*
 * Target BIO completion.
 */
static inline void dmz_bio_endio(struct bio *bio, blk_status_t status)
{
81 82
	struct dmz_bioctx *bioctx =
		dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
83

84 85
	if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK)
		bio->bi_status = status;
H
Hannes Reinecke 已提交
86
	if (bioctx->dev && bio->bi_status != BLK_STS_OK)
87
		bioctx->dev->flags |= DMZ_CHECK_BDEV;
88 89 90 91 92 93 94 95 96 97 98 99 100

	if (refcount_dec_and_test(&bioctx->ref)) {
		struct dm_zone *zone = bioctx->zone;

		if (zone) {
			if (bio->bi_status != BLK_STS_OK &&
			    bio_op(bio) == REQ_OP_WRITE &&
			    dmz_is_seq(zone))
				set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
			dmz_deactivate_zone(zone);
		}
		bio_endio(bio);
	}
101 102 103
}

/*
104
 * Completion callback for an internally cloned target BIO. This terminates the
105 106
 * target BIO when there are no more references to its context.
 */
107
static void dmz_clone_endio(struct bio *clone)
108
{
109 110
	struct dmz_bioctx *bioctx = clone->bi_private;
	blk_status_t status = clone->bi_status;
111

112
	bio_put(clone);
113 114 115 116
	dmz_bio_endio(bioctx->bio, status);
}

/*
117
 * Issue a clone of a target BIO. The clone may only partially process the
118 119
 * original target BIO.
 */
120 121 122
static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
			  struct bio *bio, sector_t chunk_block,
			  unsigned int nr_blocks)
123
{
124 125 126
	struct dmz_bioctx *bioctx =
		dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
	struct dmz_dev *dev = dmz_zone_to_dev(dmz->metadata, zone);
127 128
	struct bio *clone;

129 130 131
	if (dev->flags & DMZ_BDEV_DYING)
		return -EIO;

132
	clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set);
133 134 135
	if (!clone)
		return -ENOMEM;

136 137
	bio_set_dev(clone, dev->bdev);
	bioctx->dev = dev;
138 139
	clone->bi_iter.bi_sector =
		dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
140
	clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT;
141
	clone->bi_end_io = dmz_clone_endio;
142 143 144 145
	clone->bi_private = bioctx;

	bio_advance(bio, clone->bi_iter.bi_size);

146
	refcount_inc(&bioctx->ref);
147 148
	generic_make_request(clone);

149 150 151
	if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone))
		zone->wp_block += nr_blocks;

152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
	return 0;
}

/*
 * Zero out pages of discarded blocks accessed by a read BIO.
 */
static void dmz_handle_read_zero(struct dmz_target *dmz, struct bio *bio,
				 sector_t chunk_block, unsigned int nr_blocks)
{
	unsigned int size = nr_blocks << DMZ_BLOCK_SHIFT;

	/* Clear nr_blocks */
	swap(bio->bi_iter.bi_size, size);
	zero_fill_bio(bio);
	swap(bio->bi_iter.bi_size, size);

	bio_advance(bio, size);
}

/*
 * Process a read BIO.
 */
static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
			   struct bio *bio)
{
177 178
	struct dmz_metadata *zmd = dmz->metadata;
	sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio));
179 180 181 182 183 184 185 186 187 188 189
	unsigned int nr_blocks = dmz_bio_blocks(bio);
	sector_t end_block = chunk_block + nr_blocks;
	struct dm_zone *rzone, *bzone;
	int ret;

	/* Read into unmapped chunks need only zeroing the BIO buffer */
	if (!zone) {
		zero_fill_bio(bio);
		return 0;
	}

190 191 192 193 194 195
	DMDEBUG("(%s): READ chunk %llu -> %s zone %u, block %llu, %u blocks",
		dmz_metadata_label(zmd),
		(unsigned long long)dmz_bio_chunk(zmd, bio),
		(dmz_is_rnd(zone) ? "RND" : "SEQ"),
		zone->id,
		(unsigned long long)chunk_block, nr_blocks);
196 197 198 199 200 201 202

	/* Check block validity to determine the read location */
	bzone = zone->bzone;
	while (chunk_block < end_block) {
		nr_blocks = 0;
		if (dmz_is_rnd(zone) || chunk_block < zone->wp_block) {
			/* Test block validity in the data zone */
203
			ret = dmz_block_valid(zmd, zone, chunk_block);
204 205 206 207 208 209 210 211 212 213 214 215 216 217
			if (ret < 0)
				return ret;
			if (ret > 0) {
				/* Read data zone blocks */
				nr_blocks = ret;
				rzone = zone;
			}
		}

		/*
		 * No valid blocks found in the data zone.
		 * Check the buffer zone, if there is one.
		 */
		if (!nr_blocks && bzone) {
218
			ret = dmz_block_valid(zmd, bzone, chunk_block);
219 220 221 222 223 224 225 226 227 228 229
			if (ret < 0)
				return ret;
			if (ret > 0) {
				/* Read buffer zone blocks */
				nr_blocks = ret;
				rzone = bzone;
			}
		}

		if (nr_blocks) {
			/* Valid blocks found: read them */
230 231 232 233
			nr_blocks = min_t(unsigned int, nr_blocks,
					  end_block - chunk_block);
			ret = dmz_submit_bio(dmz, rzone, bio,
					     chunk_block, nr_blocks);
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
			if (ret)
				return ret;
			chunk_block += nr_blocks;
		} else {
			/* No valid block: zeroout the current BIO block */
			dmz_handle_read_zero(dmz, bio, chunk_block, 1);
			chunk_block++;
		}
	}

	return 0;
}

/*
 * Write blocks directly in a data zone, at the write pointer.
 * If a buffer zone is assigned, invalidate the blocks written
 * in place.
 */
static int dmz_handle_direct_write(struct dmz_target *dmz,
				   struct dm_zone *zone, struct bio *bio,
				   sector_t chunk_block,
				   unsigned int nr_blocks)
{
	struct dmz_metadata *zmd = dmz->metadata;
	struct dm_zone *bzone = zone->bzone;
	int ret;

	if (dmz_is_readonly(zone))
		return -EROFS;

	/* Submit write */
265 266 267
	ret = dmz_submit_bio(dmz, zone, bio, chunk_block, nr_blocks);
	if (ret)
		return ret;
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295

	/*
	 * Validate the blocks in the data zone and invalidate
	 * in the buffer zone, if there is one.
	 */
	ret = dmz_validate_blocks(zmd, zone, chunk_block, nr_blocks);
	if (ret == 0 && bzone)
		ret = dmz_invalidate_blocks(zmd, bzone, chunk_block, nr_blocks);

	return ret;
}

/*
 * Write blocks in the buffer zone of @zone.
 * If no buffer zone is assigned yet, get one.
 * Called with @zone write locked.
 */
static int dmz_handle_buffered_write(struct dmz_target *dmz,
				     struct dm_zone *zone, struct bio *bio,
				     sector_t chunk_block,
				     unsigned int nr_blocks)
{
	struct dmz_metadata *zmd = dmz->metadata;
	struct dm_zone *bzone;
	int ret;

	/* Get the buffer zone. One will be allocated if needed */
	bzone = dmz_get_chunk_buffer(zmd, zone);
296 297
	if (IS_ERR(bzone))
		return PTR_ERR(bzone);
298 299 300 301 302

	if (dmz_is_readonly(bzone))
		return -EROFS;

	/* Submit write */
303 304 305
	ret = dmz_submit_bio(dmz, bzone, bio, chunk_block, nr_blocks);
	if (ret)
		return ret;
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323

	/*
	 * Validate the blocks in the buffer zone
	 * and invalidate in the data zone.
	 */
	ret = dmz_validate_blocks(zmd, bzone, chunk_block, nr_blocks);
	if (ret == 0 && chunk_block < zone->wp_block)
		ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);

	return ret;
}

/*
 * Process a write BIO.
 */
static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone,
			    struct bio *bio)
{
324 325
	struct dmz_metadata *zmd = dmz->metadata;
	sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio));
326 327 328 329 330
	unsigned int nr_blocks = dmz_bio_blocks(bio);

	if (!zone)
		return -ENOSPC;

331 332 333 334 335 336
	DMDEBUG("(%s): WRITE chunk %llu -> %s zone %u, block %llu, %u blocks",
		dmz_metadata_label(zmd),
		(unsigned long long)dmz_bio_chunk(zmd, bio),
		(dmz_is_rnd(zone) ? "RND" : "SEQ"),
		zone->id,
		(unsigned long long)chunk_block, nr_blocks);
337 338 339 340 341 342 343

	if (dmz_is_rnd(zone) || chunk_block == zone->wp_block) {
		/*
		 * zone is a random zone or it is a sequential zone
		 * and the BIO is aligned to the zone write pointer:
		 * direct write the zone.
		 */
344 345
		return dmz_handle_direct_write(dmz, zone, bio,
					       chunk_block, nr_blocks);
346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
	}

	/*
	 * This is an unaligned write in a sequential zone:
	 * use buffered write.
	 */
	return dmz_handle_buffered_write(dmz, zone, bio, chunk_block, nr_blocks);
}

/*
 * Process a discard BIO.
 */
static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone,
			      struct bio *bio)
{
	struct dmz_metadata *zmd = dmz->metadata;
	sector_t block = dmz_bio_block(bio);
	unsigned int nr_blocks = dmz_bio_blocks(bio);
364
	sector_t chunk_block = dmz_chunk_block(zmd, block);
365 366 367 368 369 370 371 372 373
	int ret = 0;

	/* For unmapped chunks, there is nothing to do */
	if (!zone)
		return 0;

	if (dmz_is_readonly(zone))
		return -EROFS;

374 375 376 377 378
	DMDEBUG("(%s): DISCARD chunk %llu -> zone %u, block %llu, %u blocks",
		dmz_metadata_label(dmz->metadata),
		(unsigned long long)dmz_bio_chunk(zmd, bio),
		zone->id,
		(unsigned long long)chunk_block, nr_blocks);
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397

	/*
	 * Invalidate blocks in the data zone and its
	 * buffer zone if one is mapped.
	 */
	if (dmz_is_rnd(zone) || chunk_block < zone->wp_block)
		ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
	if (ret == 0 && zone->bzone)
		ret = dmz_invalidate_blocks(zmd, zone->bzone,
					    chunk_block, nr_blocks);
	return ret;
}

/*
 * Process a BIO.
 */
static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
			   struct bio *bio)
{
398 399
	struct dmz_bioctx *bioctx =
		dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417
	struct dmz_metadata *zmd = dmz->metadata;
	struct dm_zone *zone;
	int ret;

	/*
	 * Write may trigger a zone allocation. So make sure the
	 * allocation can succeed.
	 */
	if (bio_op(bio) == REQ_OP_WRITE)
		dmz_schedule_reclaim(dmz->reclaim);

	dmz_lock_metadata(zmd);

	/*
	 * Get the data zone mapping the chunk. There may be no
	 * mapping for read and discard. If a mapping is obtained,
	 + the zone returned will be set to active state.
	 */
418
	zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(zmd, bio),
419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442
				     bio_op(bio));
	if (IS_ERR(zone)) {
		ret = PTR_ERR(zone);
		goto out;
	}

	/* Process the BIO */
	if (zone) {
		dmz_activate_zone(zone);
		bioctx->zone = zone;
	}

	switch (bio_op(bio)) {
	case REQ_OP_READ:
		ret = dmz_handle_read(dmz, zone, bio);
		break;
	case REQ_OP_WRITE:
		ret = dmz_handle_write(dmz, zone, bio);
		break;
	case REQ_OP_DISCARD:
	case REQ_OP_WRITE_ZEROES:
		ret = dmz_handle_discard(dmz, zone, bio);
		break;
	default:
443 444
		DMERR("(%s): Unsupported BIO operation 0x%x",
		      dmz_metadata_label(dmz->metadata), bio_op(bio));
445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464
		ret = -EIO;
	}

	/*
	 * Release the chunk mapping. This will check that the mapping
	 * is still valid, that is, that the zone used still has valid blocks.
	 */
	if (zone)
		dmz_put_chunk_mapping(zmd, zone);
out:
	dmz_bio_endio(bio, errno_to_blk_status(ret));

	dmz_unlock_metadata(zmd);
}

/*
 * Increment a chunk reference counter.
 */
static inline void dmz_get_chunk_work(struct dm_chunk_work *cw)
{
465
	refcount_inc(&cw->refcount);
466 467 468 469 470 471 472 473
}

/*
 * Decrement a chunk work reference count and
 * free it if it becomes 0.
 */
static void dmz_put_chunk_work(struct dm_chunk_work *cw)
{
474
	if (refcount_dec_and_test(&cw->refcount)) {
475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516
		WARN_ON(!bio_list_empty(&cw->bio_list));
		radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk);
		kfree(cw);
	}
}

/*
 * Chunk BIO work function.
 */
static void dmz_chunk_work(struct work_struct *work)
{
	struct dm_chunk_work *cw = container_of(work, struct dm_chunk_work, work);
	struct dmz_target *dmz = cw->target;
	struct bio *bio;

	mutex_lock(&dmz->chunk_lock);

	/* Process the chunk BIOs */
	while ((bio = bio_list_pop(&cw->bio_list))) {
		mutex_unlock(&dmz->chunk_lock);
		dmz_handle_bio(dmz, cw, bio);
		mutex_lock(&dmz->chunk_lock);
		dmz_put_chunk_work(cw);
	}

	/* Queueing the work incremented the work refcount */
	dmz_put_chunk_work(cw);

	mutex_unlock(&dmz->chunk_lock);
}

/*
 * Flush work.
 */
static void dmz_flush_work(struct work_struct *work)
{
	struct dmz_target *dmz = container_of(work, struct dmz_target, flush_work.work);
	struct bio *bio;
	int ret;

	/* Flush dirty metadata blocks */
	ret = dmz_flush_metadata(dmz->metadata);
517
	if (ret)
518
		DMDEBUG("(%s): Metadata flush failed, rc=%d",
519
			dmz_metadata_label(dmz->metadata), ret);
520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539

	/* Process queued flush requests */
	while (1) {
		spin_lock(&dmz->flush_lock);
		bio = bio_list_pop(&dmz->flush_list);
		spin_unlock(&dmz->flush_lock);

		if (!bio)
			break;

		dmz_bio_endio(bio, errno_to_blk_status(ret));
	}

	queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
}

/*
 * Get a chunk work and start it to process a new BIO.
 * If the BIO chunk has no work yet, create one.
 */
540
static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
541
{
542
	unsigned int chunk = dmz_bio_chunk(dmz->metadata, bio);
543
	struct dm_chunk_work *cw;
544
	int ret = 0;
545 546 547 548 549

	mutex_lock(&dmz->chunk_lock);

	/* Get the BIO chunk work. If one is not active yet, create one */
	cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk);
550 551 552
	if (cw) {
		dmz_get_chunk_work(cw);
	} else {
553
		/* Create a new chunk work */
554
		cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO);
555 556
		if (unlikely(!cw)) {
			ret = -ENOMEM;
557
			goto out;
558
		}
559 560

		INIT_WORK(&cw->work, dmz_chunk_work);
561
		refcount_set(&cw->refcount, 1);
562 563 564 565 566 567 568 569 570 571 572 573 574
		cw->target = dmz;
		cw->chunk = chunk;
		bio_list_init(&cw->bio_list);

		ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw);
		if (unlikely(ret)) {
			kfree(cw);
			goto out;
		}
	}

	bio_list_add(&cw->bio_list, bio);

575
	dmz_reclaim_bio_acc(dmz->reclaim);
576 577 578 579
	if (queue_work(dmz->chunk_wq, &cw->work))
		dmz_get_chunk_work(cw);
out:
	mutex_unlock(&dmz->chunk_lock);
580
	return ret;
581 582
}

583
/*
584
 * Check if the backing device is being removed. If it's on the way out,
585 586 587 588 589
 * start failing I/O. Reclaim and metadata components also call this
 * function to cleanly abort operation in the event of such failure.
 */
bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
{
590 591
	if (dmz_dev->flags & DMZ_BDEV_DYING)
		return true;
592

593 594 595 596 597 598
	if (dmz_dev->flags & DMZ_CHECK_BDEV)
		return !dmz_check_bdev(dmz_dev);

	if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) {
		dmz_dev_warn(dmz_dev, "Backing device queue dying");
		dmz_dev->flags |= DMZ_BDEV_DYING;
599 600 601 602 603
	}

	return dmz_dev->flags & DMZ_BDEV_DYING;
}

604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628
/*
 * Check the backing device availability. This detects such events as
 * backing device going offline due to errors, media removals, etc.
 * This check is less efficient than dmz_bdev_is_dying() and should
 * only be performed as a part of error handling.
 */
bool dmz_check_bdev(struct dmz_dev *dmz_dev)
{
	struct gendisk *disk;

	dmz_dev->flags &= ~DMZ_CHECK_BDEV;

	if (dmz_bdev_is_dying(dmz_dev))
		return false;

	disk = dmz_dev->bdev->bd_disk;
	if (disk->fops->check_events &&
	    disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) {
		dmz_dev_warn(dmz_dev, "Backing device offline");
		dmz_dev->flags |= DMZ_BDEV_DYING;
	}

	return !(dmz_dev->flags & DMZ_BDEV_DYING);
}

629 630 631 632 633 634
/*
 * Process a new BIO.
 */
static int dmz_map(struct dm_target *ti, struct bio *bio)
{
	struct dmz_target *dmz = ti->private;
635
	struct dmz_metadata *zmd = dmz->metadata;
636 637 638 639
	struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
	sector_t sector = bio->bi_iter.bi_sector;
	unsigned int nr_sectors = bio_sectors(bio);
	sector_t chunk_sector;
640
	int ret;
641

642
	if (dmz_dev_is_dying(zmd))
643 644
		return DM_MAPIO_KILL;

645 646 647 648 649 650
	DMDEBUG("(%s): BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
		dmz_metadata_label(zmd),
		bio_op(bio), (unsigned long long)sector, nr_sectors,
		(unsigned long long)dmz_bio_chunk(zmd, bio),
		(unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)),
		(unsigned int)dmz_bio_blocks(bio));
651

652
	if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE)
653 654 655 656 657 658 659
		return DM_MAPIO_REMAPPED;

	/* The BIO should be block aligned */
	if ((nr_sectors & DMZ_BLOCK_SECTORS_MASK) || (sector & DMZ_BLOCK_SECTORS_MASK))
		return DM_MAPIO_KILL;

	/* Initialize the BIO context */
660
	bioctx->dev = NULL;
661 662
	bioctx->zone = NULL;
	bioctx->bio = bio;
663
	refcount_set(&bioctx->ref, 1);
664 665

	/* Set the BIO pending in the flush list */
666
	if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) {
667 668 669 670 671 672 673 674
		spin_lock(&dmz->flush_lock);
		bio_list_add(&dmz->flush_list, bio);
		spin_unlock(&dmz->flush_lock);
		mod_delayed_work(dmz->flush_wq, &dmz->flush_work, 0);
		return DM_MAPIO_SUBMITTED;
	}

	/* Split zone BIOs to fit entirely into a zone */
675 676 677
	chunk_sector = sector & (dmz_zone_nr_sectors(zmd) - 1);
	if (chunk_sector + nr_sectors > dmz_zone_nr_sectors(zmd))
		dm_accept_partial_bio(bio, dmz_zone_nr_sectors(zmd) - chunk_sector);
678 679

	/* Now ready to handle this BIO */
680 681
	ret = dmz_queue_chunk_work(dmz, bio);
	if (ret) {
682
		DMDEBUG("(%s): BIO op %d, can't process chunk %llu, err %i",
683 684 685
			dmz_metadata_label(zmd),
			bio_op(bio), (u64)dmz_bio_chunk(zmd, bio),
			ret);
686 687
		return DM_MAPIO_REQUEUE;
	}
688 689 690 691 692 693 694

	return DM_MAPIO_SUBMITTED;
}

/*
 * Get zoned device information.
 */
H
Hannes Reinecke 已提交
695 696
static int dmz_get_zoned_device(struct dm_target *ti, char *path,
				int idx, int nr_devs)
697 698
{
	struct dmz_target *dmz = ti->private;
H
Hannes Reinecke 已提交
699
	struct dm_dev *ddev;
700 701
	struct dmz_dev *dev;
	int ret;
H
Hannes Reinecke 已提交
702
	struct block_device *bdev;
703 704

	/* Get the target device */
H
Hannes Reinecke 已提交
705
	ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &ddev);
706 707 708 709 710
	if (ret) {
		ti->error = "Get target device failed";
		return ret;
	}

H
Hannes Reinecke 已提交
711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736
	bdev = ddev->bdev;
	if (bdev_zoned_model(bdev) == BLK_ZONED_NONE) {
		if (nr_devs == 1) {
			ti->error = "Invalid regular device";
			goto err;
		}
		if (idx != 0) {
			ti->error = "First device must be a regular device";
			goto err;
		}
		if (dmz->ddev[0]) {
			ti->error = "Too many regular devices";
			goto err;
		}
		dev = &dmz->dev[idx];
		dev->flags = DMZ_BDEV_REGULAR;
	} else {
		if (dmz->ddev[idx]) {
			ti->error = "Too many zoned devices";
			goto err;
		}
		if (nr_devs > 1 && idx == 0) {
			ti->error = "First device must be a regular device";
			goto err;
		}
		dev = &dmz->dev[idx];
737
	}
H
Hannes Reinecke 已提交
738
	dev->bdev = bdev;
739 740
	(void)bdevname(dev->bdev, dev->name);

H
Hannes Reinecke 已提交
741 742 743
	dev->capacity = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
	if (ti->begin) {
		ti->error = "Partial mapping is not supported";
744 745 746
		goto err;
	}

H
Hannes Reinecke 已提交
747
	dmz->ddev[idx] = ddev;
748 749 750

	return 0;
err:
H
Hannes Reinecke 已提交
751 752
	dm_put_device(ti, ddev);
	return -EINVAL;
753 754 755 756 757 758 759 760
}

/*
 * Cleanup zoned device information.
 */
static void dmz_put_zoned_device(struct dm_target *ti)
{
	struct dmz_target *dmz = ti->private;
H
Hannes Reinecke 已提交
761
	int i;
762

H
Hannes Reinecke 已提交
763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805
	for (i = 0; i < DMZ_MAX_DEVS; i++) {
		if (dmz->ddev[i]) {
			dm_put_device(ti, dmz->ddev[i]);
			dmz->ddev[i] = NULL;
		}
	}
}

static int dmz_fixup_devices(struct dm_target *ti)
{
	struct dmz_target *dmz = ti->private;
	struct dmz_dev *reg_dev, *zoned_dev;
	struct request_queue *q;

	/*
	 * When we have two devices, the first one must be a regular block
	 * device and the second a zoned block device.
	 */
	if (dmz->ddev[0] && dmz->ddev[1]) {
		reg_dev = &dmz->dev[0];
		if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) {
			ti->error = "Primary disk is not a regular device";
			return -EINVAL;
		}
		zoned_dev = &dmz->dev[1];
		if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
			ti->error = "Secondary disk is not a zoned device";
			return -EINVAL;
		}
	} else {
		reg_dev = NULL;
		zoned_dev = &dmz->dev[0];
		if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
			ti->error = "Disk is not a zoned device";
			return -EINVAL;
		}
	}
	q = bdev_get_queue(zoned_dev->bdev);
	zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q);
	zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk);

	if (reg_dev) {
		reg_dev->zone_nr_sectors = zoned_dev->zone_nr_sectors;
806 807 808
		reg_dev->nr_zones =
			DIV_ROUND_UP_SECTOR_T(reg_dev->capacity,
					      reg_dev->zone_nr_sectors);
H
Hannes Reinecke 已提交
809 810 811
		zoned_dev->zone_offset = reg_dev->nr_zones;
	}
	return 0;
812 813 814 815 816 817 818 819 820 821 822
}

/*
 * Setup target.
 */
static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
	struct dmz_target *dmz;
	int ret;

	/* Check arguments */
H
Hannes Reinecke 已提交
823
	if (argc < 1 || argc > 2) {
824 825 826 827 828 829 830 831 832 833
		ti->error = "Invalid argument count";
		return -EINVAL;
	}

	/* Allocate and initialize the target descriptor */
	dmz = kzalloc(sizeof(struct dmz_target), GFP_KERNEL);
	if (!dmz) {
		ti->error = "Unable to allocate the zoned target descriptor";
		return -ENOMEM;
	}
H
Hannes Reinecke 已提交
834 835 836 837 838 839
	dmz->dev = kcalloc(2, sizeof(struct dmz_dev), GFP_KERNEL);
	if (!dmz->dev) {
		ti->error = "Unable to allocate the zoned device descriptors";
		kfree(dmz);
		return -ENOMEM;
	}
840 841 842
	ti->private = dmz;

	/* Get the target zoned block device */
H
Hannes Reinecke 已提交
843 844 845 846 847 848 849 850 851 852 853 854
	ret = dmz_get_zoned_device(ti, argv[0], 0, argc);
	if (ret)
		goto err;

	if (argc == 2) {
		ret = dmz_get_zoned_device(ti, argv[1], 1, argc);
		if (ret) {
			dmz_put_zoned_device(ti);
			goto err;
		}
	}
	ret = dmz_fixup_devices(ti);
855
	if (ret) {
H
Hannes Reinecke 已提交
856
		dmz_put_zoned_device(ti);
857 858 859 860
		goto err;
	}

	/* Initialize metadata */
H
Hannes Reinecke 已提交
861
	ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata,
862
			       dm_table_device_name(ti->table));
863 864 865 866 867 868
	if (ret) {
		ti->error = "Metadata initialization failed";
		goto err_dev;
	}

	/* Set target (no write same support) */
869
	ti->max_io_len = dmz_zone_nr_sectors(dmz->metadata) << 9;
870 871 872 873 874 875 876 877
	ti->num_flush_bios = 1;
	ti->num_discard_bios = 1;
	ti->num_write_zeroes_bios = 1;
	ti->per_io_data_size = sizeof(struct dmz_bioctx);
	ti->flush_supported = true;
	ti->discards_supported = true;

	/* The exposed capacity is the number of chunks that can be mapped */
878 879
	ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) <<
		dmz_zone_nr_sectors_shift(dmz->metadata);
880 881

	/* Zone BIO */
882 883
	ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0);
	if (ret) {
884 885 886 887 888 889
		ti->error = "Create BIO set failed";
		goto err_meta;
	}

	/* Chunk BIO work */
	mutex_init(&dmz->chunk_lock);
890
	INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO);
891 892 893
	dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s",
					WQ_MEM_RECLAIM | WQ_UNBOUND, 0,
					dmz_metadata_label(dmz->metadata));
894 895 896 897 898 899 900 901 902 903 904
	if (!dmz->chunk_wq) {
		ti->error = "Create chunk workqueue failed";
		ret = -ENOMEM;
		goto err_bio;
	}

	/* Flush work */
	spin_lock_init(&dmz->flush_lock);
	bio_list_init(&dmz->flush_list);
	INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work);
	dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM,
905
						dmz_metadata_label(dmz->metadata));
906 907 908 909 910 911 912 913
	if (!dmz->flush_wq) {
		ti->error = "Create flush workqueue failed";
		ret = -ENOMEM;
		goto err_cwq;
	}
	mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);

	/* Initialize reclaim */
914
	ret = dmz_ctr_reclaim(dmz->metadata, &dmz->reclaim);
915 916 917 918 919
	if (ret) {
		ti->error = "Zone reclaim initialization failed";
		goto err_fwq;
	}

920 921 922 923
	DMINFO("(%s): Target device: %llu 512-byte logical sectors (%llu blocks)",
	       dmz_metadata_label(dmz->metadata),
	       (unsigned long long)ti->len,
	       (unsigned long long)dmz_sect2blk(ti->len));
924 925 926 927 928 929 930

	return 0;
err_fwq:
	destroy_workqueue(dmz->flush_wq);
err_cwq:
	destroy_workqueue(dmz->chunk_wq);
err_bio:
931
	mutex_destroy(&dmz->chunk_lock);
932
	bioset_exit(&dmz->bio_set);
933 934 935 936 937
err_meta:
	dmz_dtr_metadata(dmz->metadata);
err_dev:
	dmz_put_zoned_device(ti);
err:
H
Hannes Reinecke 已提交
938
	kfree(dmz->dev);
939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962
	kfree(dmz);

	return ret;
}

/*
 * Cleanup target.
 */
static void dmz_dtr(struct dm_target *ti)
{
	struct dmz_target *dmz = ti->private;

	flush_workqueue(dmz->chunk_wq);
	destroy_workqueue(dmz->chunk_wq);

	dmz_dtr_reclaim(dmz->reclaim);

	cancel_delayed_work_sync(&dmz->flush_work);
	destroy_workqueue(dmz->flush_wq);

	(void) dmz_flush_metadata(dmz->metadata);

	dmz_dtr_metadata(dmz->metadata);

963
	bioset_exit(&dmz->bio_set);
964 965 966

	dmz_put_zoned_device(ti);

967 968
	mutex_destroy(&dmz->chunk_lock);

H
Hannes Reinecke 已提交
969
	kfree(dmz->dev);
970 971 972 973 974 975 976 977 978
	kfree(dmz);
}

/*
 * Setup target request queue limits.
 */
static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
	struct dmz_target *dmz = ti->private;
979
	unsigned int chunk_sectors = dmz_zone_nr_sectors(dmz->metadata);
980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003

	limits->logical_block_size = DMZ_BLOCK_SIZE;
	limits->physical_block_size = DMZ_BLOCK_SIZE;

	blk_limits_io_min(limits, DMZ_BLOCK_SIZE);
	blk_limits_io_opt(limits, DMZ_BLOCK_SIZE);

	limits->discard_alignment = DMZ_BLOCK_SIZE;
	limits->discard_granularity = DMZ_BLOCK_SIZE;
	limits->max_discard_sectors = chunk_sectors;
	limits->max_hw_discard_sectors = chunk_sectors;
	limits->max_write_zeroes_sectors = chunk_sectors;

	/* FS hint to try to align to the device zone size */
	limits->chunk_sectors = chunk_sectors;
	limits->max_sectors = chunk_sectors;

	/* We are exposing a drive-managed zoned block device */
	limits->zoned = BLK_ZONED_NONE;
}

/*
 * Pass on ioctl to the backend device.
 */
1004
static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
1005 1006
{
	struct dmz_target *dmz = ti->private;
1007
	struct dmz_dev *dev = &dmz->dev[0];
1008

1009
	if (!dmz_check_bdev(dev))
1010
		return -EIO;
1011

1012
	*bdev = dev->bdev;
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043

	return 0;
}

/*
 * Stop works on suspend.
 */
static void dmz_suspend(struct dm_target *ti)
{
	struct dmz_target *dmz = ti->private;

	flush_workqueue(dmz->chunk_wq);
	dmz_suspend_reclaim(dmz->reclaim);
	cancel_delayed_work_sync(&dmz->flush_work);
}

/*
 * Restart works on resume or if suspend failed.
 */
static void dmz_resume(struct dm_target *ti)
{
	struct dmz_target *dmz = ti->private;

	queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
	dmz_resume_reclaim(dmz->reclaim);
}

static int dmz_iterate_devices(struct dm_target *ti,
			       iterate_devices_callout_fn fn, void *data)
{
	struct dmz_target *dmz = ti->private;
H
Hannes Reinecke 已提交
1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054
	unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata);
	sector_t capacity;
	int r;

	capacity = dmz->dev[0].capacity & ~(zone_nr_sectors - 1);
	r = fn(ti, dmz->ddev[0], 0, capacity, data);
	if (!r && dmz->ddev[1]) {
		capacity = dmz->dev[1].capacity & ~(zone_nr_sectors - 1);
		r = fn(ti, dmz->ddev[1], 0, capacity, data);
	}
	return r;
1055 1056
}

1057 1058 1059 1060 1061 1062 1063
static void dmz_status(struct dm_target *ti, status_type_t type,
		       unsigned int status_flags, char *result,
		       unsigned int maxlen)
{
	struct dmz_target *dmz = ti->private;
	ssize_t sz = 0;
	char buf[BDEVNAME_SIZE];
H
Hannes Reinecke 已提交
1064
	struct dmz_dev *dev;
1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075

	switch (type) {
	case STATUSTYPE_INFO:
		DMEMIT("%u zones %u/%u random %u/%u sequential",
		       dmz_nr_zones(dmz->metadata),
		       dmz_nr_unmap_rnd_zones(dmz->metadata),
		       dmz_nr_rnd_zones(dmz->metadata),
		       dmz_nr_unmap_seq_zones(dmz->metadata),
		       dmz_nr_seq_zones(dmz->metadata));
		break;
	case STATUSTYPE_TABLE:
H
Hannes Reinecke 已提交
1076 1077
		dev = &dmz->dev[0];
		format_dev_t(buf, dev->bdev->bd_dev);
1078
		DMEMIT("%s", buf);
H
Hannes Reinecke 已提交
1079 1080 1081 1082 1083
		if (dmz->dev[1].bdev) {
			dev = &dmz->dev[1];
			format_dev_t(buf, dev->bdev->bd_dev);
			DMEMIT(" %s", buf);
		}
1084 1085 1086 1087 1088
		break;
	}
	return;
}

1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102
static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv,
		       char *result, unsigned int maxlen)
{
	struct dmz_target *dmz = ti->private;
	int r = -EINVAL;

	if (!strcasecmp(argv[0], "reclaim")) {
		dmz_schedule_reclaim(dmz->reclaim);
		r = 0;
	} else
		DMERR("unrecognized message %s", argv[0]);
	return r;
}

1103 1104
static struct target_type dmz_type = {
	.name		 = "zoned",
H
Hannes Reinecke 已提交
1105
	.version	 = {2, 0, 0},
1106 1107 1108 1109 1110 1111 1112 1113 1114 1115
	.features	 = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM,
	.module		 = THIS_MODULE,
	.ctr		 = dmz_ctr,
	.dtr		 = dmz_dtr,
	.map		 = dmz_map,
	.io_hints	 = dmz_io_hints,
	.prepare_ioctl	 = dmz_prepare_ioctl,
	.postsuspend	 = dmz_suspend,
	.resume		 = dmz_resume,
	.iterate_devices = dmz_iterate_devices,
1116
	.status		 = dmz_status,
1117
	.message	 = dmz_message,
1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135
};

static int __init dmz_init(void)
{
	return dm_register_target(&dmz_type);
}

static void __exit dmz_exit(void)
{
	dm_unregister_target(&dmz_type);
}

module_init(dmz_init);
module_exit(dmz_exit);

MODULE_DESCRIPTION(DM_NAME " target for zoned block devices");
MODULE_AUTHOR("Damien Le Moal <damien.lemoal@wdc.com>");
MODULE_LICENSE("GPL");