blk-zoned.c 12.3 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Zoned block device handling
 *
 * Copyright (c) 2015, Hannes Reinecke
 * Copyright (c) 2015, SUSE Linux GmbH
 *
 * Copyright (c) 2016, Damien Le Moal
 * Copyright (c) 2016, Western Digital
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/rbtree.h>
#include <linux/blkdev.h>
16
#include <linux/blk-mq.h>
17 18
#include <linux/mm.h>
#include <linux/vmalloc.h>
19
#include <linux/sched/mm.h>
20

21 22
#include "blk.h"

23 24 25
static inline sector_t blk_zone_start(struct request_queue *q,
				      sector_t sector)
{
26
	sector_t zone_mask = blk_queue_zone_sectors(q) - 1;
27 28 29 30

	return sector & ~zone_mask;
}

31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
/*
 * Return true if a request is a write requests that needs zone write locking.
 */
bool blk_req_needs_zone_write_lock(struct request *rq)
{
	if (!rq->q->seq_zones_wlock)
		return false;

	if (blk_rq_is_passthrough(rq))
		return false;

	switch (req_op(rq)) {
	case REQ_OP_WRITE_ZEROES:
	case REQ_OP_WRITE_SAME:
	case REQ_OP_WRITE:
		return blk_rq_zone_is_seq(rq);
	default:
		return false;
	}
}
EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);

void __blk_req_zone_write_lock(struct request *rq)
{
	if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
					  rq->q->seq_zones_wlock)))
		return;

	WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
	rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
}
EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);

void __blk_req_zone_write_unlock(struct request *rq)
{
	rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
	if (rq->q->seq_zones_wlock)
		WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
						 rq->q->seq_zones_wlock));
}
EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);

73 74
/**
 * blkdev_nr_zones - Get number of zones
75
 * @disk:	Target gendisk
76
 *
77 78
 * Return the total number of zones of a zoned block device.  For a block
 * device without zone capabilities, the number of zones is always 0.
79
 */
80
unsigned int blkdev_nr_zones(struct gendisk *disk)
81
{
82
	sector_t zone_sectors = blk_queue_zone_sectors(disk->queue);
83

84
	if (!blk_queue_is_zoned(disk->queue))
85
		return 0;
86
	return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors);
87 88 89
}
EXPORT_SYMBOL_GPL(blkdev_nr_zones);

90 91 92 93
/**
 * blkdev_report_zones - Get zones information
 * @bdev:	Target block device
 * @sector:	Sector from which to report zones
C
Christoph Hellwig 已提交
94 95 96
 * @nr_zones:	Maximum number of zones to report
 * @cb:		Callback function called for each reported zone
 * @data:	Private data for the callback
97 98
 *
 * Description:
C
Christoph Hellwig 已提交
99 100 101 102 103 104 105 106 107
 *    Get zone information starting from the zone containing @sector for at most
 *    @nr_zones, and call @cb for each zone reported by the device.
 *    To report all zones in a device starting from @sector, the BLK_ALL_ZONES
 *    constant can be passed to @nr_zones.
 *    Returns the number of zones reported by the device, or a negative errno
 *    value in case of failure.
 *
 *    Note: The caller must use memalloc_noXX_save/restore() calls to control
 *    memory allocations done within this function.
108
 */
109
int blkdev_report_zones(struct block_device *bdev, sector_t sector,
C
Christoph Hellwig 已提交
110
			unsigned int nr_zones, report_zones_cb cb, void *data)
111
{
112
	struct gendisk *disk = bdev->bd_disk;
113
	sector_t capacity = get_capacity(disk);
114

C
Christoph Hellwig 已提交
115 116
	if (!blk_queue_is_zoned(bdev_get_queue(bdev)) ||
	    WARN_ON_ONCE(!disk->fops->report_zones))
117
		return -EOPNOTSUPP;
118

C
Christoph Hellwig 已提交
119
	if (!nr_zones || sector >= capacity)
120 121
		return 0;

C
Christoph Hellwig 已提交
122
	return disk->fops->report_zones(disk, sector, nr_zones, cb, data);
123 124 125
}
EXPORT_SYMBOL_GPL(blkdev_report_zones);

126
static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev,
127
						sector_t sector,
128 129 130 131 132 133
						sector_t nr_sectors)
{
	if (!blk_queue_zone_resetall(bdev_get_queue(bdev)))
		return false;

	/*
134 135
	 * REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors
	 * of the applicable zone range is the entire disk.
136
	 */
137
	return !sector && nr_sectors == get_capacity(bdev->bd_disk);
138 139
}

140
/**
141
 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
142
 * @bdev:	Target block device
143 144 145 146
 * @op:		Operation to be performed on the zones
 * @sector:	Start sector of the first zone to operate on
 * @nr_sectors:	Number of sectors, should be at least the length of one zone and
 *		must be zone size aligned.
147 148 149
 * @gfp_mask:	Memory allocation flags (for bio_alloc)
 *
 * Description:
150
 *    Perform the specified operation on the range of zones specified by
151 152
 *    @sector..@sector+@nr_sectors. Specifying the entire disk sector range
 *    is valid, but the specified range should not contain conventional zones.
153 154
 *    The operation to execute on each zone can be a zone reset, open, close
 *    or finish request.
155
 */
156 157 158
int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
		     sector_t sector, sector_t nr_sectors,
		     gfp_t gfp_mask)
159 160
{
	struct request_queue *q = bdev_get_queue(bdev);
161
	sector_t zone_sectors = blk_queue_zone_sectors(q);
162
	sector_t capacity = get_capacity(bdev->bd_disk);
163
	sector_t end_sector = sector + nr_sectors;
164
	struct bio *bio = NULL;
165 166 167 168 169
	int ret;

	if (!blk_queue_is_zoned(q))
		return -EOPNOTSUPP;

170 171 172
	if (bdev_read_only(bdev))
		return -EPERM;

173 174 175
	if (!op_is_zone_mgmt(op))
		return -EOPNOTSUPP;

176
	if (!nr_sectors || end_sector > capacity)
177 178 179 180 181 182 183
		/* Out of range */
		return -EINVAL;

	/* Check alignment (handle eventual smaller last zone) */
	if (sector & (zone_sectors - 1))
		return -EINVAL;

184
	if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity)
185 186 187
		return -EINVAL;

	while (sector < end_sector) {
188
		bio = blk_next_bio(bio, 0, gfp_mask);
189
		bio_set_dev(bio, bdev);
190

191 192 193 194
		/*
		 * Special case for the zone reset operation that reset all
		 * zones, this is useful for applications like mkfs.
		 */
195 196
		if (op == REQ_OP_ZONE_RESET &&
		    blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) {
197 198 199 200
			bio->bi_opf = REQ_OP_ZONE_RESET_ALL;
			break;
		}

201
		bio->bi_opf = op | REQ_SYNC;
202
		bio->bi_iter.bi_sector = sector;
203 204 205 206 207 208
		sector += zone_sectors;

		/* This may take a while, so be nice to others */
		cond_resched();
	}

209 210 211 212
	ret = submit_bio_wait(bio);
	bio_put(bio);

	return ret;
213
}
214
EXPORT_SYMBOL_GPL(blkdev_zone_mgmt);
S
Shaun Tancheff 已提交
215

C
Christoph Hellwig 已提交
216 217 218 219 220 221 222 223 224 225 226 227 228 229
struct zone_report_args {
	struct blk_zone __user *zones;
};

static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
				    void *data)
{
	struct zone_report_args *args = data;

	if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone)))
		return -EFAULT;
	return 0;
}

230
/*
S
Shaun Tancheff 已提交
231 232 233 234 235 236 237
 * BLKREPORTZONE ioctl processing.
 * Called from blkdev_ioctl.
 */
int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
			      unsigned int cmd, unsigned long arg)
{
	void __user *argp = (void __user *)arg;
C
Christoph Hellwig 已提交
238
	struct zone_report_args args;
S
Shaun Tancheff 已提交
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
	struct request_queue *q;
	struct blk_zone_report rep;
	int ret;

	if (!argp)
		return -EINVAL;

	q = bdev_get_queue(bdev);
	if (!q)
		return -ENXIO;

	if (!blk_queue_is_zoned(q))
		return -ENOTTY;

	if (!capable(CAP_SYS_ADMIN))
		return -EACCES;

	if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
		return -EFAULT;

	if (!rep.nr_zones)
		return -EINVAL;

C
Christoph Hellwig 已提交
262 263 264 265 266
	args.zones = argp + sizeof(struct blk_zone_report);
	ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
				  blkdev_copy_zone_to_user, &args);
	if (ret < 0)
		return ret;
S
Shaun Tancheff 已提交
267

C
Christoph Hellwig 已提交
268 269 270 271
	rep.nr_zones = ret;
	if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
		return -EFAULT;
	return 0;
S
Shaun Tancheff 已提交
272 273
}

274
/*
275
 * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
S
Shaun Tancheff 已提交
276 277
 * Called from blkdev_ioctl.
 */
278 279
int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
			   unsigned int cmd, unsigned long arg)
S
Shaun Tancheff 已提交
280 281 282 283
{
	void __user *argp = (void __user *)arg;
	struct request_queue *q;
	struct blk_zone_range zrange;
284
	enum req_opf op;
S
Shaun Tancheff 已提交
285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304

	if (!argp)
		return -EINVAL;

	q = bdev_get_queue(bdev);
	if (!q)
		return -ENXIO;

	if (!blk_queue_is_zoned(q))
		return -ENOTTY;

	if (!capable(CAP_SYS_ADMIN))
		return -EACCES;

	if (!(mode & FMODE_WRITE))
		return -EBADF;

	if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
		return -EFAULT;

305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
	switch (cmd) {
	case BLKRESETZONE:
		op = REQ_OP_ZONE_RESET;
		break;
	case BLKOPENZONE:
		op = REQ_OP_ZONE_OPEN;
		break;
	case BLKCLOSEZONE:
		op = REQ_OP_ZONE_CLOSE;
		break;
	case BLKFINISHZONE:
		op = REQ_OP_ZONE_FINISH;
		break;
	default:
		return -ENOTTY;
	}

	return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors,
				GFP_KERNEL);
S
Shaun Tancheff 已提交
324
}
325 326 327 328 329 330 331 332 333 334

static inline unsigned long *blk_alloc_zone_bitmap(int node,
						   unsigned int nr_zones)
{
	return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
			    GFP_NOIO, node);
}

void blk_queue_free_zone_bitmaps(struct request_queue *q)
{
335 336
	kfree(q->conv_zones_bitmap);
	q->conv_zones_bitmap = NULL;
337 338 339 340
	kfree(q->seq_zones_wlock);
	q->seq_zones_wlock = NULL;
}

C
Christoph Hellwig 已提交
341 342
struct blk_revalidate_zone_args {
	struct gendisk	*disk;
343
	unsigned long	*conv_zones_bitmap;
C
Christoph Hellwig 已提交
344
	unsigned long	*seq_zones_wlock;
345
	unsigned int	nr_zones;
346
	sector_t	zone_sectors;
C
Christoph Hellwig 已提交
347 348 349
	sector_t	sector;
};

350 351 352
/*
 * Helper function to check the validity of zones of a zoned block device.
 */
C
Christoph Hellwig 已提交
353 354
static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
				  void *data)
355
{
C
Christoph Hellwig 已提交
356 357
	struct blk_revalidate_zone_args *args = data;
	struct gendisk *disk = args->disk;
358 359 360 361 362 363 364
	struct request_queue *q = disk->queue;
	sector_t capacity = get_capacity(disk);

	/*
	 * All zones must have the same size, with the exception on an eventual
	 * smaller last zone.
	 */
365 366 367 368 369 370
	if (zone->start == 0) {
		if (zone->len == 0 || !is_power_of_2(zone->len)) {
			pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n",
				disk->disk_name, zone->len);
			return -ENODEV;
		}
371

372 373 374 375 376 377 378 379 380 381 382 383 384 385
		args->zone_sectors = zone->len;
		args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len);
	} else if (zone->start + args->zone_sectors < capacity) {
		if (zone->len != args->zone_sectors) {
			pr_warn("%s: Invalid zoned device with non constant zone size\n",
				disk->disk_name);
			return -ENODEV;
		}
	} else {
		if (zone->len > args->zone_sectors) {
			pr_warn("%s: Invalid zoned device with larger last zone size\n",
				disk->disk_name);
			return -ENODEV;
		}
386 387 388
	}

	/* Check for holes in the zone report */
C
Christoph Hellwig 已提交
389
	if (zone->start != args->sector) {
390
		pr_warn("%s: Zone gap at sectors %llu..%llu\n",
C
Christoph Hellwig 已提交
391 392
			disk->disk_name, args->sector, zone->start);
		return -ENODEV;
393 394 395 396 397
	}

	/* Check zone type */
	switch (zone->type) {
	case BLK_ZONE_TYPE_CONVENTIONAL:
398 399 400 401 402 403 404 405
		if (!args->conv_zones_bitmap) {
			args->conv_zones_bitmap =
				blk_alloc_zone_bitmap(q->node, args->nr_zones);
			if (!args->conv_zones_bitmap)
				return -ENOMEM;
		}
		set_bit(idx, args->conv_zones_bitmap);
		break;
406 407
	case BLK_ZONE_TYPE_SEQWRITE_REQ:
	case BLK_ZONE_TYPE_SEQWRITE_PREF:
408 409 410 411 412 413
		if (!args->seq_zones_wlock) {
			args->seq_zones_wlock =
				blk_alloc_zone_bitmap(q->node, args->nr_zones);
			if (!args->seq_zones_wlock)
				return -ENOMEM;
		}
414 415 416 417
		break;
	default:
		pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
			disk->disk_name, (int)zone->type, zone->start);
C
Christoph Hellwig 已提交
418
		return -ENODEV;
419 420
	}

C
Christoph Hellwig 已提交
421 422 423 424
	args->sector += zone->len;
	return 0;
}

425 426 427 428 429 430
/**
 * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
 * @disk:	Target disk
 *
 * Helper function for low-level device drivers to (re) allocate and initialize
 * a disk request queue zone bitmaps. This functions should normally be called
431 432 433
 * within the disk ->revalidate method for blk-mq based drivers.  For BIO based
 * drivers only q->nr_zones needs to be updated so that the sysfs exposed value
 * is correct.
434 435 436 437
 */
int blk_revalidate_disk_zones(struct gendisk *disk)
{
	struct request_queue *q = disk->queue;
438 439 440
	struct blk_revalidate_zone_args args = {
		.disk		= disk,
	};
441 442
	unsigned int noio_flag;
	int ret;
443

444 445
	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
		return -EIO;
446 447
	if (WARN_ON_ONCE(!queue_is_mq(q)))
		return -EIO;
448

449
	/*
450 451
	 * Ensure that all memory allocations in this context are done as if
	 * GFP_NOIO was specified.
452
	 */
453 454 455 456
	noio_flag = memalloc_noio_save();
	ret = disk->fops->report_zones(disk, 0, UINT_MAX,
				       blk_revalidate_zone_cb, &args);
	memalloc_noio_restore(noio_flag);
457 458

	/*
459 460 461
	 * Install the new bitmaps and update nr_zones only once the queue is
	 * stopped and all I/Os are completed (i.e. a scheduler is not
	 * referencing the bitmaps).
462 463
	 */
	blk_mq_freeze_queue(q);
C
Christoph Hellwig 已提交
464
	if (ret >= 0) {
465
		blk_queue_chunk_sectors(q, args.zone_sectors);
466
		q->nr_zones = args.nr_zones;
C
Christoph Hellwig 已提交
467
		swap(q->seq_zones_wlock, args.seq_zones_wlock);
468
		swap(q->conv_zones_bitmap, args.conv_zones_bitmap);
C
Christoph Hellwig 已提交
469 470
		ret = 0;
	} else {
471 472 473
		pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
		blk_queue_free_zone_bitmaps(q);
	}
C
Christoph Hellwig 已提交
474
	blk_mq_unfreeze_queue(q);
475

C
Christoph Hellwig 已提交
476
	kfree(args.seq_zones_wlock);
477
	kfree(args.conv_zones_bitmap);
478 479 480
	return ret;
}
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);