diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt index ff0841711fd524063e2acf0fa4644ec2fbb4b764..8ae1cf8e94daeb6cf142e24624d0e5f4d99d728b 100644 --- a/Documentation/device-mapper/cache.txt +++ b/Documentation/device-mapper/cache.txt @@ -206,6 +206,9 @@ Optional feature arguments are: in a separate btree, which improves speed of shutting down the cache. + no_discard_passdown : disable passing down discards from the cache + to the origin's data device. + A policy called 'default' is always registered. This is an alias for the policy we currently think is giving best all round performance. diff --git a/Documentation/device-mapper/dm-init.txt b/Documentation/device-mapper/dm-init.txt new file mode 100644 index 0000000000000000000000000000000000000000..8464ee7c01b82256d01559994d33cb6676c0c5a3 --- /dev/null +++ b/Documentation/device-mapper/dm-init.txt @@ -0,0 +1,114 @@ +Early creation of mapped devices +==================================== + +It is possible to configure a device-mapper device to act as the root device for +your system in two ways. + +The first is to build an initial ramdisk which boots to a minimal userspace +which configures the device, then pivot_root(8) in to it. + +The second is to create one or more device-mappers using the module parameter +"dm-mod.create=" through the kernel boot command line argument. + +The format is specified as a string of data separated by commas and optionally +semi-colons, where: + - a comma is used to separate fields like name, uuid, flags and table + (specifies one device) + - a semi-colon is used to separate devices. + +So the format will look like this: + + dm-mod.create=,,,,[,
+][;,,,,
[,
+]+] + +Where, + ::= The device name. + ::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | "" + ::= The device minor number | "" + ::= "ro" | "rw" +
::= + ::= "verity" | "linear" | ... (see list below) + +The dm line should be equivalent to the one used by the dmsetup tool with the +--concise argument. + +Target types +============ + +Not all target types are available as there are serious risks in allowing +activation of certain DM targets without first using userspace tools to check +the validity of associated metadata. + + "cache": constrained, userspace should verify cache device + "crypt": allowed + "delay": allowed + "era": constrained, userspace should verify metadata device + "flakey": constrained, meant for test + "linear": allowed + "log-writes": constrained, userspace should verify metadata device + "mirror": constrained, userspace should verify main/mirror device + "raid": constrained, userspace should verify metadata device + "snapshot": constrained, userspace should verify src/dst device + "snapshot-origin": allowed + "snapshot-merge": constrained, userspace should verify src/dst device + "striped": allowed + "switch": constrained, userspace should verify dev path + "thin": constrained, requires dm target message from userspace + "thin-pool": constrained, requires dm target message from userspace + "verity": allowed + "writecache": constrained, userspace should verify cache device + "zero": constrained, not meant for rootfs + +If the target is not listed above, it is constrained by default (not tested). + +Examples +======== +An example of booting to a linear array made up of user-mode linux block +devices: + + dm-mod.create="lroot,,,rw, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" root=/dev/dm-0 + +This will boot to a rw dm-linear target of 8192 sectors split across two block +devices identified by their major:minor numbers. After boot, udev will rename +this target to /dev/mapper/lroot (depending on the rules). No uuid was assigned. + +An example of multiple device-mappers, with the dm-mod.create="..." contents is shown here +split on multiple lines for readability: + + vroot,,,ro, + 0 1740800 verity 254:0 254:0 1740800 sha1 + 76e9be054b15884a9fa85973e9cb274c93afadb6 + 5b3549d54d6c7a3837b9b81ed72e49463a64c03680c47835bef94d768e5646fe; + vram,,,rw, + 0 32768 linear 1:0 0, + 32768 32768 linear 1:1 0 + +Other examples (per target): + +"crypt": + dm-crypt,,8,ro, + 0 1048576 crypt aes-xts-plain64 + babebabebabebabebabebabebabebabebabebabebabebabebabebabebabebabe 0 + /dev/sda 0 1 allow_discards + +"delay": + dm-delay,,4,ro,0 409600 delay /dev/sda1 0 500 + +"linear": + dm-linear,,,rw, + 0 32768 linear /dev/sda1 0, + 32768 1024000 linear /dev/sda2 0, + 1056768 204800 linear /dev/sda3 0, + 1261568 512000 linear /dev/sda4 0 + +"snapshot-origin": + dm-snap-orig,,4,ro,0 409600 snapshot-origin 8:2 + +"striped": + dm-striped,,4,ro,0 1638400 striped 4 4096 + /dev/sda1 0 /dev/sda2 0 /dev/sda3 0 /dev/sda4 0 + +"verity": + dm-verity,,4,ro, + 0 1638400 verity 1 8:1 8:2 4096 4096 204800 1 sha256 + fb1a5a0f00deb908d8b53cb270858975e76cf64105d412ce764225d53b8f3cfd + 51934789604d1b92399c52e7cb149d1b3a1b74bbbcb103b2a0aaacbed5c08584 diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 3db222509e44b456acb46fd2defc1a7978299b8e..2557f198e1750002a1a2d7dacd3fefbb926a7fd7 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -436,6 +436,18 @@ config DM_DELAY If unsure, say N. +config DM_INIT + bool "DM \"dm-mod.create=\" parameter support" + depends on BLK_DEV_DM=y + ---help--- + Enable "dm-mod.create=" parameter to create mapped devices at init time. + This option is useful to allow mounting rootfs without requiring an + initramfs. + See Documentation/device-mapper/dm-init.txt for dm-mod.create="..." + format. + + If unsure, say N. + config DM_UEVENT bool "DM uevents" depends on BLK_DEV_DM diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 822f4e8753bc4b197b93a90df936bd0eeeff464a..a52b703e588e23dfa3a240f8882acc2d1eb29212 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -69,6 +69,10 @@ obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o obj-$(CONFIG_DM_ZONED) += dm-zoned.o obj-$(CONFIG_DM_WRITECACHE) += dm-writecache.o +ifeq ($(CONFIG_DM_INIT),y) +dm-mod-objs += dm-init.o +endif + ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o endif diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index b29a8327eed15641df9000e019c82ad5c1cffedc..d249cf8ac277e4246d8f398c8aa3b2e46fe90724 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -353,6 +353,7 @@ struct cache_features { enum cache_metadata_mode mode; enum cache_io_mode io_mode; unsigned metadata_version; + bool discard_passdown:1; }; struct cache_stats { @@ -1899,7 +1900,11 @@ static bool process_discard_bio(struct cache *cache, struct bio *bio) b = to_dblock(from_dblock(b) + 1); } - bio_endio(bio); + if (cache->features.discard_passdown) { + remap_to_origin(cache, bio); + generic_make_request(bio); + } else + bio_endio(bio); return false; } @@ -2233,13 +2238,14 @@ static void init_features(struct cache_features *cf) cf->mode = CM_WRITE; cf->io_mode = CM_IO_WRITEBACK; cf->metadata_version = 1; + cf->discard_passdown = true; } static int parse_features(struct cache_args *ca, struct dm_arg_set *as, char **error) { static const struct dm_arg _args[] = { - {0, 2, "Invalid number of cache feature arguments"}, + {0, 3, "Invalid number of cache feature arguments"}, }; int r, mode_ctr = 0; @@ -2274,6 +2280,9 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as, else if (!strcasecmp(arg, "metadata2")) cf->metadata_version = 2; + else if (!strcasecmp(arg, "no_discard_passdown")) + cf->discard_passdown = false; + else { *error = "Unrecognised cache feature requested"; return -EINVAL; @@ -2496,7 +2505,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) ti->num_discard_bios = 1; ti->discards_supported = true; - ti->split_discard_bios = false; ti->per_io_data_size = sizeof(struct per_bio_data); @@ -3120,6 +3128,39 @@ static void cache_resume(struct dm_target *ti) do_waker(&cache->waker.work); } +static void emit_flags(struct cache *cache, char *result, + unsigned maxlen, ssize_t *sz_ptr) +{ + ssize_t sz = *sz_ptr; + struct cache_features *cf = &cache->features; + unsigned count = (cf->metadata_version == 2) + !cf->discard_passdown + 1; + + DMEMIT("%u ", count); + + if (cf->metadata_version == 2) + DMEMIT("metadata2 "); + + if (writethrough_mode(cache)) + DMEMIT("writethrough "); + + else if (passthrough_mode(cache)) + DMEMIT("passthrough "); + + else if (writeback_mode(cache)) + DMEMIT("writeback "); + + else { + DMEMIT("unknown "); + DMERR("%s: internal error: unknown io mode: %d", + cache_device_name(cache), (int) cf->io_mode); + } + + if (!cf->discard_passdown) + DMEMIT("no_discard_passdown "); + + *sz_ptr = sz; +} + /* * Status format: * @@ -3186,25 +3227,7 @@ static void cache_status(struct dm_target *ti, status_type_t type, (unsigned) atomic_read(&cache->stats.promotion), (unsigned long) atomic_read(&cache->nr_dirty)); - if (cache->features.metadata_version == 2) - DMEMIT("2 metadata2 "); - else - DMEMIT("1 "); - - if (writethrough_mode(cache)) - DMEMIT("writethrough "); - - else if (passthrough_mode(cache)) - DMEMIT("passthrough "); - - else if (writeback_mode(cache)) - DMEMIT("writeback "); - - else { - DMERR("%s: internal error: unknown io mode: %d", - cache_device_name(cache), (int) cache->features.io_mode); - goto err; - } + emit_flags(cache, result, maxlen, &sz); DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); @@ -3433,14 +3456,62 @@ static int cache_iterate_devices(struct dm_target *ti, return r; } +static bool origin_dev_supports_discard(struct block_device *origin_bdev) +{ + struct request_queue *q = bdev_get_queue(origin_bdev); + + return q && blk_queue_discard(q); +} + +/* + * If discard_passdown was enabled verify that the origin device + * supports discards. Disable discard_passdown if not. + */ +static void disable_passdown_if_not_supported(struct cache *cache) +{ + struct block_device *origin_bdev = cache->origin_dev->bdev; + struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; + const char *reason = NULL; + char buf[BDEVNAME_SIZE]; + + if (!cache->features.discard_passdown) + return; + + if (!origin_dev_supports_discard(origin_bdev)) + reason = "discard unsupported"; + + else if (origin_limits->max_discard_sectors < cache->sectors_per_block) + reason = "max discard sectors smaller than a block"; + + if (reason) { + DMWARN("Origin device (%s) %s: Disabling discard passdown.", + bdevname(origin_bdev, buf), reason); + cache->features.discard_passdown = false; + } +} + static void set_discard_limits(struct cache *cache, struct queue_limits *limits) { + struct block_device *origin_bdev = cache->origin_dev->bdev; + struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; + + if (!cache->features.discard_passdown) { + /* No passdown is done so setting own virtual limits */ + limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, + cache->origin_sectors); + limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; + return; + } + /* - * FIXME: these limits may be incompatible with the cache device + * cache_iterate_devices() is stacking both origin and fast device limits + * but discards aren't passed to fast device, so inherit origin's limits. */ - limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, - cache->origin_sectors); - limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; + limits->max_discard_sectors = origin_limits->max_discard_sectors; + limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors; + limits->discard_granularity = origin_limits->discard_granularity; + limits->discard_alignment = origin_limits->discard_alignment; + limits->discard_misaligned = origin_limits->discard_misaligned; } static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) @@ -3457,6 +3528,8 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); } + + disable_passdown_if_not_supported(cache); set_discard_limits(cache, limits); } @@ -3464,7 +3537,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {2, 0, 0}, + .version = {2, 1, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c new file mode 100644 index 0000000000000000000000000000000000000000..b53f30f16b4d4f2c02bf9c15e5b801234b8cd9ae --- /dev/null +++ b/drivers/md/dm-init.c @@ -0,0 +1,303 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * dm-init.c + * Copyright (C) 2017 The Chromium OS Authors + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "init" +#define DM_MAX_DEVICES 256 +#define DM_MAX_TARGETS 256 +#define DM_MAX_STR_SIZE 4096 + +static char *create; + +/* + * Format: dm-mod.create=,,,,
[,
+][;,,,,
[,
+]+] + * Table format: + * + * See Documentation/device-mapper/dm-init.txt for dm-mod.create="..." format + * details. + */ + +struct dm_device { + struct dm_ioctl dmi; + struct dm_target_spec *table[DM_MAX_TARGETS]; + char *target_args_array[DM_MAX_TARGETS]; + struct list_head list; +}; + +const char *dm_allowed_targets[] __initconst = { + "crypt", + "delay", + "linear", + "snapshot-origin", + "striped", + "verity", +}; + +static int __init dm_verify_target_type(const char *target) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(dm_allowed_targets); i++) { + if (!strcmp(dm_allowed_targets[i], target)) + return 0; + } + return -EINVAL; +} + +static void __init dm_setup_cleanup(struct list_head *devices) +{ + struct dm_device *dev, *tmp; + unsigned int i; + + list_for_each_entry_safe(dev, tmp, devices, list) { + list_del(&dev->list); + for (i = 0; i < dev->dmi.target_count; i++) { + kfree(dev->table[i]); + kfree(dev->target_args_array[i]); + } + kfree(dev); + } +} + +/** + * str_field_delimit - delimit a string based on a separator char. + * @str: the pointer to the string to delimit. + * @separator: char that delimits the field + * + * Find a @separator and replace it by '\0'. + * Remove leading and trailing spaces. + * Return the remainder string after the @separator. + */ +static char __init *str_field_delimit(char **str, char separator) +{ + char *s; + + /* TODO: add support for escaped characters */ + *str = skip_spaces(*str); + s = strchr(*str, separator); + /* Delimit the field and remove trailing spaces */ + if (s) + *s = '\0'; + *str = strim(*str); + return s ? ++s : NULL; +} + +/** + * dm_parse_table_entry - parse a table entry + * @dev: device to store the parsed information. + * @str: the pointer to a string with the format: + * [, ...] + * + * Return the remainder string after the table entry, i.e, after the comma which + * delimits the entry or NULL if reached the end of the string. + */ +static char __init *dm_parse_table_entry(struct dm_device *dev, char *str) +{ + const unsigned int n = dev->dmi.target_count - 1; + struct dm_target_spec *sp; + unsigned int i; + /* fields: */ + char *field[4]; + char *next; + + field[0] = str; + /* Delimit first 3 fields that are separated by space */ + for (i = 0; i < ARRAY_SIZE(field) - 1; i++) { + field[i + 1] = str_field_delimit(&field[i], ' '); + if (!field[i + 1]) + return ERR_PTR(-EINVAL); + } + /* Delimit last field that can be terminated by comma */ + next = str_field_delimit(&field[i], ','); + + sp = kzalloc(sizeof(*sp), GFP_KERNEL); + if (!sp) + return ERR_PTR(-ENOMEM); + dev->table[n] = sp; + + /* start_sector */ + if (kstrtoull(field[0], 0, &sp->sector_start)) + return ERR_PTR(-EINVAL); + /* num_sector */ + if (kstrtoull(field[1], 0, &sp->length)) + return ERR_PTR(-EINVAL); + /* target_type */ + strscpy(sp->target_type, field[2], sizeof(sp->target_type)); + if (dm_verify_target_type(sp->target_type)) { + DMERR("invalid type \"%s\"", sp->target_type); + return ERR_PTR(-EINVAL); + } + /* target_args */ + dev->target_args_array[n] = kstrndup(field[3], GFP_KERNEL, + DM_MAX_STR_SIZE); + if (!dev->target_args_array[n]) + return ERR_PTR(-ENOMEM); + + return next; +} + +/** + * dm_parse_table - parse "dm-mod.create=" table field + * @dev: device to store the parsed information. + * @str: the pointer to a string with the format: + *
[,
+] + */ +static int __init dm_parse_table(struct dm_device *dev, char *str) +{ + char *table_entry = str; + + while (table_entry) { + DMDEBUG("parsing table \"%s\"", str); + if (++dev->dmi.target_count >= DM_MAX_TARGETS) { + DMERR("too many targets %u > %d", + dev->dmi.target_count, DM_MAX_TARGETS); + return -EINVAL; + } + table_entry = dm_parse_table_entry(dev, table_entry); + if (IS_ERR(table_entry)) { + DMERR("couldn't parse table"); + return PTR_ERR(table_entry); + } + } + + return 0; +} + +/** + * dm_parse_device_entry - parse a device entry + * @dev: device to store the parsed information. + * @str: the pointer to a string with the format: + * name,uuid,minor,flags,table[; ...] + * + * Return the remainder string after the table entry, i.e, after the semi-colon + * which delimits the entry or NULL if reached the end of the string. + */ +static char __init *dm_parse_device_entry(struct dm_device *dev, char *str) +{ + /* There are 5 fields: name,uuid,minor,flags,table; */ + char *field[5]; + unsigned int i; + char *next; + + field[0] = str; + /* Delimit first 4 fields that are separated by comma */ + for (i = 0; i < ARRAY_SIZE(field) - 1; i++) { + field[i+1] = str_field_delimit(&field[i], ','); + if (!field[i+1]) + return ERR_PTR(-EINVAL); + } + /* Delimit last field that can be delimited by semi-colon */ + next = str_field_delimit(&field[i], ';'); + + /* name */ + strscpy(dev->dmi.name, field[0], sizeof(dev->dmi.name)); + /* uuid */ + strscpy(dev->dmi.uuid, field[1], sizeof(dev->dmi.uuid)); + /* minor */ + if (strlen(field[2])) { + if (kstrtoull(field[2], 0, &dev->dmi.dev)) + return ERR_PTR(-EINVAL); + dev->dmi.flags |= DM_PERSISTENT_DEV_FLAG; + } + /* flags */ + if (!strcmp(field[3], "ro")) + dev->dmi.flags |= DM_READONLY_FLAG; + else if (strcmp(field[3], "rw")) + return ERR_PTR(-EINVAL); + /* table */ + if (dm_parse_table(dev, field[4])) + return ERR_PTR(-EINVAL); + + return next; +} + +/** + * dm_parse_devices - parse "dm-mod.create=" argument + * @devices: list of struct dm_device to store the parsed information. + * @str: the pointer to a string with the format: + * [;+] + */ +static int __init dm_parse_devices(struct list_head *devices, char *str) +{ + unsigned long ndev = 0; + struct dm_device *dev; + char *device = str; + + DMDEBUG("parsing \"%s\"", str); + while (device) { + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + list_add_tail(&dev->list, devices); + + if (++ndev >= DM_MAX_DEVICES) { + DMERR("too many targets %u > %d", + dev->dmi.target_count, DM_MAX_TARGETS); + return -EINVAL; + } + + device = dm_parse_device_entry(dev, device); + if (IS_ERR(device)) { + DMERR("couldn't parse device"); + return PTR_ERR(device); + } + } + + return 0; +} + +/** + * dm_init_init - parse "dm-mod.create=" argument and configure drivers + */ +static int __init dm_init_init(void) +{ + struct dm_device *dev; + LIST_HEAD(devices); + char *str; + int r; + + if (!create) + return 0; + + if (strlen(create) >= DM_MAX_STR_SIZE) { + DMERR("Argument is too big. Limit is %d\n", DM_MAX_STR_SIZE); + return -EINVAL; + } + str = kstrndup(create, GFP_KERNEL, DM_MAX_STR_SIZE); + if (!str) + return -ENOMEM; + + r = dm_parse_devices(&devices, str); + if (r) + goto out; + + DMINFO("waiting for all devices to be available before creating mapped devices\n"); + wait_for_device_probe(); + + list_for_each_entry(dev, &devices, list) { + if (dm_early_create(&dev->dmi, dev->table, + dev->target_args_array)) + break; + } +out: + kfree(str); + dm_setup_cleanup(&devices); + return r; +} + +late_initcall(dm_init_init); + +module_param(create, charp, 0); +MODULE_PARM_DESC(create, "Create a mapped device in early boot"); diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 457200ca62878ee87b0cd5127540597454e7bbd9..d57d997a52c81cfe6c68918520316f993aeebc44 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1122,7 +1122,7 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se return r; data = dm_bufio_read(ic->bufio, *metadata_block, &b); - if (unlikely(IS_ERR(data))) + if (IS_ERR(data)) return PTR_ERR(data); to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size); @@ -1368,8 +1368,8 @@ static void integrity_metadata(struct work_struct *w) checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE); if (unlikely(r)) { if (r > 0) { - DMERR("Checksum failed at sector 0x%llx", - (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size))); + DMERR_LIMIT("Checksum failed at sector 0x%llx", + (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size))); r = -EILSEQ; atomic64_inc(&ic->number_of_mismatches); } @@ -1561,8 +1561,8 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio, integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { - DMERR("Checksum failed when reading from journal, at sector 0x%llx", - (unsigned long long)logical_sector); + DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx", + (unsigned long long)logical_sector); } } #endif diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index f666778ad23728cdc323f7570d837ebb30ede660..c740153b4e52df15ee7b4cab6c9b1e83d897143f 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -2018,3 +2018,106 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid) return r; } + + +/** + * dm_early_create - create a mapped device in early boot. + * + * @dmi: Contains main information of the device mapping to be created. + * @spec_array: array of pointers to struct dm_target_spec. Describes the + * mapping table of the device. + * @target_params_array: array of strings with the parameters to a specific + * target. + * + * Instead of having the struct dm_target_spec and the parameters for every + * target embedded at the end of struct dm_ioctl (as performed in a normal + * ioctl), pass them as arguments, so the caller doesn't need to serialize them. + * The size of the spec_array and target_params_array is given by + * @dmi->target_count. + * This function is supposed to be called in early boot, so locking mechanisms + * to protect against concurrent loads are not required. + */ +int __init dm_early_create(struct dm_ioctl *dmi, + struct dm_target_spec **spec_array, + char **target_params_array) +{ + int r, m = DM_ANY_MINOR; + struct dm_table *t, *old_map; + struct mapped_device *md; + unsigned int i; + + if (!dmi->target_count) + return -EINVAL; + + r = check_name(dmi->name); + if (r) + return r; + + if (dmi->flags & DM_PERSISTENT_DEV_FLAG) + m = MINOR(huge_decode_dev(dmi->dev)); + + /* alloc dm device */ + r = dm_create(m, &md); + if (r) + return r; + + /* hash insert */ + r = dm_hash_insert(dmi->name, *dmi->uuid ? dmi->uuid : NULL, md); + if (r) + goto err_destroy_dm; + + /* alloc table */ + r = dm_table_create(&t, get_mode(dmi), dmi->target_count, md); + if (r) + goto err_destroy_dm; + + /* add targets */ + for (i = 0; i < dmi->target_count; i++) { + r = dm_table_add_target(t, spec_array[i]->target_type, + (sector_t) spec_array[i]->sector_start, + (sector_t) spec_array[i]->length, + target_params_array[i]); + if (r) { + DMWARN("error adding target to table"); + goto err_destroy_table; + } + } + + /* finish table */ + r = dm_table_complete(t); + if (r) + goto err_destroy_table; + + md->type = dm_table_get_type(t); + /* setup md->queue to reflect md's type (may block) */ + r = dm_setup_md_queue(md, t); + if (r) { + DMWARN("unable to set up device queue for new table."); + goto err_destroy_table; + } + + /* Set new map */ + dm_suspend(md, 0); + old_map = dm_swap_table(md, t); + if (IS_ERR(old_map)) { + r = PTR_ERR(old_map); + goto err_destroy_table; + } + set_disk_ro(dm_disk(md), !!(dmi->flags & DM_READONLY_FLAG)); + + /* resume device */ + r = dm_resume(md); + if (r) + goto err_destroy_table; + + DMINFO("%s (%s) is ready", md->disk->disk_name, dmi->name); + dm_put(md); + return 0; + +err_destroy_table: + dm_table_destroy(t); +err_destroy_dm: + dm_put(md); + dm_destroy(md); + return r; +} diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index adcfe8ae10aa78c521e77bc4c71d077c43e284c3..9fdef6897316fe6ee2eab0d36f50c0e870c73304 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2986,11 +2986,6 @@ static void configure_discard_support(struct raid_set *rs) } } - /* - * RAID1 and RAID10 personalities require bio splitting, - * RAID0/4/5/6 don't and process large discard bios properly. - */ - ti->split_discard_bios = !!(rs_is_raid1(rs) || rs_is_raid10(rs)); ti->num_discard_bios = 1; } @@ -3747,6 +3742,15 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, chunk_size); blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs)); + + /* + * RAID1 and RAID10 personalities require bio splitting, + * RAID0/4/5/6 don't and process large discard bios properly. + */ + if (rs_is_raid1(rs) || rs_is_raid10(rs)) { + limits->discard_granularity = chunk_size; + limits->max_discard_sectors = chunk_size; + } } static void raid_postsuspend(struct dm_target *ti) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index b2c03c079a8db21eb19ec264a0760598e84624bf..09773636602d3d86728b127ddb67b2d674b67cae 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -12,6 +12,22 @@ #define DM_MSG_PREFIX "core-rq" +/* + * One of these is allocated per request. + */ +struct dm_rq_target_io { + struct mapped_device *md; + struct dm_target *ti; + struct request *orig, *clone; + struct kthread_work work; + blk_status_t error; + union map_info info; + struct dm_stats_aux stats_aux; + unsigned long duration_jiffies; + unsigned n_sectors; + unsigned completed; +}; + #define DM_MQ_NR_HW_QUEUES 1 #define DM_MQ_QUEUE_DEPTH 2048 static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES; diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h index b39245545229fbb19e51caf35d969c5633f65125..1eea0da641db55838912fc541f4b93506094fc9c 100644 --- a/drivers/md/dm-rq.h +++ b/drivers/md/dm-rq.h @@ -16,22 +16,6 @@ struct mapped_device; -/* - * One of these is allocated per request. - */ -struct dm_rq_target_io { - struct mapped_device *md; - struct dm_target *ti; - struct request *orig, *clone; - struct kthread_work work; - blk_status_t error; - union map_info info; - struct dm_stats_aux stats_aux; - unsigned long duration_jiffies; - unsigned n_sectors; - unsigned completed; -}; - /* * For request-based dm - the bio clones we allocate are embedded in these * structs. diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 36805b12661e173be5d2a845626f82baaf8de690..a168963b757df4fe12c885c48456a50d586851fe 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -2338,13 +2338,6 @@ static int origin_map(struct dm_target *ti, struct bio *bio) return do_origin(o->dev, bio); } -static long origin_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) -{ - DMWARN("device does not support dax."); - return -EIO; -} - /* * Set the target "max_io_len" field to the minimum of all the snapshots' * chunk sizes. @@ -2404,7 +2397,6 @@ static struct target_type origin_target = { .postsuspend = origin_postsuspend, .status = origin_status, .iterate_devices = origin_iterate_devices, - .direct_access = origin_dax_direct_access, }; static struct target_type snapshot_target = { diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index fae35caf367208ab22cbe6900010ee353f52d666..8a0f057b81220288afd70404ab83d8a5c10cefcf 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -61,8 +61,7 @@ static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_pat { struct switch_ctx *sctx; - sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path), - GFP_KERNEL); + sctx = kzalloc(struct_size(sctx, path_list, nr_paths), GFP_KERNEL); if (!sctx) return NULL; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index e83b63608262ac46bf4fa0f8f869294de6bdf22f..fcd887703f953eebbfb7cc2e1aee9cd82113e5c9 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3283,6 +3283,13 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) as.argc = argc; as.argv = argv; + /* make sure metadata and data are different devices */ + if (!strcmp(argv[0], argv[1])) { + ti->error = "Error setting metadata or data device"; + r = -EINVAL; + goto out_unlock; + } + /* * Set default pool features. */ @@ -4167,6 +4174,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) tc->sort_bio_list = RB_ROOT; if (argc == 3) { + if (!strcmp(argv[0], argv[2])) { + ti->error = "Error setting origin device"; + r = -EINVAL; + goto bad_origin_dev; + } + r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); if (r) { ti->error = "Error opening origin device"; @@ -4227,7 +4240,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) if (tc->pool->pf.discard_enabled) { ti->discards_supported = true; ti->num_discard_bios = 1; - ti->split_discard_bios = false; } mutex_unlock(&dm_thin_pool_table.mutex); diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 0ce04e5b4afbaba8b53a145ae20d1652f1c75de7..b634fa23f4c46374afb8a05636b90492d4ad2101 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -73,7 +73,7 @@ static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index, *offset = (unsigned)(position - (block << v->data_dev_block_bits)); res = dm_bufio_read(v->fec->bufio, v->fec->start + block, buf); - if (unlikely(IS_ERR(res))) { + if (IS_ERR(res)) { DMERR("%s: FEC %llu: parity read failed (block %llu): %ld", v->data_dev->name, (unsigned long long)rsb, (unsigned long long)(v->fec->start + block), @@ -163,7 +163,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio, dm_bufio_release(buf); par = fec_read_parity(v, rsb, block_offset, &offset, &buf); - if (unlikely(IS_ERR(par))) + if (IS_ERR(par)) return PTR_ERR(par); } } @@ -253,7 +253,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, } bbuf = dm_bufio_read(bufio, block, &buf); - if (unlikely(IS_ERR(bbuf))) { + if (IS_ERR(bbuf)) { DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld", v->data_dev->name, (unsigned long long)rsb, diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 2b8cee35e4d5a90daff8cc3e17a9cc3d0c90af83..f7822875589ea8439764d72a488adc2846a28f6f 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1859,7 +1859,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } - wc->writeback_wq = alloc_workqueue("writecache-writeabck", WQ_MEM_RECLAIM, 1); + wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1); if (!wc->writeback_wq) { r = -ENOMEM; ti->error = "Could not allocate writeback workqueue"; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 6af5babe6837605c7a25983ef61fb7d83e73af30..8865c1709e16357178ede5284c4477536bf2369d 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -727,7 +727,6 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->per_io_data_size = sizeof(struct dmz_bioctx); ti->flush_supported = true; ti->discards_supported = true; - ti->split_discard_bios = true; /* The exposed capacity is the number of chunks that can be mapped */ ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 515e6af9bed2c7ab5e94e356215020422316a3ce..68d24056d0b1c17d7fa0c271d1d5582d7eb72c89 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -158,9 +158,6 @@ struct table_device { struct dm_dev dm_dev; }; -static struct kmem_cache *_rq_tio_cache; -static struct kmem_cache *_rq_cache; - /* * Bio-based DM's mempools' reserved IOs set by the user. */ @@ -222,20 +219,11 @@ static unsigned dm_get_numa_node(void) static int __init local_init(void) { - int r = -ENOMEM; - - _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); - if (!_rq_tio_cache) - return r; - - _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request), - __alignof__(struct request), 0, NULL); - if (!_rq_cache) - goto out_free_rq_tio_cache; + int r; r = dm_uevent_init(); if (r) - goto out_free_rq_cache; + return r; deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); if (!deferred_remove_workqueue) { @@ -257,10 +245,6 @@ static int __init local_init(void) destroy_workqueue(deferred_remove_workqueue); out_uevent_exit: dm_uevent_exit(); -out_free_rq_cache: - kmem_cache_destroy(_rq_cache); -out_free_rq_tio_cache: - kmem_cache_destroy(_rq_tio_cache); return r; } @@ -270,8 +254,6 @@ static void local_exit(void) flush_scheduled_work(); destroy_workqueue(deferred_remove_workqueue); - kmem_cache_destroy(_rq_cache); - kmem_cache_destroy(_rq_tio_cache); unregister_blkdev(_major, _name); dm_uevent_exit(); @@ -1478,17 +1460,10 @@ static unsigned get_num_write_zeroes_bios(struct dm_target *ti) return ti->num_write_zeroes_bios; } -typedef bool (*is_split_required_fn)(struct dm_target *ti); - -static bool is_split_required_for_discard(struct dm_target *ti) -{ - return ti->split_discard_bios; -} - static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, - unsigned num_bios, bool is_split_required) + unsigned num_bios) { - unsigned len; + unsigned len = ci->sector_count; /* * Even though the device advertised support for this type of @@ -1499,11 +1474,6 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target * if (!num_bios) return -EOPNOTSUPP; - if (!is_split_required) - len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); - else - len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); - __send_duplicate_bios(ci, ti, num_bios, &len); ci->sector += len; @@ -1514,23 +1484,38 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target * static int __send_discard(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti), - is_split_required_for_discard(ti)); + return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti)); } static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti), false); + return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti)); } static int __send_write_same(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti), false); + return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti)); } static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti), false); + return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti)); +} + +static bool is_abnormal_io(struct bio *bio) +{ + bool r = false; + + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE_ZEROES: + r = true; + break; + } + + return r; } static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, @@ -1565,7 +1550,7 @@ static int __split_and_process_non_flush(struct clone_info *ci) if (!dm_target_is_valid(ti)) return -EIO; - if (unlikely(__process_abnormal_io(ci, ti, &r))) + if (__process_abnormal_io(ci, ti, &r)) return r; len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); @@ -1601,13 +1586,6 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, blk_qc_t ret = BLK_QC_T_NONE; int error = 0; - if (unlikely(!map)) { - bio_io_error(bio); - return ret; - } - - blk_queue_split(md->queue, &bio); - init_clone_info(&ci, md, map, bio); if (bio->bi_opf & REQ_PREFLUSH) { @@ -1675,18 +1653,13 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, * Optimized variant of __split_and_process_bio that leverages the * fact that targets that use it do _not_ have a need to split bios. */ -static blk_qc_t __process_bio(struct mapped_device *md, - struct dm_table *map, struct bio *bio) +static blk_qc_t __process_bio(struct mapped_device *md, struct dm_table *map, + struct bio *bio, struct dm_target *ti) { struct clone_info ci; blk_qc_t ret = BLK_QC_T_NONE; int error = 0; - if (unlikely(!map)) { - bio_io_error(bio); - return ret; - } - init_clone_info(&ci, md, map, bio); if (bio->bi_opf & REQ_PREFLUSH) { @@ -1704,21 +1677,11 @@ static blk_qc_t __process_bio(struct mapped_device *md, error = __send_empty_flush(&ci); /* dec_pending submits any data associated with flush */ } else { - struct dm_target *ti = md->immutable_target; struct dm_target_io *tio; - /* - * Defend against IO still getting in during teardown - * - as was seen for a time with nvme-fcloop - */ - if (WARN_ON_ONCE(!ti || !dm_target_is_valid(ti))) { - error = -EIO; - goto out; - } - ci.bio = bio; ci.sector_count = bio_sectors(bio); - if (unlikely(__process_abnormal_io(&ci, ti, &error))) + if (__process_abnormal_io(&ci, ti, &error)) goto out; tio = alloc_tio(&ci, ti, 0, GFP_NOIO); @@ -1730,11 +1693,55 @@ static blk_qc_t __process_bio(struct mapped_device *md, return ret; } +static void dm_queue_split(struct mapped_device *md, struct dm_target *ti, struct bio **bio) +{ + unsigned len, sector_count; + + sector_count = bio_sectors(*bio); + len = min_t(sector_t, max_io_len((*bio)->bi_iter.bi_sector, ti), sector_count); + + if (sector_count > len) { + struct bio *split = bio_split(*bio, len, GFP_NOIO, &md->queue->bio_split); + + bio_chain(split, *bio); + trace_block_split(md->queue, split, (*bio)->bi_iter.bi_sector); + generic_make_request(*bio); + *bio = split; + } +} + static blk_qc_t dm_process_bio(struct mapped_device *md, struct dm_table *map, struct bio *bio) { + blk_qc_t ret = BLK_QC_T_NONE; + struct dm_target *ti = md->immutable_target; + + if (unlikely(!map)) { + bio_io_error(bio); + return ret; + } + + if (!ti) { + ti = dm_table_find_target(map, bio->bi_iter.bi_sector); + if (unlikely(!ti || !dm_target_is_valid(ti))) { + bio_io_error(bio); + return ret; + } + } + + /* + * If in ->make_request_fn we need to use blk_queue_split(), otherwise + * queue_limits for abnormal requests (e.g. discard, writesame, etc) + * won't be imposed. + */ + if (current->bio_list) { + blk_queue_split(md->queue, &bio); + if (!is_abnormal_io(bio)) + dm_queue_split(md, ti, &bio); + } + if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED) - return __process_bio(md, map, bio); + return __process_bio(md, map, bio, ti); else return __split_and_process_bio(md, map, bio); } diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 492a3f8ac1199b841215758119e4167b46d9b71e..3972232b80378fa855513ac9f3b088ff7049c35c 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -462,7 +462,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, int r; p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); - if (unlikely(IS_ERR(p))) + if (IS_ERR(p)) return PTR_ERR(p); aux = dm_bufio_get_aux_data(to_buffer(*result)); @@ -498,7 +498,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm, return -EPERM; p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); - if (unlikely(IS_ERR(p))) + if (IS_ERR(p)) return PTR_ERR(p); aux = dm_bufio_get_aux_data(to_buffer(*result)); @@ -531,7 +531,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm, int r; p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result); - if (unlikely(IS_ERR(p))) + if (IS_ERR(p)) return PTR_ERR(p); if (unlikely(!p)) return -EWOULDBLOCK; @@ -567,7 +567,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, return -EPERM; p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result); - if (unlikely(IS_ERR(p))) + if (IS_ERR(p)) return PTR_ERR(p); memset(p, 0, dm_bm_block_size(bm)); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index e528baebad6964124e6287a890cd12f81bffa561..b0672756d0562697238cb9cf53775c3c48d87a4a 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -315,12 +316,6 @@ struct dm_target { * whether or not its underlying devices have support. */ bool discards_supported:1; - - /* - * Set if the target required discard bios to be split - * on max_io_len boundary. - */ - bool split_discard_bios:1; }; /* Each target can link one of these into the table */ @@ -431,6 +426,14 @@ void dm_remap_zone_report(struct dm_target *ti, sector_t start, struct blk_zone *zones, unsigned int *nr_zones); union map_info *dm_get_rq_mapinfo(struct request *rq); +/* + * Device mapper functions to parse and create devices specified by the + * parameter "dm-mod.create=" + */ +int __init dm_early_create(struct dm_ioctl *dmi, + struct dm_target_spec **spec_array, + char **target_params_array); + struct queue_limits *dm_get_queue_limits(struct mapped_device *md); /* @@ -609,7 +612,7 @@ do { \ */ #define dm_target_offset(ti, sector) ((sector) - (ti)->begin) -static inline sector_t to_sector(unsigned long n) +static inline sector_t to_sector(unsigned long long n) { return (n >> SECTOR_SHIFT); } diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index d1e49514977bc03732922e37272c408c2665c8f3..f396a82dfd3e61bb1810f4cacf6fcb9b9eefa37b 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -270,9 +270,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 39 +#define DM_VERSION_MINOR 40 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2018-04-03)" +#define DM_VERSION_EXTRA "-ioctl (2019-01-18)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */