From 80d6ee240ea37f36ecd6dda5c8a6d7723cba73d3 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Fri, 11 Oct 2019 17:14:52 +0800 Subject: [PATCH] alinux: blk: add iohang check function Background: We do not have a dependable block layer interface to determine whether block device has io requests which have not been completed for somewhat long time. Currently we have 'in_flight' interface, it counts the number of I/O requests that have been issued to the device driver but have not yet completed, and it does not include I/O requests that are in the queue but not yet issued to the device driver, which means it will not count io requests that have been stucked in block layer. Also say that there are steady io requests issued to device driver, 'in_flight' maybe always non-zero, but you could not determine whether there is one io request which has not been completed for too long. Solution: To find io requests which have not been completed for too long, here add 3 new inferfaces: /sys/block/vdb/queue/hang_threshold If one io request's running time has been greater than this value, count this io as hang. /sys/block/vdb/hang Show read/write io requests' hang counter. /sys/kernel/debug/block/vdb/rq_hang Show all hang io requests's detailed info, like below: ffff97db96301200 {.op=WRITE, .cmd_flags=SYNC, .rq_flags=STARTED| ELVPRIV|IO_STAT|STATS, .state=in_flight, .tag=30, .internal_tag=169, .start_time_ns=140634088407, .io_start_time_ns=140634102958, .current_time=146497371953, .bio = ffff97db91e8e000, .bio_pages = { ffffd096a0602540 }, .bio = ffff97db91e8ec00, .bio_pages = { ffffd096a070eec0 }, .bio = ffff97db91e8f600, .bio_pages = { ffffd096a0424cc0 }, .bio = ffff97db91e8f300, .bio_pages = { ffffd096a0600a80 }} With above info, we can easily see this request's latency distribution, and see next patch for bio_pages's usage. Note, /sys/kernel/debug/block/vdb/rq_hang only exists in blk-mq device driver and needs CONFIG_BLK_DEBUG_FS enabled. Signed-off-by: Xiaoguang Wang Reviewed-by: Joseph Qi --- block/blk-core.c | 1 + block/blk-mq-debugfs.c | 84 +++++++++++++++++++++++++++++++++++++++ block/blk-mq.c | 31 ++++++++++++++- block/blk-mq.h | 2 + block/blk-settings.c | 7 ++++ block/blk-sysfs.c | 28 +++++++++++++ block/genhd.c | 35 ++++++++++++++++ block/partition-generic.c | 13 ++++++ include/linux/blk-mq.h | 1 + include/linux/blkdev.h | 9 +++++ include/linux/genhd.h | 4 ++ 11 files changed, 214 insertions(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index a7747ed9c767..130709b4fc08 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1017,6 +1017,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, q->last_merge = NULL; q->end_sector = 0; q->boundary_rq = NULL; + q->rq_hang_threshold = BLK_REQ_HANG_THRESHOLD; q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index a5ea86835fcb..ba701e654465 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -213,6 +213,39 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf, return count; } +static void blk_mq_debugfs_rq_hang_show(struct seq_file *m, struct request *rq); + +static void blk_mq_check_rq_hang(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, bool reserved) +{ + struct seq_file *m = priv; + u64 now = ktime_get_ns(); + u64 duration; + + duration = div_u64(now - rq->start_time_ns, NSEC_PER_MSEC); + if (duration < rq->q->rq_hang_threshold) + return; + + /* See comments in blk_mq_check_expired() */ + if (!refcount_inc_not_zero(&rq->ref)) + return; + + duration = div_u64(now - rq->start_time_ns, NSEC_PER_MSEC); + if (duration >= rq->q->rq_hang_threshold) + blk_mq_debugfs_rq_hang_show(m, rq); + + if (refcount_dec_and_test(&rq->ref)) + __blk_mq_free_request(rq); +} + +static int queue_rq_hang_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + + blk_mq_queue_tag_busy_iter(q, blk_mq_check_rq_hang, m); + return 0; +} + static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "poll_stat", 0400, queue_poll_stat_show }, { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, @@ -220,6 +253,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "state", 0600, queue_state_show, queue_state_write }, { "write_hints", 0600, queue_write_hint_show, queue_write_hint_store }, { "zone_wlock", 0400, queue_zone_wlock_show, NULL }, + { "rq_hang", 0400, queue_rq_hang_show, NULL }, { }, }; @@ -381,6 +415,56 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) } EXPORT_SYMBOL_GPL(__blk_mq_debugfs_rq_show); +static void blk_mq_debugfs_rq_hang_show(struct seq_file *m, struct request *rq) +{ + const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; + const unsigned int op = rq->cmd_flags & REQ_OP_MASK; + struct bio *bio; + struct bio_vec *bvec; + int i; + + seq_printf(m, "%px {.op=", rq); + if (op < ARRAY_SIZE(op_name) && op_name[op]) + seq_printf(m, "%s", op_name[op]); + else + seq_printf(m, "%d", op); + seq_puts(m, ", .cmd_flags="); + blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name, + ARRAY_SIZE(cmd_flag_name)); + seq_puts(m, ", .rq_flags="); + blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, + ARRAY_SIZE(rqf_name)); + seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq))); + seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, + rq->internal_tag); + seq_printf(m, ", .start_time_ns=%llu", rq->start_time_ns); + seq_printf(m, ", .io_start_time_ns=%llu", rq->io_start_time_ns); + seq_printf(m, ", .current_time=%llu", ktime_get_ns()); + + bio = rq->bio; + if (bio) { + while (1) { + seq_printf(m, ", .bio = %px", bio); + seq_puts(m, ", .bio_pages = { "); + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + if (!page) + continue; + seq_printf(m, "%px ", page); + } + seq_puts(m, "}"); + bio = bio->bi_next; + if (bio == NULL) + break; + } + } + if (mq_ops->show_rq) + mq_ops->show_rq(m, rq); + seq_puts(m, "}\n"); + return; +} + int blk_mq_debugfs_rq_show(struct seq_file *m, void *v) { return __blk_mq_debugfs_rq_show(m, list_entry_rq(v)); diff --git a/block/blk-mq.c b/block/blk-mq.c index 43a29e9886c1..53b42883ad6a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -88,6 +88,11 @@ struct mq_inflight { unsigned int *inflight; }; +struct mq_hang { + struct hd_struct *part; + unsigned int *hang; +}; + static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) @@ -133,6 +138,30 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi); } +static void blk_mq_check_hang_rw(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, bool reserved) +{ + struct mq_hang *mh = priv; + u64 now = ktime_get_ns(); + u64 duration; + + duration = div_u64(now - rq->start_time_ns, NSEC_PER_MSEC); + if (duration < rq->q->rq_hang_threshold) + return; + + if (!mh->part->partno || rq->part == mh->part) + mh->hang[rq_data_dir(rq)]++; +} + +void blk_mq_in_hang_rw(struct request_queue *q, struct hd_struct *part, + unsigned int hang[2]) +{ + struct mq_hang mh = { .part = part, .hang = hang, }; + + hang[0] = hang[1] = 0; + blk_mq_queue_tag_busy_iter(q, blk_mq_check_hang_rw, &mh); +} + void blk_freeze_queue_start(struct request_queue *q) { int freeze_depth; @@ -476,7 +505,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); -static void __blk_mq_free_request(struct request *rq) +void __blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; struct blk_mq_ctx *ctx = rq->mq_ctx; diff --git a/block/blk-mq.h b/block/blk-mq.h index 5ad9251627f8..23fc5e3dea43 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -154,6 +154,8 @@ void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]); void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]); +void blk_mq_in_hang_rw(struct request_queue *q, struct hd_struct *part, + unsigned int hang[2]); static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx) { diff --git a/block/blk-settings.c b/block/blk-settings.c index 49fac8749a42..fab05b0e69c7 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -66,6 +66,13 @@ void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) } EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); +void blk_queue_rq_hang_threshold(struct request_queue *q, + unsigned int hang_threshold) +{ + q->rq_hang_threshold = hang_threshold; +} +EXPORT_SYMBOL_GPL(blk_queue_rq_hang_threshold); + void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn) { WARN_ON_ONCE(q->mq_ops); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8286640d4d66..474c84de0ca8 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -516,6 +516,27 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page) return queue_var_show(blk_queue_dax(q), page); } +static ssize_t queue_hang_threshold_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%u\n", q->rq_hang_threshold); +} + +static ssize_t queue_hang_threshold_store(struct request_queue *q, + const char *page, size_t count) +{ + unsigned int hang_threshold; + int err; + + err = kstrtou32(page, 10, &hang_threshold); + if (err || hang_threshold == 0) + return -EINVAL; + + blk_queue_rq_hang_threshold(q, hang_threshold); + + return count; +} + + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = 0644 }, .show = queue_requests_show, @@ -695,6 +716,12 @@ static struct queue_sysfs_entry queue_wb_lat_entry = { .store = queue_wb_lat_store, }; +static struct queue_sysfs_entry queue_hang_entry = { + .attr = {.name = "hang_threshold", .mode = 0644 }, + .show = queue_hang_threshold_show, + .store = queue_hang_threshold_store, +}; + #ifdef CONFIG_BLK_DEV_THROTTLING_LOW static struct queue_sysfs_entry throtl_sample_time_entry = { .attr = {.name = "throttle_sample_time", .mode = 0644 }, @@ -737,6 +764,7 @@ static struct attribute *default_attrs[] = { &queue_dax_entry.attr, &queue_wb_lat_entry.attr, &queue_poll_delay_entry.attr, + &queue_hang_entry.attr, #ifdef CONFIG_BLK_DEV_THROTTLING_LOW &throtl_sample_time_entry.attr, #endif diff --git a/block/genhd.c b/block/genhd.c index ee4da1ba5afe..81640b7bf85a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -94,6 +94,39 @@ void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, inflight[1] = atomic_read(&part->in_flight[1]); } +static void blk_in_hang_rw(struct request_queue *q, struct hd_struct *part, + unsigned int hang[2]) +{ + u64 now, duration; + unsigned long flags; + struct request *rq, *tmp; + + now = ktime_get_ns(); + spin_lock_irqsave(q->queue_lock, flags); + + list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) { + duration = div_u64(now - rq->start_time_ns, NSEC_PER_MSEC); + if (duration < rq->q->rq_hang_threshold) + continue; + + if (!part->partno || rq->part == part) + hang[rq_data_dir(rq)]++; + } + + spin_unlock_irqrestore(q->queue_lock, flags); +} + +void part_in_hang_rw(struct request_queue *q, struct hd_struct *part, + unsigned int hang[2]) +{ + if (q->mq_ops) { + blk_mq_in_hang_rw(q, part, hang); + return; + } + + blk_in_hang_rw(q, part, hang); +} + struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) { struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl); @@ -1157,6 +1190,7 @@ static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL); static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); +static DEVICE_ATTR(hang, 0444, part_hang_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = @@ -1179,6 +1213,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, + &dev_attr_hang.attr, &dev_attr_badblocks.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, diff --git a/block/partition-generic.c b/block/partition-generic.c index ed4bc5f7967e..d9d9b656a1f0 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -164,6 +164,17 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); } +ssize_t part_hang_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q = part_to_disk(p)->queue; + unsigned int hang[2]; + + part_in_hang_rw(q, p, hang); + return sprintf(buf, "%8u %8u\n", hang[0], hang[1]); +} + #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -195,6 +206,7 @@ static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL); static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); +static DEVICE_ATTR(hang, 0444, part_hang_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); @@ -209,6 +221,7 @@ static struct attribute *part_attrs[] = { &dev_attr_discard_alignment.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, + &dev_attr_hang.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 75086cb24b36..4d72b4ebfbc4 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -233,6 +233,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); +void __blk_mq_free_request(struct request *rq); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cf883af19589..545a39090984 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -436,6 +436,12 @@ static inline int blkdev_reset_zones_ioctl(struct block_device *bdev, #endif /* CONFIG_BLK_DEV_ZONED */ +/* + * default request hang threshold, unit is millisecond. If one request does + * not complete in this threashold time, consider this request as hang. + */ +#define BLK_REQ_HANG_THRESHOLD 5000 + struct request_queue { /* * Together with queue_head for cacheline sharing @@ -581,6 +587,7 @@ struct request_queue { unsigned int request_fn_active; unsigned int rq_timeout; + unsigned int rq_hang_threshold; int poll_nsec; struct blk_stat_callback *poll_cb; @@ -1264,6 +1271,8 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); +extern void blk_queue_rq_hang_threshold(struct request_queue *q, + unsigned int threshold); /* * Number of physical segments as sent to the device. diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 75ccb2848c11..bd2bcd296193 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -398,6 +398,8 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw); void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw); +void part_in_hang_rw(struct request_queue *q, struct hd_struct *part, + unsigned int hang[2]); static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk) { @@ -643,6 +645,8 @@ extern ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t part_hang_show(struct device *dev, + struct device_attribute *attr, char *buf); #ifdef CONFIG_FAIL_MAKE_REQUEST extern ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf); -- GitLab