提交 39a9f97e 编写于 作者: J Jens Axboe

Merge branch 'for-3.16/blk-mq-tagging' into for-3.16/core

Signed-off-by: NJens Axboe <axboe@fb.com>

Conflicts:
	block/blk-mq-tag.c
......@@ -208,6 +208,11 @@ static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
return blk_mq_tag_sysfs_show(hctx->tags, page);
}
static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page)
{
return sprintf(page, "%u\n", atomic_read(&hctx->nr_active));
}
static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
{
unsigned int i, first = 1;
......@@ -267,6 +272,10 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
.attr = {.name = "dispatched", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_dispatched_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = {
.attr = {.name = "active", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_active_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
.attr = {.name = "pending", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_rq_list_show,
......@@ -287,6 +296,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
&blk_mq_hw_sysfs_pending.attr,
&blk_mq_hw_sysfs_tags.attr,
&blk_mq_hw_sysfs_cpus.attr,
&blk_mq_hw_sysfs_active.attr,
NULL,
};
......
......@@ -7,13 +7,12 @@
#include "blk-mq.h"
#include "blk-mq-tag.h"
void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx,
bool reserved)
void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved)
{
int tag, zero = 0;
tag = blk_mq_get_tag(tags, hctx, &zero, __GFP_WAIT, reserved);
blk_mq_put_tag(tags, tag, &zero);
tag = blk_mq_get_tag(hctx, &zero, __GFP_WAIT, reserved);
blk_mq_put_tag(hctx, tag, &zero);
}
static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
......@@ -40,6 +39,84 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
return bt_has_free_tags(&tags->bitmap_tags);
}
static inline void bt_index_inc(unsigned int *index)
{
*index = (*index + 1) & (BT_WAIT_QUEUES - 1);
}
/*
* If a previously inactive queue goes active, bump the active user count.
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
!test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
atomic_inc(&hctx->tags->active_queues);
return true;
}
/*
* If a previously busy queue goes inactive, potential waiters could now
* be allowed to queue. Wake them up and check.
*/
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
struct blk_mq_bitmap_tags *bt;
int i, wake_index;
if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return;
atomic_dec(&tags->active_queues);
/*
* Will only throttle depth on non-reserved tags
*/
bt = &tags->bitmap_tags;
wake_index = bt->wake_index;
for (i = 0; i < BT_WAIT_QUEUES; i++) {
struct bt_wait_state *bs = &bt->bs[wake_index];
if (waitqueue_active(&bs->wait))
wake_up(&bs->wait);
bt_index_inc(&wake_index);
}
}
/*
* For shared tag users, we track the number of currently active users
* and attempt to provide a fair share of the tag depth for each of them.
*/
static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
struct blk_mq_bitmap_tags *bt)
{
unsigned int depth, users;
if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
return true;
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return true;
/*
* Don't try dividing an ant
*/
if (bt->depth == 1)
return true;
users = atomic_read(&hctx->tags->active_queues);
if (!users)
return true;
/*
* Allow at least some tags
*/
depth = max((bt->depth + users - 1) / users, 4U);
return atomic_read(&hctx->nr_active) < depth;
}
static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
{
int tag, org_last_tag, end;
......@@ -78,11 +155,15 @@ static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
* multiple users will tend to stick to different cachelines, at least
* until the map is exhausted.
*/
static int __bt_get(struct blk_mq_bitmap_tags *bt, unsigned int *tag_cache)
static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
unsigned int *tag_cache)
{
unsigned int last_tag, org_last_tag;
int index, i, tag;
if (!hctx_may_queue(hctx, bt))
return -1;
last_tag = org_last_tag = *tag_cache;
index = TAG_TO_INDEX(bt, last_tag);
......@@ -117,11 +198,6 @@ static int __bt_get(struct blk_mq_bitmap_tags *bt, unsigned int *tag_cache)
return tag;
}
static inline void bt_index_inc(unsigned int *index)
{
*index = (*index + 1) & (BT_WAIT_QUEUES - 1);
}
static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
struct blk_mq_hw_ctx *hctx)
{
......@@ -142,7 +218,7 @@ static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx,
DEFINE_WAIT(wait);
int tag;
tag = __bt_get(bt, last_tag);
tag = __bt_get(hctx, bt, last_tag);
if (tag != -1)
return tag;
......@@ -156,7 +232,7 @@ static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx,
was_empty = list_empty(&wait.task_list);
prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
tag = __bt_get(bt, last_tag);
tag = __bt_get(hctx, bt, last_tag);
if (tag != -1)
break;
......@@ -200,14 +276,13 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
return tag;
}
unsigned int blk_mq_get_tag(struct blk_mq_tags *tags,
struct blk_mq_hw_ctx *hctx, unsigned int *last_tag,
unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag,
gfp_t gfp, bool reserved)
{
if (!reserved)
return __blk_mq_get_tag(tags, hctx, last_tag, gfp);
return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp);
return __blk_mq_get_reserved_tag(tags, gfp);
return __blk_mq_get_reserved_tag(hctx->tags, gfp);
}
static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
......@@ -265,9 +340,11 @@ static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
bt_clear_tag(&tags->breserved_tags, tag);
}
void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag,
void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
unsigned int *last_tag)
{
struct blk_mq_tags *tags = hctx->tags;
if (tag >= tags->nr_reserved_tags) {
const int real_tag = tag - tags->nr_reserved_tags;
......@@ -465,6 +542,7 @@ ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
res = bt_unused_tags(&tags->breserved_tags);
page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues));
return page - orig_page;
}
......@@ -35,6 +35,8 @@ struct blk_mq_tags {
unsigned int nr_tags;
unsigned int nr_reserved_tags;
atomic_t active_queues;
struct blk_mq_bitmap_tags bitmap_tags;
struct blk_mq_bitmap_tags breserved_tags;
......@@ -46,9 +48,9 @@ struct blk_mq_tags {
extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
extern void blk_mq_free_tags(struct blk_mq_tags *tags);
extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved);
extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, bool reserved);
extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag, unsigned int *last_tag);
extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved);
extern void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved);
extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
......@@ -65,4 +67,23 @@ enum {
BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
};
extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
return false;
return __blk_mq_tag_busy(hctx);
}
static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
return;
__blk_mq_tag_idle(hctx);
}
#endif
......@@ -99,9 +99,16 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
struct request *rq;
unsigned int tag;
tag = blk_mq_get_tag(hctx->tags, hctx, &ctx->last_tag, gfp, reserved);
tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved);
if (tag != BLK_MQ_TAG_FAIL) {
rq = hctx->tags->rqs[tag];
rq->cmd_flags = 0;
if (blk_mq_tag_busy(hctx)) {
rq->cmd_flags = REQ_MQ_INFLIGHT;
atomic_inc(&hctx->nr_active);
}
rq->tag = tag;
return rq;
}
......@@ -209,7 +216,7 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
/* csd/requeue_work/fifo_time is initialized before use */
rq->q = q;
rq->mq_ctx = ctx;
rq->cmd_flags = rw_flags;
rq->cmd_flags |= rw_flags;
rq->cmd_type = 0;
/* do not touch atomic flags, it needs atomic ops against the timer */
rq->cpu = -1;
......@@ -281,7 +288,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
break;
}
blk_mq_wait_for_tags(hctx->tags, hctx, reserved);
blk_mq_wait_for_tags(hctx, reserved);
} while (1);
return rq;
......@@ -322,8 +329,11 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
const int tag = rq->tag;
struct request_queue *q = rq->q;
if (rq->cmd_flags & REQ_MQ_INFLIGHT)
atomic_dec(&hctx->nr_active);
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
blk_mq_put_tag(hctx->tags, tag, &ctx->last_tag);
blk_mq_put_tag(hctx, tag, &ctx->last_tag);
blk_mq_queue_exit(q);
}
......@@ -590,8 +600,13 @@ static void blk_mq_rq_timer(unsigned long data)
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
if (next_set)
mod_timer(&q->timeout, round_jiffies_up(next));
if (next_set) {
next = blk_rq_timeout(round_jiffies_up(next));
mod_timer(&q->timeout, next);
} else {
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_tag_idle(hctx);
}
}
/*
......@@ -1501,6 +1516,56 @@ static void blk_mq_map_swqueue(struct request_queue *q)
}
}
static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
{
struct blk_mq_hw_ctx *hctx;
struct request_queue *q;
bool shared;
int i;
if (set->tag_list.next == set->tag_list.prev)
shared = false;
else
shared = true;
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_freeze_queue(q);
queue_for_each_hw_ctx(q, hctx, i) {
if (shared)
hctx->flags |= BLK_MQ_F_TAG_SHARED;
else
hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
}
blk_mq_unfreeze_queue(q);
}
}
static void blk_mq_del_queue_tag_set(struct request_queue *q)
{
struct blk_mq_tag_set *set = q->tag_set;
blk_mq_freeze_queue(q);
mutex_lock(&set->tag_list_lock);
list_del_init(&q->tag_set_list);
blk_mq_update_tag_set_depth(set);
mutex_unlock(&set->tag_list_lock);
blk_mq_unfreeze_queue(q);
}
static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
struct request_queue *q)
{
q->tag_set = set;
mutex_lock(&set->tag_list_lock);
list_add_tail(&q->tag_set_list, &set->tag_list);
blk_mq_update_tag_set_depth(set);
mutex_unlock(&set->tag_list_lock);
}
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
{
struct blk_mq_hw_ctx **hctxs;
......@@ -1526,6 +1591,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
goto err_hctxs;
atomic_set(&hctxs[i]->nr_active, 0);
hctxs[i]->numa_node = NUMA_NO_NODE;
hctxs[i]->queue_num = i;
}
......@@ -1578,6 +1644,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
list_add_tail(&q->all_q_node, &all_q_list);
mutex_unlock(&all_q_mutex);
blk_mq_add_queue_tag_set(set, q);
return q;
err_flush_rq:
......@@ -1605,6 +1673,8 @@ void blk_mq_free_queue(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
blk_mq_del_queue_tag_set(q);
queue_for_each_hw_ctx(q, hctx, i) {
kfree(hctx->ctxs);
blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
......@@ -1696,6 +1766,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
goto out_unwind;
}
mutex_init(&set->tag_list_lock);
INIT_LIST_HEAD(&set->tag_list);
return 0;
out_unwind:
......
......@@ -166,6 +166,17 @@ void blk_abort_request(struct request *req)
}
EXPORT_SYMBOL_GPL(blk_abort_request);
unsigned long blk_rq_timeout(unsigned long timeout)
{
unsigned long maxt;
maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT);
if (time_after(timeout, maxt))
timeout = maxt;
return timeout;
}
/**
* blk_add_timer - Start timeout timer for a single request
* @req: request that is about to start running.
......@@ -200,7 +211,7 @@ void blk_add_timer(struct request *req)
* than an existing one, modify the timer. Round up to next nearest
* second.
*/
expiry = round_jiffies_up(req->deadline);
expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
if (!timer_pending(&q->timeout) ||
time_before(expiry, q->timeout.expires)) {
......
......@@ -9,6 +9,9 @@
/* Number of requests a "batching" process may submit */
#define BLK_BATCH_REQ 32
/* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT (5 * HZ)
extern struct kmem_cache *blk_requestq_cachep;
extern struct kmem_cache *request_cachep;
extern struct kobj_type blk_queue_ktype;
......@@ -37,6 +40,7 @@ bool __blk_end_bidi_request(struct request *rq, int error,
void blk_rq_timed_out_timer(unsigned long data);
void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
unsigned int *next_set);
unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);
void blk_delete_timer(struct request *);
......
......@@ -54,6 +54,8 @@ struct blk_mq_hw_ctx {
unsigned int numa_node;
unsigned int cmd_size; /* per-request extra data */
atomic_t nr_active;
struct blk_mq_cpu_notifier cpu_notifier;
struct kobject kobj;
};
......@@ -70,6 +72,9 @@ struct blk_mq_tag_set {
void *driver_data;
struct blk_mq_tags **tags;
struct mutex tag_list_lock;
struct list_head tag_list;
};
typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
......@@ -132,8 +137,10 @@ enum {
BLK_MQ_F_SHOULD_MERGE = 1 << 0,
BLK_MQ_F_SHOULD_SORT = 1 << 1,
BLK_MQ_F_TAG_SHARED = 1 << 2,
BLK_MQ_S_STOPPED = 0,
BLK_MQ_S_TAG_ACTIVE = 1,
BLK_MQ_MAX_DEPTH = 2048,
......
......@@ -190,6 +190,7 @@ enum rq_flag_bits {
__REQ_PM, /* runtime pm request */
__REQ_END, /* last of chain of requests */
__REQ_HASHED, /* on IO scheduler merge hash */
__REQ_MQ_INFLIGHT, /* track inflight for MQ */
__REQ_NR_BITS, /* stops here */
};
......@@ -243,5 +244,6 @@ enum rq_flag_bits {
#define REQ_PM (1ULL << __REQ_PM)
#define REQ_END (1ULL << __REQ_END)
#define REQ_HASHED (1ULL << __REQ_HASHED)
#define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT)
#endif /* __LINUX_BLK_TYPES_H */
......@@ -481,6 +481,9 @@ struct request_queue {
wait_queue_head_t mq_freeze_wq;
struct percpu_counter mq_usage_counter;
struct list_head all_q_node;
struct blk_mq_tag_set *tag_set;
struct list_head tag_set_list;
};
#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册