提交 06a67773 编写于 作者: X Xiaoguang Wang 提交者: Joseph Qi

alios: blk-throttle: limit bios to fix amount of pages entering writeback prematurely

Currently in blk_throtl_bio(), if one bio exceeds its throtl_grp's bps
or iops limit, this bio will be queued throtl_grp's throtl_service_queue,
then obviously mm subsys will submit more pages, even underlying device
can not handle these io requests, also this will make large amount of pages
entering writeback prematurely, later if some process writes some of these
pages, it will wait for long time.

I have done some tests: one process does buffered writes on a 1GB file,
and make this process's blkcg max bps limit be 10MB/s, I observe this:
	#cat /proc/meminfo  | grep -i back
	Writeback:        900024 kB
	WritebackTmp:          0 kB

I think this Writeback value is just too big, indeed many bios have been
queued in throtl_grp's throtl_service_queue, if one process try to write
the last bio's page in this queue, it will call wait_on_page_writeback(page),
which must wait the previous bios to finish and will take long time, we
have also see 120s hung task warning in our server.

 INFO: task kworker/u128:0:30072 blocked for more than 120 seconds.
       Tainted: G            E 4.9.147-013.ali3000_015_test.alios7.x86_64 #1
 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 kworker/u128:0  D    0 30072      2 0x00000000
 Workqueue: writeback wb_workfn (flush-8:16)
  ffff882ddd066b40 0000000000000000 ffff882e5cad3400 ffff882fbe959e80
  ffff882fa50b1a00 ffffc9003a5a3768 ffffffff8173325d ffffc9003a5a3780
  00ff882e5cad3400 ffff882fbe959e80 ffffffff81360b49 ffff882e5cad3400
 Call Trace:
  [<ffffffff8173325d>] ? __schedule+0x23d/0x6d0
  [<ffffffff81360b49>] ? alloc_request_struct+0x19/0x20
  [<ffffffff81733726>] schedule+0x36/0x80
  [<ffffffff81736c56>] schedule_timeout+0x206/0x4b0
  [<ffffffff81036c69>] ? sched_clock+0x9/0x10
  [<ffffffff81363073>] ? get_request+0x403/0x810
  [<ffffffff8110ca10>] ? ktime_get+0x40/0xb0
  [<ffffffff81732f8a>] io_schedule_timeout+0xda/0x170
  [<ffffffff81733f90>] ? bit_wait+0x60/0x60
  [<ffffffff81733fab>] bit_wait_io+0x1b/0x60
  [<ffffffff81733b28>] __wait_on_bit+0x58/0x90
  [<ffffffff811b0d91>] ? find_get_pages_tag+0x161/0x2e0
  [<ffffffff811aff62>] wait_on_page_bit+0x82/0xa0
  [<ffffffff810d47f0>] ? wake_atomic_t_function+0x60/0x60
  [<ffffffffa02fc181>] mpage_prepare_extent_to_map+0x2d1/0x310 [ext4]
  [<ffffffff8121ff65>] ? kmem_cache_alloc+0x185/0x1a0
  [<ffffffffa0305a2f>] ? ext4_init_io_end+0x1f/0x40 [ext4]
  [<ffffffffa0300294>] ext4_writepages+0x404/0xef0 [ext4]
  [<ffffffff81508c64>] ? scsi_init_io+0x44/0x200
  [<ffffffff81398a0f>] ? fprop_fraction_percpu+0x2f/0x80
  [<ffffffff811c139e>] do_writepages+0x1e/0x30
  [<ffffffff8127c0f5>] __writeback_single_inode+0x45/0x320
  [<ffffffff8127c942>] writeback_sb_inodes+0x272/0x600
  [<ffffffff8127cf6b>] wb_writeback+0x10b/0x300
  [<ffffffff8127d884>] wb_workfn+0xb4/0x380
  [<ffffffff810b85e9>] ? try_to_wake_up+0x59/0x3e0
  [<ffffffff810a5759>] process_one_work+0x189/0x420
  [<ffffffff810a5a3e>] worker_thread+0x4e/0x4b0
  [<ffffffff810a59f0>] ? process_one_work+0x420/0x420
  [<ffffffff810ac026>] kthread+0xe6/0x100
  [<ffffffff810abf40>] ? kthread_park+0x60/0x60
  [<ffffffff81738499>] ret_from_fork+0x39/0x50

To fix this issue, we can simply limit throtl_service_queue's max queued
bios, currently we limit it to throtl_grp's bps_limit or iops limit, if it
still exteeds, we just sleep for a while.
Signed-off-by: NXiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Reviewed-by: NLiu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: NJoseph Qi <joseph.qi@linux.alibaba.com>
Acked-by: NCaspar Zhang <caspar@linux.alibaba.com>
上级 6bb5d410
...@@ -79,6 +79,8 @@ struct throtl_service_queue { ...@@ -79,6 +79,8 @@ struct throtl_service_queue {
*/ */
struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
unsigned int nr_queued[2]; /* number of queued bios */ unsigned int nr_queued[2]; /* number of queued bios */
long nr_queued_bytes[2]; /* number of queued bytes */
wait_queue_head_t wait[2];
/* /*
* RB tree of active children throtl_grp's, which are sorted by * RB tree of active children throtl_grp's, which are sorted by
...@@ -486,6 +488,10 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq) ...@@ -486,6 +488,10 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq)
{ {
INIT_LIST_HEAD(&sq->queued[0]); INIT_LIST_HEAD(&sq->queued[0]);
INIT_LIST_HEAD(&sq->queued[1]); INIT_LIST_HEAD(&sq->queued[1]);
sq->nr_queued_bytes[0] = 0;
sq->nr_queued_bytes[1] = 0;
init_waitqueue_head(&sq->wait[0]);
init_waitqueue_head(&sq->wait[1]);
sq->pending_tree = RB_ROOT; sq->pending_tree = RB_ROOT;
timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
} }
...@@ -1201,6 +1207,7 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, ...@@ -1201,6 +1207,7 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
sq->nr_queued[rw]++; sq->nr_queued[rw]++;
sq->nr_queued_bytes[rw] += throtl_bio_data_size(bio);
blkg_rwstat_add(&tg->total_bytes_queued, bio_op(bio), blkg_rwstat_add(&tg->total_bytes_queued, bio_op(bio),
throtl_bio_data_size(bio)); throtl_bio_data_size(bio));
blkg_rwstat_add(&tg->total_io_queued, bio_op(bio), 1); blkg_rwstat_add(&tg->total_io_queued, bio_op(bio), 1);
...@@ -1259,6 +1266,15 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) ...@@ -1259,6 +1266,15 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
*/ */
bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
sq->nr_queued[rw]--; sq->nr_queued[rw]--;
sq->nr_queued_bytes[rw] -= throtl_bio_data_size(bio);
WARN_ON_ONCE(sq->nr_queued_bytes[rw] < 0);
if (wq_has_sleeper(&sq->wait[rw])) {
if (sq->nr_queued_bytes[rw] > 0)
wake_up(&sq->wait[rw]);
else
wake_up_all(&sq->wait[rw]);
}
throtl_charge_bio(tg, bio); throtl_charge_bio(tg, bio);
...@@ -2301,7 +2317,7 @@ static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) ...@@ -2301,7 +2317,7 @@ static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
} }
bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
struct bio *bio) struct bio *bio, wait_queue_head_t **wait)
{ {
struct throtl_qnode *qn = NULL; struct throtl_qnode *qn = NULL;
struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
...@@ -2392,6 +2408,16 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, ...@@ -2392,6 +2408,16 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
tg->last_low_overflow_time[rw] = jiffies; tg->last_low_overflow_time[rw] = jiffies;
td->nr_queued[rw]++; td->nr_queued[rw]++;
if (rw == WRITE) {
u64 bps_limit = tg_bps_limit(tg, rw);
if (bps_limit != U64_MAX &&
(wq_has_sleeper(&sq->wait[rw]) ||
sq->nr_queued_bytes[rw] > div_u64(bps_limit, 2)))
*wait = &sq->wait[rw];
}
throtl_add_bio_tg(bio, qn, tg); throtl_add_bio_tg(bio, qn, tg);
throttled = true; throttled = true;
......
...@@ -793,10 +793,13 @@ static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, ...@@ -793,10 +793,13 @@ static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
#ifdef CONFIG_BLK_DEV_THROTTLING #ifdef CONFIG_BLK_DEV_THROTTLING
extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
struct bio *bio); struct bio *bio, wait_queue_head_t **wait);
#else #else
static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
struct bio *bio) { return false; } struct bio *bio, wait_queue_head_t **wait)
{
return false;
}
#endif #endif
static inline bool blkcg_bio_issue_check(struct request_queue *q, static inline bool blkcg_bio_issue_check(struct request_queue *q,
...@@ -805,6 +808,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, ...@@ -805,6 +808,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
struct blkcg *blkcg; struct blkcg *blkcg;
struct blkcg_gq *blkg; struct blkcg_gq *blkg;
bool throtl = false; bool throtl = false;
DEFINE_WAIT(wait);
wait_queue_head_t *wait_head = NULL;
rcu_read_lock(); rcu_read_lock();
blkcg = bio_blkcg(bio); blkcg = bio_blkcg(bio);
...@@ -821,7 +826,7 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, ...@@ -821,7 +826,7 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
spin_unlock_irq(q->queue_lock); spin_unlock_irq(q->queue_lock);
} }
throtl = blk_throtl_bio(q, blkg, bio); throtl = blk_throtl_bio(q, blkg, bio, &wait_head);
if (!throtl) { if (!throtl) {
blkg = blkg ?: q->root_blkg; blkg = blkg ?: q->root_blkg;
...@@ -837,6 +842,12 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, ...@@ -837,6 +842,12 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
} }
rcu_read_unlock(); rcu_read_unlock();
if (wait_head) {
prepare_to_wait_exclusive(wait_head, &wait, TASK_UNINTERRUPTIBLE);
io_schedule();
finish_wait(wait_head, &wait);
}
return !throtl; return !throtl;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册