提交 c9d0c6eb 编写于 作者: D David S. Miller

Merge branch 'pie-next'

Leslie Monis says:

====================
net: sched: pie: align PIE implementation with RFC 8033

The current implementation of the PIE queuing discipline is according to the
IETF draft [http://tools.ietf.org/html/draft-pan-aqm-pie-00] and the paper
[PIE: A Lightweight Control Scheme to Address the Bufferbloat Problem].
However, a lot of necessary modifications and enhancements have been proposed
in RFC 8033, which have not yet been incorporated in the source code of Linux.
This patch series helps in achieving the same.

Performance tests carried out using Flent [https://flent.org/]

Changes from v2 to v3:
  - Used div_u64() instead of direct division after explicit type casting as
    recommended by David

Changes from v1 to v2:
  - Excluded the patch setting PIE dynamically active/inactive as the test
    results were unsatisfactory
  - Fixed a scaling issue when adding more auto-tuning cases which caused
    local variables to underflow
  - Changed the long if/else chain to a loop as suggested by Stephen
  - Changed the position of the accu_prob variable in the pie_vars
    structure as recommended by Stephen
====================
Acked-by: NDave Taht <dave.taht@gmail.com>
Acked-by: NJamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
......@@ -954,7 +954,7 @@ enum {
#define TCA_PIE_MAX (__TCA_PIE_MAX - 1)
struct tc_pie_xstats {
__u32 prob; /* current probability */
__u64 prob; /* current probability */
__u32 delay; /* current delay in ms */
__u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */
__u32 packets_in; /* total number of packets enqueued */
......
......@@ -17,9 +17,7 @@
* University of Oslo, Norway.
*
* References:
* IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00
* IEEE Conference on High Performance Switching and Routing 2013 :
* "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem"
* RFC 8033: https://tools.ietf.org/html/rfc8034
*/
#include <linux/module.h>
......@@ -31,9 +29,9 @@
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
#define QUEUE_THRESHOLD 10000
#define QUEUE_THRESHOLD 16384
#define DQCOUNT_INVALID -1
#define MAX_PROB 0xffffffff
#define MAX_PROB 0xffffffffffffffff
#define PIE_SCALE 8
/* parameters used */
......@@ -49,14 +47,16 @@ struct pie_params {
/* variables used */
struct pie_vars {
u32 prob; /* probability but scaled by u32 limit. */
u64 prob; /* probability but scaled by u64 limit. */
psched_time_t burst_time;
psched_time_t qdelay;
psched_time_t qdelay_old;
u64 dq_count; /* measured in bytes */
psched_time_t dq_tstamp; /* drain rate */
u64 accu_prob; /* accumulated drop probability */
u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */
u32 qlen_old; /* in bytes */
u8 accu_prob_overflows; /* overflows of accu_prob */
};
/* statistics gathering */
......@@ -81,9 +81,9 @@ static void pie_params_init(struct pie_params *params)
{
params->alpha = 2;
params->beta = 20;
params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */
params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */
params->limit = 1000; /* default of 1000 packets */
params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */
params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */
params->ecn = false;
params->bytemode = false;
}
......@@ -91,16 +91,18 @@ static void pie_params_init(struct pie_params *params)
static void pie_vars_init(struct pie_vars *vars)
{
vars->dq_count = DQCOUNT_INVALID;
vars->accu_prob = 0;
vars->avg_dq_rate = 0;
/* default of 100 ms in pschedtime */
vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC);
/* default of 150 ms in pschedtime */
vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC);
vars->accu_prob_overflows = 0;
}
static bool drop_early(struct Qdisc *sch, u32 packet_size)
{
struct pie_sched_data *q = qdisc_priv(sch);
u32 rnd;
u32 local_prob = q->vars.prob;
u64 rnd;
u64 local_prob = q->vars.prob;
u32 mtu = psched_mtu(qdisc_dev(sch));
/* If there is still burst allowance left skip random early drop */
......@@ -124,13 +126,33 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size)
* probablity. Smaller packets will have lower drop prob in this case
*/
if (q->params.bytemode && packet_size <= mtu)
local_prob = (local_prob / mtu) * packet_size;
local_prob = (u64)packet_size * div_u64(local_prob, mtu);
else
local_prob = q->vars.prob;
rnd = prandom_u32();
if (rnd < local_prob)
if (local_prob == 0) {
q->vars.accu_prob = 0;
q->vars.accu_prob_overflows = 0;
}
if (local_prob > MAX_PROB - q->vars.accu_prob)
q->vars.accu_prob_overflows++;
q->vars.accu_prob += local_prob;
if (q->vars.accu_prob_overflows == 0 &&
q->vars.accu_prob < (MAX_PROB / 100) * 85)
return false;
if (q->vars.accu_prob_overflows == 8 &&
q->vars.accu_prob >= MAX_PROB / 2)
return true;
prandom_bytes(&rnd, 8);
if (rnd < local_prob) {
q->vars.accu_prob = 0;
q->vars.accu_prob_overflows = 0;
return true;
}
return false;
}
......@@ -168,6 +190,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
out:
q->stats.dropped++;
q->vars.accu_prob = 0;
q->vars.accu_prob_overflows = 0;
return qdisc_drop(skb, sch, to_free);
}
......@@ -317,9 +341,10 @@ static void calculate_probability(struct Qdisc *sch)
u32 qlen = sch->qstats.backlog; /* queue size in bytes */
psched_time_t qdelay = 0; /* in pschedtime */
psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */
s32 delta = 0; /* determines the change in probability */
u32 oldprob;
u32 alpha, beta;
s64 delta = 0; /* determines the change in probability */
u64 oldprob;
u64 alpha, beta;
u32 power;
bool update_prob = true;
q->vars.qdelay_old = q->vars.qdelay;
......@@ -339,38 +364,36 @@ static void calculate_probability(struct Qdisc *sch)
* value for alpha as 0.125. In this implementation, we use values 0-32
* passed from user space to represent this. Also, alpha and beta have
* unit of HZ and need to be scaled before they can used to update
* probability. alpha/beta are updated locally below by 1) scaling them
* appropriately 2) scaling down by 16 to come to 0-2 range.
* Please see paper for details.
*
* We scale alpha and beta differently depending on whether we are in
* light, medium or high dropping mode.
* probability. alpha/beta are updated locally below by scaling down
* by 16 to come to 0-2 range.
*/
if (q->vars.prob < MAX_PROB / 100) {
alpha =
(q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
beta =
(q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
} else if (q->vars.prob < MAX_PROB / 10) {
alpha =
(q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
beta =
(q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
} else {
alpha =
(q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
beta =
(q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
alpha = ((u64)q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
beta = ((u64)q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
/* We scale alpha and beta differently depending on how heavy the
* congestion is. Please see RFC 8033 for details.
*/
if (q->vars.prob < MAX_PROB / 10) {
alpha >>= 1;
beta >>= 1;
power = 100;
while (q->vars.prob < div_u64(MAX_PROB, power) &&
power <= 1000000) {
alpha >>= 2;
beta >>= 2;
power *= 10;
}
}
/* alpha and beta should be between 0 and 32, in multiples of 1/16 */
delta += alpha * ((qdelay - q->params.target));
delta += beta * ((qdelay - qdelay_old));
delta += alpha * (u64)(qdelay - q->params.target);
delta += beta * (u64)(qdelay - qdelay_old);
oldprob = q->vars.prob;
/* to ensure we increase probability in steps of no more than 2% */
if (delta > (s32)(MAX_PROB / (100 / 2)) &&
if (delta > (s64)(MAX_PROB / (100 / 2)) &&
q->vars.prob >= MAX_PROB / 10)
delta = (MAX_PROB / 100) * 2;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册