diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index 0d5b79365d0350cc73f185ae85f11235d56ac7cf..410b33d014d2dfc3fb0c8ad0c6bb0c20a68ddbc2 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -127,6 +127,27 @@ struct tc_multiq_qopt { __u16 max_bands; /* Maximum number of queues */ }; +/* PLUG section */ + +#define TCQ_PLUG_BUFFER 0 +#define TCQ_PLUG_RELEASE_ONE 1 +#define TCQ_PLUG_RELEASE_INDEFINITE 2 +#define TCQ_PLUG_LIMIT 3 + +struct tc_plug_qopt { + /* TCQ_PLUG_BUFFER: Inset a plug into the queue and + * buffer any incoming packets + * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head + * to beginning of the next plug. + * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. + * Stop buffering packets until the next TCQ_PLUG_BUFFER + * command is received (just act as a pass-thru queue). + * TCQ_PLUG_LIMIT: Increase/decrease queue size + */ + int action; + __u32 limit; +}; + /* TBF section */ struct tc_tbf_qopt { diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2590e91b3289f604b551b799a2d723874878ab3b..75b58f81d53d316d3959ba71909adacd9010aee3 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -260,6 +260,32 @@ config NET_SCH_INGRESS To compile this code as a module, choose M here: the module will be called sch_ingress. +config NET_SCH_PLUG + tristate "Plug network traffic until release (PLUG)" + ---help--- + + This queuing discipline allows userspace to plug/unplug a network + output queue, using the netlink interface. When it receives an + enqueue command it inserts a plug into the outbound queue that + causes following packets to enqueue until a dequeue command arrives + over netlink, causing the plug to be removed and resuming the normal + packet flow. + + This module also provides a generic "network output buffering" + functionality (aka output commit), wherein upon arrival of a dequeue + command, only packets up to the first plug are released for delivery. + The Remus HA project uses this module to enable speculative execution + of virtual machines by allowing the generated network output to be rolled + back if needed. + + For more information, please refer to http://wiki.xensource.com/xenwiki/Remus + + Say Y here if you are using this kernel for Xen dom0 and + want to protect Xen guests with Remus. + + To compile this code as a module, choose M here: the + module will be called sch_plug. + comment "Classification" config NET_CLS diff --git a/net/sched/Makefile b/net/sched/Makefile index dc5889c0a15a341c302a881a36a6607f219a72bf..8cdf4e2b51d3756edbcff81895dad81b16a5741e 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o +obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c new file mode 100644 index 0000000000000000000000000000000000000000..ba7b737e40551d140c01217f946358b2022d706c --- /dev/null +++ b/net/sched/sch_plug.c @@ -0,0 +1,233 @@ +/* + * sch_plug.c Queue traffic until an explicit release command + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * There are two ways to use this qdisc: + * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating + * sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands. + * + * 2. For network output buffering (a.k.a output commit) functionality. + * Output commit property is commonly used by applications using checkpoint + * based fault-tolerance to ensure that the checkpoint from which a system + * is being restored is consistent w.r.t outside world. + * + * Consider for e.g. Remus - a Virtual Machine checkpointing system, + * wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated + * asynchronously to the backup host, while the VM continues executing the + * next epoch speculatively. + * + * The following is a typical sequence of output buffer operations: + * 1.At epoch i, start_buffer(i) + * 2. At end of epoch i (i.e. after 50ms): + * 2.1 Stop VM and take checkpoint(i). + * 2.2 start_buffer(i+1) and Resume VM + * 3. While speculatively executing epoch(i+1), asynchronously replicate + * checkpoint(i) to backup host. + * 4. When checkpoint_ack(i) is received from backup, release_buffer(i) + * Thus, this Qdisc would receive the following sequence of commands: + * TCQ_PLUG_BUFFER (epoch i) + * .. TCQ_PLUG_BUFFER (epoch i+1) + * ....TCQ_PLUG_RELEASE_ONE (epoch i) + * ......TCQ_PLUG_BUFFER (epoch i+2) + * ........ + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * State of the queue, when used for network output buffering: + * + * plug(i+1) plug(i) head + * ------------------+--------------------+----------------> + * | | + * | | + * pkts_current_epoch| pkts_last_epoch |pkts_to_release + * ----------------->|<--------+--------->|+---------------> + * v v + * + */ + +struct plug_sched_data { + /* If true, the dequeue function releases all packets + * from head to end of the queue. The queue turns into + * a pass-through queue for newly arriving packets. + */ + bool unplug_indefinite; + + /* Queue Limit in bytes */ + u32 limit; + + /* Number of packets (output) from the current speculatively + * executing epoch. + */ + u32 pkts_current_epoch; + + /* Number of packets corresponding to the recently finished + * epoch. These will be released when we receive a + * TCQ_PLUG_RELEASE_ONE command. This command is typically + * issued after committing a checkpoint at the target. + */ + u32 pkts_last_epoch; + + /* + * Number of packets from the head of the queue, that can + * be released (committed checkpoint). + */ + u32 pkts_to_release; +}; + +static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct plug_sched_data *q = qdisc_priv(sch); + + if (likely(sch->qstats.backlog + skb->len <= q->limit)) { + if (!q->unplug_indefinite) + q->pkts_current_epoch++; + return qdisc_enqueue_tail(skb, sch); + } + + return qdisc_reshape_fail(skb, sch); +} + +static struct sk_buff *plug_dequeue(struct Qdisc *sch) +{ + struct plug_sched_data *q = qdisc_priv(sch); + + if (qdisc_is_throttled(sch)) + return NULL; + + if (!q->unplug_indefinite) { + if (!q->pkts_to_release) { + /* No more packets to dequeue. Block the queue + * and wait for the next release command. + */ + qdisc_throttled(sch); + return NULL; + } + q->pkts_to_release--; + } + + return qdisc_dequeue_head(sch); +} + +static int plug_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct plug_sched_data *q = qdisc_priv(sch); + + q->pkts_current_epoch = 0; + q->pkts_last_epoch = 0; + q->pkts_to_release = 0; + q->unplug_indefinite = false; + + if (opt == NULL) { + /* We will set a default limit of 100 pkts (~150kB) + * in case tx_queue_len is not available. The + * default value is completely arbitrary. + */ + u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100; + q->limit = pkt_limit * psched_mtu(qdisc_dev(sch)); + } else { + struct tc_plug_qopt *ctl = nla_data(opt); + + if (nla_len(opt) < sizeof(*ctl)) + return -EINVAL; + + q->limit = ctl->limit; + } + + qdisc_throttled(sch); + return 0; +} + +/* Receives 4 types of messages: + * TCQ_PLUG_BUFFER: Inset a plug into the queue and + * buffer any incoming packets + * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head + * to beginning of the next plug. + * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. + * Stop buffering packets until the next TCQ_PLUG_BUFFER + * command is received (just act as a pass-thru queue). + * TCQ_PLUG_LIMIT: Increase/decrease queue size + */ +static int plug_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct plug_sched_data *q = qdisc_priv(sch); + struct tc_plug_qopt *msg; + + if (opt == NULL) + return -EINVAL; + + msg = nla_data(opt); + if (nla_len(opt) < sizeof(*msg)) + return -EINVAL; + + switch (msg->action) { + case TCQ_PLUG_BUFFER: + /* Save size of the current buffer */ + q->pkts_last_epoch = q->pkts_current_epoch; + q->pkts_current_epoch = 0; + if (q->unplug_indefinite) + qdisc_throttled(sch); + q->unplug_indefinite = false; + break; + case TCQ_PLUG_RELEASE_ONE: + /* Add packets from the last complete buffer to the + * packets to be released set. + */ + q->pkts_to_release += q->pkts_last_epoch; + q->pkts_last_epoch = 0; + qdisc_unthrottled(sch); + netif_schedule_queue(sch->dev_queue); + break; + case TCQ_PLUG_RELEASE_INDEFINITE: + q->unplug_indefinite = true; + q->pkts_to_release = 0; + q->pkts_last_epoch = 0; + q->pkts_current_epoch = 0; + qdisc_unthrottled(sch); + netif_schedule_queue(sch->dev_queue); + break; + case TCQ_PLUG_LIMIT: + /* Limit is supplied in bytes */ + q->limit = msg->limit; + break; + default: + return -EINVAL; + } + + return 0; +} + +struct Qdisc_ops plug_qdisc_ops = { + .id = "plug", + .priv_size = sizeof(struct plug_sched_data), + .enqueue = plug_enqueue, + .dequeue = plug_dequeue, + .peek = qdisc_peek_head, + .init = plug_init, + .change = plug_change, + .owner = THIS_MODULE, +}; + +static int __init plug_module_init(void) +{ + return register_qdisc(&plug_qdisc_ops); +} + +static void __exit plug_module_exit(void) +{ + unregister_qdisc(&plug_qdisc_ops); +} +module_init(plug_module_init) +module_exit(plug_module_exit) +MODULE_LICENSE("GPL");