diff --git a/src/lxc/lxc_process.c b/src/lxc/lxc_process.c index 50c61c5f9c1ddf14dccdd588b69a064f0747f541..3e7fcb85505f0bfba90e15b61aa78e01e3a442e0 100644 --- a/src/lxc/lxc_process.c +++ b/src/lxc/lxc_process.c @@ -341,7 +341,8 @@ static int virLXCProcessSetupInterfaceBridged(virConnectPtr conn, goto cleanup; if (virNetDevBandwidthSet(net->ifname, - virDomainNetGetActualBandwidth(net)) < 0) { + virDomainNetGetActualBandwidth(net), + false) < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, _("cannot set bandwidth limits on %s"), net->ifname); diff --git a/src/network/bridge_driver.c b/src/network/bridge_driver.c index 09680ff51751243dc65d0f23f7f614091a1ff432..7ad6020a12a9ae9e914a35507b1b79eb39475b62 100644 --- a/src/network/bridge_driver.c +++ b/src/network/bridge_driver.c @@ -2459,7 +2459,8 @@ networkStartNetworkVirtual(struct network_driver *driver, VIR_FORCE_CLOSE(tapfd); } - if (virNetDevBandwidthSet(network->def->bridge, network->def->bandwidth) < 0) { + if (virNetDevBandwidthSet(network->def->bridge, + network->def->bandwidth, true) < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, _("cannot set bandwidth limits on %s"), network->def->bridge); diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 9009bd296c1d83fca5cdaf718cf1ba0b14f4682e..e10eb09533b36ccc786ca7e1c9feaa845a6e7770 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -292,7 +292,8 @@ qemuNetworkIfaceConnect(virDomainDefPtr def, if (tapfd >= 0 && virNetDevBandwidthSet(net->ifname, - virDomainNetGetActualBandwidth(net)) < 0) { + virDomainNetGetActualBandwidth(net), + false) < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, _("cannot set bandwidth limits on %s"), net->ifname); diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index d035bc317178123c61d7492e59d0095f75af51ba..93d9b69b498024023322c6971cc9360255806825 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -9034,7 +9034,7 @@ qemuDomainSetInterfaceParameters(virDomainPtr dom, sizeof(*newBandwidth->out)); } - if (virNetDevBandwidthSet(net->ifname, newBandwidth) < 0) { + if (virNetDevBandwidthSet(net->ifname, newBandwidth, false) < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, _("cannot set bandwidth limits on %s"), device); diff --git a/src/util/virnetdevbandwidth.c b/src/util/virnetdevbandwidth.c index 49fc4259a77a6ca47100caac8838f91550425115..71c272e330f8d00ac2530cfd20222ee94635144c 100644 --- a/src/util/virnetdevbandwidth.c +++ b/src/util/virnetdevbandwidth.c @@ -45,17 +45,21 @@ virNetDevBandwidthFree(virNetDevBandwidthPtr def) * virNetDevBandwidthSet: * @ifname: on which interface * @bandwidth: rates to set (may be NULL) + * @hierarchical_class: whether to create hierarchical class * * This function enables QoS on specified interface * and set given traffic limits for both, incoming * and outgoing traffic. Any previous setting get - * overwritten. + * overwritten. If @hierarchical_class is TRUE, create + * hierarchical class. It is used to guarantee minimal + * throughput ('floor' attribute in NIC). * * Return 0 on success, -1 otherwise. */ int virNetDevBandwidthSet(const char *ifname, - virNetDevBandwidthPtr bandwidth) + virNetDevBandwidthPtr bandwidth, + bool hierarchical_class) { int ret = -1; virCommandPtr cmd = NULL; @@ -71,7 +75,7 @@ virNetDevBandwidthSet(const char *ifname, virNetDevBandwidthClear(ifname); - if (bandwidth->in) { + if (bandwidth->in && bandwidth->in->average) { if (virAsprintf(&average, "%llukbps", bandwidth->in->average) < 0) goto cleanup; if (bandwidth->in->peak && @@ -83,15 +87,89 @@ virNetDevBandwidthSet(const char *ifname, cmd = virCommandNew(TC); virCommandAddArgList(cmd, "qdisc", "add", "dev", ifname, "root", - "handle", "1:", "htb", "default", "1", NULL); + "handle", "1:", "htb", "default", + hierarchical_class ? "2" : "1", NULL); if (virCommandRun(cmd, NULL) < 0) goto cleanup; + /* If we are creating a hierarchical class, all non guaranteed traffic + * goes to the 1:2 class which will adjust 'rate' dynamically as NICs + * with guaranteed throughput are plugged and unplugged. Class 1:1 + * exists so we don't exceed the maximum limit for the network. For each + * NIC with guaranteed throughput a separate classid will be created. + * NB '1:' is just a shorter notation of '1:0'. + * + * To get a picture how this works: + * + * +-----+ +---------+ +-----------+ +-----------+ +-----+ + * | | | qdisc | | class 1:1 | | class 1:2 | | | + * | NIC | | def 1:2 | | rate | | rate | | sfq | + * | | --> | | --> | peak | -+-> | peak | --> | | + * +-----+ +---------+ +-----------+ | +-----------+ +-----+ + * | + * | +-----------+ +-----+ + * | | class 1:3 | | | + * | | rate | | sfq | + * +-> | peak | --> | | + * | +-----------+ +-----+ + * ... + * | +-----------+ +-----+ + * | | class 1:n | | | + * | | rate | | sfq | + * +-> | peak | --> | | + * +-----------+ +-----+ + * + * After the routing decision, when is it clear a packet is to be sent + * via a particular NIC, it is sent to the root qdisc (queueing + * discipline). In this case HTB (Hierarchical Token Bucket). It has + * only one direct child class (with id 1:1) which shapes the overall + * rate that is sent through the NIC. This class has at least one child + * (1:2) which is meant for all non-privileged (non guaranteed) traffic + * from all domains. Then, for each interface with guaranteed + * throughput, a separate class (1:n) is created. Imagine a class is a + * box. Whenever a packet ends up in a class it is stored in this box + * until the kernel sends it, then it is removed from box. Packets are + * placed into boxes based on rules (filters) - e.g. depending on + * destination IP/MAC address. If there is no rule to be applied, the + * root qdisc has a default where such packets go (1:2 in this case). + * Packets come in over and over again and boxes get filled more and + * more. Imagine that kernel sends packets just once a second. So it + * starts to traverse through this tree. It starts with the root qdisc + * and through 1:1 it gets to 1:2. It sends packets up to 1:2's 'rate'. + * Then it moves to 1:3 and again sends packets up to 1:3's 'rate'. The + * whole process is repeated until 1:n is processed. So now we have + * ensured each class its guaranteed bandwidth. If the sum of sent data + * doesn't exceed the 'rate' in 1:1 class, we can go further and send + * more packets. The rest of available bandwidth is distributed to the + * 1:2,1:3...1:n classes by ratio of their 'rate'. As soon as the root + * 'rate' limit is reached or there are no more packets to send, we stop + * sending and wait another second. Each class has an SFQ qdisc which + * shuffles packets in boxes stochastically, so one sender cannot + * starve others. + * + * Therefore, whenever we want to plug in a new guaranteed interface, we + * need to create a new class and adjust the 'rate' of the 1:2 class. + * When unplugging we do the exact opposite - remove the associated + * class, and adjust the 'rate'. + * + * This description is rather long, but it is still a good idea to read + * it before you dig into the code. + */ + if (hierarchical_class) { + virCommandFree(cmd); + cmd = virCommandNew(TC); + virCommandAddArgList(cmd, "class", "add", "dev", ifname, "parent", + "1:", "classid", "1:1", "htb", "rate", average, + "ceil", peak ? peak : average, NULL); + if (virCommandRun(cmd, NULL) < 0) + goto cleanup; + } virCommandFree(cmd); cmd = virCommandNew(TC); virCommandAddArgList(cmd,"class", "add", "dev", ifname, "parent", - "1:", "classid", "1:1", "htb", NULL); - virCommandAddArgList(cmd, "rate", average, NULL); + hierarchical_class ? "1:1" : "1:", "classid", + hierarchical_class ? "1:2" : "1:1", "htb", + "rate", average, NULL); if (peak) virCommandAddArgList(cmd, "ceil", peak, NULL); @@ -104,7 +182,8 @@ virNetDevBandwidthSet(const char *ifname, virCommandFree(cmd); cmd = virCommandNew(TC); virCommandAddArgList(cmd, "qdisc", "add", "dev", ifname, "parent", - "1:1", "handle", "2:", "sfq", "perturb", + hierarchical_class ? "1:2" : "1:1", + "handle", "2:", "sfq", "perturb", "10", NULL); if (virCommandRun(cmd, NULL) < 0) diff --git a/src/util/virnetdevbandwidth.h b/src/util/virnetdevbandwidth.h index 35f8b89cb2e80bc6135c6e213fdf423f913428a8..d308ab27ecb073329013872a509abadec9344dc2 100644 --- a/src/util/virnetdevbandwidth.h +++ b/src/util/virnetdevbandwidth.h @@ -42,7 +42,9 @@ struct _virNetDevBandwidth { void virNetDevBandwidthFree(virNetDevBandwidthPtr def); -int virNetDevBandwidthSet(const char *ifname, virNetDevBandwidthPtr bandwidth) +int virNetDevBandwidthSet(const char *ifname, + virNetDevBandwidthPtr bandwidth, + bool hierarchical_class) ATTRIBUTE_NONNULL(1) ATTRIBUTE_RETURN_CHECK; int virNetDevBandwidthClear(const char *ifname) ATTRIBUTE_NONNULL(1); diff --git a/src/util/virnetdevmacvlan.c b/src/util/virnetdevmacvlan.c index d8e646ad935429e05fbe60d6917668e566fe6c5a..657c484158db9c03cf255a289247b915c43933d5 100644 --- a/src/util/virnetdevmacvlan.c +++ b/src/util/virnetdevmacvlan.c @@ -925,7 +925,7 @@ create_name: rc = 0; } - if (virNetDevBandwidthSet(cr_ifname, bandwidth) < 0) { + if (virNetDevBandwidthSet(cr_ifname, bandwidth, false) < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, _("cannot set bandwidth limits on %s"), cr_ifname);