qemu_cgroup.c 35.1 KB
Newer Older
1 2 3
/*
 * qemu_cgroup.c: QEMU cgroup management
 *
4
 * Copyright (C) 2006-2014 Red Hat, Inc.
5 6 7 8 9 10 11 12 13 14 15 16 17
 * Copyright (C) 2006 Daniel P. Berrange
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with this library.  If not, see
O
Osier Yang 已提交
19
 * <http://www.gnu.org/licenses/>.
20 21 22 23 24 25 26
 *
 * Author: Daniel P. Berrange <berrange@redhat.com>
 */

#include <config.h>

#include "qemu_cgroup.h"
27
#include "qemu_domain.h"
28
#include "qemu_process.h"
29
#include "vircgroup.h"
30
#include "virlog.h"
31
#include "viralloc.h"
32
#include "virerror.h"
33
#include "domain_audit.h"
34
#include "virscsi.h"
35
#include "virstring.h"
36
#include "virfile.h"
37 38 39

#define VIR_FROM_THIS VIR_FROM_QEMU

40 41
VIR_LOG_INIT("qemu.qemu_cgroup");

42 43 44 45
static const char *const defaultDeviceACL[] = {
    "/dev/null", "/dev/full", "/dev/zero",
    "/dev/random", "/dev/urandom",
    "/dev/ptmx", "/dev/kvm", "/dev/kqemu",
46
    "/dev/rtc", "/dev/hpet", "/dev/vfio/vfio",
47 48 49 50 51
    NULL,
};
#define DEVICE_PTY_MAJOR 136
#define DEVICE_SND_MAJOR 116

52 53 54 55 56
static int
qemuSetImageCgroupInternal(virDomainObjPtr vm,
                           virStorageSourcePtr src,
                           bool deny,
                           bool forceReadonly)
57
{
58
    qemuDomainObjPrivatePtr priv = vm->privateData;
59
    int perms = VIR_CGROUP_DEVICE_READ;
60
    int ret;
61

62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (!src->path || !virStorageSourceIsLocalStorage(src)) {
        VIR_DEBUG("Not updating cgroups for disk path '%s', type: %s",
                  NULLSTR(src->path), virStorageTypeToString(src->type));
        return 0;
    }

    if (deny) {
        perms |= VIR_CGROUP_DEVICE_WRITE | VIR_CGROUP_DEVICE_MKNOD;

        VIR_DEBUG("Deny path %s", src->path);

        ret = virCgroupDenyDevicePath(priv->cgroup, src->path, perms);
    } else {
79
        if (!src->readonly && !forceReadonly)
80 81 82 83 84 85 86 87 88 89 90 91 92
            perms |= VIR_CGROUP_DEVICE_WRITE;

        VIR_DEBUG("Allow path %s, perms: %s",
                  src->path, virCgroupGetDevicePermsString(perms));

        ret = virCgroupAllowDevicePath(priv->cgroup, src->path, perms);
    }

    virDomainAuditCgroupPath(vm, priv->cgroup,
                             deny ? "deny" : "allow",
                             src->path,
                             virCgroupGetDevicePermsString(perms),
                             ret == 0);
93 94 95 96

    /* Get this for root squash NFS */
    if (ret < 0 &&
        virLastErrorIsSystemErrno(EACCES)) {
97
        VIR_DEBUG("Ignoring EACCES for %s", src->path);
98 99
        virResetLastError();
        ret = 0;
100
    }
101

102
    return ret;
103 104 105
}


106 107 108 109 110 111 112 113 114
int
qemuSetImageCgroup(virDomainObjPtr vm,
                   virStorageSourcePtr src,
                   bool deny)
{
    return qemuSetImageCgroupInternal(vm, src, deny, false);
}


115 116 117
int
qemuSetupDiskCgroup(virDomainObjPtr vm,
                    virDomainDiskDefPtr disk)
118
{
119
    virStorageSourcePtr next;
120
    bool forceReadonly = false;
121

122
    for (next = disk->src; next; next = next->backingStore) {
123
        if (qemuSetImageCgroupInternal(vm, next, false, forceReadonly) < 0)
124
            return -1;
125 126 127

        /* setup only the top level image for read-write */
        forceReadonly = true;
128
    }
129 130

    return 0;
131 132 133
}


134 135 136
int
qemuTeardownDiskCgroup(virDomainObjPtr vm,
                       virDomainDiskDefPtr disk)
137
{
138
    virStorageSourcePtr next;
139

140 141 142 143
    for (next = disk->src; next; next = next->backingStore) {
        if (qemuSetImageCgroup(vm, next, true) < 0)
            return -1;
    }
144

145
    return 0;
146 147
}

148

149
static int
150
qemuSetupChrSourceCgroup(virDomainDefPtr def ATTRIBUTE_UNUSED,
151
                         virDomainChrSourceDefPtr dev,
152
                         void *opaque)
153
{
154 155
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
156
    int ret;
157

158
    if (dev->type != VIR_DOMAIN_CHR_TYPE_DEV)
159 160
        return 0;

161
    VIR_DEBUG("Process path '%s' for device", dev->data.file.path);
162

163 164
    ret = virCgroupAllowDevicePath(priv->cgroup, dev->data.file.path,
                                   VIR_CGROUP_DEVICE_RW);
165
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
166
                             dev->data.file.path, "rw", ret == 0);
167

168
    return ret;
169 170
}

171 172 173 174 175
static int
qemuSetupChardevCgroup(virDomainDefPtr def,
                       virDomainChrDefPtr dev,
                       void *opaque)
{
176
    return qemuSetupChrSourceCgroup(def, &dev->source, opaque);
177 178 179 180 181 182
}


static int
qemuSetupTPMCgroup(virDomainDefPtr def,
                   virDomainTPMDefPtr dev,
183
                   void *opaque)
184
{
185
    int ret = 0;
186 187 188

    switch (dev->type) {
    case VIR_DOMAIN_TPM_TYPE_PASSTHROUGH:
189 190
        ret = qemuSetupChrSourceCgroup(def, &dev->data.passthrough.source,
                                       opaque);
191 192 193 194 195
        break;
    case VIR_DOMAIN_TPM_TYPE_LAST:
        break;
    }

196
    return ret;
197 198
}

199

200
static int
201
qemuSetupHostUSBDeviceCgroup(virUSBDevicePtr dev ATTRIBUTE_UNUSED,
202 203
                             const char *path,
                             void *opaque)
204
{
205 206
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
207
    int ret;
208 209

    VIR_DEBUG("Process path '%s' for USB device", path);
210 211 212
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path, "rw", ret == 0);
213

214
    return ret;
215 216
}

217
static int
218
qemuSetupHostSCSIDeviceCgroup(virSCSIDevicePtr dev ATTRIBUTE_UNUSED,
219 220 221 222 223
                              const char *path,
                              void *opaque)
{
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
224
    int ret;
225 226 227

    VIR_DEBUG("Process path '%s' for SCSI device", path);

228 229 230 231
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   virSCSIDeviceGetReadonly(dev) ?
                                   VIR_CGROUP_DEVICE_READ :
                                   VIR_CGROUP_DEVICE_RW);
232 233

    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path,
234
                             virSCSIDeviceGetReadonly(dev) ? "r" : "rw", ret == 0);
235

236
    return ret;
237
}
238

239 240 241 242 243 244
int
qemuSetupHostdevCGroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
245
    virDomainHostdevSubsysUSBPtr usbsrc = &dev->source.subsys.u.usb;
246
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
247
    virDomainHostdevSubsysSCSIPtr scsisrc = &dev->source.subsys.u.scsi;
248
    virPCIDevicePtr pci = NULL;
249
    virUSBDevicePtr usb = NULL;
250
    virSCSIDevicePtr scsi = NULL;
251 252 253 254 255 256 257 258 259 260 261 262 263 264
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
265
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
266
                int rv;
267

268 269 270 271
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
272 273 274
                if (!pci)
                    goto cleanup;

275
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
276 277 278
                    goto cleanup;

                VIR_DEBUG("Cgroup allow %s for PCI device assignment", path);
279
                rv = virCgroupAllowDevicePath(priv->cgroup, path,
280 281
                                              VIR_CGROUP_DEVICE_RW);
                virDomainAuditCgroupPath(vm, priv->cgroup,
282 283
                                         "allow", path, "rw", rv == 0);
                if (rv < 0)
284 285 286
                    goto cleanup;
            }
            break;
287 288 289 290 291 292 293 294

        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* NB: hostdev->missing wasn't previously checked in the
             * case of hotplug, only when starting a domain. Now it is
             * always checked, and the cgroup setup skipped if true.
             */
            if (dev->missing)
                break;
295
            if ((usb = virUSBDeviceNew(usbsrc->bus, usbsrc->device,
296 297 298 299
                                       NULL)) == NULL) {
                goto cleanup;
            }

300
            /* oddly, qemuSetupHostUSBDeviceCgroup doesn't ever
301 302
             * reference the usb object we just created
             */
303
            if (virUSBDeviceFileIterate(usb, qemuSetupHostUSBDeviceCgroup,
304 305 306 307
                                        vm) < 0) {
                goto cleanup;
            }
            break;
308

309
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_SCSI: {
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
            if (scsisrc->protocol ==
                VIR_DOMAIN_HOSTDEV_SCSI_PROTOCOL_TYPE_ISCSI) {
                virDomainHostdevSubsysSCSIiSCSIPtr iscsisrc = &scsisrc->u.iscsi;
                /* Follow qemuSetupDiskCgroup() and qemuSetImageCgroupInternal()
                 * which does nothing for non local storage
                 */
                VIR_DEBUG("Not updating cgroups for hostdev iSCSI path '%s'",
                          iscsisrc->path);
            } else {
                virDomainHostdevSubsysSCSIHostPtr scsihostsrc =
                    &scsisrc->u.host;
                if ((scsi = virSCSIDeviceNew(NULL,
                                             scsihostsrc->adapter,
                                             scsihostsrc->bus,
                                             scsihostsrc->target,
                                             scsihostsrc->unit,
                                             dev->readonly,
                                             dev->shareable)) == NULL)
                    goto cleanup;
329

330 331 332 333 334
                if (virSCSIDeviceFileIterate(scsi,
                                             qemuSetupHostSCSIDeviceCgroup,
                                             vm) < 0)
                    goto cleanup;
            }
335 336
            break;
        }
337

338 339 340 341 342 343
        default:
            break;
        }
    }

    ret = 0;
344
 cleanup:
345
    virPCIDeviceFree(pci);
346
    virUSBDeviceFree(usb);
347
    virSCSIDeviceFree(scsi);
348 349 350 351 352 353 354 355 356 357
    VIR_FREE(path);
    return ret;
}

int
qemuTeardownHostdevCgroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
358
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
    virPCIDevicePtr pci = NULL;
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
374
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
375
                int rv;
376

377 378 379 380
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
381 382 383
                if (!pci)
                    goto cleanup;

384
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
385 386 387
                    goto cleanup;

                VIR_DEBUG("Cgroup deny %s for PCI device assignment", path);
388
                rv = virCgroupDenyDevicePath(priv->cgroup, path,
389 390
                                             VIR_CGROUP_DEVICE_RWM);
                virDomainAuditCgroupPath(vm, priv->cgroup,
391 392
                                         "deny", path, "rwm", rv == 0);
                if (rv < 0)
393 394 395
                    goto cleanup;
            }
            break;
396 397 398
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* nothing to tear down for USB */
            break;
399 400 401 402 403 404
        default:
            break;
        }
    }

    ret = 0;
405
 cleanup:
406 407 408 409 410
    virPCIDeviceFree(pci);
    VIR_FREE(path);
    return ret;
}

411 412 413 414
static int
qemuSetupBlkioCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
415
    size_t i;
416 417 418 419 420 421 422 423 424 425 426 427

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_BLKIO)) {
        if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Block I/O tuning is not available on this host"));
            return -1;
        } else {
            return 0;
        }
    }

428 429 430
    if (vm->def->blkio.weight != 0 &&
        virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight) < 0)
        return -1;
431 432 433

    if (vm->def->blkio.ndevices) {
        for (i = 0; i < vm->def->blkio.ndevices; i++) {
434
            virBlkioDevicePtr dev = &vm->def->blkio.devices[i];
435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457
            if (dev->weight &&
                (virCgroupSetBlkioDeviceWeight(priv->cgroup, dev->path,
                                               dev->weight) < 0))
                return -1;

            if (dev->riops &&
                (virCgroupSetBlkioDeviceReadIops(priv->cgroup, dev->path,
                                                 dev->riops) < 0))
                return -1;

            if (dev->wiops &&
                (virCgroupSetBlkioDeviceWriteIops(priv->cgroup, dev->path,
                                                  dev->wiops) < 0))
                return -1;

            if (dev->rbps &&
                (virCgroupSetBlkioDeviceReadBps(priv->cgroup, dev->path,
                                                dev->rbps) < 0))
                return -1;

            if (dev->wbps &&
                (virCgroupSetBlkioDeviceWriteBps(priv->cgroup, dev->path,
                                                 dev->wbps) < 0))
458 459 460 461 462 463 464
                return -1;
        }
    }

    return 0;
}

465

466 467 468 469 470
static int
qemuSetupMemoryCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;

E
Eric Blake 已提交
471
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) {
472 473 474 475 476 477
        if (vm->def->mem.hard_limit != 0 ||
            vm->def->mem.soft_limit != 0 ||
            vm->def->mem.swap_hard_limit != 0) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Memory cgroup is not available on this host"));
            return -1;
O
Osier Yang 已提交
478 479
        } else {
            return 0;
480 481 482
        }
    }

483 484
    if (vm->def->mem.hard_limit != 0 &&
        virCgroupSetMemoryHardLimit(priv->cgroup, vm->def->mem.hard_limit) < 0)
485 486
        return -1;

487 488 489 490 491 492 493
    if (vm->def->mem.soft_limit != 0 &&
        virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit) < 0)
        return -1;

    if (vm->def->mem.swap_hard_limit != 0 &&
        virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit) < 0)
        return -1;
494 495 496 497 498

    return 0;
}


499 500 501 502 503 504 505
static int
qemuSetupDevicesCgroup(virQEMUDriverPtr driver,
                       virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = NULL;
    const char *const *deviceACL = NULL;
506
    int rv = -1;
507
    int ret = -1;
508
    size_t i;
509 510 511 512

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

513 514 515 516 517
    rv = virCgroupDenyAllDevices(priv->cgroup);
    virDomainAuditCgroup(vm, priv->cgroup, "deny", "all", rv == 0);
    if (rv < 0) {
        if (virLastErrorIsSystemErrno(EPERM)) {
            virResetLastError();
518 519 520 521 522 523 524
            VIR_WARN("Group devices ACL is not accessible, disabling whitelisting");
            return 0;
        }

        goto cleanup;
    }

525
    for (i = 0; i < vm->def->ndisks; i++) {
526 527 528 529
        if (qemuSetupDiskCgroup(vm, vm->def->disks[i]) < 0)
            goto cleanup;
    }

530
    rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_PTY_MAJOR,
531 532
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_PTY_MAJOR,
533 534
                              "pty", "rw", rv == 0);
    if (rv < 0)
535 536 537 538 539 540 541 542
        goto cleanup;

    cfg = virQEMUDriverGetConfig(driver);
    deviceACL = cfg->cgroupDeviceACL ?
                (const char *const *)cfg->cgroupDeviceACL :
                defaultDeviceACL;

    if (vm->def->nsounds &&
543
        ((!vm->def->ngraphics && cfg->nogfxAllowHostAudio) ||
544 545
         (vm->def->graphics &&
          ((vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC &&
546
           cfg->vncAllowHostAudio) ||
547
           (vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_SDL))))) {
548
        rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_SND_MAJOR,
549 550
                                       VIR_CGROUP_DEVICE_RW);
        virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_SND_MAJOR,
551 552
                                  "sound", "rw", rv == 0);
        if (rv < 0)
553 554 555
            goto cleanup;
    }

556
    for (i = 0; deviceACL[i] != NULL; i++) {
557
        if (!virFileExists(deviceACL[i])) {
N
Nehal J Wani 已提交
558
            VIR_DEBUG("Ignoring non-existent device %s", deviceACL[i]);
559 560 561
            continue;
        }

562
        rv = virCgroupAllowDevicePath(priv->cgroup, deviceACL[i],
563
                                      VIR_CGROUP_DEVICE_RW);
564 565 566
        virDomainAuditCgroupPath(vm, priv->cgroup, "allow", deviceACL[i], "rw", rv == 0);
        if (rv < 0 &&
            !virLastErrorIsSystemErrno(ENOENT))
567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586
            goto cleanup;
    }

    if (virDomainChrDefForeach(vm->def,
                               true,
                               qemuSetupChardevCgroup,
                               vm) < 0)
        goto cleanup;

    if (vm->def->tpm &&
        (qemuSetupTPMCgroup(vm->def,
                            vm->def->tpm,
                            vm) < 0))
        goto cleanup;

    for (i = 0; i < vm->def->nhostdevs; i++) {
        if (qemuSetupHostdevCGroup(vm, vm->def->hostdevs[i]) < 0)
            goto cleanup;
    }

587 588 589
    for (i = 0; i < vm->def->nrngs; i++) {
        if (vm->def->rngs[i]->backend == VIR_DOMAIN_RNG_BACKEND_RANDOM) {
            VIR_DEBUG("Setting Cgroup ACL for RNG device");
590 591
            rv = virCgroupAllowDevicePath(priv->cgroup,
                                          vm->def->rngs[i]->source.file,
592 593
                                          VIR_CGROUP_DEVICE_RW);
            virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
594 595
                                     vm->def->rngs[i]->source.file,
                                     "rw", rv == 0);
596 597 598 599
            if (rv < 0 &&
                !virLastErrorIsSystemErrno(ENOENT))
                goto cleanup;
        }
600 601
    }

602
    ret = 0;
603
 cleanup:
604 605 606 607 608
    virObjectUnref(cfg);
    return ret;
}


609
static int
610 611
qemuSetupCpusetMems(virDomainObjPtr vm,
                    virBitmapPtr nodemask)
612 613
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
614
    char *mem_mask = NULL;
615 616 617 618 619
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

620 621
    if (virDomainNumatuneMaybeFormatNodeset(vm->def->numatune,
                                            nodemask,
622
                                            &mem_mask, -1) < 0)
623
        goto cleanup;
624

625 626 627
    if (mem_mask &&
        virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0)
        goto cleanup;
628

629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647
    ret = 0;
 cleanup:
    VIR_FREE(mem_mask);
    return ret;
}


static int
qemuSetupCpusetCgroup(virDomainObjPtr vm,
                      virBitmapPtr nodemask,
                      virCapsPtr caps)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    char *cpu_mask = NULL;
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

648 649 650 651 652 653 654 655 656 657 658 659 660
    if (vm->def->cpumask ||
        (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) {

        if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
            virBitmapPtr cpumap;
            if (!(cpumap = virCapabilitiesGetCpusForNodemask(caps, nodemask)))
                goto cleanup;
            cpu_mask = virBitmapFormat(cpumap);
            virBitmapFree(cpumap);
        } else {
            cpu_mask = virBitmapFormat(vm->def->cpumask);
        }

661
        if (!cpu_mask)
662 663
            goto cleanup;

664
        if (virCgroupSetCpusetCpus(priv->cgroup, cpu_mask) < 0)
665 666 667
            goto cleanup;
    }

668
    ret = 0;
669
 cleanup:
670
    VIR_FREE(cpu_mask);
671 672 673 674
    return ret;
}


675 676 677 678 679 680
static int
qemuSetupCpuCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
681
       if (vm->def->cputune.sharesSpecified) {
682 683 684 685 686 687 688 689
           virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                          _("CPU tuning is not available on this host"));
           return -1;
       } else {
           return 0;
       }
    }

690 691 692 693 694 695 696 697 698
    if (vm->def->cputune.sharesSpecified) {
        unsigned long long val;
        if (virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0)
            return -1;

        if (virCgroupGetCpuShares(priv->cgroup, &val) < 0)
            return -1;
        vm->def->cputune.shares = val;
    }
699 700 701 702 703

    return 0;
}


704
static int
705
qemuInitCgroup(virQEMUDriverPtr driver,
706
               virDomainObjPtr vm)
707
{
708
    int ret = -1;
709 710 711
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);

712 713 714
    if (!cfg->privileged)
        goto done;

715 716 717
    if (!virCgroupAvailable())
        goto done;

718 719
    virCgroupFree(&priv->cgroup);

720
    if (!vm->def->resource) {
721 722
        virDomainResourceDefPtr res;

723
        if (VIR_ALLOC(res) < 0)
724
            goto cleanup;
725

726
        if (VIR_STRDUP(res->partition, "/machine") < 0) {
727 728 729 730 731
            VIR_FREE(res);
            goto cleanup;
        }

        vm->def->resource = res;
732 733
    }

734 735 736 737 738 739
    if (vm->def->resource->partition[0] != '/') {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Resource partition '%s' must start with '/'"),
                       vm->def->resource->partition);
        goto cleanup;
    }
740 741 742 743 744 745 746 747 748 749 750

    if (virCgroupNewMachine(vm->def->name,
                            "qemu",
                            cfg->privileged,
                            vm->def->uuid,
                            NULL,
                            vm->pid,
                            false,
                            vm->def->resource->partition,
                            cfg->cgroupControllers,
                            &priv->cgroup) < 0) {
751 752
        if (virCgroupNewIgnoreError())
            goto done;
753

754 755
        goto cleanup;
    }
756

757
 done:
758
    ret = 0;
759
 cleanup:
760 761 762
    virObjectUnref(cfg);
    return ret;
}
763

764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780

int
qemuConnectCgroup(virQEMUDriverPtr driver,
                  virDomainObjPtr vm)
{
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
    qemuDomainObjPrivatePtr priv = vm->privateData;
    int ret = -1;

    if (!cfg->privileged)
        goto done;

    if (!virCgroupAvailable())
        goto done;

    virCgroupFree(&priv->cgroup);

781 782 783
    if (virCgroupNewDetectMachine(vm->def->name,
                                  "qemu",
                                  vm->pid,
784 785 786
                                  vm->def->resource ?
                                  vm->def->resource->partition :
                                  NULL,
787
                                  cfg->cgroupControllers,
788
                                  &priv->cgroup) < 0)
789
        goto cleanup;
790

791
 done:
792
    ret = 0;
793
 cleanup:
794
    virObjectUnref(cfg);
795
    return ret;
796 797
}

798 799 800 801
int
qemuSetupCgroup(virQEMUDriverPtr driver,
                virDomainObjPtr vm,
                virBitmapPtr nodemask)
802
{
803
    qemuDomainObjPrivatePtr priv = vm->privateData;
804
    virCapsPtr caps = NULL;
805
    int ret = -1;
806

807 808 809 810 811 812
    if (!vm->pid) {
        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                       _("Cannot setup cgroups until process is started"));
        return -1;
    }

813
    if (qemuInitCgroup(driver, vm) < 0)
814
        return -1;
815

816
    if (!priv->cgroup)
817
        return 0;
818

819 820 821
    if (!(caps = virQEMUDriverGetCapabilities(driver, false)))
        goto cleanup;

822 823
    if (qemuSetupDevicesCgroup(driver, vm) < 0)
        goto cleanup;
824

825 826
    if (qemuSetupBlkioCgroup(vm) < 0)
        goto cleanup;
827

828 829
    if (qemuSetupMemoryCgroup(vm) < 0)
        goto cleanup;
830

831 832
    if (qemuSetupCpuCgroup(vm) < 0)
        goto cleanup;
833

834
    if (qemuSetupCpusetCgroup(vm, nodemask, caps) < 0)
835
        goto cleanup;
836

837
    ret = 0;
838
 cleanup:
839
    virObjectUnref(caps);
840
    return ret;
841 842
}

843 844 845 846 847 848 849
int
qemuSetupCgroupPostInit(virDomainObjPtr vm,
                        virBitmapPtr nodemask)
{
    return qemuSetupCpusetMems(vm, nodemask);
}

850 851 852 853
int
qemuSetupCgroupVcpuBW(virCgroupPtr cgroup,
                      unsigned long long period,
                      long long quota)
854 855 856 857 858 859 860 861
{
    unsigned long long old_period;

    if (period == 0 && quota == 0)
        return 0;

    if (period) {
        /* get old period, and we can rollback if set quota failed */
862
        if (virCgroupGetCpuCfsPeriod(cgroup, &old_period) < 0)
863 864
            return -1;

865
        if (virCgroupSetCpuCfsPeriod(cgroup, period) < 0)
866 867 868
            return -1;
    }

869 870 871
    if (quota &&
        virCgroupSetCpuCfsQuota(cgroup, quota) < 0)
        goto error;
872 873 874

    return 0;

875
 error:
876
    if (period) {
877 878 879 880 881 882
        virErrorPtr saved = virSaveLastError();
        ignore_value(virCgroupSetCpuCfsPeriod(cgroup, old_period));
        if (saved) {
            virSetError(saved);
            virFreeError(saved);
        }
883 884 885 886 887
    }

    return -1;
}

888 889 890 891 892
int
qemuSetupCgroupVcpuPin(virCgroupPtr cgroup,
                       virDomainVcpuPinDefPtr *vcpupin,
                       int nvcpupin,
                       int vcpuid)
893
{
894
    size_t i;
895 896 897

    for (i = 0; i < nvcpupin; i++) {
        if (vcpuid == vcpupin[i]->vcpuid) {
898
            return qemuSetupCgroupEmulatorPin(cgroup, vcpupin[i]->cpumask);
899 900 901
        }
    }

902 903 904
    return -1;
}

905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921
int
qemuSetupCgroupIOThreadsPin(virCgroupPtr cgroup,
                            virDomainVcpuPinDefPtr *iothreadspin,
                            int niothreadspin,
                            int iothreadid)
{
    size_t i;

    for (i = 0; i < niothreadspin; i++) {
        if (iothreadid == iothreadspin[i]->vcpuid) {
            return qemuSetupCgroupEmulatorPin(cgroup, iothreadspin[i]->cpumask);
        }
    }

    return -1;
}

922 923 924
int
qemuSetupCgroupEmulatorPin(virCgroupPtr cgroup,
                           virBitmapPtr cpumask)
925
{
926
    int ret = -1;
927 928
    char *new_cpus = NULL;

929
    if (!(new_cpus = virBitmapFormat(cpumask)))
930 931
        goto cleanup;

932
    if (virCgroupSetCpusetCpus(cgroup, new_cpus) < 0)
933 934
        goto cleanup;

935
    ret = 0;
936
 cleanup:
937
    VIR_FREE(new_cpus);
938
    return ret;
939 940
}

941 942
int
qemuSetupCgroupForVcpu(virDomainObjPtr vm)
943 944 945
{
    virCgroupPtr cgroup_vcpu = NULL;
    qemuDomainObjPrivatePtr priv = vm->privateData;
946
    virDomainDefPtr def = vm->def;
947
    size_t i, j;
948 949 950
    unsigned long long period = vm->def->cputune.period;
    long long quota = vm->def->cputune.quota;

951
    if ((period || quota) &&
952
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
953 954
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
H
Hu Tao 已提交
955 956 957
        return -1;
    }

958 959 960 961 962 963 964 965 966
    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

967
    /* We are trying to setup cgroups for CPU pinning, which can also be done
968
     * with virProcessSetAffinity, thus the lack of cgroups is not fatal here.
969
     */
970
    if (priv->cgroup == NULL)
971 972
        return 0;

973
    if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) {
974
        /* If we don't know VCPU<->PID mapping or all vcpu runs in the same
W
Wen Congyang 已提交
975
         * thread, we cannot control each vcpu.
976
         */
977 978
        VIR_WARN("Unable to get vcpus' pids.");
        return 0;
979 980 981
    }

    for (i = 0; i < priv->nvcpupids; i++) {
982
        if (virCgroupNewVcpu(priv->cgroup, i, true, &cgroup_vcpu) < 0)
983 984 985
            goto cleanup;

        /* move the thread for vcpu to sub dir */
986
        if (virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]) < 0)
987 988 989
            goto cleanup;

        if (period || quota) {
H
Hu Tao 已提交
990 991
            if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0)
                goto cleanup;
992 993
        }

994
        /* Set vcpupin in cgroup if vcpupin xml is provided */
995
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
M
Martin Kletzander 已提交
996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
            /* find the right CPU to pin, otherwise
             * qemuSetupCgroupVcpuPin will fail. */
            for (j = 0; j < def->cputune.nvcpupin; j++) {
                if (def->cputune.vcpupin[j]->vcpuid != i)
                    continue;

                if (qemuSetupCgroupVcpuPin(cgroup_vcpu,
                                           def->cputune.vcpupin,
                                           def->cputune.nvcpupin,
                                           i) < 0)
                    goto cleanup;

                break;
            }
        }
1011

1012 1013 1014 1015 1016
        virCgroupFree(&cgroup_vcpu);
    }

    return 0;

1017
 cleanup:
1018 1019 1020 1021 1022
    if (cgroup_vcpu) {
        virCgroupRemove(cgroup_vcpu);
        virCgroupFree(&cgroup_vcpu);
    }

1023 1024 1025
    return -1;
}

1026 1027 1028 1029
int
qemuSetupCgroupForEmulator(virQEMUDriverPtr driver,
                           virDomainObjPtr vm,
                           virBitmapPtr nodemask)
1030
{
1031
    virBitmapPtr cpumask = NULL;
1032
    virBitmapPtr cpumap = NULL;
1033
    virCgroupPtr cgroup_emulator = NULL;
1034
    virDomainDefPtr def = vm->def;
1035
    qemuDomainObjPrivatePtr priv = vm->privateData;
1036 1037
    unsigned long long period = vm->def->cputune.emulator_period;
    long long quota = vm->def->cputune.emulator_quota;
1038

1039
    if ((period || quota) &&
1040
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
1041 1042 1043 1044 1045
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

1046 1047 1048 1049 1050 1051 1052 1053 1054
    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

1055
    if (priv->cgroup == NULL)
1056 1057
        return 0; /* Not supported, so claim success */

1058
    if (virCgroupNewEmulator(priv->cgroup, true, &cgroup_emulator) < 0)
1059 1060
        goto cleanup;

1061
    if (virCgroupMoveTask(priv->cgroup, cgroup_emulator) < 0)
1062
        goto cleanup;
1063

1064 1065 1066 1067 1068
    if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
        if (!(cpumap = qemuPrepareCpumap(driver, nodemask)))
            goto cleanup;
        cpumask = cpumap;
    } else if (def->cputune.emulatorpin) {
1069
        cpumask = def->cputune.emulatorpin->cpumask;
1070
    } else if (def->cpumask) {
1071
        cpumask = def->cpumask;
1072
    }
1073 1074

    if (cpumask) {
1075 1076 1077
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET) &&
            qemuSetupCgroupEmulatorPin(cgroup_emulator, cpumask) < 0)
            goto cleanup;
H
Hu Tao 已提交
1078
    }
1079

1080
    if (period || quota) {
1081 1082 1083 1084
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
            qemuSetupCgroupVcpuBW(cgroup_emulator, period,
                                  quota) < 0)
            goto cleanup;
1085 1086
    }

1087
    virCgroupFree(&cgroup_emulator);
1088
    virBitmapFree(cpumap);
1089 1090
    return 0;

1091
 cleanup:
1092 1093
    virBitmapFree(cpumap);

1094 1095 1096 1097 1098
    if (cgroup_emulator) {
        virCgroupRemove(cgroup_emulator);
        virCgroupFree(&cgroup_emulator);
    }

1099
    return -1;
1100
}
1101

1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133
int
qemuSetupCgroupForIOThreads(virDomainObjPtr vm)
{
    virCgroupPtr cgroup_iothread = NULL;
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virDomainDefPtr def = vm->def;
    size_t i, j;
    unsigned long long period = vm->def->cputune.period;
    long long quota = vm->def->cputune.quota;

    if ((period || quota) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

    /* We are trying to setup cgroups for CPU pinning, which can also be done
     * with virProcessSetAffinity, thus the lack of cgroups is not fatal here.
     */
    if (priv->cgroup == NULL)
        return 0;

J
Ján Tomko 已提交
1134
    if (def->iothreads && priv->niothreadpids == 0) {
1135 1136 1137 1138 1139 1140 1141 1142
        VIR_WARN("Unable to get iothreads' pids.");
        return 0;
    }

    for (i = 0; i < priv->niothreadpids; i++) {
        /* IOThreads are numbered 1..n, although the array is 0..n-1,
         * so we will account for that here
         */
1143 1144
        if (virCgroupNewIOThread(priv->cgroup, i + 1, true,
                                 &cgroup_iothread) < 0)
1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162
            goto cleanup;

        /* move the thread for iothread to sub dir */
        if (virCgroupAddTask(cgroup_iothread, priv->iothreadpids[i]) < 0)
            goto cleanup;

        if (period || quota) {
            if (qemuSetupCgroupVcpuBW(cgroup_iothread, period, quota) < 0)
                goto cleanup;
        }

        /* Set iothreadpin in cgroup if iothreadpin xml is provided */
        if (virCgroupHasController(priv->cgroup,
                                   VIR_CGROUP_CONTROLLER_CPUSET)) {
            /* find the right CPU to pin, otherwise
             * qemuSetupCgroupIOThreadsPin will fail. */
            for (j = 0; j < def->cputune.niothreadspin; j++) {
                /* IOThreads are numbered/named 1..n */
1163
                if (def->cputune.iothreadspin[j]->vcpuid != i + 1)
1164 1165 1166 1167 1168
                    continue;

                if (qemuSetupCgroupIOThreadsPin(cgroup_iothread,
                                                def->cputune.iothreadspin,
                                                def->cputune.niothreadspin,
1169
                                                i + 1) < 0)
1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189
                    goto cleanup;

                break;
            }
        }

        virCgroupFree(&cgroup_iothread);
    }

    return 0;

 cleanup:
    if (cgroup_iothread) {
        virCgroupRemove(cgroup_iothread);
        virCgroupFree(&cgroup_iothread);
    }

    return -1;
}

1190 1191
int
qemuRemoveCgroup(virDomainObjPtr vm)
1192
{
1193
    qemuDomainObjPrivatePtr priv = vm->privateData;
1194

1195
    if (priv->cgroup == NULL)
1196 1197
        return 0; /* Not supported, so claim success */

1198
    return virCgroupRemove(priv->cgroup);
1199 1200
}

1201 1202
int
qemuAddToCgroup(virDomainObjPtr vm)
1203
{
1204
    qemuDomainObjPrivatePtr priv = vm->privateData;
1205

1206
    if (priv->cgroup == NULL)
1207 1208
        return 0; /* Not supported, so claim success */

1209
    return 0;
1210
}