qemu_cgroup.c 30.3 KB
Newer Older
1 2 3
/*
 * qemu_cgroup.c: QEMU cgroup management
 *
4
 * Copyright (C) 2006-2014 Red Hat, Inc.
5 6 7 8 9 10 11 12 13 14 15 16 17
 * Copyright (C) 2006 Daniel P. Berrange
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with this library.  If not, see
O
Osier Yang 已提交
19
 * <http://www.gnu.org/licenses/>.
20 21 22 23 24 25 26
 *
 * Author: Daniel P. Berrange <berrange@redhat.com>
 */

#include <config.h>

#include "qemu_cgroup.h"
27
#include "qemu_domain.h"
28
#include "qemu_process.h"
29
#include "vircgroup.h"
30
#include "virlog.h"
31
#include "viralloc.h"
32
#include "virerror.h"
33
#include "domain_audit.h"
34
#include "virscsi.h"
35
#include "virstring.h"
36
#include "virfile.h"
37 38 39

#define VIR_FROM_THIS VIR_FROM_QEMU

40 41
VIR_LOG_INIT("qemu.qemu_cgroup");

42 43 44 45
static const char *const defaultDeviceACL[] = {
    "/dev/null", "/dev/full", "/dev/zero",
    "/dev/random", "/dev/urandom",
    "/dev/ptmx", "/dev/kvm", "/dev/kqemu",
46
    "/dev/rtc", "/dev/hpet", "/dev/vfio/vfio",
47 48 49 50 51
    NULL,
};
#define DEVICE_PTY_MAJOR 136
#define DEVICE_SND_MAJOR 116

52
static int
53
qemuSetupDiskPathAllow(virDomainDiskDefPtr disk,
54 55 56
                       const char *path,
                       size_t depth ATTRIBUTE_UNUSED,
                       void *opaque)
57
{
58 59
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
60
    int ret;
61 62

    VIR_DEBUG("Process path %s for disk", path);
63 64 65
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   (disk->readonly ? VIR_CGROUP_DEVICE_READ
                                    : VIR_CGROUP_DEVICE_RW));
66
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path,
67 68 69 70 71 72 73 74
                             disk->readonly ? "r" : "rw", ret == 0);

    /* Get this for root squash NFS */
    if (ret < 0 &&
        virLastErrorIsSystemErrno(EACCES)) {
        VIR_DEBUG("Ignoring EACCES for %s", path);
        virResetLastError();
        ret = 0;
75
    }
76
    return ret;
77 78 79
}


80 81 82
int
qemuSetupDiskCgroup(virDomainObjPtr vm,
                    virDomainDiskDefPtr disk)
83
{
84 85 86 87 88 89
    qemuDomainObjPrivatePtr priv = vm->privateData;

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

90
    return virDomainDiskDefForeachPath(disk, true, qemuSetupDiskPathAllow, vm);
91 92 93
}


94 95 96 97 98
static int
qemuTeardownDiskPathDeny(virDomainDiskDefPtr disk ATTRIBUTE_UNUSED,
                         const char *path,
                         size_t depth ATTRIBUTE_UNUSED,
                         void *opaque)
99
{
100 101
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
102
    int ret;
103 104

    VIR_DEBUG("Process path %s for disk", path);
105 106 107 108 109 110 111 112 113 114
    ret = virCgroupDenyDevicePath(priv->cgroup, path,
                                  VIR_CGROUP_DEVICE_RWM);
    virDomainAuditCgroupPath(vm, priv->cgroup, "deny", path, "rwm", ret == 0);

    /* Get this for root squash NFS */
    if (ret < 0 &&
        virLastErrorIsSystemErrno(EACCES)) {
        VIR_DEBUG("Ignoring EACCES for %s", path);
        virResetLastError();
        ret = 0;
115
    }
116
    return ret;
117 118 119
}


120 121 122
int
qemuTeardownDiskCgroup(virDomainObjPtr vm,
                       virDomainDiskDefPtr disk)
123
{
124 125 126 127 128 129
    qemuDomainObjPrivatePtr priv = vm->privateData;

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

130 131 132
    return virDomainDiskDefForeachPath(disk,
                                       true,
                                       qemuTeardownDiskPathDeny,
133
                                       vm);
134 135
}

136
static int
137
qemuSetupChrSourceCgroup(virDomainDefPtr def ATTRIBUTE_UNUSED,
138
                         virDomainChrSourceDefPtr dev,
139
                         void *opaque)
140
{
141 142
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
143
    int ret;
144

145
    if (dev->type != VIR_DOMAIN_CHR_TYPE_DEV)
146 147
        return 0;

148
    VIR_DEBUG("Process path '%s' for device", dev->data.file.path);
149

150 151
    ret = virCgroupAllowDevicePath(priv->cgroup, dev->data.file.path,
                                   VIR_CGROUP_DEVICE_RW);
152
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
153
                             dev->data.file.path, "rw", ret == 0);
154

155
    return ret;
156 157
}

158 159 160 161 162
static int
qemuSetupChardevCgroup(virDomainDefPtr def,
                       virDomainChrDefPtr dev,
                       void *opaque)
{
163
    return qemuSetupChrSourceCgroup(def, &dev->source, opaque);
164 165 166 167 168 169
}


static int
qemuSetupTPMCgroup(virDomainDefPtr def,
                   virDomainTPMDefPtr dev,
170
                   void *opaque)
171
{
172
    int ret = 0;
173 174 175

    switch (dev->type) {
    case VIR_DOMAIN_TPM_TYPE_PASSTHROUGH:
176 177
        ret = qemuSetupChrSourceCgroup(def, &dev->data.passthrough.source,
                                       opaque);
178 179 180 181 182
        break;
    case VIR_DOMAIN_TPM_TYPE_LAST:
        break;
    }

183
    return ret;
184 185
}

186

187 188 189 190
static int
qemuSetupHostUsbDeviceCgroup(virUSBDevicePtr dev ATTRIBUTE_UNUSED,
                             const char *path,
                             void *opaque)
191
{
192 193
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
194
    int ret;
195 196

    VIR_DEBUG("Process path '%s' for USB device", path);
197 198 199
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path, "rw", ret == 0);
200

201
    return ret;
202 203
}

204 205 206 207 208 209 210
static int
qemuSetupHostScsiDeviceCgroup(virSCSIDevicePtr dev ATTRIBUTE_UNUSED,
                              const char *path,
                              void *opaque)
{
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
211
    int ret;
212 213 214

    VIR_DEBUG("Process path '%s' for SCSI device", path);

215 216 217 218
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   virSCSIDeviceGetReadonly(dev) ?
                                   VIR_CGROUP_DEVICE_READ :
                                   VIR_CGROUP_DEVICE_RW);
219 220

    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path,
221
                             virSCSIDeviceGetReadonly(dev) ? "r" : "rw", ret == 0);
222

223
    return ret;
224
}
225

226 227 228 229 230 231 232
int
qemuSetupHostdevCGroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virPCIDevicePtr pci = NULL;
233
    virUSBDevicePtr usb = NULL;
234
    virSCSIDevicePtr scsi = NULL;
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
            if (dev->source.subsys.u.pci.backend
250
                == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
251
                int rv;
252 253 254 255 256 257 258 259

                pci = virPCIDeviceNew(dev->source.subsys.u.pci.addr.domain,
                                      dev->source.subsys.u.pci.addr.bus,
                                      dev->source.subsys.u.pci.addr.slot,
                                      dev->source.subsys.u.pci.addr.function);
                if (!pci)
                    goto cleanup;

260
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
261 262 263
                    goto cleanup;

                VIR_DEBUG("Cgroup allow %s for PCI device assignment", path);
264
                rv = virCgroupAllowDevicePath(priv->cgroup, path,
265 266
                                              VIR_CGROUP_DEVICE_RW);
                virDomainAuditCgroupPath(vm, priv->cgroup,
267 268
                                         "allow", path, "rw", rv == 0);
                if (rv < 0)
269 270 271
                    goto cleanup;
            }
            break;
272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293

        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* NB: hostdev->missing wasn't previously checked in the
             * case of hotplug, only when starting a domain. Now it is
             * always checked, and the cgroup setup skipped if true.
             */
            if (dev->missing)
                break;
            if ((usb = virUSBDeviceNew(dev->source.subsys.u.usb.bus,
                                       dev->source.subsys.u.usb.device,
                                       NULL)) == NULL) {
                goto cleanup;
            }

            /* oddly, qemuSetupHostUsbDeviceCgroup doesn't ever
             * reference the usb object we just created
             */
            if (virUSBDeviceFileIterate(usb, qemuSetupHostUsbDeviceCgroup,
                                        vm) < 0) {
                goto cleanup;
            }
            break;
294 295

        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_SCSI:
296 297
            if ((scsi = virSCSIDeviceNew(NULL,
                                         dev->source.subsys.u.scsi.adapter,
298 299 300
                                         dev->source.subsys.u.scsi.bus,
                                         dev->source.subsys.u.scsi.target,
                                         dev->source.subsys.u.scsi.unit,
301 302
                                         dev->readonly,
                                         dev->shareable)) == NULL)
303 304 305 306 307 308 309
                goto cleanup;

            if (virSCSIDeviceFileIterate(scsi,
                                         qemuSetupHostScsiDeviceCgroup,
                                         vm) < 0)
                goto cleanup;

310 311 312 313 314 315
        default:
            break;
        }
    }

    ret = 0;
316
 cleanup:
317
    virPCIDeviceFree(pci);
318
    virUSBDeviceFree(usb);
319
    virSCSIDeviceFree(scsi);
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345
    VIR_FREE(path);
    return ret;
}

int
qemuTeardownHostdevCgroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virPCIDevicePtr pci = NULL;
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
            if (dev->source.subsys.u.pci.backend
346
                == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
347
                int rv;
348 349 350 351 352 353 354 355

                pci = virPCIDeviceNew(dev->source.subsys.u.pci.addr.domain,
                                      dev->source.subsys.u.pci.addr.bus,
                                      dev->source.subsys.u.pci.addr.slot,
                                      dev->source.subsys.u.pci.addr.function);
                if (!pci)
                    goto cleanup;

356
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
357 358 359
                    goto cleanup;

                VIR_DEBUG("Cgroup deny %s for PCI device assignment", path);
360
                rv = virCgroupDenyDevicePath(priv->cgroup, path,
361 362
                                             VIR_CGROUP_DEVICE_RWM);
                virDomainAuditCgroupPath(vm, priv->cgroup,
363 364
                                         "deny", path, "rwm", rv == 0);
                if (rv < 0)
365 366 367
                    goto cleanup;
            }
            break;
368 369 370
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* nothing to tear down for USB */
            break;
371 372 373 374 375 376
        default:
            break;
        }
    }

    ret = 0;
377
 cleanup:
378 379 380 381 382
    virPCIDeviceFree(pci);
    VIR_FREE(path);
    return ret;
}

383 384 385 386
static int
qemuSetupBlkioCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
387
    size_t i;
388 389 390 391 392 393 394 395 396 397 398 399

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_BLKIO)) {
        if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Block I/O tuning is not available on this host"));
            return -1;
        } else {
            return 0;
        }
    }

400 401 402
    if (vm->def->blkio.weight != 0 &&
        virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight) < 0)
        return -1;
403 404 405

    if (vm->def->blkio.ndevices) {
        for (i = 0; i < vm->def->blkio.ndevices; i++) {
406
            virBlkioDevicePtr dev = &vm->def->blkio.devices[i];
407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429
            if (dev->weight &&
                (virCgroupSetBlkioDeviceWeight(priv->cgroup, dev->path,
                                               dev->weight) < 0))
                return -1;

            if (dev->riops &&
                (virCgroupSetBlkioDeviceReadIops(priv->cgroup, dev->path,
                                                 dev->riops) < 0))
                return -1;

            if (dev->wiops &&
                (virCgroupSetBlkioDeviceWriteIops(priv->cgroup, dev->path,
                                                  dev->wiops) < 0))
                return -1;

            if (dev->rbps &&
                (virCgroupSetBlkioDeviceReadBps(priv->cgroup, dev->path,
                                                dev->rbps) < 0))
                return -1;

            if (dev->wbps &&
                (virCgroupSetBlkioDeviceWriteBps(priv->cgroup, dev->path,
                                                 dev->wbps) < 0))
430 431 432 433 434 435 436
                return -1;
        }
    }

    return 0;
}

437

438 439 440 441 442
static int
qemuSetupMemoryCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;

E
Eric Blake 已提交
443
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) {
444 445 446 447 448 449
        if (vm->def->mem.hard_limit != 0 ||
            vm->def->mem.soft_limit != 0 ||
            vm->def->mem.swap_hard_limit != 0) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Memory cgroup is not available on this host"));
            return -1;
O
Osier Yang 已提交
450 451
        } else {
            return 0;
452 453 454
        }
    }

455 456
    if (vm->def->mem.hard_limit != 0 &&
        virCgroupSetMemoryHardLimit(priv->cgroup, vm->def->mem.hard_limit) < 0)
457 458
        return -1;

459 460 461 462 463 464 465
    if (vm->def->mem.soft_limit != 0 &&
        virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit) < 0)
        return -1;

    if (vm->def->mem.swap_hard_limit != 0 &&
        virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit) < 0)
        return -1;
466 467 468 469 470

    return 0;
}


471 472 473 474 475 476 477
static int
qemuSetupDevicesCgroup(virQEMUDriverPtr driver,
                       virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = NULL;
    const char *const *deviceACL = NULL;
478
    int rv = -1;
479
    int ret = -1;
480
    size_t i;
481 482 483 484

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

485 486 487 488 489
    rv = virCgroupDenyAllDevices(priv->cgroup);
    virDomainAuditCgroup(vm, priv->cgroup, "deny", "all", rv == 0);
    if (rv < 0) {
        if (virLastErrorIsSystemErrno(EPERM)) {
            virResetLastError();
490 491 492 493 494 495 496
            VIR_WARN("Group devices ACL is not accessible, disabling whitelisting");
            return 0;
        }

        goto cleanup;
    }

497
    for (i = 0; i < vm->def->ndisks; i++) {
498 499 500 501
        if (qemuSetupDiskCgroup(vm, vm->def->disks[i]) < 0)
            goto cleanup;
    }

502
    rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_PTY_MAJOR,
503 504
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_PTY_MAJOR,
505 506
                              "pty", "rw", rv == 0);
    if (rv < 0)
507 508 509 510 511 512 513 514
        goto cleanup;

    cfg = virQEMUDriverGetConfig(driver);
    deviceACL = cfg->cgroupDeviceACL ?
                (const char *const *)cfg->cgroupDeviceACL :
                defaultDeviceACL;

    if (vm->def->nsounds &&
515
        ((!vm->def->ngraphics && cfg->nogfxAllowHostAudio) ||
516 517
         (vm->def->graphics &&
          ((vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC &&
518
           cfg->vncAllowHostAudio) ||
519
           (vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_SDL))))) {
520
        rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_SND_MAJOR,
521 522
                                       VIR_CGROUP_DEVICE_RW);
        virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_SND_MAJOR,
523 524
                                  "sound", "rw", rv == 0);
        if (rv < 0)
525 526 527
            goto cleanup;
    }

528
    for (i = 0; deviceACL[i] != NULL; i++) {
529 530
        if (!virFileExists(deviceACL[i])) {
            VIR_DEBUG("Ignoring non-existant device %s", deviceACL[i]);
531 532 533
            continue;
        }

534
        rv = virCgroupAllowDevicePath(priv->cgroup, deviceACL[i],
535
                                      VIR_CGROUP_DEVICE_RW);
536 537 538
        virDomainAuditCgroupPath(vm, priv->cgroup, "allow", deviceACL[i], "rw", rv == 0);
        if (rv < 0 &&
            !virLastErrorIsSystemErrno(ENOENT))
539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558
            goto cleanup;
    }

    if (virDomainChrDefForeach(vm->def,
                               true,
                               qemuSetupChardevCgroup,
                               vm) < 0)
        goto cleanup;

    if (vm->def->tpm &&
        (qemuSetupTPMCgroup(vm->def,
                            vm->def->tpm,
                            vm) < 0))
        goto cleanup;

    for (i = 0; i < vm->def->nhostdevs; i++) {
        if (qemuSetupHostdevCGroup(vm, vm->def->hostdevs[i]) < 0)
            goto cleanup;
    }

559 560 561 562 563 564 565 566 567 568 569 570
    if (vm->def->rng &&
        (vm->def->rng->backend == VIR_DOMAIN_RNG_BACKEND_RANDOM)) {
        VIR_DEBUG("Setting Cgroup ACL for RNG device");
        rv = virCgroupAllowDevicePath(priv->cgroup, vm->def->rng->source.file,
                                      VIR_CGROUP_DEVICE_RW);
        virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
                                 vm->def->rng->source.file, "rw", rv == 0);
        if (rv < 0 &&
            !virLastErrorIsSystemErrno(ENOENT))
            goto cleanup;
    }

571
    ret = 0;
572
 cleanup:
573 574 575 576 577
    virObjectUnref(cfg);
    return ret;
}


578 579
static int
qemuSetupCpusetCgroup(virDomainObjPtr vm,
580 581
                      virBitmapPtr nodemask,
                      virCapsPtr caps)
582 583
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
584 585
    char *mem_mask = NULL;
    char *cpu_mask = NULL;
586 587 588 589 590 591 592 593 594 595 596 597
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

    if ((vm->def->numatune.memory.nodemask ||
         (vm->def->numatune.memory.placement_mode ==
          VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) &&
        vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {

        if (vm->def->numatune.memory.placement_mode ==
            VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)
598
            mem_mask = virBitmapFormat(nodemask);
599
        else
600
            mem_mask = virBitmapFormat(vm->def->numatune.memory.nodemask);
601

602
        if (!mem_mask) {
603 604 605 606 607
            virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                           _("failed to convert memory nodemask"));
            goto cleanup;
        }

608
        if (virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0)
609 610 611
            goto cleanup;
    }

612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630
    if (vm->def->cpumask ||
        (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) {

        if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
            virBitmapPtr cpumap;
            if (!(cpumap = virCapabilitiesGetCpusForNodemask(caps, nodemask)))
                goto cleanup;
            cpu_mask = virBitmapFormat(cpumap);
            virBitmapFree(cpumap);
        } else {
            cpu_mask = virBitmapFormat(vm->def->cpumask);
        }

        if (!cpu_mask) {
            virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                           _("failed to convert cpu mask"));
            goto cleanup;
        }

631
        if (virCgroupSetCpusetCpus(priv->cgroup, cpu_mask) < 0)
632 633 634
            goto cleanup;
    }

635
    ret = 0;
636
 cleanup:
637 638
    VIR_FREE(mem_mask);
    VIR_FREE(cpu_mask);
639 640 641 642
    return ret;
}


643 644 645 646 647 648 649 650 651 652 653 654 655 656 657
static int
qemuSetupCpuCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
       if (vm->def->cputune.shares) {
           virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                          _("CPU tuning is not available on this host"));
           return -1;
       } else {
           return 0;
       }
    }

658 659 660
    if (vm->def->cputune.shares &&
        virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0)
        return -1;
661 662 663 664 665

    return 0;
}


666
static int
667
qemuInitCgroup(virQEMUDriverPtr driver,
668
               virDomainObjPtr vm)
669
{
670
    int ret = -1;
671 672 673
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);

674 675 676
    if (!cfg->privileged)
        goto done;

677 678 679
    if (!virCgroupAvailable())
        goto done;

680 681
    virCgroupFree(&priv->cgroup);

682
    if (!vm->def->resource) {
683 684
        virDomainResourceDefPtr res;

685
        if (VIR_ALLOC(res) < 0)
686
            goto cleanup;
687

688
        if (VIR_STRDUP(res->partition, "/machine") < 0) {
689 690 691 692 693
            VIR_FREE(res);
            goto cleanup;
        }

        vm->def->resource = res;
694 695
    }

696 697 698 699 700 701
    if (vm->def->resource->partition[0] != '/') {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Resource partition '%s' must start with '/'"),
                       vm->def->resource->partition);
        goto cleanup;
    }
702 703 704 705 706 707 708 709 710 711 712

    if (virCgroupNewMachine(vm->def->name,
                            "qemu",
                            cfg->privileged,
                            vm->def->uuid,
                            NULL,
                            vm->pid,
                            false,
                            vm->def->resource->partition,
                            cfg->cgroupControllers,
                            &priv->cgroup) < 0) {
713 714
        if (virCgroupNewIgnoreError())
            goto done;
715

716 717
        goto cleanup;
    }
718

719
 done:
720
    ret = 0;
721
 cleanup:
722 723 724
    virObjectUnref(cfg);
    return ret;
}
725

726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742

int
qemuConnectCgroup(virQEMUDriverPtr driver,
                  virDomainObjPtr vm)
{
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
    qemuDomainObjPrivatePtr priv = vm->privateData;
    int ret = -1;

    if (!cfg->privileged)
        goto done;

    if (!virCgroupAvailable())
        goto done;

    virCgroupFree(&priv->cgroup);

743 744 745
    if (virCgroupNewDetectMachine(vm->def->name,
                                  "qemu",
                                  vm->pid,
746 747 748
                                  vm->def->resource ?
                                  vm->def->resource->partition :
                                  NULL,
749
                                  cfg->cgroupControllers,
750
                                  &priv->cgroup) < 0)
751
        goto cleanup;
752

753
 done:
754
    ret = 0;
755
 cleanup:
756
    virObjectUnref(cfg);
757
    return ret;
758 759
}

760 761 762 763
int
qemuSetupCgroup(virQEMUDriverPtr driver,
                virDomainObjPtr vm,
                virBitmapPtr nodemask)
764
{
765
    qemuDomainObjPrivatePtr priv = vm->privateData;
766
    virCapsPtr caps = NULL;
767
    int ret = -1;
768

769 770 771 772 773 774
    if (!vm->pid) {
        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                       _("Cannot setup cgroups until process is started"));
        return -1;
    }

775
    if (qemuInitCgroup(driver, vm) < 0)
776
        return -1;
777

778
    if (!priv->cgroup)
779
        return 0;
780

781 782 783
    if (!(caps = virQEMUDriverGetCapabilities(driver, false)))
        goto cleanup;

784 785
    if (qemuSetupDevicesCgroup(driver, vm) < 0)
        goto cleanup;
786

787 788
    if (qemuSetupBlkioCgroup(vm) < 0)
        goto cleanup;
789

790 791
    if (qemuSetupMemoryCgroup(vm) < 0)
        goto cleanup;
792

793 794
    if (qemuSetupCpuCgroup(vm) < 0)
        goto cleanup;
795

796
    if (qemuSetupCpusetCgroup(vm, nodemask, caps) < 0)
797
        goto cleanup;
798

799
    ret = 0;
800
 cleanup:
801
    virObjectUnref(caps);
802
    return ret;
803 804
}

805 806 807 808
int
qemuSetupCgroupVcpuBW(virCgroupPtr cgroup,
                      unsigned long long period,
                      long long quota)
809 810 811 812 813 814 815 816
{
    unsigned long long old_period;

    if (period == 0 && quota == 0)
        return 0;

    if (period) {
        /* get old period, and we can rollback if set quota failed */
817
        if (virCgroupGetCpuCfsPeriod(cgroup, &old_period) < 0)
818 819
            return -1;

820
        if (virCgroupSetCpuCfsPeriod(cgroup, period) < 0)
821 822 823
            return -1;
    }

824 825 826
    if (quota &&
        virCgroupSetCpuCfsQuota(cgroup, quota) < 0)
        goto error;
827 828 829

    return 0;

830
 error:
831
    if (period) {
832 833 834 835 836 837
        virErrorPtr saved = virSaveLastError();
        ignore_value(virCgroupSetCpuCfsPeriod(cgroup, old_period));
        if (saved) {
            virSetError(saved);
            virFreeError(saved);
        }
838 839 840 841 842
    }

    return -1;
}

843 844 845 846 847
int
qemuSetupCgroupVcpuPin(virCgroupPtr cgroup,
                       virDomainVcpuPinDefPtr *vcpupin,
                       int nvcpupin,
                       int vcpuid)
848
{
849
    size_t i;
850 851 852

    for (i = 0; i < nvcpupin; i++) {
        if (vcpuid == vcpupin[i]->vcpuid) {
853
            return qemuSetupCgroupEmulatorPin(cgroup, vcpupin[i]->cpumask);
854 855 856
        }
    }

857 858 859
    return -1;
}

860 861 862
int
qemuSetupCgroupEmulatorPin(virCgroupPtr cgroup,
                           virBitmapPtr cpumask)
863
{
864
    int ret = -1;
865 866
    char *new_cpus = NULL;

867
    new_cpus = virBitmapFormat(cpumask);
868 869 870 871 872 873
    if (!new_cpus) {
        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                       _("failed to convert cpu mask"));
        goto cleanup;
    }

874
    if (virCgroupSetCpusetCpus(cgroup, new_cpus) < 0)
875 876
        goto cleanup;

877
    ret = 0;
878
 cleanup:
879
    VIR_FREE(new_cpus);
880
    return ret;
881 882
}

883 884
int
qemuSetupCgroupForVcpu(virDomainObjPtr vm)
885 886 887
{
    virCgroupPtr cgroup_vcpu = NULL;
    qemuDomainObjPrivatePtr priv = vm->privateData;
888
    virDomainDefPtr def = vm->def;
889
    size_t i, j;
890 891 892
    unsigned long long period = vm->def->cputune.period;
    long long quota = vm->def->cputune.quota;

893
    if ((period || quota) &&
894
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
895 896
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
H
Hu Tao 已提交
897 898 899
        return -1;
    }

900 901 902 903
    /* We are trying to setup cgroups for CPU pinning, which can also be done
     * with virProcessInfoSetAffinity, thus the lack of cgroups is not fatal
     * here.
     */
904
    if (priv->cgroup == NULL)
905 906
        return 0;

907
    if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) {
908
        /* If we don't know VCPU<->PID mapping or all vcpu runs in the same
W
Wen Congyang 已提交
909
         * thread, we cannot control each vcpu.
910
         */
911 912
        VIR_WARN("Unable to get vcpus' pids.");
        return 0;
913 914 915
    }

    for (i = 0; i < priv->nvcpupids; i++) {
916
        if (virCgroupNewVcpu(priv->cgroup, i, true, &cgroup_vcpu) < 0)
917 918 919
            goto cleanup;

        /* move the thread for vcpu to sub dir */
920
        if (virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]) < 0)
921 922 923
            goto cleanup;

        if (period || quota) {
H
Hu Tao 已提交
924 925
            if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0)
                goto cleanup;
926 927
        }

928
        /* Set vcpupin in cgroup if vcpupin xml is provided */
929
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
M
Martin Kletzander 已提交
930 931 932 933 934 935 936 937 938 939 940 941 942 943 944
            /* find the right CPU to pin, otherwise
             * qemuSetupCgroupVcpuPin will fail. */
            for (j = 0; j < def->cputune.nvcpupin; j++) {
                if (def->cputune.vcpupin[j]->vcpuid != i)
                    continue;

                if (qemuSetupCgroupVcpuPin(cgroup_vcpu,
                                           def->cputune.vcpupin,
                                           def->cputune.nvcpupin,
                                           i) < 0)
                    goto cleanup;

                break;
            }
        }
945

946 947 948 949 950
        virCgroupFree(&cgroup_vcpu);
    }

    return 0;

951
 cleanup:
952 953 954 955 956
    if (cgroup_vcpu) {
        virCgroupRemove(cgroup_vcpu);
        virCgroupFree(&cgroup_vcpu);
    }

957 958 959
    return -1;
}

960 961 962 963
int
qemuSetupCgroupForEmulator(virQEMUDriverPtr driver,
                           virDomainObjPtr vm,
                           virBitmapPtr nodemask)
964
{
965
    virBitmapPtr cpumask = NULL;
966
    virBitmapPtr cpumap = NULL;
967
    virCgroupPtr cgroup_emulator = NULL;
968
    virDomainDefPtr def = vm->def;
969
    qemuDomainObjPrivatePtr priv = vm->privateData;
970 971
    unsigned long long period = vm->def->cputune.emulator_period;
    long long quota = vm->def->cputune.emulator_quota;
972

973
    if ((period || quota) &&
974
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
975 976 977 978 979
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

980
    if (priv->cgroup == NULL)
981 982
        return 0; /* Not supported, so claim success */

983
    if (virCgroupNewEmulator(priv->cgroup, true, &cgroup_emulator) < 0)
984 985
        goto cleanup;

986
    if (virCgroupMoveTask(priv->cgroup, cgroup_emulator) < 0)
987
        goto cleanup;
988

989 990 991 992 993
    if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
        if (!(cpumap = qemuPrepareCpumap(driver, nodemask)))
            goto cleanup;
        cpumask = cpumap;
    } else if (def->cputune.emulatorpin) {
994
        cpumask = def->cputune.emulatorpin->cpumask;
995
    } else if (def->cpumask) {
996
        cpumask = def->cpumask;
997
    }
998 999

    if (cpumask) {
1000 1001 1002
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET) &&
            qemuSetupCgroupEmulatorPin(cgroup_emulator, cpumask) < 0)
            goto cleanup;
H
Hu Tao 已提交
1003
    }
1004

1005
    if (period || quota) {
1006 1007 1008 1009
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
            qemuSetupCgroupVcpuBW(cgroup_emulator, period,
                                  quota) < 0)
            goto cleanup;
1010 1011
    }

1012
    virCgroupFree(&cgroup_emulator);
1013
    virBitmapFree(cpumap);
1014 1015
    return 0;

1016
 cleanup:
1017 1018
    virBitmapFree(cpumap);

1019 1020 1021 1022 1023
    if (cgroup_emulator) {
        virCgroupRemove(cgroup_emulator);
        virCgroupFree(&cgroup_emulator);
    }

1024
    return -1;
1025
}
1026

1027 1028
int
qemuRemoveCgroup(virDomainObjPtr vm)
1029
{
1030
    qemuDomainObjPrivatePtr priv = vm->privateData;
1031

1032
    if (priv->cgroup == NULL)
1033 1034
        return 0; /* Not supported, so claim success */

1035
    return virCgroupRemove(priv->cgroup);
1036 1037
}

1038 1039
int
qemuAddToCgroup(virDomainObjPtr vm)
1040
{
1041
    qemuDomainObjPrivatePtr priv = vm->privateData;
1042

1043
    if (priv->cgroup == NULL)
1044 1045
        return 0; /* Not supported, so claim success */

1046
    return 0;
1047
}