vircgroup.c 87.3 KB
Newer Older
1
/*
2
 * vircgroup.c: methods for managing control cgroups
3
 *
4
 * Copyright (C) 2010-2013 Red Hat, Inc.
5 6
 * Copyright IBM Corp. 2008
 *
O
Osier Yang 已提交
7 8 9 10 11 12 13 14 15 16 17
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with this library.  If not, see
O
Osier Yang 已提交
19
 * <http://www.gnu.org/licenses/>.
20 21 22 23 24 25 26
 *
 * Authors:
 *  Dan Smith <danms@us.ibm.com>
 */
#include <config.h>

#include <stdio.h>
27
#if defined HAVE_MNTENT_H && defined HAVE_GETMNTENT_R
28
# include <mntent.h>
D
Daniel P. Berrange 已提交
29
#endif
30 31 32
#if defined HAVE_SYS_MOUNT_H
# include <sys/mount.h>
#endif
33 34 35 36 37 38
#include <fcntl.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
39
#include <signal.h>
40
#include <dirent.h>
41

42 43 44
#define __VIR_CGROUP_ALLOW_INCLUDE_PRIV_H__
#include "vircgrouppriv.h"

45
#include "virutil.h"
46
#include "viralloc.h"
47
#include "virerror.h"
48
#include "virlog.h"
E
Eric Blake 已提交
49
#include "virfile.h"
50
#include "virhash.h"
51
#include "virhashcode.h"
52
#include "virstring.h"
53
#include "virsystemd.h"
54 55 56

#define CGROUP_MAX_VAL 512

57 58
#define VIR_FROM_THIS VIR_FROM_CGROUP

59
VIR_ENUM_IMPL(virCgroupController, VIR_CGROUP_CONTROLLER_LAST,
R
Ryota Ozaki 已提交
60
              "cpu", "cpuacct", "cpuset", "memory", "devices",
61 62
              "freezer", "blkio", "net_cls", "perf_event",
              "name=systemd");
63

64 65 66 67 68 69 70 71
typedef enum {
    VIR_CGROUP_NONE = 0, /* create subdir under each cgroup if possible. */
    VIR_CGROUP_MEM_HIERACHY = 1 << 0, /* call virCgroupSetMemoryUseHierarchy
                                       * before creating subcgroups and
                                       * attaching tasks
                                       */
} virCgroupFlags;

72 73
bool virCgroupAvailable(void)
{
74 75
    bool ret = false;
#ifdef HAVE_GETMNTENT_R
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
    FILE *mounts = NULL;
    struct mntent entry;
    char buf[CGROUP_MAX_VAL];

    if (!virFileExists("/proc/cgroups"))
        return false;

    if (!(mounts = fopen("/proc/mounts", "r")))
        return false;

    while (getmntent_r(mounts, &entry, buf, sizeof(buf)) != NULL) {
        if (STREQ(entry.mnt_type, "cgroup")) {
            ret = true;
            break;
        }
    }

    VIR_FORCE_FCLOSE(mounts);
94
#endif
95 96 97
    return ret;
}

98
#if defined HAVE_MNTENT_H && defined HAVE_GETMNTENT_R
E
Eric Blake 已提交
99 100 101

static int virCgroupPartitionEscape(char **path);

102
static bool
103 104 105
virCgroupValidateMachineGroup(virCgroupPtr group,
                              const char *name,
                              const char *drivername,
106
                              const char *partition,
107
                              bool stripEmulatorSuffix)
108 109 110 111
{
    size_t i;
    bool valid = false;
    char *partname;
112
    char *scopename;
113 114 115 116 117 118 119 120

    if (virAsprintf(&partname, "%s.libvirt-%s",
                    name, drivername) < 0)
        goto cleanup;

    if (virCgroupPartitionEscape(&partname) < 0)
        goto cleanup;

121 122 123 124 125 126 127 128 129
    if (!partition)
        partition = "/machine";

    if (!(scopename = virSystemdMakeScopeName(name, drivername, partition)))
        goto cleanup;

    if (virCgroupPartitionEscape(&scopename) < 0)
        goto cleanup;

130 131 132
    for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
        char *tmp;

133 134 135
        if (i == VIR_CGROUP_CONTROLLER_SYSTEMD)
            continue;

136 137 138 139 140 141
        if (!group->controllers[i].placement)
            continue;

        tmp = strrchr(group->controllers[i].placement, '/');
        if (!tmp)
            goto cleanup;
142 143 144 145 146 147 148 149 150 151 152 153

        if (stripEmulatorSuffix &&
            (i == VIR_CGROUP_CONTROLLER_CPU ||
             i == VIR_CGROUP_CONTROLLER_CPUACCT ||
             i == VIR_CGROUP_CONTROLLER_CPUSET)) {
            if (STREQ(tmp, "/emulator"))
                *tmp = '\0';
            tmp = strrchr(group->controllers[i].placement, '/');
            if (!tmp)
                goto cleanup;
        }

154 155 156
        tmp++;

        if (STRNEQ(tmp, name) &&
157 158 159 160
            STRNEQ(tmp, partname) &&
            STRNEQ(tmp, scopename)) {
            VIR_DEBUG("Name '%s' for controller '%s' does not match '%s', '%s' or '%s'",
                      tmp, virCgroupControllerTypeToString(i), name, partname, scopename);
161
            goto cleanup;
162
        }
163 164 165 166 167 168
    }

    valid = true;

 cleanup:
    VIR_FREE(partname);
169
    VIR_FREE(scopename);
170 171
    return valid;
}
172 173 174 175 176 177 178 179 180 181
#else
static bool
virCgroupValidateMachineGroup(virCgroupPtr group ATTRIBUTE_UNUSED,
                              const char *name ATTRIBUTE_UNUSED,
                              const char *drivername ATTRIBUTE_UNUSED,
                              bool stripEmulatorSuffix ATTRIBUTE_UNUSED)
{
    return true;
}
#endif
182

183 184 185 186 187 188 189
/**
 * virCgroupFree:
 *
 * @group: The group structure to free
 */
void virCgroupFree(virCgroupPtr *group)
{
190
    size_t i;
191 192 193 194

    if (*group == NULL)
        return;

195
    for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
196
        VIR_FREE((*group)->controllers[i].mountPoint);
197
        VIR_FREE((*group)->controllers[i].linkPoint);
198
        VIR_FREE((*group)->controllers[i].placement);
199
    }
200 201 202

    VIR_FREE((*group)->path);
    VIR_FREE(*group);
203 204
}

L
Lai Jiangshan 已提交
205
/**
206
 * virCgroupHasController: query whether a cgroup controller is present
L
Lai Jiangshan 已提交
207
 *
208
 * @cgroup: The group structure to be queried, or NULL
L
Lai Jiangshan 已提交
209 210
 * @controller: cgroup subsystem id
 *
211 212
 * Returns true if a cgroup controller is mounted and is associated
 * with this cgroup object.
L
Lai Jiangshan 已提交
213
 */
214
bool virCgroupHasController(virCgroupPtr cgroup, int controller)
L
Lai Jiangshan 已提交
215
{
216 217 218 219
    if (!cgroup)
        return false;
    if (controller < 0 || controller >= VIR_CGROUP_CONTROLLER_LAST)
        return false;
L
Lai Jiangshan 已提交
220 221 222
    return cgroup->controllers[controller].mountPoint != NULL;
}

223
#if defined HAVE_MNTENT_H && defined HAVE_GETMNTENT_R
224 225 226
static int virCgroupCopyMounts(virCgroupPtr group,
                               virCgroupPtr parent)
{
227
    size_t i;
228
    for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
229 230 231
        if (!parent->controllers[i].mountPoint)
            continue;

232 233 234
        if (VIR_STRDUP(group->controllers[i].mountPoint,
                       parent->controllers[i].mountPoint) < 0)
            return -1;
235

236 237 238
        if (VIR_STRDUP(group->controllers[i].linkPoint,
                       parent->controllers[i].linkPoint) < 0)
            return -1;
239 240 241 242
    }
    return 0;
}

243 244 245 246 247
/*
 * Process /proc/mounts figuring out what controllers are
 * mounted and where
 */
static int virCgroupDetectMounts(virCgroupPtr group)
248
{
249
    size_t i;
250
    FILE *mounts = NULL;
251 252 253 254 255
    struct mntent entry;
    char buf[CGROUP_MAX_VAL];

    mounts = fopen("/proc/mounts", "r");
    if (mounts == NULL) {
256 257 258
        virReportSystemError(errno, "%s",
                             _("Unable to open /proc/mounts"));
        return -1;
259 260 261
    }

    while (getmntent_r(mounts, &entry, buf, sizeof(buf)) != NULL) {
262 263
        if (STRNEQ(entry.mnt_type, "cgroup"))
            continue;
264

265
        for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
266 267 268 269 270 271 272 273 274 275 276 277
            const char *typestr = virCgroupControllerTypeToString(i);
            int typelen = strlen(typestr);
            char *tmp = entry.mnt_opts;
            while (tmp) {
                char *next = strchr(tmp, ',');
                int len;
                if (next) {
                    len = next-tmp;
                    next++;
                } else {
                    len = strlen(tmp);
                }
278 279 280 281
                /* NB, the same controller can appear >1 time in mount list
                 * due to bind mounts from one location to another. Pick the
                 * first entry only
                 */
282
                if (typelen == len && STREQLEN(typestr, tmp, len) &&
283 284 285 286 287
                    !group->controllers[i].mountPoint) {
                    char *linksrc;
                    struct stat sb;
                    char *tmp2;

288 289
                    if (VIR_STRDUP(group->controllers[i].mountPoint,
                                   entry.mnt_dir) < 0)
290
                        goto error;
291 292 293

                    tmp2 = strrchr(entry.mnt_dir, '/');
                    if (!tmp2) {
294 295 296
                        virReportError(VIR_ERR_INTERNAL_ERROR,
                                       _("Missing '/' separator in cgroup mount '%s'"),
                                       entry.mnt_dir);
297 298 299 300 301 302 303 304
                        goto error;
                    }
                    *tmp2 = '\0';
                    /* If it is a co-mount it has a filename like "cpu,cpuacct"
                     * and we must identify the symlink path */
                    if (strchr(tmp2 + 1, ',')) {
                        if (virAsprintf(&linksrc, "%s/%s",
                                        entry.mnt_dir, typestr) < 0)
305
                            goto error;
306 307 308 309 310 311 312 313
                        *tmp2 = '/';

                        if (lstat(linksrc, &sb) < 0) {
                            if (errno == ENOENT) {
                                VIR_WARN("Controller %s co-mounted at %s is missing symlink at %s",
                                         typestr, entry.mnt_dir, linksrc);
                                VIR_FREE(linksrc);
                            } else {
314 315
                                virReportSystemError(errno,
                                                     _("Cannot stat %s"), linksrc);
316 317 318 319 320 321 322 323 324 325 326 327
                                goto error;
                            }
                        } else {
                            if (!S_ISLNK(sb.st_mode)) {
                                VIR_WARN("Expecting a symlink at %s for controller %s",
                                         linksrc, typestr);
                            } else {
                                group->controllers[i].linkPoint = linksrc;
                            }
                        }
                    }
                }
328 329 330
                tmp = next;
            }
        }
331 332
    }

333
    VIR_FORCE_FCLOSE(mounts);
334

335
    return 0;
336

337
error:
338
    VIR_FORCE_FCLOSE(mounts);
339
    return -1;
340 341
}

342

343 344 345 346
static int virCgroupCopyPlacement(virCgroupPtr group,
                                  const char *path,
                                  virCgroupPtr parent)
{
347
    size_t i;
348
    for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
349 350 351
        if (!group->controllers[i].mountPoint)
            continue;

352 353 354
        if (i == VIR_CGROUP_CONTROLLER_SYSTEMD)
            continue;

355
        if (path[0] == '/') {
356 357
            if (VIR_STRDUP(group->controllers[i].placement, path) < 0)
                return -1;
358 359 360 361 362 363 364 365 366 367 368 369
        } else {
            /*
             * parent=="/" + path="" => "/"
             * parent=="/libvirt.service" + path=="" => "/libvirt.service"
             * parent=="/libvirt.service" + path=="foo" => "/libvirt.service/foo"
             */
            if (virAsprintf(&group->controllers[i].placement,
                            "%s%s%s",
                            parent->controllers[i].placement,
                            (STREQ(parent->controllers[i].placement, "/") ||
                             STREQ(path, "") ? "" : "/"),
                            path) < 0)
370
                return -1;
371 372 373 374 375 376 377
        }
    }

    return 0;
}


378
/*
379 380 381 382
 * virCgroupDetectPlacement:
 * @group: the group to process
 * @path: the relative path to append, not starting with '/'
 *
383 384
 * Process /proc/self/cgroup figuring out what cgroup
 * sub-path the current process is assigned to. ie not
385 386 387 388 389 390 391 392 393 394 395 396 397 398
 * necessarily in the root. The contents of this file
 * looks like
 *
 * 9:perf_event:/
 * 8:blkio:/
 * 7:net_cls:/
 * 6:freezer:/
 * 5:devices:/
 * 4:memory:/
 * 3:cpuacct,cpu:/
 * 2:cpuset:/
 * 1:name=systemd:/user/berrange/2
 *
 * It then appends @path to each detected path.
399
 */
400
static int virCgroupDetectPlacement(virCgroupPtr group,
401
                                    pid_t pid,
402
                                    const char *path)
403
{
404
    size_t i;
405 406
    FILE *mapping  = NULL;
    char line[1024];
407
    int ret = -1;
408
    char *procfile;
409

410 411
    VIR_DEBUG("Detecting placement for pid %lld path %s",
              (unsigned long long)pid, path);
412 413 414 415 416 417 418 419 420 421
    if (pid == -1) {
        if (VIR_STRDUP(procfile, "/proc/self/cgroup") < 0)
            goto cleanup;
    } else {
        if (virAsprintf(&procfile, "/proc/%llu/cgroup",
                        (unsigned long long)pid) < 0)
            goto cleanup;
    }

    mapping = fopen(procfile, "r");
422
    if (mapping == NULL) {
423 424 425 426
        virReportSystemError(errno,
                             _("Unable to open '%s'"),
                             procfile);
        goto cleanup;
427 428
    }

429 430
    while (fgets(line, sizeof(line), mapping) != NULL) {
        char *controllers = strchr(line, ':');
431 432
        char *selfpath = controllers ? strchr(controllers + 1, ':') : NULL;
        char *nl = selfpath ? strchr(selfpath, '\n') : NULL;
433

434
        if (!controllers || !selfpath)
435 436 437 438 439
            continue;

        if (nl)
            *nl = '\0';

440
        *selfpath = '\0';
441
        controllers++;
442
        selfpath++;
443

444
        for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
445 446 447
            const char *typestr = virCgroupControllerTypeToString(i);
            int typelen = strlen(typestr);
            char *tmp = controllers;
448

449 450 451 452
            while (tmp) {
                char *next = strchr(tmp, ',');
                int len;
                if (next) {
453
                    len = next - tmp;
454 455 456 457
                    next++;
                } else {
                    len = strlen(tmp);
                }
458 459 460 461 462 463

                /*
                 * selfpath=="/" + path="" -> "/"
                 * selfpath=="/libvirt.service" + path="" -> "/libvirt.service"
                 * selfpath=="/libvirt.service" + path="foo" -> "/libvirt.service/foo"
                 */
464
                if (typelen == len && STREQLEN(typestr, tmp, len) &&
465 466 467 468 469 470 471 472 473 474 475 476 477 478
                    group->controllers[i].mountPoint != NULL &&
                    group->controllers[i].placement == NULL) {
                    if (i == VIR_CGROUP_CONTROLLER_SYSTEMD) {
                        if (VIR_STRDUP(group->controllers[i].placement,
                                       selfpath) < 0)
                            goto cleanup;
                    } else {
                        if (virAsprintf(&group->controllers[i].placement,
                                        "%s%s%s", selfpath,
                                        (STREQ(selfpath, "/") ||
                                         STREQ(path, "") ? "" : "/"),
                                        path) < 0)
                            goto cleanup;
                    }
479
                }
480 481 482 483 484 485

                tmp = next;
            }
        }
    }

486
    ret = 0;
487

488
cleanup:
489
    VIR_FREE(procfile);
490
    VIR_FORCE_FCLOSE(mapping);
491

492
    return ret;
493 494
}

495
static int virCgroupDetect(virCgroupPtr group,
496
                           pid_t pid,
497 498 499
                           int controllers,
                           const char *path,
                           virCgroupPtr parent)
500
{
501 502
    size_t i;
    size_t j;
503 504
    VIR_DEBUG("group=%p controllers=%d path=%s parent=%p",
              group, controllers, path, parent);
505

506 507 508 509 510 511
    if (parent) {
        if (virCgroupCopyMounts(group, parent) < 0)
            return -1;
    } else {
        if (virCgroupDetectMounts(group) < 0)
            return -1;
512 513
    }

514
    if (controllers >= 0) {
515
        VIR_DEBUG("Filtering controllers %d", controllers);
516
        for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
517
            VIR_DEBUG("Controller '%s' wanted=%s, mount='%s'",
518
                      virCgroupControllerTypeToString(i),
519 520
                      (1 << i) & controllers ? "yes" : "no",
                      NULLSTR(group->controllers[i].mountPoint));
521
            if (((1 << i) & controllers)) {
522
                /* Remove non-existent controllers  */
523
                if (!group->controllers[i].mountPoint) {
524
                    VIR_DEBUG("Requested controller '%s' not mounted, ignoring",
525
                              virCgroupControllerTypeToString(i));
526
                    controllers &= ~(1 << i);
527 528 529 530
                }
            } else {
                /* Check whether a request to disable a controller
                 * clashes with co-mounting of controllers */
531
                for (j = 0; j < VIR_CGROUP_CONTROLLER_LAST; j++) {
532 533 534 535 536 537 538
                    if (j == i)
                        continue;
                    if (!((1 << j) & controllers))
                        continue;

                    if (STREQ_NULLABLE(group->controllers[i].mountPoint,
                                       group->controllers[j].mountPoint)) {
539 540 541 542 543
                        virReportSystemError(EINVAL,
                                             _("Controller '%s' is not wanted, but '%s' is co-mounted"),
                                             virCgroupControllerTypeToString(i),
                                             virCgroupControllerTypeToString(j));
                        return -1;
544 545 546 547 548 549 550 551
                    }
                }
                VIR_FREE(group->controllers[i].mountPoint);
            }
        }
    } else {
        VIR_DEBUG("Auto-detecting controllers");
        controllers = 0;
552
        for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
553 554 555 556 557 558 559
            VIR_DEBUG("Controller '%s' present=%s",
                      virCgroupControllerTypeToString(i),
                      group->controllers[i].mountPoint ? "yes" : "no");
            if (group->controllers[i].mountPoint == NULL)
                continue;
            controllers |= (1 << i);
        }
560
    }
561

562
    /* Check that at least 1 controller is available */
563
    if (!controllers) {
564 565 566
        virReportSystemError(ENXIO, "%s",
                             _("At least one cgroup controller is required"));
        return -1;
567
    }
568

569 570 571 572 573 574 575 576 577 578
    /* In some cases we can copy part of the placement info
     * based on the parent cgroup...
     */
    if ((parent || path[0] == '/') &&
        virCgroupCopyPlacement(group, path, parent) < 0)
        return -1;

    /* ... but use /proc/cgroups to fill in the rest */
    if (virCgroupDetectPlacement(group, pid, path) < 0)
        return -1;
579

580 581 582 583
    /* Check that for every mounted controller, we found our placement */
    for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
        if (!group->controllers[i].mountPoint)
            continue;
584

585 586 587 588 589 590
        if (!group->controllers[i].placement) {
            virReportError(VIR_ERR_INTERNAL_ERROR,
                           _("Could not find placement for controller %s at %s"),
                           virCgroupControllerTypeToString(i),
                           group->controllers[i].placement);
            return -1;
591
        }
592

593
        VIR_DEBUG("Detected mount/mapping %zu:%s at %s in %s for pid %llu", i,
594 595
                  virCgroupControllerTypeToString(i),
                  group->controllers[i].mountPoint,
596 597
                  group->controllers[i].placement,
                  (unsigned long long)pid);
598 599
    }

600
    return 0;
601
}
D
Daniel P. Berrange 已提交
602
#endif
603

604

605 606 607 608
int virCgroupPathOfController(virCgroupPtr group,
                              int controller,
                              const char *key,
                              char **path)
609
{
610
    if (controller == -1) {
611
        size_t i;
612
        for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
613 614 615 616
            /* Reject any controller with a placement
             * of '/' to avoid doing bad stuff to the root
             * cgroup
             */
617
            if (group->controllers[i].mountPoint &&
618 619
                group->controllers[i].placement &&
                STRNEQ(group->controllers[i].placement, "/")) {
620 621 622 623 624
                controller = i;
                break;
            }
        }
    }
625 626 627 628 629
    if (controller == -1) {
        virReportSystemError(ENOSYS, "%s",
                             _("No controllers are mounted"));
        return -1;
    }
630

631 632 633 634 635 636
    if (group->controllers[controller].mountPoint == NULL) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Controller '%s' is not mounted"),
                       virCgroupControllerTypeToString(controller));
        return -1;
    }
637

638 639 640 641 642 643
    if (group->controllers[controller].placement == NULL) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Controller '%s' is not enabled for group"),
                       virCgroupControllerTypeToString(controller));
        return -1;
    }
644

645
    if (virAsprintf(path, "%s%s/%s",
646 647
                    group->controllers[controller].mountPoint,
                    group->controllers[controller].placement,
648 649
                    key ? key : "") < 0)
        return -1;
650 651 652 653 654

    return 0;
}


655
static int virCgroupSetValueStr(virCgroupPtr group,
656
                                int controller,
657 658 659
                                const char *key,
                                const char *value)
{
660
    int ret = -1;
661 662
    char *keypath = NULL;

663 664
    if (virCgroupPathOfController(group, controller, key, &keypath) < 0)
        return -1;
665

666
    VIR_DEBUG("Set value '%s' to '%s'", keypath, value);
667 668 669 670
    if (virFileWriteStr(keypath, value, 0) < 0) {
        virReportSystemError(errno,
                             _("Unable to write to '%s'"), keypath);
        goto cleanup;
671 672
    }

673
    ret = 0;
674

675 676 677
cleanup:
    VIR_FREE(keypath);
    return ret;
678 679 680
}

static int virCgroupGetValueStr(virCgroupPtr group,
681
                                int controller,
682 683 684 685
                                const char *key,
                                char **value)
{
    char *keypath = NULL;
686
    int ret = -1, rc;
687

688
    *value = NULL;
689

690 691
    if (virCgroupPathOfController(group, controller, key, &keypath) < 0)
        return -1;
692

693
    VIR_DEBUG("Get value %s", keypath);
694

695 696 697 698
    if ((rc = virFileReadAll(keypath, 1024*1024, value)) < 0) {
        virReportSystemError(errno,
                             _("Unable to read from '%s'"), keypath);
        goto cleanup;
699 700
    }

701 702 703
    /* Terminated with '\n' has sometimes harmful effects to the caller */
    if (rc > 0 && (*value)[rc - 1] == '\n')
        (*value)[rc - 1] = '\0';
704

705 706 707 708 709
    ret = 0;

cleanup:
    VIR_FREE(keypath);
    return ret;
710 711
}

712
static int virCgroupSetValueU64(virCgroupPtr group,
713
                                int controller,
714
                                const char *key,
D
Daniel P. Berrange 已提交
715
                                unsigned long long int value)
716 717
{
    char *strval = NULL;
718
    int ret;
719

720 721
    if (virAsprintf(&strval, "%llu", value) < 0)
        return -1;
722

723
    ret = virCgroupSetValueStr(group, controller, key, strval);
724 725 726

    VIR_FREE(strval);

727
    return ret;
728 729 730
}


731 732

static int virCgroupSetValueI64(virCgroupPtr group,
733
                                int controller,
734
                                const char *key,
D
Daniel P. Berrange 已提交
735
                                long long int value)
736 737
{
    char *strval = NULL;
738
    int ret;
739

740 741
    if (virAsprintf(&strval, "%lld", value) < 0)
        return -1;
742

743
    ret = virCgroupSetValueStr(group, controller, key, strval);
744 745 746

    VIR_FREE(strval);

747
    return ret;
748 749 750
}

static int virCgroupGetValueI64(virCgroupPtr group,
751
                                int controller,
752
                                const char *key,
D
Daniel P. Berrange 已提交
753
                                long long int *value)
754 755
{
    char *strval = NULL;
756
    int ret = -1;
757

758 759
    if (virCgroupGetValueStr(group, controller, key, &strval) < 0)
        goto cleanup;
760

761 762 763 764 765 766
    if (virStrToLong_ll(strval, NULL, 10, value) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unable to parse '%s' as an integer"),
                       strval);
        goto cleanup;
    }
767

768 769 770 771 772
    ret = 0;

cleanup:
    VIR_FREE(strval);
    return ret;
773 774
}

775
static int virCgroupGetValueU64(virCgroupPtr group,
776
                                int controller,
777
                                const char *key,
D
Daniel P. Berrange 已提交
778
                                unsigned long long int *value)
779 780
{
    char *strval = NULL;
781
    int ret = -1;
782

783 784
    if (virCgroupGetValueStr(group, controller, key, &strval) < 0)
        goto cleanup;
785

786 787 788 789 790 791
    if (virStrToLong_ull(strval, NULL, 10, value) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unable to parse '%s' as an integer"),
                       strval);
        goto cleanup;
    }
792

793 794 795 796 797
    ret = 0;

cleanup:
    VIR_FREE(strval);
    return ret;
798 799 800
}


801
#if defined HAVE_MNTENT_H && defined HAVE_GETMNTENT_R
802
static int virCgroupCpuSetInherit(virCgroupPtr parent, virCgroupPtr group)
803
{
804
    size_t i;
805 806 807 808 809
    const char *inherit_values[] = {
        "cpuset.cpus",
        "cpuset.mems",
    };

810
    VIR_DEBUG("Setting up inheritance %s -> %s", parent->path, group->path);
811
    for (i = 0; i < ARRAY_CARDINALITY(inherit_values); i++) {
812
        char *value;
813

814 815 816 817
        if (virCgroupGetValueStr(parent,
                                 VIR_CGROUP_CONTROLLER_CPUSET,
                                 inherit_values[i],
                                 &value) < 0)
818
            return -1;
819 820 821

        VIR_DEBUG("Inherit %s = %s", inherit_values[i], value);

822 823 824 825 826
        if (virCgroupSetValueStr(group,
                                 VIR_CGROUP_CONTROLLER_CPUSET,
                                 inherit_values[i],
                                 value) < 0) {
            VIR_FREE(value);
827
            return -1;
828
        }
829
        VIR_FREE(value);
830 831
    }

832
    return 0;
833 834
}

835 836 837 838 839
static int virCgroupSetMemoryUseHierarchy(virCgroupPtr group)
{
    unsigned long long value;
    const char *filename = "memory.use_hierarchy";

840 841 842
    if (virCgroupGetValueU64(group,
                             VIR_CGROUP_CONTROLLER_MEMORY,
                             filename, &value) < 0)
843
        return -1;
844 845 846 847 848 849

    /* Setting twice causes error, so if already enabled, skip setting */
    if (value == 1)
        return 0;

    VIR_DEBUG("Setting up %s/%s", group->path, filename);
850 851 852
    if (virCgroupSetValueU64(group,
                             VIR_CGROUP_CONTROLLER_MEMORY,
                             filename, 1) < 0)
853
        return -1;
854

855
    return 0;
856 857
}

858 859 860 861
static int virCgroupMakeGroup(virCgroupPtr parent,
                              virCgroupPtr group,
                              bool create,
                              unsigned int flags)
862
{
863
    size_t i;
864
    int ret = -1;
865

866
    VIR_DEBUG("Make group %s", group->path);
867
    for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
868 869
        char *path = NULL;

870 871 872 873 874 875
        /* We must never mkdir() in systemd's hierarchy */
        if (i == VIR_CGROUP_CONTROLLER_SYSTEMD) {
            VIR_DEBUG("Not creating systemd controller group");
            continue;
        }

876
        /* Skip over controllers that aren't mounted */
877 878 879
        if (!group->controllers[i].mountPoint) {
            VIR_DEBUG("Skipping unmounted controller %s",
                      virCgroupControllerTypeToString(i));
880
            continue;
881
        }
882

883
        if (virCgroupPathOfController(group, i, "", &path) < 0)
884
            return -1;
885

886 887 888
        /* As of Feb 2011, clang can't see that the above function
         * call did not modify group. */
        sa_assert(group->controllers[i].mountPoint);
889

890
        VIR_DEBUG("Make controller %s", path);
891
        if (access(path, F_OK) != 0) {
892 893
            if (!create ||
                mkdir(path, 0755) < 0) {
894 895 896 897 898
                /* With a kernel that doesn't support multi-level directory
                 * for blkio controller, libvirt will fail and disable all
                 * other controllers even though they are available. So
                 * treat blkio as unmounted if mkdir fails. */
                if (i == VIR_CGROUP_CONTROLLER_BLKIO) {
899
                    VIR_DEBUG("Ignoring mkdir failure with blkio controller. Kernel probably too old");
900 901 902 903
                    VIR_FREE(group->controllers[i].mountPoint);
                    VIR_FREE(path);
                    continue;
                } else {
904 905 906
                    virReportSystemError(errno,
                                         _("Failed to create controller %s for group"),
                                         virCgroupControllerTypeToString(i));
907
                    VIR_FREE(path);
908
                    goto cleanup;
909
                }
910
            }
911 912 913
            if (group->controllers[VIR_CGROUP_CONTROLLER_CPUSET].mountPoint != NULL &&
                (i == VIR_CGROUP_CONTROLLER_CPUSET ||
                 STREQ(group->controllers[i].mountPoint, group->controllers[VIR_CGROUP_CONTROLLER_CPUSET].mountPoint))) {
914
                if (virCgroupCpuSetInherit(parent, group) < 0) {
915
                    VIR_FREE(path);
916
                    goto cleanup;
917
                }
918
            }
919 920 921 922
            /*
             * Note that virCgroupSetMemoryUseHierarchy should always be
             * called prior to creating subcgroups and attaching tasks.
             */
923 924
            if ((flags & VIR_CGROUP_MEM_HIERACHY) &&
                (group->controllers[VIR_CGROUP_CONTROLLER_MEMORY].mountPoint != NULL) &&
925 926
                (i == VIR_CGROUP_CONTROLLER_MEMORY ||
                 STREQ(group->controllers[i].mountPoint, group->controllers[VIR_CGROUP_CONTROLLER_MEMORY].mountPoint))) {
927
                if (virCgroupSetMemoryUseHierarchy(group) < 0) {
928
                    VIR_FREE(path);
929
                    goto cleanup;
930 931
                }
            }
932 933 934 935 936
        }

        VIR_FREE(path);
    }

937
    VIR_DEBUG("Done making controllers for group");
938 939 940 941
    ret = 0;

cleanup:
    return ret;
942 943
}

944

945 946 947 948 949 950 951 952 953 954 955 956 957 958
/**
 * virCgroupNew:
 * @path: path for the new group
 * @parent: parent group, or NULL
 * @controllers: bitmask of controllers to activate
 *
 * Create a new cgroup storing it in @group.
 *
 * If @path starts with a '/' it is treated as an
 * absolute path, and @parent is ignored. Otherwise
 * it is treated as being relative to @parent. If
 * @parent is NULL, then the placement of the current
 * process is used.
 *
959
 * Returns 0 on success, -1 on error
960
 */
961 962
static int virCgroupNew(pid_t pid,
                        const char *path,
963
                        virCgroupPtr parent,
964
                        int controllers,
965
                        virCgroupPtr *group)
966
{
967 968
    VIR_DEBUG("parent=%p path=%s controllers=%d",
              parent, path, controllers);
969
    *group = NULL;
970

971 972
    if (VIR_ALLOC((*group)) < 0)
        goto error;
973

974
    if (path[0] == '/' || !parent) {
975 976
        if (VIR_STRDUP((*group)->path, path) < 0)
            goto error;
977 978 979 980
    } else {
        if (virAsprintf(&(*group)->path, "%s%s%s",
                        parent->path,
                        STREQ(parent->path, "") ? "" : "/",
981 982
                        path) < 0)
            goto error;
983 984
    }

985
    if (virCgroupDetect(*group, pid, controllers, path, parent) < 0)
986
        goto error;
987

988 989 990
    return 0;

error:
991 992
    virCgroupFree(group);
    *group = NULL;
993

994
    return -1;
995
}
D
Daniel P. Berrange 已提交
996
#endif
997

998
#if defined _DIRENT_HAVE_D_TYPE
999
int virCgroupRemoveRecursively(char *grppath)
1000 1001 1002 1003 1004 1005 1006
{
    DIR *grpdir;
    struct dirent *ent;
    int rc = 0;

    grpdir = opendir(grppath);
    if (grpdir == NULL) {
1007 1008
        if (errno == ENOENT)
            return 0;
1009
        rc = -errno;
1010
        VIR_ERROR(_("Unable to open %s (%d)"), grppath, errno);
1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038
        return rc;
    }

    for (;;) {
        char *path;

        errno = 0;
        ent = readdir(grpdir);
        if (ent == NULL) {
            if ((rc = -errno))
                VIR_ERROR(_("Failed to readdir for %s (%d)"), grppath, errno);
            break;
        }

        if (ent->d_name[0] == '.') continue;
        if (ent->d_type != DT_DIR) continue;

        if (virAsprintf(&path, "%s/%s", grppath, ent->d_name) == -1) {
            rc = -ENOMEM;
            break;
        }
        rc = virCgroupRemoveRecursively(path);
        VIR_FREE(path);
        if (rc != 0)
            break;
    }
    closedir(grpdir);

1039
    VIR_DEBUG("Removing cgroup %s", grppath);
1040 1041 1042 1043 1044 1045 1046
    if (rmdir(grppath) != 0 && errno != ENOENT) {
        rc = -errno;
        VIR_ERROR(_("Unable to remove %s (%d)"), grppath, errno);
    }

    return rc;
}
1047
#else
1048
int virCgroupRemoveRecursively(char *grppath ATTRIBUTE_UNUSED)
1049
{
1050 1051 1052
    virReportSystemError(ENXIO, "%s",
                         _("Control groups not supported on this platform"));
    return -1;
1053 1054
}
#endif
1055

1056 1057 1058 1059 1060
/**
 * virCgroupRemove:
 *
 * @group: The group to be removed
 *
1061 1062 1063 1064 1065
 * It first removes all child groups recursively
 * in depth first order and then removes @group
 * because the presence of the child groups
 * prevents removing @group.
 *
1066 1067 1068 1069 1070
 * Returns: 0 on success
 */
int virCgroupRemove(virCgroupPtr group)
{
    int rc = 0;
1071
    size_t i;
1072 1073
    char *grppath = NULL;

1074
    VIR_DEBUG("Removing cgroup %s", group->path);
1075
    for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
1076 1077
        /* Skip over controllers not mounted */
        if (!group->controllers[i].mountPoint)
1078 1079
            continue;

1080 1081 1082 1083
        /* We must never rmdir() in systemd's hierarchy */
        if (i == VIR_CGROUP_CONTROLLER_SYSTEMD)
            continue;

1084 1085 1086 1087 1088
        /* Don't delete the root group, if we accidentally
           ended up in it for some reason */
        if (STREQ(group->controllers[i].placement, "/"))
            continue;

1089 1090 1091 1092 1093
        if (virCgroupPathOfController(group,
                                      i,
                                      NULL,
                                      &grppath) != 0)
            continue;
1094

1095
        VIR_DEBUG("Removing cgroup %s and all child cgroups", grppath);
1096
        rc = virCgroupRemoveRecursively(grppath);
1097 1098
        VIR_FREE(grppath);
    }
1099
    VIR_DEBUG("Done removing cgroup %s", group->path);
1100 1101 1102 1103

    return rc;
}

1104

1105 1106 1107 1108 1109 1110
/**
 * virCgroupAddTask:
 *
 * @group: The cgroup to add a task to
 * @pid: The pid of the task to add
 *
1111
 * Returns: 0 on success, -1 on error
1112 1113 1114
 */
int virCgroupAddTask(virCgroupPtr group, pid_t pid)
{
1115
    int ret = -1;
1116
    size_t i;
1117

1118
    for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
1119 1120 1121
        /* Skip over controllers not mounted */
        if (!group->controllers[i].mountPoint)
            continue;
1122

1123 1124 1125 1126
        /* We must never add tasks in systemd's hierarchy */
        if (i == VIR_CGROUP_CONTROLLER_SYSTEMD)
            continue;

1127 1128
        if (virCgroupSetValueU64(group, i, "tasks", (unsigned long long)pid) < 0)
            goto cleanup;
1129 1130
    }

1131 1132 1133
    ret = 0;
cleanup:
    return ret;
1134 1135
}

1136 1137 1138 1139 1140 1141 1142
/**
 * virCgroupAddTaskController:
 *
 * @group: The cgroup to add a task to
 * @pid: The pid of the task to add
 * @controller: The cgroup controller to be operated on
 *
1143
 * Returns: 0 on success or -1 on error
1144 1145 1146
 */
int virCgroupAddTaskController(virCgroupPtr group, pid_t pid, int controller)
{
1147 1148 1149 1150 1151
    if (controller < 0 || controller >= VIR_CGROUP_CONTROLLER_LAST) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Controller %d out of range"), controller);
        return -1;
    }
1152

1153 1154 1155 1156 1157 1158
    if (!group->controllers[controller].mountPoint) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Controller '%s' not mounted"),
                       virCgroupControllerTypeToString(controller));
        return -1;
    }
1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173

    return virCgroupSetValueU64(group, controller, "tasks",
                                (unsigned long long)pid);
}


static int virCgroupAddTaskStrController(virCgroupPtr group,
                                        const char *pidstr,
                                        int controller)
{
    char *str = NULL, *cur = NULL, *next = NULL;
    unsigned long long p = 0;
    int rc = 0;
    char *endp;

1174 1175
    if (VIR_STRDUP(str, pidstr) < 0)
        return -1;
1176 1177 1178

    cur = str;
    while (*cur != '\0') {
1179 1180 1181
        if (virStrToLong_ull(cur, &endp, 10, &p) < 0) {
            virReportError(VIR_ERR_INTERNAL_ERROR,
                           _("Cannot parse '%s' as an integer"), cur);
1182
            goto cleanup;
1183
        }
1184

1185 1186 1187 1188 1189 1190 1191 1192
        if (virCgroupAddTaskController(group, p, controller) < 0) {
            /* A thread that exits between when we first read the source
             * tasks and now is not fatal.  */
            if (virLastErrorIsSystemErrno(ESRCH))
                virResetLastError();
            else
                goto cleanup;
        }
1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214

        next = strchr(cur, '\n');
        if (next) {
            cur = next + 1;
            *next = '\0';
        } else {
            break;
        }
    }

cleanup:
    VIR_FREE(str);
    return rc;
}

/**
 * virCgroupMoveTask:
 *
 * @src_group: The source cgroup where all tasks are removed from
 * @dest_group: The destination where all tasks are added to
 * @controller: The cgroup controller to be operated on
 *
1215
 * Returns: 0 on success or -1 on failure
1216
 */
1217
int virCgroupMoveTask(virCgroupPtr src_group, virCgroupPtr dest_group)
1218
{
1219
    int ret = -1;
1220
    char *content = NULL;
1221
    size_t i;
1222

1223
    for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
1224 1225 1226
        if (!src_group->controllers[i].mountPoint ||
            !dest_group->controllers[i].mountPoint)
            continue;
1227

1228 1229 1230 1231
        /* We must never move tasks in systemd's hierarchy */
        if (i == VIR_CGROUP_CONTROLLER_SYSTEMD)
            continue;

1232 1233 1234 1235 1236
        /* New threads are created in the same group as their parent;
         * but if a thread is created after we first read we aren't
         * aware that it needs to move.  Therefore, we must iterate
         * until content is empty.  */
        while (1) {
J
Ján Tomko 已提交
1237
            VIR_FREE(content);
1238 1239 1240
            if (virCgroupGetValueStr(src_group, i, "tasks", &content) < 0)
                return -1;

1241 1242
            if (!*content)
                break;
1243

1244
            if (virCgroupAddTaskStrController(dest_group, content, i) < 0)
1245 1246
                goto cleanup;
        }
1247
    }
1248

1249
    ret = 0;
1250 1251
cleanup:
    VIR_FREE(content);
1252
    return ret;
1253
}
1254

1255 1256

#if defined HAVE_MNTENT_H && defined HAVE_GETMNTENT_R
1257 1258 1259 1260 1261
static int virCgroupPartitionNeedsEscaping(const char *path)
{
    FILE *fp = NULL;
    int ret = 0;
    char *line = NULL;
J
Ján Tomko 已提交
1262
    size_t buflen;
1263 1264 1265 1266 1267 1268 1269 1270

    /* If it starts with 'cgroup.' or a '_' of any
     * of the controller names from /proc/cgroups,
     * then we must prefix a '_'
     */
    if (STRPREFIX(path, "cgroup."))
        return 1;

1271 1272
    if (path[0] == '_' ||
        path[0] == '.')
1273 1274
        return 1;

1275 1276 1277 1278 1279
    if (!(fp = fopen("/proc/cgroups", "r"))) {
        /* The API contract is that we return ENXIO
         * if cgroups are not available on a host */
        if (errno == ENOENT)
            errno = ENXIO;
1280 1281 1282
        virReportSystemError(errno, "%s",
                             _("Cannot open /proc/cgroups"));
        return -1;
1283
    }
1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295

    /*
     * Data looks like this:
     * #subsys_name hierarchy num_cgroups enabled
     * cpuset  2 4  1
     * cpu     3 48 1
     * cpuacct 3 48 1
     * memory  4 4  1
     * devices 5 4  1
     * freezer 6 4  1
     * net_cls 7 1  1
     */
J
Ján Tomko 已提交
1296 1297 1298 1299 1300
    while (getline(&line, &buflen, fp) > 0) {
        char *tmp;
        size_t len;

        if (STRPREFIX(line, "#subsys_name"))
1301
            continue;
J
Ján Tomko 已提交
1302 1303 1304

        tmp = strchrnul(line, ' ');
        *tmp = '\0';
1305 1306 1307 1308 1309 1310 1311 1312 1313 1314
        len = tmp - line;

        if (STRPREFIX(path, line) &&
            path[len] == '.') {
            ret = 1;
            goto cleanup;
        }
    }

    if (ferror(fp)) {
1315 1316
        virReportSystemError(errno, "%s",
                             _("Error while reading /proc/cgroups"));
1317 1318 1319 1320
        goto cleanup;
    }

cleanup:
J
Ján Tomko 已提交
1321
    VIR_FREE(line);
1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334
    VIR_FORCE_FCLOSE(fp);
    return ret;
}

static int virCgroupPartitionEscape(char **path)
{
    size_t len = strlen(*path) + 1;
    int rc;
    char escape = '_';

    if ((rc = virCgroupPartitionNeedsEscaping(*path)) <= 0)
        return rc;

1335
    if (VIR_INSERT_ELEMENT(*path, 0, len, escape) < 0)
1336
        return -1;
1337 1338 1339 1340

    return 0;
}

1341
static int virCgroupSetPartitionSuffix(const char *path, char **res)
1342
{
1343
    char **tokens;
1344
    size_t i;
1345
    int ret = -1;
1346

1347
    if (!(tokens = virStringSplit(path, "/", 0)))
1348
        return ret;
1349

1350
    for (i = 0; tokens[i] != NULL; i++) {
1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365
        /* Whitelist the 3 top level fixed dirs
         * NB i == 0 is "", since we have leading '/'
         */
        if (i == 1 &&
            (STREQ(tokens[i], "machine") ||
             STREQ(tokens[i], "system") ||
             STREQ(tokens[i], "user"))) {
            continue;
        }
        /* If there is no suffix set already, then
         * add ".partition"
         */
        if (STRNEQ(tokens[i], "") &&
            !strchr(tokens[i], '.')) {
            if (VIR_REALLOC_N(tokens[i],
1366
                              strlen(tokens[i]) + strlen(".partition") + 1) < 0)
1367 1368 1369
                goto cleanup;
            strcat(tokens[i], ".partition");
        }
1370

1371
        if (virCgroupPartitionEscape(&(tokens[i])) < 0)
1372
            goto cleanup;
1373 1374
    }

1375
    if (!(*res = virStringJoin((const char **)tokens, "/")))
1376
        goto cleanup;
1377 1378

    ret = 0;
1379 1380 1381 1382 1383 1384

cleanup:
    virStringFreeList(tokens);
    return ret;
}

1385 1386 1387 1388 1389 1390 1391 1392 1393
/**
 * virCgroupNewPartition:
 * @path: path for the partition
 * @create: true to create the cgroup tree
 * @controllers: mask of controllers to create
 *
 * Creates a new cgroup to represent the resource
 * partition path identified by @name.
 *
1394
 * Returns 0 on success, -1 on failure
1395 1396 1397 1398 1399 1400
 */
int virCgroupNewPartition(const char *path,
                          bool create,
                          int controllers,
                          virCgroupPtr *group)
{
1401
    int ret = -1;
1402 1403
    char *parentPath = NULL;
    virCgroupPtr parent = NULL;
1404
    char *newpath = NULL;
1405 1406 1407
    VIR_DEBUG("path=%s create=%d controllers=%x",
              path, create, controllers);

1408 1409 1410 1411 1412 1413
    if (path[0] != '/') {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Partition path '%s' must start with '/'"),
                       path);
        return -1;
    }
1414

1415
    if (virCgroupSetPartitionSuffix(path, &newpath) < 0)
1416 1417
        goto cleanup;

1418
    if (virCgroupNew(-1, newpath, NULL, controllers, group) < 0)
1419 1420
        goto cleanup;

1421
    if (STRNEQ(newpath, "/")) {
1422
        char *tmp;
1423
        if (VIR_STRDUP(parentPath, newpath) < 0)
1424 1425 1426 1427 1428 1429
            goto cleanup;

        tmp = strrchr(parentPath, '/');
        tmp++;
        *tmp = '\0';

1430
        if (virCgroupNew(-1, parentPath, NULL, controllers, &parent) < 0)
1431 1432
            goto cleanup;

1433
        if (virCgroupMakeGroup(parent, *group, create, VIR_CGROUP_NONE) < 0) {
1434 1435 1436 1437 1438
            virCgroupRemove(*group);
            goto cleanup;
        }
    }

1439
    ret = 0;
1440
cleanup:
1441
    if (ret != 0)
1442 1443 1444
        virCgroupFree(group);
    virCgroupFree(&parent);
    VIR_FREE(parentPath);
1445
    VIR_FREE(newpath);
1446
    return ret;
1447 1448 1449 1450 1451 1452 1453
}
#else
int virCgroupNewPartition(const char *path ATTRIBUTE_UNUSED,
                          bool create ATTRIBUTE_UNUSED,
                          int controllers ATTRIBUTE_UNUSED,
                          virCgroupPtr *group ATTRIBUTE_UNUSED)
{
1454 1455 1456
    virReportSystemError(ENXIO, "%s",
                         _("Control groups not supported on this platform"));
    return -1;
1457 1458 1459
}
#endif

1460

G
Gao feng 已提交
1461
/**
1462
* virCgroupNewSelf:
G
Gao feng 已提交
1463 1464 1465
*
* @group: Pointer to returned virCgroupPtr
*
1466 1467 1468
* Obtain a cgroup representing the config of the
* current process
*
1469
* Returns 0 on success, or -1 on error
G
Gao feng 已提交
1470
*/
1471
int virCgroupNewSelf(virCgroupPtr *group)
G
Gao feng 已提交
1472
{
1473
    return virCgroupNewDetect(-1, -1, group);
G
Gao feng 已提交
1474
}
1475

1476

1477 1478 1479 1480 1481 1482 1483 1484
/**
 * virCgroupNewDomainPartition:
 *
 * @partition: partition holding the domain
 * @driver: name of the driver
 * @name: name of the domain
 * @group: Pointer to returned virCgroupPtr
 *
1485
 * Returns 0 on success, or -1 on error
1486 1487 1488 1489 1490 1491 1492 1493
 */
#if defined HAVE_MNTENT_H && defined HAVE_GETMNTENT_R
int virCgroupNewDomainPartition(virCgroupPtr partition,
                                const char *driver,
                                const char *name,
                                bool create,
                                virCgroupPtr *group)
{
1494
    int ret = -1;
1495
    char *grpname = NULL;
1496

1497
    if (virAsprintf(&grpname, "%s.libvirt-%s",
1498
                    name, driver) < 0)
1499
        goto cleanup;
1500

1501 1502
    if (virCgroupPartitionEscape(&grpname) < 0)
        goto cleanup;
1503

1504
    if (virCgroupNew(-1, grpname, partition, -1, group) < 0)
1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520
        goto cleanup;

    /*
     * Create a cgroup with memory.use_hierarchy enabled to
     * surely account memory usage of lxc with ns subsystem
     * enabled. (To be exact, memory and ns subsystems are
     * enabled at the same time.)
     *
     * The reason why doing it here, not a upper group, say
     * a group for driver, is to avoid overhead to track
     * cumulative usage that we don't need.
     */
    if (virCgroupMakeGroup(partition, *group, create, VIR_CGROUP_MEM_HIERACHY) < 0) {
        virCgroupRemove(*group);
        virCgroupFree(group);
        goto cleanup;
1521 1522
    }

1523 1524 1525
    ret = 0;

cleanup:
1526
    VIR_FREE(grpname);
1527
    return ret;
1528 1529 1530 1531 1532 1533 1534
}
#else
int virCgroupNewDomainPartition(virCgroupPtr partition ATTRIBUTE_UNUSED,
                                const char *driver ATTRIBUTE_UNUSED,
                                const char *name ATTRIBUTE_UNUSED,
                                bool create ATTRIBUTE_UNUSED,
                                virCgroupPtr *group ATTRIBUTE_UNUSED)
D
Daniel P. Berrange 已提交
1535
{
1536 1537 1538
    virReportSystemError(ENXIO, "%s",
                         _("Control groups not supported on this platform"));
    return -1;
D
Daniel P. Berrange 已提交
1539 1540
}
#endif
1541

1542
/**
1543
 * virCgroupNewVcpu:
1544
 *
1545
 * @domain: group for the domain
1546
 * @vcpuid: id of the vcpu
1547
 * @create: true to create if not already existing
1548 1549
 * @group: Pointer to returned virCgroupPtr
 *
1550
 * Returns 0 on success, or -1 on error
1551 1552
 */
#if defined HAVE_MNTENT_H && defined HAVE_GETMNTENT_R
1553
int virCgroupNewVcpu(virCgroupPtr domain,
1554
                     int vcpuid,
1555 1556
                     bool create,
                     virCgroupPtr *group)
1557
{
1558 1559
    int ret = -1;
    char *name = NULL;
1560
    int controllers;
1561

1562
    if (virAsprintf(&name, "vcpu%d", vcpuid) < 0)
1563
        goto cleanup;
1564

1565 1566 1567 1568
    controllers = ((1 << VIR_CGROUP_CONTROLLER_CPU) |
                   (1 << VIR_CGROUP_CONTROLLER_CPUACCT) |
                   (1 << VIR_CGROUP_CONTROLLER_CPUSET));

1569
    if (virCgroupNew(-1, name, domain, controllers, group) < 0)
1570
        goto cleanup;
1571

1572 1573 1574 1575
    if (virCgroupMakeGroup(domain, *group, create, VIR_CGROUP_NONE) < 0) {
        virCgroupRemove(*group);
        virCgroupFree(group);
        goto cleanup;
1576 1577
    }

1578 1579 1580 1581
    ret = 0;
cleanup:
    VIR_FREE(name);
    return ret;
1582 1583
}
#else
1584
int virCgroupNewVcpu(virCgroupPtr domain ATTRIBUTE_UNUSED,
1585
                     int vcpuid ATTRIBUTE_UNUSED,
1586 1587
                     bool create ATTRIBUTE_UNUSED,
                     virCgroupPtr *group ATTRIBUTE_UNUSED)
1588
{
1589 1590 1591
    virReportSystemError(ENXIO, "%s",
                         _("Control groups not supported on this platform"));
    return -1;
1592 1593 1594
}
#endif

1595
/**
1596
 * virCgroupNewEmulator:
1597
 *
1598 1599
 * @domain: group for the domain
 * @create: true to create if not already existing
1600 1601
 * @group: Pointer to returned virCgroupPtr
 *
1602
 * Returns: 0 on success or -1 on error
1603 1604
 */
#if defined HAVE_MNTENT_H && defined HAVE_GETMNTENT_R
1605 1606 1607
int virCgroupNewEmulator(virCgroupPtr domain,
                         bool create,
                         virCgroupPtr *group)
1608
{
1609
    int ret = -1;
1610
    int controllers;
1611

1612 1613 1614 1615
    controllers = ((1 << VIR_CGROUP_CONTROLLER_CPU) |
                   (1 << VIR_CGROUP_CONTROLLER_CPUACCT) |
                   (1 << VIR_CGROUP_CONTROLLER_CPUSET));

1616
    if (virCgroupNew(-1, "emulator", domain, controllers, group) < 0)
1617
        goto cleanup;
1618

1619 1620 1621 1622
    if (virCgroupMakeGroup(domain, *group, create, VIR_CGROUP_NONE) < 0) {
        virCgroupRemove(*group);
        virCgroupFree(group);
        goto cleanup;
1623 1624
    }

1625 1626 1627
    ret = 0;
cleanup:
    return ret;
1628 1629
}
#else
1630 1631 1632
int virCgroupNewEmulator(virCgroupPtr domain ATTRIBUTE_UNUSED,
                         bool create ATTRIBUTE_UNUSED,
                         virCgroupPtr *group ATTRIBUTE_UNUSED)
1633
{
1634 1635 1636
    virReportSystemError(ENXIO, "%s",
                         _("Control groups not supported on this platform"));
    return -1;
1637 1638 1639
}

#endif
1640

1641 1642 1643

#if defined HAVE_MNTENT_H && defined HAVE_GETMNTENT_R
int virCgroupNewDetect(pid_t pid,
1644
                       int controllers,
1645 1646
                       virCgroupPtr *group)
{
1647
    return virCgroupNew(pid, "", NULL, controllers, group);
1648 1649 1650
}
#else
int virCgroupNewDetect(pid_t pid ATTRIBUTE_UNUSED,
1651
                       int controllers ATTRIBUTE_UNUSED,
1652 1653 1654 1655 1656 1657 1658 1659
                       virCgroupPtr *group ATTRIBUTE_UNUSED)
{
    virReportSystemError(ENXIO, "%s",
                         _("Control groups not supported on this platform"));
    return -1;
}
#endif

1660 1661 1662 1663 1664 1665
/*
 * Returns 0 on success (but @group may be NULL), -1 on fatal error
 */
int virCgroupNewDetectMachine(const char *name,
                              const char *drivername,
                              pid_t pid,
1666
                              const char *partition,
1667
                              int controllers,
1668 1669
                              virCgroupPtr *group)
{
1670
    if (virCgroupNewDetect(pid, controllers, group) < 0) {
1671 1672 1673 1674 1675
        if (virCgroupNewIgnoreError())
            return 0;
        return -1;
    }

1676
    if (!virCgroupValidateMachineGroup(*group, name, drivername, partition, true)) {
1677 1678
        VIR_DEBUG("Failed to validate machine name for '%s' driver '%s'",
                  name, drivername);
1679 1680 1681 1682 1683 1684 1685
        virCgroupFree(group);
        return 0;
    }

    return 0;
}

1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699
/*
 * Returns 0 on success, -1 on fatal error, -2 on systemd not available
 */
static int
virCgroupNewMachineSystemd(const char *name,
                           const char *drivername,
                           bool privileged,
                           const unsigned char *uuid,
                           const char *rootdir,
                           pid_t pidleader,
                           bool isContainer,
                           const char *partition,
                           int controllers,
                           virCgroupPtr *group)
1700 1701
{
    int ret = -1;
1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725
    int rv;
    virCgroupPtr init, parent = NULL;
    char *path = NULL;
    char *offset;

    VIR_DEBUG("Trying to setup machine '%s' via systemd", name);
    if ((rv = virSystemdCreateMachine(name,
                                      drivername,
                                      privileged,
                                      uuid,
                                      rootdir,
                                      pidleader,
                                      isContainer,
                                      partition)) < 0)
        return rv;

    if (controllers != -1)
        controllers |= (1 << VIR_CGROUP_CONTROLLER_SYSTEMD);

    VIR_DEBUG("Detecting systemd placement");
    if (virCgroupNewDetect(pidleader,
                           controllers,
                           &init) < 0)
        return -1;
1726

1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790
    path = init->controllers[VIR_CGROUP_CONTROLLER_SYSTEMD].placement;
    init->controllers[VIR_CGROUP_CONTROLLER_SYSTEMD].placement = NULL;
    virCgroupFree(&init);

    if (!path || STREQ(path, "/") || path[0] != '/') {
        VIR_DEBUG("Systemd didn't setup its controller");
        ret = -2;
        goto cleanup;
    }

    offset = path;

    if (virCgroupNew(pidleader,
                     "",
                     NULL,
                     controllers,
                     &parent) < 0)
        goto cleanup;


    for (;;) {
        virCgroupPtr tmp;
        char *t = strchr(offset + 1, '/');
        if (t)
            *t = '\0';

        if (virCgroupNew(pidleader,
                         path,
                         parent,
                         controllers,
                         &tmp) < 0)
            goto cleanup;

        if (virCgroupMakeGroup(parent, tmp, true, VIR_CGROUP_NONE) < 0) {
            virCgroupFree(&tmp);
            goto cleanup;
        }
        if (t) {
            *t = '/';
            offset = t;
            virCgroupFree(&parent);
            parent = tmp;
        } else {
            *group = tmp;
            break;
        }
    }

    if (virCgroupAddTask(*group, pidleader) < 0) {
        virErrorPtr saved = virSaveLastError();
        virCgroupRemove(*group);
        virCgroupFree(group);
        if (saved) {
            virSetError(saved);
            virFreeError(saved);
        }
    }

    ret = 0;
 cleanup:
    virCgroupFree(&parent);
    VIR_FREE(path);
    return ret;
}
1791

1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803
static int
virCgroupNewMachineManual(const char *name,
                          const char *drivername,
                          pid_t pidleader,
                          const char *partition,
                          int controllers,
                          virCgroupPtr *group)
{
    virCgroupPtr parent = NULL;
    int ret = -1;

    VIR_DEBUG("Fallback to non-systemd setup");
1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838
    if (virCgroupNewPartition(partition,
                              STREQ(partition, "/machine"),
                              controllers,
                              &parent) < 0) {
        if (virCgroupNewIgnoreError())
            goto done;

        goto cleanup;
    }

    if (virCgroupNewDomainPartition(parent,
                                    drivername,
                                    name,
                                    true,
                                    group) < 0)
        goto cleanup;

    if (virCgroupAddTask(*group, pidleader) < 0) {
        virErrorPtr saved = virSaveLastError();
        virCgroupRemove(*group);
        virCgroupFree(group);
        if (saved) {
            virSetError(saved);
            virFreeError(saved);
        }
    }

done:
    ret = 0;

cleanup:
    virCgroupFree(&parent);
    return ret;
}

1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876
int virCgroupNewMachine(const char *name,
                        const char *drivername,
                        bool privileged,
                        const unsigned char *uuid,
                        const char *rootdir,
                        pid_t pidleader,
                        bool isContainer,
                        const char *partition,
                        int controllers,
                        virCgroupPtr *group)
{
    int rv;

    *group = NULL;

    if ((rv = virCgroupNewMachineSystemd(name,
                                         drivername,
                                         privileged,
                                         uuid,
                                         rootdir,
                                         pidleader,
                                         isContainer,
                                         partition,
                                         controllers,
                                         group)) == 0)
        return 0;

    if (rv == -1)
        return -1;

    return virCgroupNewMachineManual(name,
                                     drivername,
                                     pidleader,
                                     partition,
                                     controllers,
                                     group);
}

1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888
bool virCgroupNewIgnoreError(void)
{
    if (virLastErrorIsSystemErrno(ENXIO) ||
        virLastErrorIsSystemErrno(EPERM) ||
        virLastErrorIsSystemErrno(EACCES)) {
        virResetLastError();
        VIR_DEBUG("No cgroups present/configured/accessible, ignoring error");
        return true;
    }
    return false;
}

1889 1890 1891 1892 1893 1894
/**
 * virCgroupSetBlkioWeight:
 *
 * @group: The cgroup to change io weight for
 * @weight: The Weight for this cgroup
 *
1895
 * Returns: 0 on success, -1 on error
1896 1897 1898
 */
int virCgroupSetBlkioWeight(virCgroupPtr group, unsigned int weight)
{
1899 1900 1901 1902 1903 1904
    if (weight > 1000 || weight < 100) {
        virReportError(VIR_ERR_INVALID_ARG,
                       _("weight '%u' must be in range (100, 1000)"),
                       weight);
        return -1;
    }
1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917

    return virCgroupSetValueU64(group,
                                VIR_CGROUP_CONTROLLER_BLKIO,
                                "blkio.weight",
                                weight);
}

/**
 * virCgroupGetBlkioWeight:
 *
 * @group: The cgroup to get weight for
 * @Weight: Pointer to returned weight
 *
1918
 * Returns: 0 on success, -1 on error
1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931
 */
int virCgroupGetBlkioWeight(virCgroupPtr group, unsigned int *weight)
{
    unsigned long long tmp;
    int ret;
    ret = virCgroupGetValueU64(group,
                               VIR_CGROUP_CONTROLLER_BLKIO,
                               "blkio.weight", &tmp);
    if (ret == 0)
        *weight = tmp;
    return ret;
}

1932 1933 1934 1935 1936 1937 1938 1939 1940 1941
/**
 * virCgroupSetBlkioDeviceWeight:
 *
 * @group: The cgroup to change io device weight device for
 * @path: The device with a weight to alter
 * @weight: The new device weight (100-1000), or 0 to clear
 *
 * device_weight is treated as a write-only parameter, so
 * there isn't a getter counterpart.
 *
1942
 * Returns: 0 on success, -1 on error
1943 1944 1945 1946 1947 1948 1949 1950 1951 1952
 */
#if defined(major) && defined(minor)
int virCgroupSetBlkioDeviceWeight(virCgroupPtr group,
                                  const char *path,
                                  unsigned int weight)
{
    char *str;
    struct stat sb;
    int ret;

1953 1954 1955 1956 1957 1958
    if (weight && (weight > 1000 || weight < 100)) {
        virReportError(VIR_ERR_INVALID_ARG,
                       _("weight '%u' must be in range (100, 1000)"),
                       weight);
        return -1;
    }
1959

1960 1961 1962 1963 1964 1965
    if (stat(path, &sb) < 0) {
        virReportSystemError(errno,
                             _("Path '%s' is not accessible"),
                             path);
        return -1;
    }
1966

1967 1968 1969 1970 1971 1972
    if (!S_ISBLK(sb.st_mode)) {
        virReportSystemError(EINVAL,
                             _("Path '%s' must be a block device"),
                             path);
        return -1;
    }
1973 1974 1975

    if (virAsprintf(&str, "%d:%d %d", major(sb.st_rdev), minor(sb.st_rdev),
                    weight) < 0)
1976
        return -1;
1977 1978 1979 1980 1981 1982 1983 1984 1985

    ret = virCgroupSetValueStr(group,
                               VIR_CGROUP_CONTROLLER_BLKIO,
                               "blkio.weight_device",
                               str);
    VIR_FREE(str);
    return ret;
}
#else
O
Osier Yang 已提交
1986 1987 1988 1989
int
virCgroupSetBlkioDeviceWeight(virCgroupPtr group ATTRIBUTE_UNUSED,
                              const char *path ATTRIBUTE_UNUSED,
                              unsigned int weight ATTRIBUTE_UNUSED)
1990
{
1991 1992 1993
    virReportSystemError(ENOSYS, "%s",
                         _("Control groups not supported on this platform"));
    return -1;
1994 1995 1996
}
#endif

1997 1998 1999 2000 2001 2002 2003 2004
/**
 * virCgroupSetMemory:
 *
 * @group: The cgroup to change memory for
 * @kb: The memory amount in kilobytes
 *
 * Returns: 0 on success
 */
2005
int virCgroupSetMemory(virCgroupPtr group, unsigned long long kb)
2006
{
2007 2008
    unsigned long long maxkb = VIR_DOMAIN_MEMORY_PARAM_UNLIMITED;

2009 2010 2011 2012 2013 2014 2015 2016
    if (kb > maxkb) {
        virReportError(VIR_ERR_INVALID_ARG,
                       _("Memory '%llu' must be less than %llu"),
                       kb, maxkb);
        return -1;
    }

    if (kb == maxkb)
2017 2018 2019 2020 2021 2022 2023 2024 2025
        return virCgroupSetValueI64(group,
                                    VIR_CGROUP_CONTROLLER_MEMORY,
                                    "memory.limit_in_bytes",
                                    -1);
    else
        return virCgroupSetValueU64(group,
                                    VIR_CGROUP_CONTROLLER_MEMORY,
                                    "memory.limit_in_bytes",
                                    kb << 10);
2026 2027
}

R
Ryota Ozaki 已提交
2028 2029 2030 2031 2032 2033 2034 2035 2036 2037
/**
 * virCgroupGetMemoryUsage:
 *
 * @group: The cgroup to change memory for
 * @kb: Pointer to returned used memory in kilobytes
 *
 * Returns: 0 on success
 */
int virCgroupGetMemoryUsage(virCgroupPtr group, unsigned long *kb)
{
C
Cole Robinson 已提交
2038
    long long unsigned int usage_in_bytes;
R
Ryota Ozaki 已提交
2039 2040 2041 2042 2043 2044 2045 2046 2047
    int ret;
    ret = virCgroupGetValueU64(group,
                               VIR_CGROUP_CONTROLLER_MEMORY,
                               "memory.usage_in_bytes", &usage_in_bytes);
    if (ret == 0)
        *kb = (unsigned long) usage_in_bytes >> 10;
    return ret;
}

2048 2049 2050 2051 2052 2053 2054 2055
/**
 * virCgroupSetMemoryHardLimit:
 *
 * @group: The cgroup to change memory hard limit for
 * @kb: The memory amount in kilobytes
 *
 * Returns: 0 on success
 */
2056
int virCgroupSetMemoryHardLimit(virCgroupPtr group, unsigned long long kb)
2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068
{
    return virCgroupSetMemory(group, kb);
}

/**
 * virCgroupGetMemoryHardLimit:
 *
 * @group: The cgroup to get the memory hard limit for
 * @kb: The memory amount in kilobytes
 *
 * Returns: 0 on success
 */
2069
int virCgroupGetMemoryHardLimit(virCgroupPtr group, unsigned long long *kb)
2070 2071 2072 2073 2074 2075 2076
{
    long long unsigned int limit_in_bytes;
    int ret;
    ret = virCgroupGetValueU64(group,
                               VIR_CGROUP_CONTROLLER_MEMORY,
                               "memory.limit_in_bytes", &limit_in_bytes);
    if (ret == 0)
2077
        *kb = limit_in_bytes >> 10;
2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088
    return ret;
}

/**
 * virCgroupSetMemorySoftLimit:
 *
 * @group: The cgroup to change memory soft limit for
 * @kb: The memory amount in kilobytes
 *
 * Returns: 0 on success
 */
2089
int virCgroupSetMemorySoftLimit(virCgroupPtr group, unsigned long long kb)
2090
{
2091 2092
    unsigned long long maxkb = VIR_DOMAIN_MEMORY_PARAM_UNLIMITED;

2093 2094 2095 2096 2097 2098 2099 2100
    if (kb > maxkb) {
        virReportError(VIR_ERR_INVALID_ARG,
                       _("Memory '%llu' must be less than %llu"),
                       kb, maxkb);
        return -1;
    }

    if (kb == maxkb)
2101 2102 2103 2104 2105 2106 2107 2108 2109
        return virCgroupSetValueI64(group,
                                    VIR_CGROUP_CONTROLLER_MEMORY,
                                    "memory.soft_limit_in_bytes",
                                    -1);
    else
        return virCgroupSetValueU64(group,
                                    VIR_CGROUP_CONTROLLER_MEMORY,
                                    "memory.soft_limit_in_bytes",
                                    kb << 10);
2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120
}


/**
 * virCgroupGetMemorySoftLimit:
 *
 * @group: The cgroup to get the memory soft limit for
 * @kb: The memory amount in kilobytes
 *
 * Returns: 0 on success
 */
2121
int virCgroupGetMemorySoftLimit(virCgroupPtr group, unsigned long long *kb)
2122 2123 2124 2125 2126 2127 2128
{
    long long unsigned int limit_in_bytes;
    int ret;
    ret = virCgroupGetValueU64(group,
                               VIR_CGROUP_CONTROLLER_MEMORY,
                               "memory.soft_limit_in_bytes", &limit_in_bytes);
    if (ret == 0)
2129
        *kb = limit_in_bytes >> 10;
2130 2131 2132 2133
    return ret;
}

/**
2134
 * virCgroupSetMemSwapHardLimit:
2135
 *
2136 2137
 * @group: The cgroup to change mem+swap hard limit for
 * @kb: The mem+swap amount in kilobytes
2138 2139 2140
 *
 * Returns: 0 on success
 */
2141
int virCgroupSetMemSwapHardLimit(virCgroupPtr group, unsigned long long kb)
2142
{
2143 2144
    unsigned long long maxkb = VIR_DOMAIN_MEMORY_PARAM_UNLIMITED;

2145 2146 2147 2148 2149 2150 2151 2152
    if (kb > maxkb) {
        virReportError(VIR_ERR_INVALID_ARG,
                       _("Memory '%llu' must be less than %llu"),
                       kb, maxkb);
        return -1;
    }

    if (kb == maxkb)
2153 2154 2155 2156 2157 2158 2159 2160 2161
        return virCgroupSetValueI64(group,
                                    VIR_CGROUP_CONTROLLER_MEMORY,
                                    "memory.memsw.limit_in_bytes",
                                    -1);
    else
        return virCgroupSetValueU64(group,
                                    VIR_CGROUP_CONTROLLER_MEMORY,
                                    "memory.memsw.limit_in_bytes",
                                    kb << 10);
2162 2163 2164
}

/**
2165
 * virCgroupGetMemSwapHardLimit:
2166
 *
2167 2168
 * @group: The cgroup to get mem+swap hard limit for
 * @kb: The mem+swap amount in kilobytes
2169 2170 2171
 *
 * Returns: 0 on success
 */
2172
int virCgroupGetMemSwapHardLimit(virCgroupPtr group, unsigned long long *kb)
2173 2174 2175 2176 2177 2178 2179
{
    long long unsigned int limit_in_bytes;
    int ret;
    ret = virCgroupGetValueU64(group,
                               VIR_CGROUP_CONTROLLER_MEMORY,
                               "memory.memsw.limit_in_bytes", &limit_in_bytes);
    if (ret == 0)
2180
        *kb = limit_in_bytes >> 10;
2181 2182 2183
    return ret;
}

G
Gao feng 已提交
2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203
/**
 * virCgroupGetMemSwapUsage:
 *
 * @group: The cgroup to get mem+swap usage for
 * @kb: The mem+swap amount in kilobytes
 *
 * Returns: 0 on success
 */
int virCgroupGetMemSwapUsage(virCgroupPtr group, unsigned long long *kb)
{
    long long unsigned int usage_in_bytes;
    int ret;
    ret = virCgroupGetValueU64(group,
                               VIR_CGROUP_CONTROLLER_MEMORY,
                               "memory.memsw.usage_in_bytes", &usage_in_bytes);
    if (ret == 0)
        *kb = usage_in_bytes >> 10;
    return ret;
}

2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235
/**
 * virCgroupSetCpusetMems:
 *
 * @group: The cgroup to set cpuset.mems for
 * @mems: the numa nodes to set
 *
 * Returns: 0 on success
 */
int virCgroupSetCpusetMems(virCgroupPtr group, const char *mems)
{
    return virCgroupSetValueStr(group,
                                VIR_CGROUP_CONTROLLER_CPUSET,
                                "cpuset.mems",
                                mems);
}

/**
 * virCgroupGetCpusetMems:
 *
 * @group: The cgroup to get cpuset.mems for
 * @mems: the numa nodes to get
 *
 * Returns: 0 on success
 */
int virCgroupGetCpusetMems(virCgroupPtr group, char **mems)
{
    return virCgroupGetValueStr(group,
                                VIR_CGROUP_CONTROLLER_CPUSET,
                                "cpuset.mems",
                                mems);
}

2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267
/**
 * virCgroupSetCpusetCpus:
 *
 * @group: The cgroup to set cpuset.cpus for
 * @cpus: the cpus to set
 *
 * Retuens: 0 on success
 */
int virCgroupSetCpusetCpus(virCgroupPtr group, const char *cpus)
{
    return virCgroupSetValueStr(group,
                                VIR_CGROUP_CONTROLLER_CPUSET,
                                "cpuset.cpus",
                                cpus);
}

/**
 * virCgroupGetCpusetCpus:
 *
 * @group: The cgroup to get cpuset.cpus for
 * @cpus: the cpus to get
 *
 * Retuens: 0 on success
 */
int virCgroupGetCpusetCpus(virCgroupPtr group, char **cpus)
{
    return virCgroupGetValueStr(group,
                                VIR_CGROUP_CONTROLLER_CPUSET,
                                "cpuset.cpus",
                                cpus);
}

2268 2269 2270
/**
 * virCgroupDenyAllDevices:
 *
2271
 * @group: The cgroup to deny all permissions, for all devices
2272 2273 2274 2275 2276 2277
 *
 * Returns: 0 on success
 */
int virCgroupDenyAllDevices(virCgroupPtr group)
{
    return virCgroupSetValueStr(group,
2278 2279 2280
                                VIR_CGROUP_CONTROLLER_DEVICES,
                                "devices.deny",
                                "a");
2281 2282 2283 2284 2285 2286 2287 2288 2289
}

/**
 * virCgroupAllowDevice:
 *
 * @group: The cgroup to allow a device for
 * @type: The device type (i.e., 'c' or 'b')
 * @major: The major number of the device
 * @minor: The minor number of the device
2290
 * @perms: Bitwise or of VIR_CGROUP_DEVICE permission bits to allow
2291 2292 2293
 *
 * Returns: 0 on success
 */
2294 2295
int virCgroupAllowDevice(virCgroupPtr group, char type, int major, int minor,
                         int perms)
2296
{
2297
    int ret = -1;
2298 2299
    char *devstr = NULL;

2300 2301 2302
    if (virAsprintf(&devstr, "%c %i:%i %s%s%s", type, major, minor,
                    perms & VIR_CGROUP_DEVICE_READ ? "r" : "",
                    perms & VIR_CGROUP_DEVICE_WRITE ? "w" : "",
2303 2304
                    perms & VIR_CGROUP_DEVICE_MKNOD ? "m" : "") < 0)
        goto cleanup;
2305

2306 2307 2308 2309 2310
    if (virCgroupSetValueStr(group,
                             VIR_CGROUP_CONTROLLER_DEVICES,
                             "devices.allow",
                             devstr) < 0)
        goto cleanup;
2311

2312 2313 2314 2315 2316
    ret = 0;

cleanup:
    VIR_FREE(devstr);
    return ret;
2317
}
2318

2319 2320 2321 2322 2323 2324
/**
 * virCgroupAllowDeviceMajor:
 *
 * @group: The cgroup to allow an entire device major type for
 * @type: The device type (i.e., 'c' or 'b')
 * @major: The major number of the device type
2325
 * @perms: Bitwise or of VIR_CGROUP_DEVICE permission bits to allow
2326 2327 2328
 *
 * Returns: 0 on success
 */
2329 2330
int virCgroupAllowDeviceMajor(virCgroupPtr group, char type, int major,
                              int perms)
2331
{
2332
    int ret = -1;
2333 2334
    char *devstr = NULL;

2335 2336 2337
    if (virAsprintf(&devstr, "%c %i:* %s%s%s", type, major,
                    perms & VIR_CGROUP_DEVICE_READ ? "r" : "",
                    perms & VIR_CGROUP_DEVICE_WRITE ? "w" : "",
2338 2339
                    perms & VIR_CGROUP_DEVICE_MKNOD ? "m" : "") < 0)
        goto cleanup;
2340

2341 2342 2343 2344 2345
    if (virCgroupSetValueStr(group,
                             VIR_CGROUP_CONTROLLER_DEVICES,
                             "devices.allow",
                             devstr) < 0)
        goto cleanup;
2346

2347 2348 2349 2350 2351
    ret = 0;

cleanup:
    VIR_FREE(devstr);
    return ret;
2352 2353
}

2354 2355 2356 2357 2358
/**
 * virCgroupAllowDevicePath:
 *
 * @group: The cgroup to allow the device for
 * @path: the device to allow
2359
 * @perms: Bitwise or of VIR_CGROUP_DEVICE permission bits to allow
2360 2361 2362 2363
 *
 * Queries the type of device and its major/minor number, and
 * adds that to the cgroup ACL
 *
2364
 * Returns: 0 on success, 1 if path exists but is not a device, or
2365
 * -1 on error
2366
 */
D
Daniel P. Berrange 已提交
2367
#if defined(major) && defined(minor)
2368
int virCgroupAllowDevicePath(virCgroupPtr group, const char *path, int perms)
2369 2370 2371
{
    struct stat sb;

2372 2373 2374 2375 2376 2377
    if (stat(path, &sb) < 0) {
        virReportSystemError(errno,
                             _("Path '%s' is not accessible"),
                             path);
        return -1;
    }
2378 2379

    if (!S_ISCHR(sb.st_mode) && !S_ISBLK(sb.st_mode))
2380
        return 1;
2381 2382 2383 2384

    return virCgroupAllowDevice(group,
                                S_ISCHR(sb.st_mode) ? 'c' : 'b',
                                major(sb.st_rdev),
2385 2386
                                minor(sb.st_rdev),
                                perms);
2387
}
D
Daniel P. Berrange 已提交
2388 2389
#else
int virCgroupAllowDevicePath(virCgroupPtr group ATTRIBUTE_UNUSED,
2390 2391
                             const char *path ATTRIBUTE_UNUSED,
                             int perms ATTRIBUTE_UNUSED)
D
Daniel P. Berrange 已提交
2392
{
2393 2394 2395
    virReportSystemError(ENOSYS, "%s",
                         _("Control groups not supported on this platform"));
    return -1;
D
Daniel P. Berrange 已提交
2396 2397 2398
}
#endif

2399 2400 2401 2402 2403 2404 2405 2406

/**
 * virCgroupDenyDevice:
 *
 * @group: The cgroup to deny a device for
 * @type: The device type (i.e., 'c' or 'b')
 * @major: The major number of the device
 * @minor: The minor number of the device
2407
 * @perms: Bitwise or of VIR_CGROUP_DEVICE permission bits to deny
2408 2409 2410
 *
 * Returns: 0 on success
 */
2411 2412
int virCgroupDenyDevice(virCgroupPtr group, char type, int major, int minor,
                        int perms)
2413
{
2414
    int ret = -1;
2415 2416
    char *devstr = NULL;

2417 2418 2419
    if (virAsprintf(&devstr, "%c %i:%i %s%s%s", type, major, minor,
                    perms & VIR_CGROUP_DEVICE_READ ? "r" : "",
                    perms & VIR_CGROUP_DEVICE_WRITE ? "w" : "",
2420 2421
                    perms & VIR_CGROUP_DEVICE_MKNOD ? "m" : "") < 0)
        goto cleanup;
2422

2423 2424 2425 2426 2427
    if (virCgroupSetValueStr(group,
                             VIR_CGROUP_CONTROLLER_DEVICES,
                             "devices.deny",
                             devstr) < 0)
        goto cleanup;
2428

2429 2430 2431 2432 2433
    ret = 0;

cleanup:
    VIR_FREE(devstr);
    return ret;
2434 2435 2436 2437 2438 2439 2440 2441
}

/**
 * virCgroupDenyDeviceMajor:
 *
 * @group: The cgroup to deny an entire device major type for
 * @type: The device type (i.e., 'c' or 'b')
 * @major: The major number of the device type
2442
 * @perms: Bitwise or of VIR_CGROUP_DEVICE permission bits to deny
2443 2444 2445
 *
 * Returns: 0 on success
 */
2446 2447
int virCgroupDenyDeviceMajor(virCgroupPtr group, char type, int major,
                             int perms)
2448
{
2449
    int ret = -1;
2450 2451
    char *devstr = NULL;

2452 2453 2454
    if (virAsprintf(&devstr, "%c %i:* %s%s%s", type, major,
                    perms & VIR_CGROUP_DEVICE_READ ? "r" : "",
                    perms & VIR_CGROUP_DEVICE_WRITE ? "w" : "",
2455 2456
                    perms & VIR_CGROUP_DEVICE_MKNOD ? "m" : "") < 0)
        goto cleanup;
2457

2458 2459 2460 2461 2462
    if (virCgroupSetValueStr(group,
                             VIR_CGROUP_CONTROLLER_DEVICES,
                             "devices.deny",
                             devstr) < 0)
        goto cleanup;
2463

2464 2465 2466 2467 2468
    ret = 0;

cleanup:
    VIR_FREE(devstr);
    return ret;
2469 2470
}

D
Daniel P. Berrange 已提交
2471
#if defined(major) && defined(minor)
2472
int virCgroupDenyDevicePath(virCgroupPtr group, const char *path, int perms)
2473 2474 2475
{
    struct stat sb;

2476 2477 2478 2479 2480 2481
    if (stat(path, &sb) < 0) {
        virReportSystemError(errno,
                             _("Path '%s' is not accessible"),
                             path);
        return -1;
    }
2482 2483

    if (!S_ISCHR(sb.st_mode) && !S_ISBLK(sb.st_mode))
2484
        return 1;
2485 2486 2487 2488

    return virCgroupDenyDevice(group,
                               S_ISCHR(sb.st_mode) ? 'c' : 'b',
                               major(sb.st_rdev),
2489 2490
                               minor(sb.st_rdev),
                               perms);
2491
}
D
Daniel P. Berrange 已提交
2492 2493
#else
int virCgroupDenyDevicePath(virCgroupPtr group ATTRIBUTE_UNUSED,
2494 2495
                            const char *path ATTRIBUTE_UNUSED,
                            int perms ATTRIBUTE_UNUSED)
D
Daniel P. Berrange 已提交
2496
{
2497 2498 2499
    virReportSystemError(ENOSYS, "%s",
                         _("Control groups not supported on this platform"));
    return -1;
D
Daniel P. Berrange 已提交
2500 2501
}
#endif
2502

2503
int virCgroupSetCpuShares(virCgroupPtr group, unsigned long long shares)
2504
{
2505 2506
    return virCgroupSetValueU64(group,
                                VIR_CGROUP_CONTROLLER_CPU,
D
Daniel P. Berrange 已提交
2507
                                "cpu.shares", shares);
2508 2509
}

2510
int virCgroupGetCpuShares(virCgroupPtr group, unsigned long long *shares)
2511
{
2512 2513
    return virCgroupGetValueU64(group,
                                VIR_CGROUP_CONTROLLER_CPU,
D
Daniel P. Berrange 已提交
2514
                                "cpu.shares", shares);
2515
}
2516

2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529
/**
 * virCgroupSetCpuCfsPeriod:
 *
 * @group: The cgroup to change cpu.cfs_period_us for
 * @cfs_period: The bandwidth period in usecs
 *
 * Returns: 0 on success
 */
int virCgroupSetCpuCfsPeriod(virCgroupPtr group, unsigned long long cfs_period)
{
    /* The cfs_period shoule be greater or equal than 1ms, and less or equal
     * than 1s.
     */
2530 2531 2532 2533 2534 2535
    if (cfs_period < 1000 || cfs_period > 1000000) {
        virReportError(VIR_ERR_INVALID_ARG,
                       _("cfs_period '%llu' must be in range (1000, 1000000)"),
                       cfs_period);
        return -1;
    }
2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567

    return virCgroupSetValueU64(group,
                                VIR_CGROUP_CONTROLLER_CPU,
                                "cpu.cfs_period_us", cfs_period);
}

/**
 * virCgroupGetCpuCfsPeriod:
 *
 * @group: The cgroup to get cpu.cfs_period_us for
 * @cfs_period: Pointer to the returned bandwidth period in usecs
 *
 * Returns: 0 on success
 */
int virCgroupGetCpuCfsPeriod(virCgroupPtr group, unsigned long long *cfs_period)
{
    return virCgroupGetValueU64(group,
                                VIR_CGROUP_CONTROLLER_CPU,
                                "cpu.cfs_period_us", cfs_period);
}

/**
 * virCgroupSetCpuCfsQuota:
 *
 * @group: The cgroup to change cpu.cfs_quota_us for
 * @cfs_quota: the cpu bandwidth (in usecs) that this tg will be allowed to
 *             consume over period
 *
 * Returns: 0 on success
 */
int virCgroupSetCpuCfsQuota(virCgroupPtr group, long long cfs_quota)
{
2568 2569 2570 2571 2572 2573 2574 2575
    /* The cfs_quota should be greater or equal than 1ms */
    if (cfs_quota >= 0 &&
        (cfs_quota < 1000 ||
         cfs_quota > ULLONG_MAX / 1000)) {
        virReportError(VIR_ERR_INVALID_ARG,
                       _("cfs_quota '%lld' must be in range (1000, %llu)"),
                       cfs_quota, ULLONG_MAX / 1000);
        return -1;
2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598
    }

    return virCgroupSetValueI64(group,
                                VIR_CGROUP_CONTROLLER_CPU,
                                "cpu.cfs_quota_us", cfs_quota);
}

/**
 * virCgroupGetCpuCfsQuota:
 *
 * @group: The cgroup to get cpu.cfs_quota_us for
 * @cfs_quota: Pointer to the returned cpu bandwidth (in usecs) that this tg
 *             will be allowed to consume over period
 *
 * Returns: 0 on success
 */
int virCgroupGetCpuCfsQuota(virCgroupPtr group, long long *cfs_quota)
{
    return virCgroupGetValueI64(group,
                                VIR_CGROUP_CONTROLLER_CPU,
                                "cpu.cfs_quota_us", cfs_quota);
}

2599 2600
int virCgroupGetCpuacctUsage(virCgroupPtr group, unsigned long long *usage)
{
2601 2602
    return virCgroupGetValueU64(group,
                                VIR_CGROUP_CONTROLLER_CPUACCT,
D
Daniel P. Berrange 已提交
2603
                                "cpuacct.usage", usage);
2604
}
R
Ryota Ozaki 已提交
2605

2606 2607 2608 2609 2610 2611
int virCgroupGetCpuacctPercpuUsage(virCgroupPtr group, char **usage)
{
    return virCgroupGetValueStr(group, VIR_CGROUP_CONTROLLER_CPUACCT,
                                "cpuacct.usage_percpu", usage);
}

2612 2613 2614 2615 2616 2617
#ifdef _SC_CLK_TCK
int virCgroupGetCpuacctStat(virCgroupPtr group, unsigned long long *user,
                            unsigned long long *sys)
{
    char *str;
    char *p;
2618
    int ret = -1;
2619 2620
    static double scale = -1.0;

2621 2622 2623 2624
    if (virCgroupGetValueStr(group, VIR_CGROUP_CONTROLLER_CPUACCT,
                             "cpuacct.stat", &str) < 0)
        return -1;

2625
    if (!(p = STRSKIP(str, "user ")) ||
2626 2627 2628 2629 2630 2631 2632
        virStrToLong_ull(p, &p, 10, user) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Cannot parse user stat '%s'"),
                       p);
        goto cleanup;
    }
    if (!(p = STRSKIP(p, "\nsystem ")) ||
2633
        virStrToLong_ull(p, NULL, 10, sys) < 0) {
2634 2635 2636
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Cannot parse sys stat '%s'"),
                       p);
2637 2638 2639 2640 2641 2642 2643 2644
        goto cleanup;
    }
    /* times reported are in system ticks (generally 100 Hz), but that
     * rate can theoretically vary between machines.  Scale things
     * into approximate nanoseconds.  */
    if (scale < 0) {
        long ticks_per_sec = sysconf(_SC_CLK_TCK);
        if (ticks_per_sec == -1) {
2645 2646
            virReportSystemError(errno, "%s",
                                 _("Cannot determine system clock HZ"));
2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663
            goto cleanup;
        }
        scale = 1000000000.0 / ticks_per_sec;
    }
    *user *= scale;
    *sys *= scale;

    ret = 0;
cleanup:
    VIR_FREE(str);
    return ret;
}
#else
int virCgroupGetCpuacctStat(virCgroupPtr group ATTRIBUTE_UNUSED,
                            unsigned long long *user ATTRIBUTE_UNUSED,
                            unsigned long long *sys ATTRIBUTE_UNUSED)
{
2664 2665 2666
    virReportSystemError(ENOSYS, "%s",
                         _("Control groups not supported on this platform"));
    return -1;
2667 2668 2669
}
#endif

R
Ryota Ozaki 已提交
2670 2671 2672
int virCgroupSetFreezerState(virCgroupPtr group, const char *state)
{
    return virCgroupSetValueStr(group,
2673
                                VIR_CGROUP_CONTROLLER_FREEZER,
R
Ryota Ozaki 已提交
2674 2675 2676 2677 2678
                                "freezer.state", state);
}

int virCgroupGetFreezerState(virCgroupPtr group, char **state)
{
2679
    return virCgroupGetValueStr(group,
2680
                                VIR_CGROUP_CONTROLLER_FREEZER,
R
Ryota Ozaki 已提交
2681 2682
                                "freezer.state", state);
}
2683

2684

E
Eric Blake 已提交
2685
#if defined HAVE_KILL && defined HAVE_MNTENT_H && defined HAVE_GETMNTENT_R
2686 2687 2688
/*
 * Returns 1 if some PIDs are killed, 0 if none are killed, or -1 on error
 */
2689 2690
static int virCgroupKillInternal(virCgroupPtr group, int signum, virHashTablePtr pids)
{
2691
    int ret = -1;
2692
    bool killedAny = false;
2693 2694
    char *keypath = NULL;
    bool done = false;
E
Eric Blake 已提交
2695 2696 2697
    FILE *fp = NULL;
    VIR_DEBUG("group=%p path=%s signum=%d pids=%p",
              group, group->path, signum, pids);
2698

2699
    if (virCgroupPathOfController(group, -1, "tasks", &keypath) < 0)
2700
        return -1;
2701 2702 2703 2704 2705 2706 2707

    /* PIDs may be forking as we kill them, so loop
     * until there are no new PIDs found
     */
    while (!done) {
        done = true;
        if (!(fp = fopen(keypath, "r"))) {
2708 2709 2710 2711 2712 2713
            if (errno == ENOENT) {
                VIR_DEBUG("No file %s, assuming done", keypath);
                killedAny = false;
                goto done;
            }

2714 2715 2716
            virReportSystemError(errno,
                                 _("Failed to read %s"),
                                 keypath);
2717 2718 2719
            goto cleanup;
        } else {
            while (!feof(fp)) {
2720 2721
                unsigned long pid_value;
                if (fscanf(fp, "%lu", &pid_value) != 1) {
2722 2723
                    if (feof(fp))
                        break;
2724 2725 2726
                    virReportSystemError(errno,
                                         _("Failed to read %s"),
                                         keypath);
E
Eric Blake 已提交
2727
                    goto cleanup;
2728
                }
2729
                if (virHashLookup(pids, (void*)pid_value))
2730 2731
                    continue;

2732 2733 2734
                VIR_DEBUG("pid=%lu", pid_value);
                /* Cgroups is a Linux concept, so this cast is safe.  */
                if (kill((pid_t)pid_value, signum) < 0) {
2735
                    if (errno != ESRCH) {
2736 2737 2738
                        virReportSystemError(errno,
                                             _("Failed to kill process %lu"),
                                             pid_value);
2739 2740 2741 2742
                        goto cleanup;
                    }
                    /* Leave RC == 0 since we didn't kill one */
                } else {
2743
                    killedAny = true;
2744 2745 2746
                    done = false;
                }

2747
                ignore_value(virHashAddEntry(pids, (void*)pid_value, (void*)1));
2748 2749 2750 2751 2752
            }
            VIR_FORCE_FCLOSE(fp);
        }
    }

2753
 done:
2754
    ret = killedAny ? 1 : 0;
2755 2756 2757

cleanup:
    VIR_FREE(keypath);
E
Eric Blake 已提交
2758
    VIR_FORCE_FCLOSE(fp);
2759

2760
    return ret;
2761 2762 2763
}


2764
static uint32_t virCgroupPidCode(const void *name, uint32_t seed)
2765
{
2766 2767
    unsigned long pid_value = (unsigned long)(intptr_t)name;
    return virHashCodeGen(&pid_value, sizeof(pid_value), seed);
2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778
}
static bool virCgroupPidEqual(const void *namea, const void *nameb)
{
    return namea == nameb;
}
static void *virCgroupPidCopy(const void *name)
{
    return (void*)name;
}

/*
2779
 * Returns 1 if some PIDs are killed, 0 if none are killed, or -1 on error
2780 2781 2782 2783
 */
int virCgroupKill(virCgroupPtr group, int signum)
{
    VIR_DEBUG("group=%p path=%s signum=%d", group, group->path, signum);
2784
    int ret;
2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795
    /* The 'tasks' file in cgroups can contain duplicated
     * pids, so we use a hash to track which we've already
     * killed.
     */
    virHashTablePtr pids = virHashCreateFull(100,
                                             NULL,
                                             virCgroupPidCode,
                                             virCgroupPidEqual,
                                             virCgroupPidCopy,
                                             NULL);

2796
    ret = virCgroupKillInternal(group, signum, pids);
2797 2798 2799

    virHashFree(pids);

2800
    return ret;
2801 2802 2803 2804 2805
}


static int virCgroupKillRecursiveInternal(virCgroupPtr group, int signum, virHashTablePtr pids, bool dormdir)
{
2806
    int ret = -1;
2807
    int rc;
2808
    bool killedAny = false;
2809 2810 2811 2812 2813 2814
    char *keypath = NULL;
    DIR *dp;
    virCgroupPtr subgroup = NULL;
    struct dirent *ent;
    VIR_DEBUG("group=%p path=%s signum=%d pids=%p", group, group->path, signum, pids);

2815
    if (virCgroupPathOfController(group, -1, "", &keypath) < 0)
2816
        return -1;
2817

2818 2819 2820 2821
    if ((rc = virCgroupKillInternal(group, signum, pids)) < 0)
        return -1;
    if (rc == 1)
        killedAny = true;
2822

2823
    VIR_DEBUG("Iterate over children of %s (killedAny=%d)", keypath, killedAny);
2824
    if (!(dp = opendir(keypath))) {
2825 2826 2827 2828 2829
        if (errno == ENOENT) {
            VIR_DEBUG("Path %s does not exist, assuming done", keypath);
            killedAny = false;
            goto done;
        }
2830 2831 2832
        virReportSystemError(errno,
                             _("Cannot open %s"), keypath);
        return -1;
2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844
    }

    while ((ent = readdir(dp))) {
        if (STREQ(ent->d_name, "."))
            continue;
        if (STREQ(ent->d_name, ".."))
            continue;
        if (ent->d_type != DT_DIR)
            continue;

        VIR_DEBUG("Process subdir %s", ent->d_name);

2845
        if (virCgroupNew(-1, ent->d_name, group, -1, &subgroup) < 0)
2846 2847 2848 2849 2850
            goto cleanup;

        if ((rc = virCgroupKillRecursiveInternal(subgroup, signum, pids, true)) < 0)
            goto cleanup;
        if (rc == 1)
2851
            killedAny = true;
2852 2853 2854 2855 2856 2857 2858

        if (dormdir)
            virCgroupRemove(subgroup);

        virCgroupFree(&subgroup);
    }

2859
 done:
2860
    ret = killedAny ? 1 : 0;
2861 2862 2863 2864 2865

cleanup:
    virCgroupFree(&subgroup);
    closedir(dp);

2866
    return ret;
2867 2868 2869 2870
}

int virCgroupKillRecursive(virCgroupPtr group, int signum)
{
2871
    int ret;
2872 2873 2874 2875 2876 2877 2878 2879
    VIR_DEBUG("group=%p path=%s signum=%d", group, group->path, signum);
    virHashTablePtr pids = virHashCreateFull(100,
                                             NULL,
                                             virCgroupPidCode,
                                             virCgroupPidEqual,
                                             virCgroupPidCopy,
                                             NULL);

2880
    ret = virCgroupKillRecursiveInternal(group, signum, pids, false);
2881 2882 2883

    virHashFree(pids);

2884
    return ret;
2885 2886 2887 2888 2889
}


int virCgroupKillPainfully(virCgroupPtr group)
{
2890
    size_t i;
2891
    int ret;
2892
    VIR_DEBUG("cgroup=%p path=%s", group, group->path);
2893
    for (i = 0; i < 15; i++) {
2894 2895 2896 2897 2898 2899
        int signum;
        if (i == 0)
            signum = SIGTERM;
        else if (i == 8)
            signum = SIGKILL;
        else
J
Ján Tomko 已提交
2900
            signum = 0; /* Just check for existence */
2901

2902 2903 2904 2905
        ret = virCgroupKillRecursive(group, signum);
        VIR_DEBUG("Iteration %zu rc=%d", i, ret);
        /* If ret == -1 we hit error, if 0 we ran out of PIDs */
        if (ret <= 0)
2906 2907 2908 2909
            break;

        usleep(200 * 1000);
    }
2910 2911
    VIR_DEBUG("Complete %d", ret);
    return ret;
2912
}
2913

E
Eric Blake 已提交
2914
#else /* !(HAVE_KILL, HAVE_MNTENT_H, HAVE_GETMNTENT_R) */
2915 2916 2917
int virCgroupKill(virCgroupPtr group ATTRIBUTE_UNUSED,
                  int signum ATTRIBUTE_UNUSED)
{
2918 2919 2920
    virReportSystemError(ENOSYS, "%s",
                         _("Control groups not supported on this platform"));
    return -1;
2921 2922 2923 2924
}
int virCgroupKillRecursive(virCgroupPtr group ATTRIBUTE_UNUSED,
                           int signum ATTRIBUTE_UNUSED)
{
2925 2926 2927
    virReportSystemError(ENOSYS, "%s",
                         _("Control groups not supported on this platform"));
    return -1;
2928 2929 2930 2931
}

int virCgroupKillPainfully(virCgroupPtr group ATTRIBUTE_UNUSED)
{
2932 2933 2934
    virReportSystemError(ENOSYS, "%s",
                         _("Control groups not supported on this platform"));
    return -1;
2935
}
E
Eric Blake 已提交
2936
#endif /* HAVE_KILL, HAVE_MNTENT_H, HAVE_GETMNTENT_R */
2937 2938 2939 2940 2941 2942 2943

#ifdef __linux__
static char *virCgroupIdentifyRoot(virCgroupPtr group)
{
    char *ret = NULL;
    size_t i;

2944
    for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
2945 2946 2947 2948 2949 2950 2951 2952 2953 2954
        char *tmp;
        if (!group->controllers[i].mountPoint)
            continue;
        if (!(tmp = strrchr(group->controllers[i].mountPoint, '/'))) {
            virReportError(VIR_ERR_INTERNAL_ERROR,
                           _("Could not find directory separator in %s"),
                           group->controllers[i].mountPoint);
            return NULL;
        }

2955 2956 2957
        if (VIR_STRNDUP(ret, group->controllers[i].mountPoint,
                        tmp - group->controllers[i].mountPoint) < 0)
            return NULL;
2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987
        return ret;
    }

    virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                   _("Could not find any mounted controllers"));
    return NULL;
}


int virCgroupIsolateMount(virCgroupPtr group, const char *oldroot,
                          const char *mountopts)
{
    int ret = -1;
    size_t i;
    char *opts = NULL;
    char *root = NULL;

    if (!(root = virCgroupIdentifyRoot(group)))
        return -1;

    VIR_DEBUG("Mounting cgroups at '%s'", root);

    if (virFileMakePath(root) < 0) {
        virReportSystemError(errno,
                             _("Unable to create directory %s"),
                             root);
        goto cleanup;
    }

    if (virAsprintf(&opts,
2988
                    "mode=755,size=65536%s", mountopts) < 0)
2989 2990 2991 2992 2993 2994 2995 2996 2997
        goto cleanup;

    if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_NODEV|MS_NOEXEC, opts) < 0) {
        virReportSystemError(errno,
                             _("Failed to mount %s on %s type %s"),
                             "tmpfs", root, "tmpfs");
        goto cleanup;
    }

2998
    for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
2999 3000 3001 3002 3003 3004 3005 3006
        if (!group->controllers[i].mountPoint)
            continue;

        if (!virFileExists(group->controllers[i].mountPoint)) {
            char *src;
            if (virAsprintf(&src, "%s%s%s",
                            oldroot,
                            group->controllers[i].mountPoint,
3007
                            group->controllers[i].placement) < 0)
3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022
                goto cleanup;

            VIR_DEBUG("Create mount point '%s'", group->controllers[i].mountPoint);
            if (virFileMakePath(group->controllers[i].mountPoint) < 0) {
                virReportSystemError(errno,
                                     _("Unable to create directory %s"),
                                     group->controllers[i].mountPoint);
                VIR_FREE(src);
                goto cleanup;
            }

            if (mount(src, group->controllers[i].mountPoint, NULL, MS_BIND, NULL) < 0) {
                virReportSystemError(errno,
                                     _("Failed to bind cgroup '%s' on '%s'"),
                                     src, group->controllers[i].mountPoint);
3023
                VIR_FREE(src);
3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055
                goto cleanup;
            }

            VIR_FREE(src);
        }

        if (group->controllers[i].linkPoint) {
            VIR_DEBUG("Link mount point '%s' to '%s'",
                      group->controllers[i].mountPoint,
                      group->controllers[i].linkPoint);
            if (symlink(group->controllers[i].mountPoint,
                        group->controllers[i].linkPoint) < 0) {
                virReportSystemError(errno,
                                     _("Unable to symlink directory %s to %s"),
                                     group->controllers[i].mountPoint,
                                     group->controllers[i].linkPoint);
                return -1;
            }
        }
    }
    ret = 0;

cleanup:
    VIR_FREE(root);
    VIR_FREE(opts);
    return ret;
}
#else /* __linux__ */
int virCgroupIsolateMount(virCgroupPtr group ATTRIBUTE_UNUSED,
                          const char *oldroot ATTRIBUTE_UNUSED,
                          const char *mountopts ATTRIBUTE_UNUSED)
{
3056 3057 3058
    virReportSystemError(ENOSYS, "%s",
                         _("Control groups not supported on this platform"));
    return -1;
3059 3060
}
#endif /* __linux__ */