lxc_container.c 23.0 KB
Newer Older
1 2
/*
 * Copyright IBM Corp. 2008
3
 * Copyright Red Hat 2008-2009
4 5 6 7 8
 *
 * lxc_container.c: file description
 *
 * Authors:
 *  David L. Leskovec <dlesko at linux.vnet.ibm.com>
9
 *  Daniel P. Berrange <berrange@redhat.com>
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */

#include <config.h>

#include <fcntl.h>
#include <limits.h>
#include <stdlib.h>
31
#include <stdio.h>
32 33
#include <sys/ioctl.h>
#include <sys/mount.h>
34
#include <sys/wait.h>
35
#include <unistd.h>
36 37 38 39 40 41 42
#include <mntent.h>

/* Yes, we want linux private one, for _syscall2() macro */
#include <linux/unistd.h>

/* For MS_MOVE */
#include <linux/fs.h>
43

D
Daniel P. Berrange 已提交
44 45 46
#if HAVE_CAPNG
#include <cap-ng.h>
#endif
47

48
#include "virterror_internal.h"
49
#include "logging.h"
50 51
#include "lxc_container.h"
#include "util.h"
52
#include "memory.h"
53
#include "veth.h"
54

55 56
#define VIR_FROM_THIS VIR_FROM_LXC

57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
/*
 * GLibc headers are behind the kernel, so we define these
 * constants if they're not present already.
 */

#ifndef CLONE_NEWPID
#define CLONE_NEWPID  0x20000000
#endif
#ifndef CLONE_NEWUTS
#define CLONE_NEWUTS  0x04000000
#endif
#ifndef CLONE_NEWUSER
#define CLONE_NEWUSER 0x10000000
#endif
#ifndef CLONE_NEWIPC
#define CLONE_NEWIPC  0x08000000
#endif
#ifndef CLONE_NEWNET
#define CLONE_NEWNET  0x40000000 /* New network namespace */
#endif

/* messages between parent and container */
typedef char lxc_message_t;
#define LXC_CONTINUE_MSG 'c'

typedef struct __lxc_child_argv lxc_child_argv_t;
struct __lxc_child_argv {
84
    virDomainDefPtr config;
85 86
    unsigned int nveths;
    char **veths;
87 88 89 90 91
    int monitor;
    char *ttyPath;
};


92
/**
93
 * lxcContainerExecInit:
94 95
 * @vmDef: Ptr to vm definition structure
 *
96
 * Exec the container init string. The container init will replace then
97 98
 * be running in the current process
 *
99
 * Does not return
100
 */
101
static int lxcContainerExecInit(virDomainDefPtr vmDef)
102
{
103
    const char *const argv[] = {
104
        vmDef->os.init,
105 106
        NULL,
    };
107

108
    return execve(argv[0], (char **)argv, NULL);
109 110 111
}

/**
112 113 114
 * lxcContainerSetStdio:
 * @control: the conrol FD
 * @ttyPath: Name of tty to set as the container console
115 116 117 118 119 120
 *
 * Sets the given tty as the primary conosole for the container as well as
 * stdout, stdin and stderr.
 *
 * Returns 0 on success or -1 in case of error
 */
121
static int lxcContainerSetStdio(int control, int ttyfd)
122 123
{
    int rc = -1;
124
    int open_max, i;
125 126

    if (setsid() < 0) {
127 128
        virReportSystemError(NULL, errno, "%s",
                             _("setsid failed"));
129
        goto cleanup;
130 131 132
    }

    if (ioctl(ttyfd, TIOCSCTTY, NULL) < 0) {
133 134
        virReportSystemError(NULL, errno, "%s",
                             _("ioctl(TIOCSTTY) failed"));
135 136 137
        goto cleanup;
    }

138 139 140 141
    /* Just in case someone forget to set FD_CLOEXEC, explicitly
     * close all FDs before executing the container */
    open_max = sysconf (_SC_OPEN_MAX);
    for (i = 0; i < open_max; i++)
142
        if (i != ttyfd && i != control)
143
            close(i);
144 145

    if (dup2(ttyfd, 0) < 0) {
146 147
        virReportSystemError(NULL, errno, "%s",
                             _("dup2(stdin) failed"));
148 149 150 151
        goto cleanup;
    }

    if (dup2(ttyfd, 1) < 0) {
152 153
        virReportSystemError(NULL, errno, "%s",
                             _("dup2(stdout) failed"));
154 155 156 157
        goto cleanup;
    }

    if (dup2(ttyfd, 2) < 0) {
158 159
        virReportSystemError(NULL, errno, "%s",
                             _("dup2(stderr) failed"));
160 161 162 163 164 165 166 167 168 169
        goto cleanup;
    }

    rc = 0;

cleanup:
    return rc;
}

/**
170 171
 * lxcContainerSendContinue:
 * @monitor: control FD to child
172
 *
173 174
 * Sends the continue message via the socket pair stored in the vm
 * structure.
175 176 177
 *
 * Returns 0 on success or -1 in case of error
 */
178
int lxcContainerSendContinue(int control)
179 180
{
    int rc = -1;
181 182
    lxc_message_t msg = LXC_CONTINUE_MSG;
    int writeCount = 0;
183

184 185
    writeCount = safewrite(control, &msg, sizeof(msg));
    if (writeCount != sizeof(msg)) {
186 187
        virReportSystemError(NULL, errno, "%s",
                             _("unable to send container continue message"));
188
        goto error_out;
189 190
    }

191
    rc = 0;
192

193 194
error_out:
    return rc;
195 196
}

197
/**
198 199
 * lxcContainerWaitForContinue:
 * @control: control FD from parent
200 201 202 203 204 205 206
 *
 * This function will wait for the container continue message from the
 * parent process.  It will send this message on the socket pair stored in
 * the vm structure once it has completed the post clone container setup.
 *
 * Returns 0 on success or -1 in case of error
 */
207
static int lxcContainerWaitForContinue(int control)
208 209 210 211
{
    lxc_message_t msg;
    int readLen;

212
    readLen = saferead(control, &msg, sizeof(msg));
213 214
    if (readLen != sizeof(msg) ||
        msg != LXC_CONTINUE_MSG) {
215 216
        virReportSystemError(NULL, errno, "%s",
                             _("Failed to read the container continue message"));
217
        return -1;
218
    }
219
    close(control);
220 221 222

    DEBUG0("Received container continue message");

223
    return 0;
224 225
}

226

227 228 229 230 231 232 233 234
/**
 * lxcEnableInterfaces:
 * @vm: Pointer to vm structure
 *
 * This function will enable the interfaces for this container.
 *
 * Returns 0 on success or nonzero in case of error
 */
235 236
static int lxcContainerEnableInterfaces(unsigned int nveths,
                                        char **veths)
237 238
{
    int rc = 0;
239
    unsigned int i;
240

241 242 243
    for (i = 0 ; i < nveths ; i++) {
        DEBUG("Enabling %s", veths[i]);
        rc =  vethInterfaceUpOrDown(veths[i], 1);
244 245 246 247 248 249
        if (0 != rc) {
            goto error_out;
        }
    }

    /* enable lo device only if there were other net devices */
250
    if (veths)
251 252 253 254 255 256
        rc = vethInterfaceUpOrDown("lo", 1);

error_out:
    return rc;
}

257 258 259 260 261 262 263 264 265 266 267 268 269 270

//_syscall2(int, pivot_root, char *, newroot, const char *, oldroot)
extern int pivot_root(const char * new_root,const char * put_old);

static int lxcContainerChildMountSort(const void *a, const void *b)
{
  const char **sa = (const char**)a;
  const char **sb = (const char**)b;

  /* Delibrately reversed args - we need to unmount deepest
     children first */
  return strcmp(*sb, *sa);
}

271 272 273 274 275 276 277 278 279 280 281 282 283
#ifndef MS_REC
#define MS_REC          16384
#endif

#ifndef MNT_DETACH
#define MNT_DETACH      0x00000002
#endif

#ifndef MS_PRIVATE
#define MS_PRIVATE              (1<<18)
#endif

#ifndef MS_SLAVE
284
#define MS_SLAVE                (1<<19)
285 286
#endif

287 288
static int lxcContainerPivotRoot(virDomainFSDefPtr root)
{
M
Mark McLoughlin 已提交
289
    int rc, ret;
290
    char *oldroot = NULL, *newroot = NULL;
291

M
Mark McLoughlin 已提交
292 293
    ret = -1;

294 295 296 297 298
    /* root->parent must be private, so make / private. */
    if (mount("", "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0) {
        virReportSystemError(NULL, errno, "%s",
                             _("failed to make root private"));
        goto err;
299 300
    }

301
    if (virAsprintf(&oldroot, "%s/.oldroot", root->src) < 0) {
302
        virReportOOMError(NULL);
303
        goto err;
304 305
    }

306 307 308 309
    if ((rc = virFileMakePath(oldroot)) < 0) {
        virReportSystemError(NULL, rc,
                             _("failed to create %s"),
                             oldroot);
310 311 312 313 314
        goto err;
    }

    /* Create a tmpfs root since old and new roots must be
     * on separate filesystems */
315
    if (mount("tmprootfs", oldroot, "tmpfs", 0, NULL) < 0) {
316 317 318 319 320
        virReportSystemError(NULL, errno,
                             _("failed to mount empty tmpfs at %s"),
                             oldroot);
        goto err;
    }
M
Mark McLoughlin 已提交
321

322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
    /* Create a directory called 'new' in tmpfs */
    if (virAsprintf(&newroot, "%s/new", oldroot) < 0) {
        virReportOOMError(NULL);
        goto err;
    }

    if ((rc = virFileMakePath(newroot)) < 0) {
        virReportSystemError(NULL, rc,
                             _("failed to create %s"),
                             newroot);
        goto err;
    }

    /* ... and mount our root onto it */
    if (mount(root->src, newroot, NULL, MS_BIND|MS_REC, NULL) < 0) {
        virReportSystemError(NULL, errno,
                             _("failed to bind new root %s into tmpfs"),
                             root->src);
        goto err;
    }

    /* Now we chroot into the tmpfs, then pivot into the
     * root->src bind-mounted onto '/new' */
345 346 347
    if (chdir(newroot) < 0) {
        virReportSystemError(NULL, errno,
                             _("failed to chroot into %s"), newroot);
348
        goto err;
349 350 351 352
    }

    /* The old root directory will live at /.oldroot after
     * this and will soon be unmounted completely */
353 354 355 356
    if (pivot_root(".", ".oldroot") < 0) {
        virReportSystemError(NULL, errno, "%s",
                             _("failed to pivot root"));
        goto err;
357 358 359
    }

    /* CWD is undefined after pivot_root, so go to / */
360 361 362
    if (chdir("/") < 0)
        goto err;

M
Mark McLoughlin 已提交
363 364 365
    ret = 0;

err:
366 367 368
    VIR_FREE(oldroot);
    VIR_FREE(newroot);

M
Mark McLoughlin 已提交
369
    return ret;
370 371
}

372 373

static int lxcContainerMountBasicFS(virDomainFSDefPtr root)
374 375
{
    const struct {
376 377 378 379 380 381 382 383 384 385
        const char *src;
        const char *dst;
        const char *type;
    } mnts[] = {
        { "/dev", "/dev", "tmpfs" },
        { "/proc", "/proc", "proc" },
        { "/sys", "/sys", "sysfs" },
#if WITH_SELINUX
        { "none", "/selinux", "selinuxfs" },
#endif
386
    };
387
    int i, rc = -1;
388
    char *devpts;
389

390 391
    if (virAsprintf(&devpts, "/.oldroot%s/dev/pts", root->src) < 0) {
        virReportOOMError(NULL);
392
        return rc;
393
    }
394 395 396 397 398 399

    for (i = 0 ; i < ARRAY_CARDINALITY(mnts) ; i++) {
        if (virFileMakePath(mnts[i].dst) < 0) {
            virReportSystemError(NULL, errno,
                                 _("failed to mkdir %s"),
                                 mnts[i].src);
400
            goto cleanup;
401 402 403 404 405
        }
        if (mount(mnts[i].src, mnts[i].dst, mnts[i].type, 0, NULL) < 0) {
            virReportSystemError(NULL, errno,
                                 _("failed to mount %s on %s"),
                                 mnts[i].type, mnts[i].type);
406
            goto cleanup;
407
        }
408
    }
409

410 411 412
    if ((rc = virFileMakePath("/dev/pts") < 0)) {
        virReportSystemError(NULL, rc, "%s",
                             _("cannot create /dev/pts"));
413
        goto cleanup;
414
    }
415 416 417

    VIR_DEBUG("Trying to move %s to %s", devpts, "/dev/pts");
    if ((rc = mount(devpts, "/dev/pts", NULL, MS_MOVE, NULL)) < 0) {
418
        virReportSystemError(NULL, errno, "%s",
419
                             _("failed to mount /dev/pts in container"));
420
        goto cleanup;
421
    }
422 423 424 425

    rc = 0;

 cleanup:
426 427
    VIR_FREE(devpts);

428
    return rc;
429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446
}

static int lxcContainerPopulateDevices(void)
{
    int i;
    const struct {
        int maj;
        int min;
        mode_t mode;
        const char *path;
    } devs[] = {
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/dev/null" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/dev/zero" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/dev/full" },
        { LXC_DEV_MAJ_TTY, LXC_DEV_MIN_CONSOLE, 0600, "/dev/console" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/dev/random" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/dev/urandom" },
    };
447 448 449 450

    /* Populate /dev/ with a few important bits */
    for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) {
        dev_t dev = makedev(devs[i].maj, devs[i].min);
451
        if (mknod(devs[i].path, S_IFCHR, dev) < 0 ||
452
            chmod(devs[i].path, devs[i].mode)) {
453 454 455
            virReportSystemError(NULL, errno,
                                 _("failed to make device %s"),
                                 devs[i].path);
456 457 458 459
            return -1;
        }
    }

460 461 462 463 464 465 466 467
    if (access("/dev/pts/ptmx", W_OK) == 0) {
        if (symlink("/dev/pts/ptmx", "/dev/ptmx") < 0) {
            virReportSystemError(NULL, errno, "%s",
                                 _("failed to create symlink /dev/ptmx to /dev/pts/ptmx"));
            return -1;
        }
    } else {
        dev_t dev = makedev(LXC_DEV_MAJ_TTY, LXC_DEV_MIN_PTMX);
468
        if (mknod("/dev/ptmx", S_IFCHR, dev) < 0 ||
469 470 471 472 473 474 475 476
            chmod("/dev/ptmx", 0666)) {
            virReportSystemError(NULL, errno, "%s",
                                 _("failed to make device /dev/ptmx"));
            return -1;
        }
    }


477 478 479 480 481 482
    return 0;
}


static int lxcContainerMountNewFS(virDomainDefPtr vmDef)
{
483
    int i;
484 485

    /* Pull in rest of container's mounts */
486
    for (i = 0 ; i < vmDef->nfss ; i++) {
487
        char *src;
488
        if (STREQ(vmDef->fss[i]->dst, "/"))
489 490
            continue;
        // XXX fix
491
        if (vmDef->fss[i]->type != VIR_DOMAIN_FS_TYPE_MOUNT)
492 493
            continue;

494
        if (virAsprintf(&src, "/.oldroot/%s", vmDef->fss[i]->src) < 0) {
495
            virReportOOMError(NULL);
496 497 498
            return -1;
        }

499 500 501 502
        if (virFileMakePath(vmDef->fss[i]->dst) < 0) {
            virReportSystemError(NULL, errno,
                                 _("failed to create %s"),
                                 vmDef->fss[i]->dst);
503
            VIR_FREE(src);
504 505 506 507 508 509 510 511
            return -1;
        }
        if (mount(src, vmDef->fss[i]->dst, NULL, MS_BIND, NULL) < 0) {
            VIR_FREE(src);
            virReportSystemError(NULL, errno,
                                 _("failed to mount %s at %s"),
                                 vmDef->fss[i]->src,
                                 vmDef->fss[i]->dst);
512 513 514 515 516 517 518 519 520 521 522
            return -1;
        }
        VIR_FREE(src);
    }

    return 0;
}


static int lxcContainerUnmountOldFS(void)
{
523
    struct mntent mntent;
524 525 526 527
    char **mounts = NULL;
    int nmounts = 0;
    FILE *procmnt;
    int i;
528
    char mntbuf[1024];
529 530

    if (!(procmnt = setmntent("/proc/mounts", "r"))) {
531 532
        virReportSystemError(NULL, errno, "%s",
                             _("failed to read /proc/mounts"));
533 534
        return -1;
    }
535
    while (getmntent_r(procmnt, &mntent, mntbuf, sizeof(mntbuf)) != NULL) {
536
        VIR_DEBUG("Got %s", mntent.mnt_dir);
537
        if (!STRPREFIX(mntent.mnt_dir, "/.oldroot"))
538 539 540 541
            continue;

        if (VIR_REALLOC_N(mounts, nmounts+1) < 0) {
            endmntent(procmnt);
542
            virReportOOMError(NULL);
543 544
            return -1;
        }
545
        if (!(mounts[nmounts++] = strdup(mntent.mnt_dir))) {
546
            endmntent(procmnt);
547
            virReportOOMError(NULL);
548 549 550 551 552
            return -1;
        }
    }
    endmntent(procmnt);

553 554 555
    if (mounts)
        qsort(mounts, nmounts, sizeof(mounts[0]),
              lxcContainerChildMountSort);
556 557

    for (i = 0 ; i < nmounts ; i++) {
558
        VIR_DEBUG("Umount %s", mounts[i]);
559
        if (umount(mounts[i]) < 0) {
560 561 562
            virReportSystemError(NULL, errno,
                                 _("failed to unmount '%s'"),
                                 mounts[i]);
563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579
            return -1;
        }
        VIR_FREE(mounts[i]);
    }
    VIR_FREE(mounts);

    return 0;
}


/* Got a FS mapped to /, we're going the pivot_root
 * approach to do a better-chroot-than-chroot
 * this is based on this thread http://lkml.org/lkml/2008/3/5/29
 */
static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef,
                                      virDomainFSDefPtr root)
{
580
    /* Gives us a private root, leaving all parent OS mounts on /.oldroot */
581 582 583
    if (lxcContainerPivotRoot(root) < 0)
        return -1;

584 585
    /* Mounts the core /proc, /sys, /dev, /dev/pts filesystems */
    if (lxcContainerMountBasicFS(root) < 0)
586 587
        return -1;

588
    /* Populates device nodes in /dev/ */
589 590 591
    if (lxcContainerPopulateDevices() < 0)
        return -1;

592
    /* Sets up any non-root mounts from guest config */
593 594 595
    if (lxcContainerMountNewFS(vmDef) < 0)
        return -1;

596
    /* Gets rid of all remaining mounts from host OS, including /.oldroot itself */
597 598 599 600 601 602 603 604 605 606
    if (lxcContainerUnmountOldFS() < 0)
        return -1;

    return 0;
}

/* Nothing mapped to /, we're using the main root,
   but with extra stuff mapped in */
static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef)
{
607
    int i;
608

609 610 611 612 613
    if (mount("", "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
        virReportSystemError(NULL, errno, "%s",
                             _("failed to make / slave"));
        return -1;
    }
614
    for (i = 0 ; i < vmDef->nfss ; i++) {
615
        // XXX fix to support other mount types
616
        if (vmDef->fss[i]->type != VIR_DOMAIN_FS_TYPE_MOUNT)
617 618
            continue;

619 620
        if (mount(vmDef->fss[i]->src,
                  vmDef->fss[i]->dst,
621 622 623
                  NULL,
                  MS_BIND,
                  NULL) < 0) {
624 625 626 627
            virReportSystemError(NULL, errno,
                                 _("failed to mount %s at %s"),
                                 vmDef->fss[i]->src,
                                 vmDef->fss[i]->dst);
628 629 630 631 632 633
            return -1;
        }
    }

    /* mount /proc */
    if (mount("lxcproc", "/proc", "proc", 0, NULL) < 0) {
634 635
        virReportSystemError(NULL, errno, "%s",
                             _("failed to mount /proc"));
636 637 638 639 640 641
        return -1;
    }

    return 0;
}

642 643
static int lxcContainerSetupMounts(virDomainDefPtr vmDef,
                                   virDomainFSDefPtr root)
644 645 646 647 648 649 650
{
    if (root)
        return lxcContainerSetupPivotRoot(vmDef, root);
    else
        return lxcContainerSetupExtraMounts(vmDef);
}

D
Daniel P. Berrange 已提交
651 652 653 654 655 656 657

/*
 * This is running as the 'init' process insid the container.
 * It removes some capabilities that could be dangerous to
 * host system, since they are not currently "containerized"
 */
static int lxcContainerDropCapabilities(void)
658
{
D
Daniel P. Berrange 已提交
659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676
#if HAVE_CAPNG
    int ret;

    capng_get_caps_process();

    if ((ret = capng_updatev(CAPNG_DROP,
                             CAPNG_EFFECTIVE | CAPNG_PERMITTED |
                             CAPNG_INHERITABLE | CAPNG_BOUNDING_SET,
                             CAP_SYS_BOOT, /* No use of reboot */
                             CAP_SYS_MODULE, /* No kernel module loading */
                             CAP_SYS_TIME, /* No changing the clock */
                             CAP_AUDIT_CONTROL, /* No messing with auditing status */
                             CAP_MAC_ADMIN, /* No messing with LSM config */
                             -1 /* sentinal */)) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to remove capabilities %d"), ret);
        return -1;
    }
677

D
Daniel P. Berrange 已提交
678 679 680 681
    if ((ret = capng_apply(CAPNG_SELECT_BOTH)) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to apply capabilities: %d"), ret);
        return -1;
682
    }
D
Daniel P. Berrange 已提交
683 684 685 686 687 688 689 690 691 692

    /* Need to prevent them regaining any caps on exec */
    if ((ret = capng_lock()) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to lock capabilities: %d"), ret);
        return -1;
    }

#else
    VIR_WARN0(_("libcap-ng support not compiled in, unable to clear capabilities"));
D
Daniel Veillard 已提交
693
#endif
694 695 696 697
    return 0;
}


698 699 700 701 702 703 704 705 706 707 708 709
/**
 * lxcChild:
 * @argv: Pointer to container arguments
 *
 * This function is run in the process clone()'d in lxcStartContainer.
 * Perform a number of container setup tasks:
 *     Setup container file system
 *     mount container /proca
 * Then exec's the container init
 *
 * Returns 0 on success or -1 in case of error
 */
710
static int lxcContainerChild( void *data )
711
{
712
    lxc_child_argv_t *argv = data;
713
    virDomainDefPtr vmDef = argv->config;
714
    int ttyfd;
715 716
    char *ttyPath;
    virDomainFSDefPtr root;
717 718 719

    if (NULL == vmDef) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
J
Jim Meyering 已提交
720
                 "%s", _("lxcChild() passed invalid vm definition"));
721
        return -1;
722 723
    }

724
    root = virDomainGetRootFilesystem(vmDef);
725

726 727 728 729 730 731 732 733 734 735 736 737 738
    if (root) {
        if (virAsprintf(&ttyPath, "%s%s", root->src, argv->ttyPath) < 0) {
            virReportOOMError(NULL);
            return -1;
        }
    } else {
        if (!(ttyPath = strdup(argv->ttyPath))) {
            virReportOOMError(NULL);
            return -1;
        }
    }

    ttyfd = open(ttyPath, O_RDWR|O_NOCTTY);
739
    if (ttyfd < 0) {
740
        virReportSystemError(NULL, errno,
741 742
                             _("failed to open tty %s"),
                             ttyPath);
743
        return -1;
744
    }
745
    VIR_FREE(ttyPath);
746

747 748
    if (lxcContainerSetStdio(argv->monitor, ttyfd) < 0) {
        close(ttyfd);
749
        return -1;
750 751
    }
    close(ttyfd);
752

753 754 755
    if (lxcContainerSetupMounts(vmDef, root) < 0)
        return -1;

756
    /* Wait for interface devices to show up */
757 758
    if (lxcContainerWaitForContinue(argv->monitor) < 0)
        return -1;
759 760

    /* enable interfaces */
761
    if (lxcContainerEnableInterfaces(argv->nveths, argv->veths) < 0)
762
        return -1;
763

764
    /* drop a set of root capabilities */
D
Daniel P. Berrange 已提交
765
    if (lxcContainerDropCapabilities() < 0)
766 767
        return -1;

768
    /* this function will only return if an error occured */
769 770
    return lxcContainerExecInit(vmDef);
}
771

772 773 774 775 776
static int userns_supported(void)
{
    return lxcContainerAvailable(LXC_CONTAINER_FEATURE_USER) == 0;
}

777 778 779 780 781 782 783 784 785
/**
 * lxcContainerStart:
 * @driver: pointer to driver structure
 * @vm: pointer to virtual machine structure
 *
 * Starts a container process by calling clone() with the namespace flags
 *
 * Returns PID of container on success or -1 in case of error
 */
786
int lxcContainerStart(virDomainDefPtr def,
787 788
                      unsigned int nveths,
                      char **veths,
789 790 791 792 793 794 795
                      int control,
                      char *ttyPath)
{
    pid_t pid;
    int flags;
    int stacksize = getpagesize() * 4;
    char *stack, *stacktop;
796
    lxc_child_argv_t args = { def, nveths, veths, control, ttyPath };
797 798 799

    /* allocate a stack for the container */
    if (VIR_ALLOC_N(stack, stacksize) < 0) {
800
        virReportOOMError(NULL);
801 802 803 804
        return -1;
    }
    stacktop = stack + stacksize;

805 806 807 808
    flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWIPC|SIGCHLD;

    if (userns_supported())
        flags |= CLONE_NEWUSER;
809 810 811 812 813 814 815 816 817

    if (def->nets != NULL)
        flags |= CLONE_NEWNET;

    pid = clone(lxcContainerChild, stacktop, flags, &args);
    VIR_FREE(stack);
    DEBUG("clone() returned, %d", pid);

    if (pid < 0) {
818 819
        virReportSystemError(NULL, errno, "%s",
                             _("failed to run clone container"));
820 821 822 823 824 825 826 827 828 829 830 831 832
        return -1;
    }

    return pid;
}

static int lxcContainerDummyChild(void *argv ATTRIBUTE_UNUSED)
{
    _exit(0);
}

int lxcContainerAvailable(int features)
{
833
    int flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|
834 835 836 837 838 839
        CLONE_NEWIPC|SIGCHLD;
    int cpid;
    char *childStack;
    char *stack;
    int childStatus;

840 841 842
    if (features & LXC_CONTAINER_FEATURE_USER)
        flags |= CLONE_NEWUSER;

843 844 845 846 847 848 849 850 851 852 853 854 855
    if (features & LXC_CONTAINER_FEATURE_NET)
        flags |= CLONE_NEWNET;

    if (VIR_ALLOC_N(stack, getpagesize() * 4) < 0) {
        DEBUG0("Unable to allocate stack");
        return -1;
    }

    childStack = stack + (getpagesize() * 4);

    cpid = clone(lxcContainerDummyChild, childStack, flags, NULL);
    VIR_FREE(stack);
    if (cpid < 0) {
856
        char ebuf[1024];
857
        DEBUG("clone call returned %s, container support is not enabled",
858
              virStrerror(errno, ebuf, sizeof ebuf));
859 860 861 862 863 864
        return -1;
    } else {
        waitpid(cpid, &childStatus, 0);
    }

    return 0;
865
}