lxc_container.c 25.9 KB
Newer Older
1
/*
2 3
 * Copyright (C) 2008-2010 Red Hat, Inc.
 * Copyright (C) 2008 IBM Corp.
4 5 6 7 8
 *
 * lxc_container.c: file description
 *
 * Authors:
 *  David L. Leskovec <dlesko at linux.vnet.ibm.com>
9
 *  Daniel P. Berrange <berrange@redhat.com>
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */

#include <config.h>

#include <fcntl.h>
#include <limits.h>
#include <stdlib.h>
31
#include <stdio.h>
32 33
#include <sys/ioctl.h>
#include <sys/mount.h>
34
#include <sys/wait.h>
35
#include <sys/stat.h>
36
#include <unistd.h>
37 38 39 40 41 42 43
#include <mntent.h>

/* Yes, we want linux private one, for _syscall2() macro */
#include <linux/unistd.h>

/* For MS_MOVE */
#include <linux/fs.h>
44

D
Daniel P. Berrange 已提交
45
#if HAVE_CAPNG
46
# include <cap-ng.h>
D
Daniel P. Berrange 已提交
47
#endif
48

49
#include "virterror_internal.h"
50
#include "logging.h"
51 52
#include "lxc_container.h"
#include "util.h"
53
#include "memory.h"
54
#include "veth.h"
55
#include "uuid.h"
56
#include "files.h"
57
#include "command.h"
58

59 60
#define VIR_FROM_THIS VIR_FROM_LXC

61 62 63 64 65 66
/*
 * GLibc headers are behind the kernel, so we define these
 * constants if they're not present already.
 */

#ifndef CLONE_NEWPID
67
# define CLONE_NEWPID  0x20000000
68 69
#endif
#ifndef CLONE_NEWUTS
70
# define CLONE_NEWUTS  0x04000000
71 72
#endif
#ifndef CLONE_NEWUSER
73
# define CLONE_NEWUSER 0x10000000
74 75
#endif
#ifndef CLONE_NEWIPC
76
# define CLONE_NEWIPC  0x08000000
77 78
#endif
#ifndef CLONE_NEWNET
79
# define CLONE_NEWNET  0x40000000 /* New network namespace */
80 81 82 83 84 85 86 87
#endif

/* messages between parent and container */
typedef char lxc_message_t;
#define LXC_CONTINUE_MSG 'c'

typedef struct __lxc_child_argv lxc_child_argv_t;
struct __lxc_child_argv {
88
    virDomainDefPtr config;
89 90
    unsigned int nveths;
    char **veths;
91 92
    int monitor;
    char *ttyPath;
93
    int handshakefd;
94 95 96
};


97
/**
98
 * lxcContainerBuildInitCmd:
99
 * @vmDef: pointer to vm definition structure
100
 *
101
 * Build a virCommandPtr for launching the container 'init' process
102
 *
103
 * Returns a virCommandPtr
104
 */
105
static virCommandPtr lxcContainerBuildInitCmd(virDomainDefPtr vmDef)
106
{
107
    char uuidstr[VIR_UUID_STRING_BUFLEN];
108
    virCommandPtr cmd;
109 110 111

    virUUIDFormat(vmDef->uuid, uuidstr);

112 113 114 115 116 117 118
    cmd = virCommandNew(vmDef->os.init);

    virCommandAddEnvString(cmd, "PATH=/bin:/sbin");
    virCommandAddEnvString(cmd, "TERM=linux");
    virCommandAddEnvPair(cmd, "LIBVIRT_LXC_UUID", uuidstr);
    virCommandAddEnvPair(cmd, "LIBVIRT_LXC_NAME", vmDef->name);

119
    return cmd;
120 121 122
}

/**
123
 * lxcContainerSetStdio:
124 125
 * @control: control FD from parent
 * @ttyfd: FD of tty to set as the container console
126 127 128 129 130 131
 *
 * Sets the given tty as the primary conosole for the container as well as
 * stdout, stdin and stderr.
 *
 * Returns 0 on success or -1 in case of error
 */
132
static int lxcContainerSetStdio(int control, int ttyfd, int handshakefd)
133 134
{
    int rc = -1;
135
    int open_max, i;
136 137

    if (setsid() < 0) {
138
        virReportSystemError(errno, "%s",
139
                             _("setsid failed"));
140
        goto cleanup;
141 142 143
    }

    if (ioctl(ttyfd, TIOCSCTTY, NULL) < 0) {
144
        virReportSystemError(errno, "%s",
145
                             _("ioctl(TIOCSTTY) failed"));
146 147 148
        goto cleanup;
    }

149 150 151 152
    /* Just in case someone forget to set FD_CLOEXEC, explicitly
     * close all FDs before executing the container */
    open_max = sysconf (_SC_OPEN_MAX);
    for (i = 0; i < open_max; i++)
153
        if (i != ttyfd && i != control && i != handshakefd) {
154 155 156
            int tmpfd = i;
            VIR_FORCE_CLOSE(tmpfd);
        }
157 158

    if (dup2(ttyfd, 0) < 0) {
159
        virReportSystemError(errno, "%s",
160
                             _("dup2(stdin) failed"));
161 162 163 164
        goto cleanup;
    }

    if (dup2(ttyfd, 1) < 0) {
165
        virReportSystemError(errno, "%s",
166
                             _("dup2(stdout) failed"));
167 168 169 170
        goto cleanup;
    }

    if (dup2(ttyfd, 2) < 0) {
171
        virReportSystemError(errno, "%s",
172
                             _("dup2(stderr) failed"));
173 174 175 176 177 178 179 180 181 182
        goto cleanup;
    }

    rc = 0;

cleanup:
    return rc;
}

/**
183
 * lxcContainerSendContinue:
184
 * @control: control FD to child
185
 *
186 187
 * Sends the continue message via the socket pair stored in the vm
 * structure.
188 189 190
 *
 * Returns 0 on success or -1 in case of error
 */
191
int lxcContainerSendContinue(int control)
192 193
{
    int rc = -1;
194 195
    lxc_message_t msg = LXC_CONTINUE_MSG;
    int writeCount = 0;
196

197 198 199
    writeCount = safewrite(control, &msg, sizeof(msg));
    if (writeCount != sizeof(msg)) {
        goto error_out;
200 201
    }

202 203 204
    rc = 0;
error_out:
    return rc;
205 206
}

207
/**
208
 * lxcContainerWaitForContinue:
209
 * @control: Control FD from parent
210 211 212 213 214 215 216
 *
 * This function will wait for the container continue message from the
 * parent process.  It will send this message on the socket pair stored in
 * the vm structure once it has completed the post clone container setup.
 *
 * Returns 0 on success or -1 in case of error
 */
217
int lxcContainerWaitForContinue(int control)
218 219 220 221
{
    lxc_message_t msg;
    int readLen;

222
    readLen = saferead(control, &msg, sizeof(msg));
223 224 225
    if (readLen != sizeof(msg) ||
        msg != LXC_CONTINUE_MSG) {
        return -1;
226 227
    }

228
    return 0;
229 230
}

231

232
/**
233
 * lxcContainerRenameAndEnableInterfaces:
234 235
 * @nveths: number of interfaces
 * @veths: interface names
236
 *
237 238 239
 * This function will rename the interfaces to ethN
 * with id ascending order from zero and enable the
 * renamed interfaces for this container.
240 241 242
 *
 * Returns 0 on success or nonzero in case of error
 */
243 244
static int lxcContainerRenameAndEnableInterfaces(unsigned int nveths,
                                                 char **veths)
245 246
{
    int rc = 0;
247
    unsigned int i;
248
    char *newname = NULL;
249

250
    for (i = 0 ; i < nveths ; i++) {
251 252 253
        if (virAsprintf(&newname, "eth%d", i) < 0) {
            virReportOOMError();
            rc = -1;
254
            goto error_out;
255
        }
256

257
        VIR_DEBUG("Renaming %s to %s", veths[i], newname);
258
        rc = setInterfaceName(veths[i], newname);
259
        if (rc < 0)
260 261
            goto error_out;

262
        VIR_DEBUG("Enabling %s", newname);
263
        rc = vethInterfaceUpOrDown(newname, 1);
264
        if (rc < 0)
265
            goto error_out;
266

267
        VIR_FREE(newname);
268 269 270
    }

    /* enable lo device only if there were other net devices */
271
    if (veths)
272 273 274
        rc = vethInterfaceUpOrDown("lo", 1);

error_out:
275
    VIR_FREE(newname);
276 277 278
    return rc;
}

279

280
/*_syscall2(int, pivot_root, char *, newroot, const char *, oldroot)*/
281 282 283 284 285 286 287 288 289 290 291 292
extern int pivot_root(const char * new_root,const char * put_old);

static int lxcContainerChildMountSort(const void *a, const void *b)
{
  const char **sa = (const char**)a;
  const char **sb = (const char**)b;

  /* Delibrately reversed args - we need to unmount deepest
     children first */
  return strcmp(*sb, *sa);
}

293
#ifndef MS_REC
294
# define MS_REC          16384
295 296 297
#endif

#ifndef MNT_DETACH
298
# define MNT_DETACH      0x00000002
299 300 301
#endif

#ifndef MS_PRIVATE
302
# define MS_PRIVATE              (1<<18)
303 304 305
#endif

#ifndef MS_SLAVE
306
# define MS_SLAVE                (1<<19)
307 308
#endif

309 310
static int lxcContainerPivotRoot(virDomainFSDefPtr root)
{
M
Mark McLoughlin 已提交
311
    int rc, ret;
312
    char *oldroot = NULL, *newroot = NULL;
313

M
Mark McLoughlin 已提交
314 315
    ret = -1;

316 317
    /* root->parent must be private, so make / private. */
    if (mount("", "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0) {
318
        virReportSystemError(errno, "%s",
319
                             _("Failed to make root private"));
320
        goto err;
321 322
    }

323
    if (virAsprintf(&oldroot, "%s/.oldroot", root->src) < 0) {
324
        virReportOOMError();
325
        goto err;
326 327
    }

L
Laine Stump 已提交
328
    if ((rc = virFileMakePath(oldroot)) != 0) {
329
        virReportSystemError(rc,
330
                             _("Failed to create %s"),
331
                             oldroot);
332 333 334 335 336
        goto err;
    }

    /* Create a tmpfs root since old and new roots must be
     * on separate filesystems */
337
    if (mount("tmprootfs", oldroot, "tmpfs", 0, NULL) < 0) {
338
        virReportSystemError(errno,
339
                             _("Failed to mount empty tmpfs at %s"),
340 341 342
                             oldroot);
        goto err;
    }
M
Mark McLoughlin 已提交
343

344 345
    /* Create a directory called 'new' in tmpfs */
    if (virAsprintf(&newroot, "%s/new", oldroot) < 0) {
346
        virReportOOMError();
347 348 349
        goto err;
    }

L
Laine Stump 已提交
350
    if ((rc = virFileMakePath(newroot)) != 0) {
351
        virReportSystemError(rc,
352
                             _("Failed to create %s"),
353 354 355 356 357 358
                             newroot);
        goto err;
    }

    /* ... and mount our root onto it */
    if (mount(root->src, newroot, NULL, MS_BIND|MS_REC, NULL) < 0) {
359
        virReportSystemError(errno,
360
                             _("Failed to bind new root %s into tmpfs"),
361 362 363 364 365 366
                             root->src);
        goto err;
    }

    /* Now we chroot into the tmpfs, then pivot into the
     * root->src bind-mounted onto '/new' */
367
    if (chdir(newroot) < 0) {
368
        virReportSystemError(errno,
369
                             _("Failed to chroot into %s"), newroot);
370
        goto err;
371 372 373 374
    }

    /* The old root directory will live at /.oldroot after
     * this and will soon be unmounted completely */
375
    if (pivot_root(".", ".oldroot") < 0) {
376
        virReportSystemError(errno, "%s",
377
                             _("Failed to pivot root"));
378
        goto err;
379 380 381
    }

    /* CWD is undefined after pivot_root, so go to / */
382 383 384
    if (chdir("/") < 0)
        goto err;

M
Mark McLoughlin 已提交
385 386 387
    ret = 0;

err:
388 389 390
    VIR_FREE(oldroot);
    VIR_FREE(newroot);

M
Mark McLoughlin 已提交
391
    return ret;
392 393
}

394 395

static int lxcContainerMountBasicFS(virDomainFSDefPtr root)
396 397
{
    const struct {
398 399 400 401 402 403 404 405 406 407
        const char *src;
        const char *dst;
        const char *type;
    } mnts[] = {
        { "/dev", "/dev", "tmpfs" },
        { "/proc", "/proc", "proc" },
        { "/sys", "/sys", "sysfs" },
#if WITH_SELINUX
        { "none", "/selinux", "selinuxfs" },
#endif
408
    };
409
    int i, rc = -1;
410
    char *devpts;
411

412
    if (virAsprintf(&devpts, "/.oldroot%s/dev/pts", root->src) < 0) {
413
        virReportOOMError();
414
        return rc;
415
    }
416 417

    for (i = 0 ; i < ARRAY_CARDINALITY(mnts) ; i++) {
L
Laine Stump 已提交
418
        if (virFileMakePath(mnts[i].dst) != 0) {
419
            virReportSystemError(errno,
420
                                 _("Failed to mkdir %s"),
421
                                 mnts[i].src);
422
            goto cleanup;
423 424
        }
        if (mount(mnts[i].src, mnts[i].dst, mnts[i].type, 0, NULL) < 0) {
425
            virReportSystemError(errno,
426
                                 _("Failed to mount %s on %s"),
427
                                 mnts[i].type, mnts[i].type);
428
            goto cleanup;
429
        }
430
    }
431

L
Laine Stump 已提交
432
    if ((rc = virFileMakePath("/dev/pts") != 0)) {
433
        virReportSystemError(rc, "%s",
434
                             _("Cannot create /dev/pts"));
435
        goto cleanup;
436
    }
437 438 439

    VIR_DEBUG("Trying to move %s to %s", devpts, "/dev/pts");
    if ((rc = mount(devpts, "/dev/pts", NULL, MS_MOVE, NULL)) < 0) {
440
        virReportSystemError(errno, "%s",
441
                             _("Failed to mount /dev/pts in container"));
442
        goto cleanup;
443
    }
444 445 446 447

    rc = 0;

 cleanup:
448 449
    VIR_FREE(devpts);

450
    return rc;
451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467
}

static int lxcContainerPopulateDevices(void)
{
    int i;
    const struct {
        int maj;
        int min;
        mode_t mode;
        const char *path;
    } devs[] = {
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/dev/null" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/dev/zero" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/dev/full" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/dev/random" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/dev/urandom" },
    };
468 469 470 471

    /* Populate /dev/ with a few important bits */
    for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) {
        dev_t dev = makedev(devs[i].maj, devs[i].min);
472
        if (mknod(devs[i].path, S_IFCHR, dev) < 0 ||
473
            chmod(devs[i].path, devs[i].mode)) {
474
            virReportSystemError(errno,
475
                                 _("Failed to make device %s"),
476
                                 devs[i].path);
477 478 479 480
            return -1;
        }
    }

481 482
    if (access("/dev/pts/ptmx", W_OK) == 0) {
        if (symlink("/dev/pts/ptmx", "/dev/ptmx") < 0) {
483
            virReportSystemError(errno, "%s",
484
                                 _("Failed to create symlink /dev/ptmx to /dev/pts/ptmx"));
485 486 487 488
            return -1;
        }
    } else {
        dev_t dev = makedev(LXC_DEV_MAJ_TTY, LXC_DEV_MIN_PTMX);
489
        if (mknod("/dev/ptmx", S_IFCHR, dev) < 0 ||
490
            chmod("/dev/ptmx", 0666)) {
491
            virReportSystemError(errno, "%s",
492
                                 _("Failed to make device /dev/ptmx"));
493 494 495 496
            return -1;
        }
    }

497 498 499 500 501 502 503 504 505
    /* XXX we should allow multiple consoles per container
     * for tty2, tty3, etc, but the domain XML does not
     * handle this yet
     */
    if (symlink("/dev/pts/0", "/dev/tty1") < 0) {
        virReportSystemError(errno, "%s",
                             _("Failed to symlink /dev/pts/0 to /dev/tty1"));
        return -1;
    }
506 507 508 509 510
    if (symlink("/dev/pts/0", "/dev/console") < 0) {
        virReportSystemError(errno, "%s",
                             _("Failed to symlink /dev/pts/0 to /dev/console"));
        return -1;
    }
511

512 513 514 515 516 517
    return 0;
}


static int lxcContainerMountNewFS(virDomainDefPtr vmDef)
{
518
    int i;
519 520

    /* Pull in rest of container's mounts */
521
    for (i = 0 ; i < vmDef->nfss ; i++) {
522
        char *src;
523
        if (STREQ(vmDef->fss[i]->dst, "/"))
524
            continue;
525
        /* XXX fix */
526
        if (vmDef->fss[i]->type != VIR_DOMAIN_FS_TYPE_MOUNT)
527 528
            continue;

529
        if (virAsprintf(&src, "/.oldroot/%s", vmDef->fss[i]->src) < 0) {
530
            virReportOOMError();
531 532 533
            return -1;
        }

L
Laine Stump 已提交
534
        if (virFileMakePath(vmDef->fss[i]->dst) != 0) {
535
            virReportSystemError(errno,
536
                                 _("Failed to create %s"),
537
                                 vmDef->fss[i]->dst);
538
            VIR_FREE(src);
539 540 541
            return -1;
        }
        if (mount(src, vmDef->fss[i]->dst, NULL, MS_BIND, NULL) < 0) {
542
            virReportSystemError(errno,
543 544 545
                                 _("Failed to mount %s at %s"),
                                 src, vmDef->fss[i]->dst);
            VIR_FREE(src);
546 547 548 549 550 551 552 553 554 555 556
            return -1;
        }
        VIR_FREE(src);
    }

    return 0;
}


static int lxcContainerUnmountOldFS(void)
{
557
    struct mntent mntent;
558 559 560 561
    char **mounts = NULL;
    int nmounts = 0;
    FILE *procmnt;
    int i;
562
    char mntbuf[1024];
563 564

    if (!(procmnt = setmntent("/proc/mounts", "r"))) {
565
        virReportSystemError(errno, "%s",
566
                             _("Failed to read /proc/mounts"));
567 568
        return -1;
    }
569
    while (getmntent_r(procmnt, &mntent, mntbuf, sizeof(mntbuf)) != NULL) {
570
        VIR_DEBUG("Got %s", mntent.mnt_dir);
571
        if (!STRPREFIX(mntent.mnt_dir, "/.oldroot"))
572 573 574 575
            continue;

        if (VIR_REALLOC_N(mounts, nmounts+1) < 0) {
            endmntent(procmnt);
576
            virReportOOMError();
577 578
            return -1;
        }
579
        if (!(mounts[nmounts++] = strdup(mntent.mnt_dir))) {
580
            endmntent(procmnt);
581
            virReportOOMError();
582 583 584 585 586
            return -1;
        }
    }
    endmntent(procmnt);

587 588 589
    if (mounts)
        qsort(mounts, nmounts, sizeof(mounts[0]),
              lxcContainerChildMountSort);
590 591

    for (i = 0 ; i < nmounts ; i++) {
592
        VIR_DEBUG("Umount %s", mounts[i]);
593
        if (umount(mounts[i]) < 0) {
594
            virReportSystemError(errno,
595
                                 _("Failed to unmount '%s'"),
596
                                 mounts[i]);
597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613
            return -1;
        }
        VIR_FREE(mounts[i]);
    }
    VIR_FREE(mounts);

    return 0;
}


/* Got a FS mapped to /, we're going the pivot_root
 * approach to do a better-chroot-than-chroot
 * this is based on this thread http://lkml.org/lkml/2008/3/5/29
 */
static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef,
                                      virDomainFSDefPtr root)
{
614
    /* Gives us a private root, leaving all parent OS mounts on /.oldroot */
615 616 617
    if (lxcContainerPivotRoot(root) < 0)
        return -1;

618 619
    /* Mounts the core /proc, /sys, /dev, /dev/pts filesystems */
    if (lxcContainerMountBasicFS(root) < 0)
620 621
        return -1;

622
    /* Populates device nodes in /dev/ */
623 624 625
    if (lxcContainerPopulateDevices() < 0)
        return -1;

626
    /* Sets up any non-root mounts from guest config */
627 628 629
    if (lxcContainerMountNewFS(vmDef) < 0)
        return -1;

630
    /* Gets rid of all remaining mounts from host OS, including /.oldroot itself */
631 632 633 634 635 636 637 638 639 640
    if (lxcContainerUnmountOldFS() < 0)
        return -1;

    return 0;
}

/* Nothing mapped to /, we're using the main root,
   but with extra stuff mapped in */
static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef)
{
641
    int i;
642

643
    if (mount("", "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
644
        virReportSystemError(errno, "%s",
645
                             _("Failed to make / slave"));
646 647
        return -1;
    }
648
    for (i = 0 ; i < vmDef->nfss ; i++) {
649
        /* XXX fix to support other mount types */
650
        if (vmDef->fss[i]->type != VIR_DOMAIN_FS_TYPE_MOUNT)
651 652
            continue;

653 654
        if (mount(vmDef->fss[i]->src,
                  vmDef->fss[i]->dst,
655 656 657
                  NULL,
                  MS_BIND,
                  NULL) < 0) {
658
            virReportSystemError(errno,
659
                                 _("Failed to mount %s at %s"),
660 661
                                 vmDef->fss[i]->src,
                                 vmDef->fss[i]->dst);
662 663 664 665 666 667
            return -1;
        }
    }

    /* mount /proc */
    if (mount("lxcproc", "/proc", "proc", 0, NULL) < 0) {
668
        virReportSystemError(errno, "%s",
669
                             _("Failed to mount /proc"));
670 671 672 673 674 675
        return -1;
    }

    return 0;
}

676 677
static int lxcContainerSetupMounts(virDomainDefPtr vmDef,
                                   virDomainFSDefPtr root)
678 679 680 681 682 683 684
{
    if (root)
        return lxcContainerSetupPivotRoot(vmDef, root);
    else
        return lxcContainerSetupExtraMounts(vmDef);
}

D
Daniel P. Berrange 已提交
685 686 687 688 689 690 691

/*
 * This is running as the 'init' process insid the container.
 * It removes some capabilities that could be dangerous to
 * host system, since they are not currently "containerized"
 */
static int lxcContainerDropCapabilities(void)
692
{
D
Daniel P. Berrange 已提交
693 694 695 696 697 698 699 700 701 702 703 704 705 706
#if HAVE_CAPNG
    int ret;

    capng_get_caps_process();

    if ((ret = capng_updatev(CAPNG_DROP,
                             CAPNG_EFFECTIVE | CAPNG_PERMITTED |
                             CAPNG_INHERITABLE | CAPNG_BOUNDING_SET,
                             CAP_SYS_BOOT, /* No use of reboot */
                             CAP_SYS_MODULE, /* No kernel module loading */
                             CAP_SYS_TIME, /* No changing the clock */
                             CAP_AUDIT_CONTROL, /* No messing with auditing status */
                             CAP_MAC_ADMIN, /* No messing with LSM config */
                             -1 /* sentinal */)) < 0) {
707
        lxcError(VIR_ERR_INTERNAL_ERROR,
708
                 _("Failed to remove capabilities: %d"), ret);
D
Daniel P. Berrange 已提交
709 710
        return -1;
    }
711

D
Daniel P. Berrange 已提交
712
    if ((ret = capng_apply(CAPNG_SELECT_BOTH)) < 0) {
713
        lxcError(VIR_ERR_INTERNAL_ERROR,
714
                 _("Failed to apply capabilities: %d"), ret);
D
Daniel P. Berrange 已提交
715
        return -1;
716
    }
D
Daniel P. Berrange 已提交
717

718 719 720 721 722
    /* We do not need to call capng_lock() in this case. The bounding
     * set restriction will prevent them reacquiring sys_boot/module/time,
     * etc which is all that matters for the container. Once inside the
     * container it is fine for SECURE_NOROOT / SECURE_NO_SETUID_FIXUP to
     * be unmasked  - they can never escape the bounding set. */
D
Daniel P. Berrange 已提交
723 724

#else
725
    VIR_WARN("libcap-ng support not compiled in, unable to clear capabilities");
D
Daniel Veillard 已提交
726
#endif
727 728 729 730
    return 0;
}


731
/**
732 733
 * lxcContainerChild:
 * @data: pointer to container arguments
734 735 736 737 738 739 740 741 742
 *
 * This function is run in the process clone()'d in lxcStartContainer.
 * Perform a number of container setup tasks:
 *     Setup container file system
 *     mount container /proca
 * Then exec's the container init
 *
 * Returns 0 on success or -1 in case of error
 */
743
static int lxcContainerChild( void *data )
744
{
745
    lxc_child_argv_t *argv = data;
746
    virDomainDefPtr vmDef = argv->config;
747
    int ttyfd = -1;
748
    int ret = -1;
749
    char *ttyPath = NULL;
750
    virDomainFSDefPtr root;
751
    virCommandPtr cmd = NULL;
752 753

    if (NULL == vmDef) {
754
        lxcError(VIR_ERR_INTERNAL_ERROR,
J
Jim Meyering 已提交
755
                 "%s", _("lxcChild() passed invalid vm definition"));
756
        goto cleanup;
757 758
    }

759 760 761
    cmd = lxcContainerBuildInitCmd(vmDef);
    virCommandWriteArgLog(cmd, 1);

762
    root = virDomainGetRootFilesystem(vmDef);
763

764 765
    if (root) {
        if (virAsprintf(&ttyPath, "%s%s", root->src, argv->ttyPath) < 0) {
766
            virReportOOMError();
767
            goto cleanup;
768 769 770
        }
    } else {
        if (!(ttyPath = strdup(argv->ttyPath))) {
771
            virReportOOMError();
772
            goto cleanup;
773 774
        }
    }
775
    VIR_DEBUG("Container TTY path: %s", ttyPath);
776 777

    ttyfd = open(ttyPath, O_RDWR|O_NOCTTY);
778
    if (ttyfd < 0) {
779
        virReportSystemError(errno,
780
                             _("Failed to open tty %s"),
781
                             ttyPath);
782
        goto cleanup;
783
    }
784

785
    if (lxcContainerSetupMounts(vmDef, root) < 0)
786
        goto cleanup;
787

788 789 790 791 792 793 794
    if (!virFileExists(vmDef->os.init)) {
        virReportSystemError(errno,
                    _("cannot find init path '%s' relative to container root"),
                    vmDef->os.init);
        goto cleanup;
    }

795
    /* Wait for interface devices to show up */
796 797 798
    if (lxcContainerWaitForContinue(argv->monitor) < 0) {
        virReportSystemError(errno, "%s",
                             _("Failed to read the container continue message"));
799
        goto cleanup;
800 801
    }
    VIR_DEBUG("Received container continue message");
802

803 804
    /* rename and enable interfaces */
    if (lxcContainerRenameAndEnableInterfaces(argv->nveths,
805
                                              argv->veths) < 0) {
806
        goto cleanup;
807
    }
808

809
    /* drop a set of root capabilities */
D
Daniel P. Berrange 已提交
810
    if (lxcContainerDropCapabilities() < 0)
811
        goto cleanup;
812

813 814 815 816 817 818 819
    if (lxcContainerSendContinue(argv->handshakefd) < 0) {
        virReportSystemError(errno, "%s",
                            _("failed to send continue signal to controller"));
        goto cleanup;
    }

    if (lxcContainerSetStdio(argv->monitor, ttyfd, argv->handshakefd) < 0) {
820 821
        goto cleanup;
    }
822

823
    ret = 0;
824
cleanup:
825 826
    VIR_FREE(ttyPath);
    VIR_FORCE_CLOSE(ttyfd);
827
    VIR_FORCE_CLOSE(argv->monitor);
828
    VIR_FORCE_CLOSE(argv->handshakefd);
829 830 831 832 833 834

    if (ret == 0) {
        /* this function will only return if an error occured */
        ret = virCommandExec(cmd);
    }

835 836
    virCommandFree(cmd);
    return ret;
837
}
838

839 840
static int userns_supported(void)
{
841 842 843 844 845 846
#if 1
    /*
     * put off using userns until uid mapping is implemented
     */
    return 0;
#else
847
    return lxcContainerAvailable(LXC_CONTAINER_FEATURE_USER) == 0;
848
#endif
849 850
}

851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871
const char *lxcContainerGetAlt32bitArch(const char *arch)
{
    /* Any Linux 64bit arch which has a 32bit
     * personality available should be listed here */
    if (STREQ(arch, "x86_64"))
        return "i686";
    if (STREQ(arch, "s390x"))
        return "s390";
    if (STREQ(arch, "ppc64"))
        return "ppc";
    if (STREQ(arch, "parisc64"))
        return "parisc";
    if (STREQ(arch, "sparc64"))
        return "sparc";
    if (STREQ(arch, "mips64"))
        return "mips";

    return NULL;
}


872 873
/**
 * lxcContainerStart:
874 875 876 877 878
 * @def: pointer to virtual machine structure
 * @nveths: number of interfaces
 * @veths: interface names
 * @control: control FD to the container
 * @ttyPath: path of tty to set as the container console
879 880 881 882 883
 *
 * Starts a container process by calling clone() with the namespace flags
 *
 * Returns PID of container on success or -1 in case of error
 */
884
int lxcContainerStart(virDomainDefPtr def,
885 886
                      unsigned int nveths,
                      char **veths,
887
                      int control,
888
                      int handshakefd,
889 890 891 892 893 894
                      char *ttyPath)
{
    pid_t pid;
    int flags;
    int stacksize = getpagesize() * 4;
    char *stack, *stacktop;
895 896
    lxc_child_argv_t args = { def, nveths, veths, control, ttyPath,
                              handshakefd};
897 898 899

    /* allocate a stack for the container */
    if (VIR_ALLOC_N(stack, stacksize) < 0) {
900
        virReportOOMError();
901 902 903 904
        return -1;
    }
    stacktop = stack + stacksize;

905 906
    flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWIPC|SIGCHLD;

907
    if (userns_supported()) {
908
        VIR_DEBUG("Enable user namespaces");
909
        flags |= CLONE_NEWUSER;
910
    }
911

912
    if (def->nets != NULL) {
913
        VIR_DEBUG("Enable network namespaces");
914
        flags |= CLONE_NEWNET;
915
    }
916 917 918

    pid = clone(lxcContainerChild, stacktop, flags, &args);
    VIR_FREE(stack);
919
    VIR_DEBUG("clone() completed, new container PID is %d", pid);
920 921

    if (pid < 0) {
922
        virReportSystemError(errno, "%s",
923
                             _("Failed to run clone container"));
924 925 926 927 928 929
        return -1;
    }

    return pid;
}

930 931
ATTRIBUTE_NORETURN static int
lxcContainerDummyChild(void *argv ATTRIBUTE_UNUSED)
932 933 934 935 936 937
{
    _exit(0);
}

int lxcContainerAvailable(int features)
{
938
    int flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|
939 940 941 942 943 944
        CLONE_NEWIPC|SIGCHLD;
    int cpid;
    char *childStack;
    char *stack;
    int childStatus;

945 946 947
    if (features & LXC_CONTAINER_FEATURE_USER)
        flags |= CLONE_NEWUSER;

948 949 950 951
    if (features & LXC_CONTAINER_FEATURE_NET)
        flags |= CLONE_NEWNET;

    if (VIR_ALLOC_N(stack, getpagesize() * 4) < 0) {
952
        VIR_DEBUG("Unable to allocate stack");
953 954 955 956 957 958 959 960
        return -1;
    }

    childStack = stack + (getpagesize() * 4);

    cpid = clone(lxcContainerDummyChild, childStack, flags, NULL);
    VIR_FREE(stack);
    if (cpid < 0) {
961
        char ebuf[1024];
962
        VIR_DEBUG("clone call returned %s, container support is not enabled",
963
              virStrerror(errno, ebuf, sizeof ebuf));
964 965 966 967 968 969
        return -1;
    } else {
        waitpid(cpid, &childStatus, 0);
    }

    return 0;
970
}