lxc_container.c 17.5 KB
Newer Older
1 2
/*
 * Copyright IBM Corp. 2008
3
 * Copyright Red Hat 2008
4 5 6 7 8
 *
 * lxc_container.c: file description
 *
 * Authors:
 *  David L. Leskovec <dlesko at linux.vnet.ibm.com>
9
 *  Daniel P. Berrange <berrange@redhat.com>
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */

#include <config.h>

#include <fcntl.h>
#include <limits.h>
#include <stdlib.h>
31
#include <stdio.h>
32 33
#include <sys/ioctl.h>
#include <sys/mount.h>
34
#include <sys/wait.h>
35
#include <unistd.h>
36 37 38 39 40 41 42
#include <mntent.h>

/* Yes, we want linux private one, for _syscall2() macro */
#include <linux/unistd.h>

/* For MS_MOVE */
#include <linux/fs.h>
43 44 45

#include "lxc_container.h"
#include "util.h"
46
#include "memory.h"
47
#include "veth.h"
48

49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
/*
 * GLibc headers are behind the kernel, so we define these
 * constants if they're not present already.
 */

#ifndef CLONE_NEWPID
#define CLONE_NEWPID  0x20000000
#endif
#ifndef CLONE_NEWUTS
#define CLONE_NEWUTS  0x04000000
#endif
#ifndef CLONE_NEWUSER
#define CLONE_NEWUSER 0x10000000
#endif
#ifndef CLONE_NEWIPC
#define CLONE_NEWIPC  0x08000000
#endif
#ifndef CLONE_NEWNET
#define CLONE_NEWNET  0x40000000 /* New network namespace */
#endif

/* messages between parent and container */
typedef char lxc_message_t;
#define LXC_CONTINUE_MSG 'c'

typedef struct __lxc_child_argv lxc_child_argv_t;
struct __lxc_child_argv {
76
    virDomainDefPtr config;
77 78
    unsigned int nveths;
    char **veths;
79 80 81 82 83
    int monitor;
    char *ttyPath;
};


84
/**
85
 * lxcContainerExecInit:
86 87
 * @vmDef: Ptr to vm definition structure
 *
88
 * Exec the container init string. The container init will replace then
89 90
 * be running in the current process
 *
91
 * Does not return
92
 */
93
static int lxcContainerExecInit(virDomainDefPtr vmDef)
94
{
95
    const char *const argv[] = {
96
        vmDef->os.init,
97 98
        NULL,
    };
99

100
    return execve(argv[0], (char **)argv, NULL);
101 102 103
}

/**
104 105 106
 * lxcContainerSetStdio:
 * @control: the conrol FD
 * @ttyPath: Name of tty to set as the container console
107 108 109 110 111 112
 *
 * Sets the given tty as the primary conosole for the container as well as
 * stdout, stdin and stderr.
 *
 * Returns 0 on success or -1 in case of error
 */
113
static int lxcContainerSetStdio(int control, int ttyfd)
114 115
{
    int rc = -1;
116
    int open_max, i;
117 118 119 120

    if (setsid() < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("setsid failed: %s"), strerror(errno));
121
        goto cleanup;
122 123 124 125 126 127 128 129
    }

    if (ioctl(ttyfd, TIOCSCTTY, NULL) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("ioctl(TIOCSTTY) failed: %s"), strerror(errno));
        goto cleanup;
    }

130 131 132 133
    /* Just in case someone forget to set FD_CLOEXEC, explicitly
     * close all FDs before executing the container */
    open_max = sysconf (_SC_OPEN_MAX);
    for (i = 0; i < open_max; i++)
134
        if (i != ttyfd && i != control)
135
            close(i);
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161

    if (dup2(ttyfd, 0) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("dup2(stdin) failed: %s"), strerror(errno));
        goto cleanup;
    }

    if (dup2(ttyfd, 1) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("dup2(stdout) failed: %s"), strerror(errno));
        goto cleanup;
    }

    if (dup2(ttyfd, 2) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("dup2(stderr) failed: %s"), strerror(errno));
        goto cleanup;
    }

    rc = 0;

cleanup:
    return rc;
}

/**
162 163
 * lxcContainerSendContinue:
 * @monitor: control FD to child
164
 *
165 166
 * Sends the continue message via the socket pair stored in the vm
 * structure.
167 168 169
 *
 * Returns 0 on success or -1 in case of error
 */
170
int lxcContainerSendContinue(int control)
171 172
{
    int rc = -1;
173 174
    lxc_message_t msg = LXC_CONTINUE_MSG;
    int writeCount = 0;
175

176 177
    writeCount = safewrite(control, &msg, sizeof(msg));
    if (writeCount != sizeof(msg)) {
178
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
179 180 181
                 _("unable to send container continue message: %s"),
                 strerror(errno));
        goto error_out;
182 183
    }

184
    rc = 0;
185

186 187
error_out:
    return rc;
188 189
}

190
/**
191 192
 * lxcContainerWaitForContinue:
 * @control: control FD from parent
193 194 195 196 197 198 199
 *
 * This function will wait for the container continue message from the
 * parent process.  It will send this message on the socket pair stored in
 * the vm structure once it has completed the post clone container setup.
 *
 * Returns 0 on success or -1 in case of error
 */
200
static int lxcContainerWaitForContinue(int control)
201 202 203 204
{
    lxc_message_t msg;
    int readLen;

205
    readLen = saferead(control, &msg, sizeof(msg));
206 207
    if (readLen != sizeof(msg) ||
        msg != LXC_CONTINUE_MSG) {
208 209 210
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("Failed to read the container continue message: %s"),
                 strerror(errno));
211
        return -1;
212
    }
213
    close(control);
214 215 216

    DEBUG0("Received container continue message");

217
    return 0;
218 219
}

220

221 222 223 224 225 226 227 228
/**
 * lxcEnableInterfaces:
 * @vm: Pointer to vm structure
 *
 * This function will enable the interfaces for this container.
 *
 * Returns 0 on success or nonzero in case of error
 */
229 230
static int lxcContainerEnableInterfaces(unsigned int nveths,
                                        char **veths)
231 232
{
    int rc = 0;
233
    unsigned int i;
234

235 236 237
    for (i = 0 ; i < nveths ; i++) {
        DEBUG("Enabling %s", veths[i]);
        rc =  vethInterfaceUpOrDown(veths[i], 1);
238 239 240 241 242 243
        if (0 != rc) {
            goto error_out;
        }
    }

    /* enable lo device only if there were other net devices */
244
    if (veths)
245 246 247 248 249 250
        rc = vethInterfaceUpOrDown("lo", 1);

error_out:
    return rc;
}

251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319

//_syscall2(int, pivot_root, char *, newroot, const char *, oldroot)
extern int pivot_root(const char * new_root,const char * put_old);

static int lxcContainerChildMountSort(const void *a, const void *b)
{
  const char **sa = (const char**)a;
  const char **sb = (const char**)b;

  /* Delibrately reversed args - we need to unmount deepest
     children first */
  return strcmp(*sb, *sa);
}

static int lxcContainerPivotRoot(virDomainFSDefPtr root)
{
    char *oldroot;

    /* First step is to ensure the new root itself is
       a mount point */
    if (mount(root->src, root->src, NULL, MS_BIND, NULL) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to bind new root %s: %s"),
                 root->src, strerror(errno));
        return -1;
    }

    if (asprintf(&oldroot, "%s/.oldroot", root->src) < 0) {
        oldroot = NULL;
        lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, NULL);
        return -1;
    }

    if (virFileMakePath(oldroot) < 0) {
        VIR_FREE(oldroot);
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to create %s: %s"),
                 oldroot, strerror(errno));
        return -1;
    }

    /* The old root directory will live at /.oldroot after
     * this and will soon be unmounted completely */
    if (pivot_root(root->src, oldroot) < 0) {
        VIR_FREE(oldroot);
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to pivot root %s to %s: %s"),
                 oldroot, root->src, strerror(errno));
        return -1;
    }
    VIR_FREE(oldroot);

    /* CWD is undefined after pivot_root, so go to / */
    if (chdir("/") < 0) {
        return -1;
    }

    return 0;
}

static int lxcContainerPopulateDevices(void)
{
    int i;
    const struct {
        int maj;
        int min;
        mode_t mode;
        const char *path;
    } devs[] = {
D
Dan Smith 已提交
320 321 322 323 324 325
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/dev/null" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/dev/zero" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/dev/full" },
        { LXC_DEV_MAJ_TTY, LXC_DEV_MIN_CONSOLE, 0600, "/dev/console" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/dev/random" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/dev/urandom" },
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529
    };

    if (virFileMakePath("/dev") < 0 ||
        mount("none", "/dev", "tmpfs", 0, NULL) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to mount /dev tmpfs for container: %s"),
                 strerror(errno));
        return -1;
    }
    /* Move old devpts into container, since we have to
       connect to the master ptmx which was opened in
       the parent.
       XXX This sucks, we need to figure out how to get our
       own private devpts for isolation
    */
    if (virFileMakePath("/dev/pts") < 0 ||
        mount("/.oldroot/dev/pts", "/dev/pts", NULL,
              MS_MOVE, NULL) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to move /dev/pts into container: %s"),
                 strerror(errno));
        return -1;
    }

    /* Populate /dev/ with a few important bits */
    for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) {
        dev_t dev = makedev(devs[i].maj, devs[i].min);
        if (mknod(devs[i].path, 0, dev) < 0 ||
            chmod(devs[i].path, devs[i].mode)) {
            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                     _("failed to make device %s: %s"),
                     devs[i].path, strerror(errno));
            return -1;
        }
    }

    return 0;
}


static int lxcContainerMountNewFS(virDomainDefPtr vmDef)
{
    virDomainFSDefPtr tmp;

    /* Pull in rest of container's mounts */
    for (tmp = vmDef->fss; tmp; tmp = tmp->next) {
        char *src;
        if (STREQ(tmp->dst, "/"))
            continue;
        // XXX fix
        if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT)
            continue;

        if (asprintf(&src, "/.oldroot/%s", tmp->src) < 0) {
            lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, NULL);
            return -1;
        }

        if (virFileMakePath(tmp->dst) < 0 ||
            mount(src, tmp->dst, NULL, MS_BIND, NULL) < 0) {
            VIR_FREE(src);
            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                     _("failed to mount %s at %s for container: %s"),
                     tmp->src, tmp->dst, strerror(errno));
            return -1;
        }
        VIR_FREE(src);
    }

    return 0;
}


static int lxcContainerUnmountOldFS(void)
{
    struct mntent *mntent;
    char **mounts = NULL;
    int nmounts = 0;
    FILE *procmnt;
    int i;

    if (!(procmnt = setmntent("/proc/mounts", "r"))) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to read /proc/mounts: %s"),
                 strerror(errno));
        return -1;
    }
    while ((mntent = getmntent(procmnt)) != NULL) {
        if (!STRPREFIX(mntent->mnt_dir, "/.oldroot"))
            continue;

        if (VIR_REALLOC_N(mounts, nmounts+1) < 0) {
            endmntent(procmnt);
            lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, NULL);
            return -1;
        }
        if (!(mounts[nmounts++] = strdup(mntent->mnt_dir))) {
            endmntent(procmnt);
            lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, NULL);
            return -1;
        }
    }
    endmntent(procmnt);

    qsort(mounts, nmounts, sizeof(mounts[0]),
          lxcContainerChildMountSort);

    for (i = 0 ; i < nmounts ; i++) {
        if (umount(mounts[i]) < 0) {
            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                     _("failed to unmount %s: %s"),
                     mounts[i], strerror(errno));
            return -1;
        }
        VIR_FREE(mounts[i]);
    }
    VIR_FREE(mounts);

    return 0;
}


/* Got a FS mapped to /, we're going the pivot_root
 * approach to do a better-chroot-than-chroot
 * this is based on this thread http://lkml.org/lkml/2008/3/5/29
 */
static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef,
                                      virDomainFSDefPtr root)
{
    if (lxcContainerPivotRoot(root) < 0)
        return -1;

    if (virFileMakePath("/proc") < 0 ||
        mount("none", "/proc", "proc", 0, NULL) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to mount /proc for container: %s"),
                 strerror(errno));
        return -1;
    }

    if (lxcContainerPopulateDevices() < 0)
        return -1;

    if (lxcContainerMountNewFS(vmDef) < 0)
        return -1;

    if (lxcContainerUnmountOldFS() < 0)
        return -1;

    return 0;
}

/* Nothing mapped to /, we're using the main root,
   but with extra stuff mapped in */
static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef)
{
    virDomainFSDefPtr tmp;

    for (tmp = vmDef->fss; tmp; tmp = tmp->next) {
        // XXX fix to support other mount types
        if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT)
            continue;

        if (mount(tmp->src,
                  tmp->dst,
                  NULL,
                  MS_BIND,
                  NULL) < 0) {
            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                     _("failed to mount %s at %s for container: %s"),
                     tmp->src, tmp->dst, strerror(errno));
            return -1;
        }
    }

    /* mount /proc */
    if (mount("lxcproc", "/proc", "proc", 0, NULL) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to mount /proc for container: %s"),
                 strerror(errno));
        return -1;
    }

    return 0;
}

static int lxcContainerSetupMounts(virDomainDefPtr vmDef)
{
    virDomainFSDefPtr tmp;
    virDomainFSDefPtr root = NULL;

    for (tmp = vmDef->fss; tmp && !root; tmp = tmp->next) {
        if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT)
            continue;
        if (STREQ(tmp->dst, "/"))
            root = tmp;
    }

    if (root)
        return lxcContainerSetupPivotRoot(vmDef, root);
    else
        return lxcContainerSetupExtraMounts(vmDef);
}

530 531 532 533 534 535 536 537 538 539 540 541
/**
 * lxcChild:
 * @argv: Pointer to container arguments
 *
 * This function is run in the process clone()'d in lxcStartContainer.
 * Perform a number of container setup tasks:
 *     Setup container file system
 *     mount container /proca
 * Then exec's the container init
 *
 * Returns 0 on success or -1 in case of error
 */
542
static int lxcContainerChild( void *data )
543
{
544
    lxc_child_argv_t *argv = data;
545
    virDomainDefPtr vmDef = argv->config;
546
    int ttyfd;
547 548 549 550

    if (NULL == vmDef) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("lxcChild() passed invalid vm definition"));
551
        return -1;
552 553
    }

554 555
    if (lxcContainerSetupMounts(vmDef) < 0)
        return -1;
556

557 558
    ttyfd = open(argv->ttyPath, O_RDWR|O_NOCTTY);
    if (ttyfd < 0) {
559
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
560
                 _("open(%s) failed: %s"), argv->ttyPath, strerror(errno));
561
        return -1;
562 563
    }

564 565
    if (lxcContainerSetStdio(argv->monitor, ttyfd) < 0) {
        close(ttyfd);
566
        return -1;
567 568
    }
    close(ttyfd);
569

570
    /* Wait for interface devices to show up */
571 572
    if (lxcContainerWaitForContinue(argv->monitor) < 0)
        return -1;
573 574

    /* enable interfaces */
575
    if (lxcContainerEnableInterfaces(argv->nveths, argv->veths) < 0)
576
        return -1;
577

578
    /* this function will only return if an error occured */
579 580
    return lxcContainerExecInit(vmDef);
}
581

582 583 584 585 586 587 588 589 590
/**
 * lxcContainerStart:
 * @driver: pointer to driver structure
 * @vm: pointer to virtual machine structure
 *
 * Starts a container process by calling clone() with the namespace flags
 *
 * Returns PID of container on success or -1 in case of error
 */
591
int lxcContainerStart(virDomainDefPtr def,
592 593
                      unsigned int nveths,
                      char **veths,
594 595 596 597 598 599 600
                      int control,
                      char *ttyPath)
{
    pid_t pid;
    int flags;
    int stacksize = getpagesize() * 4;
    char *stack, *stacktop;
601
    lxc_child_argv_t args = { def, nveths, veths, control, ttyPath };
602 603 604

    /* allocate a stack for the container */
    if (VIR_ALLOC_N(stack, stacksize) < 0) {
605
        lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, NULL);
606 607 608 609 610 611 612 613 614 615 616 617 618 619
        return -1;
    }
    stacktop = stack + stacksize;

    flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWUSER|CLONE_NEWIPC|SIGCHLD;

    if (def->nets != NULL)
        flags |= CLONE_NEWNET;

    pid = clone(lxcContainerChild, stacktop, flags, &args);
    VIR_FREE(stack);
    DEBUG("clone() returned, %d", pid);

    if (pid < 0) {
620
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662
                 _("clone() failed, %s"), strerror(errno));
        return -1;
    }

    return pid;
}

static int lxcContainerDummyChild(void *argv ATTRIBUTE_UNUSED)
{
    _exit(0);
}

int lxcContainerAvailable(int features)
{
    int flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWUSER|
        CLONE_NEWIPC|SIGCHLD;
    int cpid;
    char *childStack;
    char *stack;
    int childStatus;

    if (features & LXC_CONTAINER_FEATURE_NET)
        flags |= CLONE_NEWNET;

    if (VIR_ALLOC_N(stack, getpagesize() * 4) < 0) {
        DEBUG0("Unable to allocate stack");
        return -1;
    }

    childStack = stack + (getpagesize() * 4);

    cpid = clone(lxcContainerDummyChild, childStack, flags, NULL);
    VIR_FREE(stack);
    if (cpid < 0) {
        DEBUG("clone call returned %s, container support is not enabled",
              strerror(errno));
        return -1;
    } else {
        waitpid(cpid, &childStatus, 0);
    }

    return 0;
663 664
}