lxc_controller.c 18.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * Copyright IBM Corp. 2008
 *
 * lxc_controller.c: linux container process controller
 *
 * Authors:
 *  David L. Leskovec <dlesko at linux.vnet.ibm.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */

#include <config.h>

#include <sys/epoll.h>
27 28
#include <sys/wait.h>
#include <sys/socket.h>
29 30
#include <sys/types.h>
#include <sys/un.h>
31
#include <unistd.h>
32 33 34
#include <paths.h>
#include <fcntl.h>
#include <signal.h>
35
#include <getopt.h>
36

37
#include "virterror_internal.h"
38
#include "logging.h"
39 40 41
#include "util.h"

#include "lxc_conf.h"
42 43 44 45
#include "lxc_container.h"
#include "veth.h"
#include "memory.h"
#include "util.h"
D
Dan Smith 已提交
46
#include "cgroup.h"
47

48 49
#define VIR_FROM_THIS VIR_FROM_LXC

D
Dan Smith 已提交
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
struct cgroup_device_policy {
    char type;
    int major;
    int minor;
};

/**
 * lxcSetContainerResources
 * @def: pointer to virtual machine structure
 *
 * Creates a cgroup for the container, moves the task inside,
 * and sets resource limits
 *
 * Returns 0 on success or -1 in case of error
 */
static int lxcSetContainerResources(virDomainDefPtr def)
{
    virCgroupPtr cgroup;
    int rc = -1;
    int i;
    struct cgroup_device_policy devices[] = {
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM},
        {'c', LXC_DEV_MAJ_TTY, LXC_DEV_MIN_CONSOLE},
        {0,   0, 0}};

    if (virCgroupHaveSupport() != 0)
        return 0; /* Not supported, so claim success */

    rc = virCgroupForDomain(def, "lxc", &cgroup);
    if (rc != 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("Unable to create cgroup for %s\n"), def->name);
        return rc;
    }

    rc = virCgroupSetMemory(cgroup, def->maxmem);
    if (rc != 0)
        goto out;

    rc = virCgroupDenyAllDevices(cgroup);
    if (rc != 0)
        goto out;

    for (i = 0; devices[i].type != 0; i++) {
        struct cgroup_device_policy *dev = &devices[i];
        rc = virCgroupAllowDevice(cgroup,
                                  dev->type,
                                  dev->major,
                                  dev->minor);
        if (rc != 0)
            goto out;
    }

107 108 109 110
    rc = virCgroupAllowDeviceMajor(cgroup, 'c', LXC_DEV_MAJ_PTY);
    if (rc != 0)
        goto out;

D
Dan Smith 已提交
111 112 113
    rc = virCgroupAddTask(cgroup, getpid());
out:
    if (rc != 0) {
114 115
        virReportSystemError(NULL, -rc, "%s",
                             _("Failed to set lxc resources"));
D
Dan Smith 已提交
116 117 118 119 120 121 122 123
        virCgroupRemove(cgroup);
    }

    virCgroupFree(&cgroup);

    return rc;
}

124 125 126
static char*lxcMonitorPath(virDomainDefPtr def)
{
    char *sockpath;
127 128 129

    if (virAsprintf(&sockpath, "%s/%s.sock",
                    LXC_STATE_DIR, def->name) < 0)
130
        virReportOOMError(NULL);
131 132 133 134 135 136 137 138 139
    return sockpath;
}

static int lxcMonitorServer(const char *sockpath)
{
    int fd;
    struct sockaddr_un addr;

    if ((fd = socket(PF_UNIX, SOCK_STREAM, 0)) < 0) {
140 141 142
        virReportSystemError(NULL, errno,
                             _("failed to create server socket '%s'"),
                             sockpath);
143 144 145 146 147 148 149 150 151
        goto error;
    }

    unlink(sockpath);
    memset(&addr, 0, sizeof(addr));
    addr.sun_family = AF_UNIX;
    strncpy(addr.sun_path, sockpath, sizeof(addr.sun_path));

    if (bind(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
152 153 154
        virReportSystemError(NULL, errno,
                             _("failed to bind server socket '%s'"),
                             sockpath);
155 156 157
        goto error;
    }
    if (listen(fd, 30 /* backlog */ ) < 0) {
158 159 160
        virReportSystemError(NULL, errno,
                             _("failed to listen server socket %s"),
                             sockpath);
161 162 163 164 165 166 167 168 169 170
        goto error;
    }

    return fd;

error:
    if (fd != -1)
        close(fd);
    return -1;
}
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191

/**
 * lxcFdForward:
 * @readFd: file descriptor to read
 * @writeFd: file desriptor to write
 *
 * Reads 1 byte of data from readFd and writes to writeFd.
 *
 * Returns 0 on success, EAGAIN if returned on read, or -1 in case of error
 */
static int lxcFdForward(int readFd, int writeFd)
{
    int rc = -1;
    char buf[2];

    if (1 != (saferead(readFd, buf, 1))) {
        if (EAGAIN == errno) {
            rc = EAGAIN;
            goto cleanup;
        }

192 193 194
        virReportSystemError(NULL, errno,
                             _("read of fd %d failed"),
                             readFd);
195 196 197 198
        goto cleanup;
    }

    if (1 != (safewrite(writeFd, buf, 1))) {
199 200 201
        virReportSystemError(NULL, errno,
                             _("write to fd %d failed"),
                             writeFd);
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
        goto cleanup;
    }

    rc = 0;

cleanup:
    return rc;
}

typedef struct _lxcTtyForwardFd_t {
    int fd;
    int active;
} lxcTtyForwardFd_t;

/**
 * lxcTtyForward:
 * @appPty: Open fd for application facing Pty
 * @contPty: Open fd for container facing Pty
 *
 * Forwards traffic between fds.  Data read from appPty will be written to contPty
 * This process loops forever.
 * This uses epoll in edge triggered mode to avoid a hard loop on POLLHUP
 * events when the user disconnects the virsh console via ctrl-]
 *
 * Returns 0 on success or -1 in case of error
 */
228 229 230 231
static int lxcControllerMain(int monitor,
                             int client,
                             int appPty,
                             int contPty)
232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
{
    int rc = -1;
    int epollFd;
    struct epoll_event epollEvent;
    int numEvents;
    int numActive = 0;
    lxcTtyForwardFd_t fdArray[2];
    int timeout = -1;
    int curFdOff = 0;
    int writeFdOff = 0;

    fdArray[0].fd = appPty;
    fdArray[0].active = 0;
    fdArray[1].fd = contPty;
    fdArray[1].active = 0;

    /* create the epoll fild descriptor */
    epollFd = epoll_create(2);
    if (0 > epollFd) {
251 252
        virReportSystemError(NULL, errno, "%s",
                             _("epoll_create(2) failed"));
253 254 255 256 257 258 259 260
        goto cleanup;
    }

    /* add the file descriptors the epoll fd */
    memset(&epollEvent, 0x00, sizeof(epollEvent));
    epollEvent.events = EPOLLIN|EPOLLET;    /* edge triggered */
    epollEvent.data.fd = appPty;
    if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, appPty, &epollEvent)) {
261 262
        virReportSystemError(NULL, errno, "%s",
                             _("epoll_ctl(appPty) failed"));
263 264 265 266
        goto cleanup;
    }
    epollEvent.data.fd = contPty;
    if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, contPty, &epollEvent)) {
267 268
        virReportSystemError(NULL, errno, "%s",
                             _("epoll_ctl(contPty) failed"));
269 270 271
        goto cleanup;
    }

272 273 274
    epollEvent.events = EPOLLIN;
    epollEvent.data.fd = monitor;
    if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, monitor, &epollEvent)) {
275 276
        virReportSystemError(NULL, errno, "%s",
                             _("epoll_ctl(contPty) failed"));
277 278 279 280 281 282
        goto cleanup;
    }

    epollEvent.events = EPOLLHUP;
    epollEvent.data.fd = client;
    if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, client, &epollEvent)) {
283 284
        virReportSystemError(NULL, errno, "%s",
                             _("epoll_ctl(contPty) failed"));
285 286 287
        goto cleanup;
    }

288 289 290 291
    while (1) {
        /* if active fd's, return if no events, else wait forever */
        timeout = (numActive > 0) ? 0 : -1;
        numEvents = epoll_wait(epollFd, &epollEvent, 1, timeout);
292 293 294 295 296 297
        if (numEvents > 0) {
            if (epollEvent.data.fd == monitor) {
                int fd = accept(monitor, NULL, 0);
                if (client != -1) { /* Already connected, so kick new one out */
                    close(fd);
                    continue;
298
                }
299 300 301 302
                client = fd;
                epollEvent.events = EPOLLHUP;
                epollEvent.data.fd = client;
                if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, client, &epollEvent)) {
303 304
                    virReportSystemError(NULL, errno, "%s",
                                         _("epoll_ctl(contPty) failed"));
305 306 307 308
                    goto cleanup;
                }
            } else if (client != -1 && epollEvent.data.fd == client) {
                if (0 > epoll_ctl(epollFd, EPOLL_CTL_DEL, client, &epollEvent)) {
309 310
                    virReportSystemError(NULL, errno, "%s",
                                         _("epoll_ctl(contPty) failed"));
311 312 313 314
                    goto cleanup;
                }
                close(client);
                client = -1;
315
            } else {
316 317 318 319 320 321 322 323 324 325 326 327 328 329
                if (epollEvent.events & EPOLLIN) {
                    curFdOff = epollEvent.data.fd == appPty ? 0 : 1;
                    if (!fdArray[curFdOff].active) {
                        fdArray[curFdOff].active = 1;
                        ++numActive;
                    }
                } else if (epollEvent.events & EPOLLHUP) {
                    DEBUG("EPOLLHUP from fd %d", epollEvent.data.fd);
                    continue;
                } else {
                    lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                             _("error event %d"), epollEvent.events);
                    goto cleanup;
                }
330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
            }
        } else if (0 == numEvents) {
            if (2 == numActive) {
                /* both fds active, toggle between the two */
                curFdOff ^= 1;
            } else {
                /* only one active, if current is active, use it, else it */
                /* must be the other one (ie. curFd just went inactive) */
                curFdOff = fdArray[curFdOff].active ? curFdOff : curFdOff ^ 1;
            }

        } else  {
            if (EINTR == errno) {
                continue;
            }

            /* error */
347 348
            virReportSystemError(NULL, errno, "%s",
                                 _("epoll_wait() failed"));
349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
            goto cleanup;

        }

        if (0 < numActive) {
            writeFdOff = curFdOff ^ 1;
            rc = lxcFdForward(fdArray[curFdOff].fd, fdArray[writeFdOff].fd);

            if (EAGAIN == rc) {
                /* this fd no longer has data, set it as inactive */
                --numActive;
                fdArray[curFdOff].active = 0;
            } else if (-1 == rc) {
                goto cleanup;
            }

        }

    }

    rc = 0;

cleanup:
    close(appPty);
    close(contPty);
    close(epollFd);
    return rc;
}

378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430


/**
 * lxcControllerMoveInterfaces
 * @nveths: number of interfaces
 * @veths: interface names
 * @container: pid of container
 *
 * Moves network interfaces into a container's namespace
 *
 * Returns 0 on success or -1 in case of error
 */
static int lxcControllerMoveInterfaces(unsigned int nveths,
                                       char **veths,
                                       pid_t container)
{
    unsigned int i;
    for (i = 0 ; i < nveths ; i++)
        if (moveInterfaceToNetNs(veths[i], container) < 0) {
            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                     _("failed to move interface %s to ns %d"),
                     veths[i], container);
            return -1;
        }

    return 0;
}


/**
 * lxcCleanupInterfaces:
 * @conn: pointer to connection
 * @vm: pointer to virtual machine structure
 *
 * Cleans up the container interfaces by deleting the veth device pairs.
 *
 * Returns 0 on success or -1 in case of error
 */
static int lxcControllerCleanupInterfaces(unsigned int nveths,
                                          char **veths)
{
    unsigned int i;
    for (i = 0 ; i < nveths ; i++)
        if (vethDelete(veths[i]) < 0)
            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                     _("failed to delete veth: %s"), veths[i]);
            /* will continue to try to cleanup any other interfaces */

    return 0;
}


static int
431
lxcControllerRun(virDomainDefPtr def,
432 433 434 435 436 437 438 439 440 441 442 443 444
                 unsigned int nveths,
                 char **veths,
                 int monitor,
                 int client,
                 int appPty)
{
    int rc = -1;
    int control[2] = { -1, -1};
    int containerPty;
    char *containerPtyPath;
    pid_t container = -1;

    if (socketpair(PF_UNIX, SOCK_STREAM, 0, control) < 0) {
445 446
        virReportSystemError(NULL, errno, "%s",
                             _("sockpair failed"));
447 448 449 450 451 452
        goto cleanup;
    }

    if (virFileOpenTty(&containerPty,
                       &containerPtyPath,
                       0) < 0) {
453 454
        virReportSystemError(NULL, errno, "%s",
                             _("failed to allocate tty"));
455 456 457
        goto cleanup;
    }

458 459 460
    if (lxcSetContainerResources(def) < 0)
        goto cleanup;

461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
    if ((container = lxcContainerStart(def,
                                       nveths,
                                       veths,
                                       control[1],
                                       containerPtyPath)) < 0)
        goto cleanup;
    close(control[1]);
    control[1] = -1;

    if (lxcControllerMoveInterfaces(nveths, veths, container) < 0)
        goto cleanup;

    if (lxcContainerSendContinue(control[0]) < 0)
        goto cleanup;

    rc = lxcControllerMain(monitor, client, appPty, containerPty);

cleanup:
    if (control[0] != -1)
        close(control[0]);
    if (control[1] != -1)
        close(control[1]);
    VIR_FREE(containerPtyPath);
    if (containerPty != -1)
        close(containerPty);

487 488 489 490
    if (container > 1) {
        kill(container, SIGTERM);
        waitpid(container, NULL, 0);
    }
491 492 493 494
    return rc;
}


495
int main(int argc, char *argv[])
496 497
{
    pid_t pid;
498
    int rc = 1;
499
    int client;
500 501 502 503 504 505 506 507 508 509
    char *name = NULL;
    int nveths = 0;
    char **veths = NULL;
    int monitor = -1;
    int appPty = -1;
    int bg = 0;
    virCapsPtr caps = NULL;
    virDomainDefPtr def = NULL;
    char *configFile = NULL;
    char *sockpath = NULL;
510
    const struct option options[] = {
511 512 513 514 515 516 517
        { "background", 0, NULL, 'b' },
        { "name",   1, NULL, 'n' },
        { "veth",   1, NULL, 'v' },
        { "console", 1, NULL, 'c' },
        { "help", 0, NULL, 'h' },
        { 0, 0, 0, 0 },
    };
518

519 520
    while (1) {
        int c;
521

522 523 524 525 526 527 528 529 530 531 532 533 534
        c = getopt_long(argc, argv, "dn:v:m:c:h",
                       options, NULL);

        if (c == -1)
            break;

        switch (c) {
        case 'b':
            bg = 1;
            break;

        case 'n':
            if ((name = strdup(optarg)) == NULL) {
535
                virReportOOMError(NULL);
536
                goto cleanup;
537
            }
538 539 540 541
            break;

        case 'v':
            if (VIR_REALLOC_N(veths, nveths+1) < 0) {
542
                virReportOOMError(NULL);
543
                goto cleanup;
544
            }
545
            if ((veths[nveths++] = strdup(optarg)) == NULL) {
546
                virReportOOMError(NULL);
547
                goto cleanup;
548
            }
549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571
            break;

        case 'c':
            if (virStrToLong_i(optarg, NULL, 10, &appPty) < 0) {
                fprintf(stderr, "malformed --console argument '%s'", optarg);
                goto cleanup;
            }
            break;

        case 'h':
        case '?':
            fprintf(stderr, "\n");
            fprintf(stderr, "syntax: %s [OPTIONS]\n", argv[0]);
            fprintf(stderr, "\n");
            fprintf(stderr, "Options\n");
            fprintf(stderr, "\n");
            fprintf(stderr, "  -b, --background\n");
            fprintf(stderr, "  -n NAME, --name NAME\n");
            fprintf(stderr, "  -c FD, --console FD\n");
            fprintf(stderr, "  -v VETH, --veth VETH\n");
            fprintf(stderr, "  -h, --help\n");
            fprintf(stderr, "\n");
            goto cleanup;
572 573 574 575
        }
    }


576 577 578 579 580 581 582 583 584 585 586 587 588 589
    if (name == NULL) {
        fprintf(stderr, "%s: missing --name argument for configuration\n", argv[0]);
        goto cleanup;
    }

    if (appPty < 0) {
        fprintf(stderr, "%s: missing --console argument for container PTY\n", argv[0]);
        goto cleanup;
    }

    if (getuid() && 0) {
        fprintf(stderr, "%s: must be run as the 'root' user\n", argv[0]);
        goto cleanup;
    }
590

591 592
    if ((caps = lxcCapsInit()) == NULL)
        goto cleanup;
593

594 595 596 597
    if ((configFile = virDomainConfigFile(NULL,
                                          LXC_STATE_DIR,
                                          name)) == NULL)
        goto cleanup;
598

599 600
    if ((def = virDomainDefParseFile(NULL, caps, configFile,
                                     VIR_DOMAIN_XML_INACTIVE)) == NULL)
601
        goto cleanup;
602

603
    if (def->nnets != nveths) {
604
        fprintf(stderr, "%s: expecting %d veths, but got %d\n",
605
                argv[0], def->nnets, nveths);
606
        goto cleanup;
607 608
    }

609 610
    if ((sockpath = lxcMonitorPath(def)) == NULL)
        goto cleanup;
611

612 613
    if ((monitor = lxcMonitorServer(sockpath)) < 0)
        goto cleanup;
614

615 616 617
    if (bg) {
        if ((pid = fork()) < 0)
            goto cleanup;
618

619 620
        if (pid > 0) {
            if ((rc = virFileWritePid(LXC_STATE_DIR, name, pid)) != 0) {
621 622 623
                virReportSystemError(NULL, rc,
                                     _("Unable to write pid file '%s/%s.pid'"),
                                     LXC_STATE_DIR, name);
624 625
                _exit(1);
            }
626

627 628 629 630
            /* First child now exits, allowing original caller
             * (ie libvirtd's LXC driver to complete their
             * waitpid & continue */
            _exit(0);
631 632
        }

633 634
        /* Don't hold onto any cwd we inherit from libvirtd either */
        if (chdir("/") < 0) {
635 636
            virReportSystemError(NULL, errno, "%s",
                                 _("Unable to change to root dir"));
637 638 639 640
            goto cleanup;
        }

        if (setsid() < 0) {
641 642
            virReportSystemError(NULL, errno, "%s",
                                 _("Unable to become session leader"));
643 644 645
            goto cleanup;
        }
    }
646 647

    /* Accept initial client which is the libvirtd daemon */
648
    if ((client = accept(monitor, NULL, 0)) < 0) {
649 650
        virReportSystemError(NULL, errno, "%s",
                             _("Failed connection from LXC driver"));
651
        goto cleanup;
652 653
    }

654
    rc = lxcControllerRun(def, nveths, veths, monitor, client, appPty);
655 656


657 658 659 660 661 662 663 664
cleanup:
    virFileDeletePid(LXC_STATE_DIR, def->name);
    lxcControllerCleanupInterfaces(nveths, veths);
    unlink(sockpath);
    VIR_FREE(sockpath);

    return rc;
}