lxc_controller.c 25.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * Copyright IBM Corp. 2008
 *
 * lxc_controller.c: linux container process controller
 *
 * Authors:
 *  David L. Leskovec <dlesko at linux.vnet.ibm.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */

#include <config.h>

#include <sys/epoll.h>
27 28
#include <sys/wait.h>
#include <sys/socket.h>
29 30
#include <sys/types.h>
#include <sys/un.h>
31
#include <unistd.h>
32 33 34
#include <paths.h>
#include <fcntl.h>
#include <signal.h>
35
#include <getopt.h>
36
#include <sys/mount.h>
37

D
Daniel P. Berrange 已提交
38
#if HAVE_CAPNG
39
# include <cap-ng.h>
D
Daniel P. Berrange 已提交
40 41
#endif

42
#include "virterror_internal.h"
43
#include "logging.h"
44 45 46
#include "util.h"

#include "lxc_conf.h"
47 48 49 50
#include "lxc_container.h"
#include "veth.h"
#include "memory.h"
#include "util.h"
51
#include "files.h"
52

53 54
#define VIR_FROM_THIS VIR_FROM_LXC

D
Dan Smith 已提交
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
struct cgroup_device_policy {
    char type;
    int major;
    int minor;
};

/**
 * lxcSetContainerResources
 * @def: pointer to virtual machine structure
 *
 * Creates a cgroup for the container, moves the task inside,
 * and sets resource limits
 *
 * Returns 0 on success or -1 in case of error
 */
static int lxcSetContainerResources(virDomainDefPtr def)
{
72
    virCgroupPtr driver;
D
Dan Smith 已提交
73 74 75 76 77 78 79 80 81
    virCgroupPtr cgroup;
    int rc = -1;
    int i;
    struct cgroup_device_policy devices[] = {
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM},
82
        {'c', LXC_DEV_MAJ_TTY, LXC_DEV_MIN_TTY},
83
        {'c', LXC_DEV_MAJ_TTY, LXC_DEV_MIN_PTMX},
D
Dan Smith 已提交
84 85
        {0,   0, 0}};

86 87
    rc = virCgroupForDriver("lxc", &driver, 1, 0);
    if (rc != 0) {
88 89 90 91
        /* Skip all if no driver cgroup is configured */
        if (rc == -ENXIO || rc == -ENOENT)
            return 0;

92
        virReportSystemError(-rc, "%s",
93
                             _("Unable to get cgroup for driver"));
94 95
        return rc;
    }
D
Dan Smith 已提交
96

97
    rc = virCgroupForDomain(driver, def->name, &cgroup, 1);
D
Dan Smith 已提交
98
    if (rc != 0) {
99
        virReportSystemError(-rc,
100 101
                             _("Unable to create cgroup for domain %s"),
                             def->name);
102
        goto cleanup;
D
Dan Smith 已提交
103 104
    }

105
    rc = virCgroupSetMemory(cgroup, def->mem.max_balloon);
106
    if (rc != 0) {
107
        virReportSystemError(-rc,
108 109
                             _("Unable to set memory limit for domain %s"),
                             def->name);
110 111 112
        /* Don't fail if we can't set memory due to lack of kernel support */
        if (rc != -ENOENT)
            goto cleanup;
113
    }
D
Dan Smith 已提交
114

115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
    if(def->mem.hard_limit) {
        rc = virCgroupSetMemoryHardLimit(cgroup, def->mem.hard_limit);
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to set memory hard limit for domain %s"),
                                 def->name);
            goto cleanup;
        }
    }

    if(def->mem.soft_limit) {
        rc = virCgroupSetMemorySoftLimit(cgroup, def->mem.soft_limit);
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to set memory soft limit for domain %s"),
                                 def->name);
            goto cleanup;
        }
    }

    if(def->mem.swap_hard_limit) {
        rc = virCgroupSetSwapHardLimit(cgroup, def->mem.swap_hard_limit);
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to set swap hard limit for domain %s"),
                                 def->name);
            goto cleanup;
        }
    }

D
Dan Smith 已提交
145
    rc = virCgroupDenyAllDevices(cgroup);
146
    if (rc != 0) {
147
        virReportSystemError(-rc,
148 149 150 151
                             _("Unable to deny devices for domain %s"),
                             def->name);
        goto cleanup;
    }
D
Dan Smith 已提交
152 153 154 155 156 157 158

    for (i = 0; devices[i].type != 0; i++) {
        struct cgroup_device_policy *dev = &devices[i];
        rc = virCgroupAllowDevice(cgroup,
                                  dev->type,
                                  dev->major,
                                  dev->minor);
159
        if (rc != 0) {
160
            virReportSystemError(-rc,
161 162 163 164
                                 _("Unable to allow device %c:%d:%d for domain %s"),
                                 dev->type, dev->major, dev->minor, def->name);
            goto cleanup;
        }
D
Dan Smith 已提交
165 166
    }

167
    rc = virCgroupAllowDeviceMajor(cgroup, 'c', LXC_DEV_MAJ_PTY);
168
    if (rc != 0) {
169
        virReportSystemError(-rc,
170 171 172 173
                             _("Unable to allow PYT devices for domain %s"),
                             def->name);
        goto cleanup;
    }
174

D
Dan Smith 已提交
175 176
    rc = virCgroupAddTask(cgroup, getpid());
    if (rc != 0) {
177
        virReportSystemError(-rc,
178 179
                             _("Unable to add task %d to cgroup for domain %s"),
                             getpid(), def->name);
D
Dan Smith 已提交
180 181
    }

182 183
cleanup:
    virCgroupFree(&driver);
D
Dan Smith 已提交
184 185 186 187 188
    virCgroupFree(&cgroup);

    return rc;
}

189 190 191
static char*lxcMonitorPath(virDomainDefPtr def)
{
    char *sockpath;
192 193 194

    if (virAsprintf(&sockpath, "%s/%s.sock",
                    LXC_STATE_DIR, def->name) < 0)
195
        virReportOOMError();
196 197 198 199 200 201 202 203 204
    return sockpath;
}

static int lxcMonitorServer(const char *sockpath)
{
    int fd;
    struct sockaddr_un addr;

    if ((fd = socket(PF_UNIX, SOCK_STREAM, 0)) < 0) {
205
        virReportSystemError(errno,
206 207
                             _("failed to create server socket '%s'"),
                             sockpath);
208 209 210 211 212 213
        goto error;
    }

    unlink(sockpath);
    memset(&addr, 0, sizeof(addr));
    addr.sun_family = AF_UNIX;
C
Chris Lalancette 已提交
214
    if (virStrcpyStatic(addr.sun_path, sockpath) == NULL) {
215
        lxcError(VIR_ERR_INTERNAL_ERROR,
C
Chris Lalancette 已提交
216 217 218
                 _("Socket path %s too long for destination"), sockpath);
        goto error;
    }
219 220

    if (bind(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
221
        virReportSystemError(errno,
222 223
                             _("failed to bind server socket '%s'"),
                             sockpath);
224 225 226
        goto error;
    }
    if (listen(fd, 30 /* backlog */ ) < 0) {
227
        virReportSystemError(errno,
228 229
                             _("failed to listen server socket %s"),
                             sockpath);
230 231 232 233 234 235
        goto error;
    }

    return fd;

error:
236
    VIR_FORCE_CLOSE(fd);
237 238
    return -1;
}
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259

/**
 * lxcFdForward:
 * @readFd: file descriptor to read
 * @writeFd: file desriptor to write
 *
 * Reads 1 byte of data from readFd and writes to writeFd.
 *
 * Returns 0 on success, EAGAIN if returned on read, or -1 in case of error
 */
static int lxcFdForward(int readFd, int writeFd)
{
    int rc = -1;
    char buf[2];

    if (1 != (saferead(readFd, buf, 1))) {
        if (EAGAIN == errno) {
            rc = EAGAIN;
            goto cleanup;
        }

260
        virReportSystemError(errno,
261 262
                             _("read of fd %d failed"),
                             readFd);
263 264 265 266
        goto cleanup;
    }

    if (1 != (safewrite(writeFd, buf, 1))) {
267
        virReportSystemError(errno,
268 269
                             _("write to fd %d failed"),
                             writeFd);
270 271 272 273 274 275 276 277 278
        goto cleanup;
    }

    rc = 0;

cleanup:
    return rc;
}

D
Daniel P. Berrange 已提交
279 280 281 282 283 284 285 286 287

static int lxcControllerClearCapabilities(void)
{
#if HAVE_CAPNG
    int ret;

    capng_clear(CAPNG_SELECT_BOTH);

    if ((ret = capng_apply(CAPNG_SELECT_BOTH)) < 0) {
288
        lxcError(VIR_ERR_INTERNAL_ERROR,
D
Daniel P. Berrange 已提交
289 290 291 292
                 _("failed to apply capabilities: %d"), ret);
        return -1;
    }
#else
293
    VIR_WARN0("libcap-ng support not compiled in, unable to clear capabilities");
D
Daniel P. Berrange 已提交
294 295 296 297
#endif
    return 0;
}

298 299 300 301 302
typedef struct _lxcTtyForwardFd_t {
    int fd;
    int active;
} lxcTtyForwardFd_t;

303 304 305 306 307 308 309 310 311 312 313
/* Return true if it is ok to ignore an accept-after-epoll syscall
   that fails with the specified errno value.  Else false.  */
static bool
ignorable_epoll_accept_errno(int errnum)
{
  return (errnum == EINVAL
          || errnum == ECONNABORTED
          || errnum == EAGAIN
          || errnum == EWOULDBLOCK);
}

314
/**
315 316 317 318 319
 * lxcControllerMain
 * @monitor: server socket fd to accept client requests
 * @client: initial client which is the libvirtd daemon
 * @appPty: open fd for application facing Pty
 * @contPty: open fd for container facing Pty
320 321 322 323 324 325 326 327
 *
 * Forwards traffic between fds.  Data read from appPty will be written to contPty
 * This process loops forever.
 * This uses epoll in edge triggered mode to avoid a hard loop on POLLHUP
 * events when the user disconnects the virsh console via ctrl-]
 *
 * Returns 0 on success or -1 in case of error
 */
328 329 330 331
static int lxcControllerMain(int monitor,
                             int client,
                             int appPty,
                             int contPty)
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
{
    int rc = -1;
    int epollFd;
    struct epoll_event epollEvent;
    int numEvents;
    int numActive = 0;
    lxcTtyForwardFd_t fdArray[2];
    int timeout = -1;
    int curFdOff = 0;
    int writeFdOff = 0;

    fdArray[0].fd = appPty;
    fdArray[0].active = 0;
    fdArray[1].fd = contPty;
    fdArray[1].active = 0;
347 348 349 350

    VIR_DEBUG("monitor=%d client=%d appPty=%d contPty=%d",
              monitor, client, appPty, contPty);

351 352 353
    /* create the epoll fild descriptor */
    epollFd = epoll_create(2);
    if (0 > epollFd) {
354
        virReportSystemError(errno, "%s",
355
                             _("epoll_create(2) failed"));
356 357 358 359 360 361 362 363
        goto cleanup;
    }

    /* add the file descriptors the epoll fd */
    memset(&epollEvent, 0x00, sizeof(epollEvent));
    epollEvent.events = EPOLLIN|EPOLLET;    /* edge triggered */
    epollEvent.data.fd = appPty;
    if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, appPty, &epollEvent)) {
364
        virReportSystemError(errno, "%s",
365
                             _("epoll_ctl(appPty) failed"));
366 367 368 369
        goto cleanup;
    }
    epollEvent.data.fd = contPty;
    if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, contPty, &epollEvent)) {
370
        virReportSystemError(errno, "%s",
371
                             _("epoll_ctl(contPty) failed"));
372 373 374
        goto cleanup;
    }

375 376 377
    epollEvent.events = EPOLLIN;
    epollEvent.data.fd = monitor;
    if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, monitor, &epollEvent)) {
378
        virReportSystemError(errno, "%s",
379
                             _("epoll_ctl(monitor) failed"));
380 381 382 383 384 385
        goto cleanup;
    }

    epollEvent.events = EPOLLHUP;
    epollEvent.data.fd = client;
    if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, client, &epollEvent)) {
386
        virReportSystemError(errno, "%s",
387
                             _("epoll_ctl(client) failed"));
388 389 390
        goto cleanup;
    }

391 392 393 394
    while (1) {
        /* if active fd's, return if no events, else wait forever */
        timeout = (numActive > 0) ? 0 : -1;
        numEvents = epoll_wait(epollFd, &epollEvent, 1, timeout);
395 396 397
        if (numEvents > 0) {
            if (epollEvent.data.fd == monitor) {
                int fd = accept(monitor, NULL, 0);
398 399 400 401 402 403 404 405 406 407 408 409
                if (fd < 0) {
                    /* First reflex may be simply to declare accept failure
                       to be a fatal error.  However, accept may fail when
                       a client quits between the above epoll_wait and here.
                       That case is not fatal, but rather to be expected,
                       if not common, so ignore it.  */
                    if (ignorable_epoll_accept_errno(errno))
                        continue;
                    virReportSystemError(errno, "%s",
                                         _("accept(monitor,...) failed"));
                    goto cleanup;
                }
410
                if (client != -1) { /* Already connected, so kick new one out */
411
                    VIR_FORCE_CLOSE(fd);
412
                    continue;
413
                }
414 415 416 417
                client = fd;
                epollEvent.events = EPOLLHUP;
                epollEvent.data.fd = client;
                if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, client, &epollEvent)) {
418
                    virReportSystemError(errno, "%s",
419
                                         _("epoll_ctl(client) failed"));
420 421 422 423
                    goto cleanup;
                }
            } else if (client != -1 && epollEvent.data.fd == client) {
                if (0 > epoll_ctl(epollFd, EPOLL_CTL_DEL, client, &epollEvent)) {
424
                    virReportSystemError(errno, "%s",
425
                                         _("epoll_ctl(client) failed"));
426 427
                    goto cleanup;
                }
428
                VIR_FORCE_CLOSE(client);
429
            } else {
430 431 432 433 434 435 436 437 438 439
                if (epollEvent.events & EPOLLIN) {
                    curFdOff = epollEvent.data.fd == appPty ? 0 : 1;
                    if (!fdArray[curFdOff].active) {
                        fdArray[curFdOff].active = 1;
                        ++numActive;
                    }
                } else if (epollEvent.events & EPOLLHUP) {
                    DEBUG("EPOLLHUP from fd %d", epollEvent.data.fd);
                    continue;
                } else {
440
                    lxcError(VIR_ERR_INTERNAL_ERROR,
441 442 443
                             _("error event %d"), epollEvent.events);
                    goto cleanup;
                }
444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460
            }
        } else if (0 == numEvents) {
            if (2 == numActive) {
                /* both fds active, toggle between the two */
                curFdOff ^= 1;
            } else {
                /* only one active, if current is active, use it, else it */
                /* must be the other one (ie. curFd just went inactive) */
                curFdOff = fdArray[curFdOff].active ? curFdOff : curFdOff ^ 1;
            }

        } else  {
            if (EINTR == errno) {
                continue;
            }

            /* error */
461
            virReportSystemError(errno, "%s",
462
                                 _("epoll_wait() failed"));
463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485
            goto cleanup;

        }

        if (0 < numActive) {
            writeFdOff = curFdOff ^ 1;
            rc = lxcFdForward(fdArray[curFdOff].fd, fdArray[writeFdOff].fd);

            if (EAGAIN == rc) {
                /* this fd no longer has data, set it as inactive */
                --numActive;
                fdArray[curFdOff].active = 0;
            } else if (-1 == rc) {
                goto cleanup;
            }

        }

    }

    rc = 0;

cleanup:
486 487 488
    VIR_FORCE_CLOSE(appPty);
    VIR_FORCE_CLOSE(contPty);
    VIR_FORCE_CLOSE(epollFd);
489 490 491
    return rc;
}

492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509


/**
 * lxcControllerMoveInterfaces
 * @nveths: number of interfaces
 * @veths: interface names
 * @container: pid of container
 *
 * Moves network interfaces into a container's namespace
 *
 * Returns 0 on success or -1 in case of error
 */
static int lxcControllerMoveInterfaces(unsigned int nveths,
                                       char **veths,
                                       pid_t container)
{
    unsigned int i;
    for (i = 0 ; i < nveths ; i++)
510
        if (moveInterfaceToNetNs(veths[i], container) < 0)
511 512 513 514 515 516 517 518
            return -1;

    return 0;
}


/**
 * lxcCleanupInterfaces:
519 520
 * @nveths: number of interfaces
 * @veths: interface names
521 522 523 524 525 526 527 528 529 530
 *
 * Cleans up the container interfaces by deleting the veth device pairs.
 *
 * Returns 0 on success or -1 in case of error
 */
static int lxcControllerCleanupInterfaces(unsigned int nveths,
                                          char **veths)
{
    unsigned int i;
    for (i = 0 ; i < nveths ; i++)
531
        vethDelete(veths[i]);
532 533 534 535

    return 0;
}

536
#ifndef MS_REC
537
# define MS_REC          16384
538 539 540
#endif

#ifndef MS_SLAVE
541
# define MS_SLAVE              (1<<19)
542
#endif
543 544

static int
545
lxcControllerRun(virDomainDefPtr def,
546 547 548 549 550 551 552 553
                 unsigned int nveths,
                 char **veths,
                 int monitor,
                 int client,
                 int appPty)
{
    int rc = -1;
    int control[2] = { -1, -1};
554
    int containerPty = -1;
555 556
    char *containerPtyPath;
    pid_t container = -1;
557 558 559
    virDomainFSDefPtr root;
    char *devpts = NULL;
    char *devptmx = NULL;
560 561

    if (socketpair(PF_UNIX, SOCK_STREAM, 0, control) < 0) {
562
        virReportSystemError(errno, "%s",
563
                             _("sockpair failed"));
564 565 566
        goto cleanup;
    }

567 568
    root = virDomainGetRootFilesystem(def);

569 570 571
    if (lxcSetContainerResources(def) < 0)
        goto cleanup;

572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594
    /*
     * If doing a chroot style setup, we need to prepare
     * a private /dev/pts for the child now, which they
     * will later move into position.
     *
     * This is complex because 'virsh console' needs to
     * use /dev/pts from the host OS, and the guest OS
     * needs to use /dev/pts from the guest.
     *
     * This means that we (libvirt_lxc) need to see and
     * use both /dev/pts instances. We're running in the
     * host OS context though and don't want to expose
     * the guest OS /dev/pts there.
     *
     * Thus we call unshare(CLONE_NS) so that we can see
     * the guest's new /dev/pts, without it becoming
     * visible to the host OS. We also put the root FS
     * into slave mode, just in case it was currently
     * marked as shared
     */
    if (root) {
        VIR_DEBUG0("Setting up private /dev/pts");
        if (unshare(CLONE_NEWNS) < 0) {
595
            virReportSystemError(errno, "%s",
596
                                 _("Cannot unshare mount namespace"));
597 598 599 600
            goto cleanup;
        }

        if (mount("", "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
601
            virReportSystemError(errno, "%s",
602
                                 _("Failed to switch root mount into slave mode"));
603 604 605 606 607
            goto cleanup;
        }

        if (virAsprintf(&devpts, "%s/dev/pts", root->src) < 0 ||
            virAsprintf(&devptmx, "%s/dev/pts/ptmx", root->src) < 0) {
608
            virReportOOMError();
609 610 611
            goto cleanup;
        }

L
Laine Stump 已提交
612
        if (virFileMakePath(devpts) != 0) {
613
            virReportSystemError(errno,
614
                                 _("Failed to make path %s"),
615 616 617 618 619 620
                                 devpts);
            goto cleanup;
        }

        VIR_DEBUG("Mouting 'devpts' on %s", devpts);
        if (mount("devpts", devpts, "devpts", 0, "newinstance,ptmxmode=0666") < 0) {
621
            virReportSystemError(errno,
622
                                 _("Failed to mount devpts on %s"),
623 624 625 626 627
                                 devpts);
            goto cleanup;
        }

        if (access(devptmx, R_OK) < 0) {
628
            VIR_WARN0("Kernel does not support private devpts, using shared devpts");
629 630
            VIR_FREE(devptmx);
        }
631 632
    }

633 634 635 636 637 638
    if (devptmx) {
        VIR_DEBUG("Opening tty on private %s", devptmx);
        if (virFileOpenTtyAt(devptmx,
                             &containerPty,
                             &containerPtyPath,
                             0) < 0) {
639
            virReportSystemError(errno, "%s",
640
                                 _("Failed to allocate tty"));
641 642 643 644 645 646 647
            goto cleanup;
        }
    } else {
        VIR_DEBUG0("Opening tty on shared /dev/ptmx");
        if (virFileOpenTty(&containerPty,
                           &containerPtyPath,
                           0) < 0) {
648
            virReportSystemError(errno, "%s",
649
                                 _("Failed to allocate tty"));
650 651 652 653 654
            goto cleanup;
        }
    }


655 656 657 658 659 660
    if ((container = lxcContainerStart(def,
                                       nveths,
                                       veths,
                                       control[1],
                                       containerPtyPath)) < 0)
        goto cleanup;
661
    VIR_FORCE_CLOSE(control[1]);
662 663 664 665 666 667 668

    if (lxcControllerMoveInterfaces(nveths, veths, container) < 0)
        goto cleanup;

    if (lxcContainerSendContinue(control[0]) < 0)
        goto cleanup;

D
Daniel P. Berrange 已提交
669 670 671 672 673
    /* Now the container is running, there's no need for us to keep
       any elevated capabilities */
    if (lxcControllerClearCapabilities() < 0)
        goto cleanup;

674 675 676
    rc = lxcControllerMain(monitor, client, appPty, containerPty);

cleanup:
677 678
    VIR_FREE(devptmx);
    VIR_FREE(devpts);
679 680
    VIR_FORCE_CLOSE(control[0]);
    VIR_FORCE_CLOSE(control[1]);
681
    VIR_FREE(containerPtyPath);
682
    VIR_FORCE_CLOSE(containerPty);
683

684
    if (container > 1) {
685
        int status;
686
        kill(container, SIGTERM);
687 688 689
        if (!(waitpid(container, &status, WNOHANG) == 0 &&
            WIFEXITED(status)))
            kill(container, SIGKILL);
690 691
        waitpid(container, NULL, 0);
    }
692 693 694 695
    return rc;
}


696
int main(int argc, char *argv[])
697 698
{
    pid_t pid;
699
    int rc = 1;
700
    int client;
701 702 703 704 705 706 707 708 709 710
    char *name = NULL;
    int nveths = 0;
    char **veths = NULL;
    int monitor = -1;
    int appPty = -1;
    int bg = 0;
    virCapsPtr caps = NULL;
    virDomainDefPtr def = NULL;
    char *configFile = NULL;
    char *sockpath = NULL;
711
    const struct option options[] = {
712 713 714 715 716 717 718
        { "background", 0, NULL, 'b' },
        { "name",   1, NULL, 'n' },
        { "veth",   1, NULL, 'v' },
        { "console", 1, NULL, 'c' },
        { "help", 0, NULL, 'h' },
        { 0, 0, 0, 0 },
    };
719

720 721
    while (1) {
        int c;
722

723 724 725 726 727 728 729 730 731 732 733 734 735
        c = getopt_long(argc, argv, "dn:v:m:c:h",
                       options, NULL);

        if (c == -1)
            break;

        switch (c) {
        case 'b':
            bg = 1;
            break;

        case 'n':
            if ((name = strdup(optarg)) == NULL) {
736
                virReportOOMError();
737
                goto cleanup;
738
            }
739 740 741 742
            break;

        case 'v':
            if (VIR_REALLOC_N(veths, nveths+1) < 0) {
743
                virReportOOMError();
744
                goto cleanup;
745
            }
746
            if ((veths[nveths++] = strdup(optarg)) == NULL) {
747
                virReportOOMError();
748
                goto cleanup;
749
            }
750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772
            break;

        case 'c':
            if (virStrToLong_i(optarg, NULL, 10, &appPty) < 0) {
                fprintf(stderr, "malformed --console argument '%s'", optarg);
                goto cleanup;
            }
            break;

        case 'h':
        case '?':
            fprintf(stderr, "\n");
            fprintf(stderr, "syntax: %s [OPTIONS]\n", argv[0]);
            fprintf(stderr, "\n");
            fprintf(stderr, "Options\n");
            fprintf(stderr, "\n");
            fprintf(stderr, "  -b, --background\n");
            fprintf(stderr, "  -n NAME, --name NAME\n");
            fprintf(stderr, "  -c FD, --console FD\n");
            fprintf(stderr, "  -v VETH, --veth VETH\n");
            fprintf(stderr, "  -h, --help\n");
            fprintf(stderr, "\n");
            goto cleanup;
773 774 775 776
        }
    }


777 778 779 780 781 782 783 784 785 786
    if (name == NULL) {
        fprintf(stderr, "%s: missing --name argument for configuration\n", argv[0]);
        goto cleanup;
    }

    if (appPty < 0) {
        fprintf(stderr, "%s: missing --console argument for container PTY\n", argv[0]);
        goto cleanup;
    }

787
    if (getuid() != 0) {
788 789 790
        fprintf(stderr, "%s: must be run as the 'root' user\n", argv[0]);
        goto cleanup;
    }
791

792 793
    if ((caps = lxcCapsInit()) == NULL)
        goto cleanup;
794

795
    if ((configFile = virDomainConfigFile(LXC_STATE_DIR,
796 797
                                          name)) == NULL)
        goto cleanup;
798

799
    if ((def = virDomainDefParseFile(caps, configFile,
800
                                     VIR_DOMAIN_XML_INACTIVE)) == NULL)
801
        goto cleanup;
802

803
    if (def->nnets != nveths) {
804
        fprintf(stderr, "%s: expecting %d veths, but got %d\n",
805
                argv[0], def->nnets, nveths);
806
        goto cleanup;
807 808
    }

809 810
    if ((sockpath = lxcMonitorPath(def)) == NULL)
        goto cleanup;
811

812 813
    if ((monitor = lxcMonitorServer(sockpath)) < 0)
        goto cleanup;
814

815 816 817
    if (bg) {
        if ((pid = fork()) < 0)
            goto cleanup;
818

819 820
        if (pid > 0) {
            if ((rc = virFileWritePid(LXC_STATE_DIR, name, pid)) != 0) {
821
                virReportSystemError(rc,
822 823
                                     _("Unable to write pid file '%s/%s.pid'"),
                                     LXC_STATE_DIR, name);
824 825
                _exit(1);
            }
826

827 828 829 830
            /* First child now exits, allowing original caller
             * (ie libvirtd's LXC driver to complete their
             * waitpid & continue */
            _exit(0);
831 832
        }

833 834
        /* Don't hold onto any cwd we inherit from libvirtd either */
        if (chdir("/") < 0) {
835
            virReportSystemError(errno, "%s",
836
                                 _("Unable to change to root dir"));
837 838 839 840
            goto cleanup;
        }

        if (setsid() < 0) {
841
            virReportSystemError(errno, "%s",
842
                                 _("Unable to become session leader"));
843 844 845
            goto cleanup;
        }
    }
846

A
Amy Griffis 已提交
847 848 849
    /* Initialize logging */
    virLogSetFromEnv();

850
    /* Accept initial client which is the libvirtd daemon */
851
    if ((client = accept(monitor, NULL, 0)) < 0) {
852
        virReportSystemError(errno, "%s",
853
                             _("Failed to accept a connection from driver"));
854
        goto cleanup;
855 856
    }

857
    rc = lxcControllerRun(def, nveths, veths, monitor, client, appPty);
858 859


860
cleanup:
861 862
    if (def)
        virFileDeletePid(LXC_STATE_DIR, def->name);
863
    lxcControllerCleanupInterfaces(nveths, veths);
J
Jim Meyering 已提交
864 865
    if (sockpath)
        unlink(sockpath);
866 867 868 869
    VIR_FREE(sockpath);

    return rc;
}