lxc_controller.c 27.4 KB
Newer Older
1
/*
2 3
 * Copyright (C) 2010-2011 Red Hat, Inc.
 * Copyright IBM Corp. 2008
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
 *
 * lxc_controller.c: linux container process controller
 *
 * Authors:
 *  David L. Leskovec <dlesko at linux.vnet.ibm.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */

#include <config.h>

#include <sys/epoll.h>
28 29
#include <sys/wait.h>
#include <sys/socket.h>
30 31
#include <sys/types.h>
#include <sys/un.h>
32 33
#include <sys/utsname.h>
#include <sys/personality.h>
34
#include <unistd.h>
35
#include <paths.h>
36
#include <errno.h>
37 38
#include <fcntl.h>
#include <signal.h>
39
#include <getopt.h>
40
#include <sys/mount.h>
E
Eric Blake 已提交
41
#include <locale.h>
42

D
Daniel P. Berrange 已提交
43
#if HAVE_CAPNG
44
# include <cap-ng.h>
D
Daniel P. Berrange 已提交
45 46
#endif

47
#include "virterror_internal.h"
48
#include "logging.h"
49 50 51
#include "util.h"

#include "lxc_conf.h"
52 53 54 55
#include "lxc_container.h"
#include "veth.h"
#include "memory.h"
#include "util.h"
56
#include "files.h"
57

58 59
#define VIR_FROM_THIS VIR_FROM_LXC

D
Dan Smith 已提交
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
struct cgroup_device_policy {
    char type;
    int major;
    int minor;
};

/**
 * lxcSetContainerResources
 * @def: pointer to virtual machine structure
 *
 * Creates a cgroup for the container, moves the task inside,
 * and sets resource limits
 *
 * Returns 0 on success or -1 in case of error
 */
static int lxcSetContainerResources(virDomainDefPtr def)
{
77
    virCgroupPtr driver;
D
Dan Smith 已提交
78 79 80 81 82 83 84 85 86
    virCgroupPtr cgroup;
    int rc = -1;
    int i;
    struct cgroup_device_policy devices[] = {
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM},
87
        {'c', LXC_DEV_MAJ_TTY, LXC_DEV_MIN_TTY},
88
        {'c', LXC_DEV_MAJ_TTY, LXC_DEV_MIN_PTMX},
D
Dan Smith 已提交
89 90
        {0,   0, 0}};

91 92
    rc = virCgroupForDriver("lxc", &driver, 1, 0);
    if (rc != 0) {
93 94 95 96
        /* Skip all if no driver cgroup is configured */
        if (rc == -ENXIO || rc == -ENOENT)
            return 0;

97
        virReportSystemError(-rc, "%s",
98
                             _("Unable to get cgroup for driver"));
99 100
        return rc;
    }
D
Dan Smith 已提交
101

102
    rc = virCgroupForDomain(driver, def->name, &cgroup, 1);
D
Dan Smith 已提交
103
    if (rc != 0) {
104
        virReportSystemError(-rc,
105 106
                             _("Unable to create cgroup for domain %s"),
                             def->name);
107
        goto cleanup;
D
Dan Smith 已提交
108 109
    }

110 111 112 113 114 115 116 117 118 119
    if (def->blkio.weight) {
        rc = virCgroupSetBlkioWeight(cgroup, def->blkio.weight);
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to set Blkio weight for domain %s"),
                                 def->name);
            goto cleanup;
        }
    }

120 121 122 123 124 125 126 127 128 129
    if (def->cputune.shares) {
        rc = virCgroupSetCpuShares(cgroup, def->cputune.shares);
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to set cpu shares for domain %s"),
                                 def->name);
            goto cleanup;
        }
    }

130
    rc = virCgroupSetMemory(cgroup, def->mem.max_balloon);
131
    if (rc != 0) {
132
        virReportSystemError(-rc,
133 134
                             _("Unable to set memory limit for domain %s"),
                             def->name);
135
        goto cleanup;
136
    }
D
Dan Smith 已提交
137

138
    if (def->mem.hard_limit) {
139 140 141 142 143 144 145 146 147
        rc = virCgroupSetMemoryHardLimit(cgroup, def->mem.hard_limit);
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to set memory hard limit for domain %s"),
                                 def->name);
            goto cleanup;
        }
    }

148
    if (def->mem.soft_limit) {
149 150 151 152 153 154 155 156 157
        rc = virCgroupSetMemorySoftLimit(cgroup, def->mem.soft_limit);
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to set memory soft limit for domain %s"),
                                 def->name);
            goto cleanup;
        }
    }

158
    if (def->mem.swap_hard_limit) {
159
        rc = virCgroupSetMemSwapHardLimit(cgroup, def->mem.swap_hard_limit);
160 161 162 163 164 165 166 167
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to set swap hard limit for domain %s"),
                                 def->name);
            goto cleanup;
        }
    }

D
Dan Smith 已提交
168
    rc = virCgroupDenyAllDevices(cgroup);
169
    if (rc != 0) {
170
        virReportSystemError(-rc,
171 172 173 174
                             _("Unable to deny devices for domain %s"),
                             def->name);
        goto cleanup;
    }
D
Dan Smith 已提交
175 176 177 178 179 180

    for (i = 0; devices[i].type != 0; i++) {
        struct cgroup_device_policy *dev = &devices[i];
        rc = virCgroupAllowDevice(cgroup,
                                  dev->type,
                                  dev->major,
181 182
                                  dev->minor,
                                  VIR_CGROUP_DEVICE_RWM);
183
        if (rc != 0) {
184
            virReportSystemError(-rc,
185 186 187 188
                                 _("Unable to allow device %c:%d:%d for domain %s"),
                                 dev->type, dev->major, dev->minor, def->name);
            goto cleanup;
        }
D
Dan Smith 已提交
189 190
    }

191 192
    rc = virCgroupAllowDeviceMajor(cgroup, 'c', LXC_DEV_MAJ_PTY,
                                   VIR_CGROUP_DEVICE_RWM);
193
    if (rc != 0) {
194
        virReportSystemError(-rc,
195 196 197 198
                             _("Unable to allow PYT devices for domain %s"),
                             def->name);
        goto cleanup;
    }
199

D
Dan Smith 已提交
200 201
    rc = virCgroupAddTask(cgroup, getpid());
    if (rc != 0) {
202
        virReportSystemError(-rc,
203 204
                             _("Unable to add task %d to cgroup for domain %s"),
                             getpid(), def->name);
D
Dan Smith 已提交
205 206
    }

207 208
cleanup:
    virCgroupFree(&driver);
D
Dan Smith 已提交
209 210 211 212 213
    virCgroupFree(&cgroup);

    return rc;
}

214 215 216
static char*lxcMonitorPath(virDomainDefPtr def)
{
    char *sockpath;
217 218 219

    if (virAsprintf(&sockpath, "%s/%s.sock",
                    LXC_STATE_DIR, def->name) < 0)
220
        virReportOOMError();
221 222 223 224 225 226 227 228 229
    return sockpath;
}

static int lxcMonitorServer(const char *sockpath)
{
    int fd;
    struct sockaddr_un addr;

    if ((fd = socket(PF_UNIX, SOCK_STREAM, 0)) < 0) {
230
        virReportSystemError(errno,
231 232
                             _("failed to create server socket '%s'"),
                             sockpath);
233 234 235 236 237 238
        goto error;
    }

    unlink(sockpath);
    memset(&addr, 0, sizeof(addr));
    addr.sun_family = AF_UNIX;
C
Chris Lalancette 已提交
239
    if (virStrcpyStatic(addr.sun_path, sockpath) == NULL) {
240
        lxcError(VIR_ERR_INTERNAL_ERROR,
C
Chris Lalancette 已提交
241 242 243
                 _("Socket path %s too long for destination"), sockpath);
        goto error;
    }
244 245

    if (bind(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
246
        virReportSystemError(errno,
247 248
                             _("failed to bind server socket '%s'"),
                             sockpath);
249 250 251
        goto error;
    }
    if (listen(fd, 30 /* backlog */ ) < 0) {
252
        virReportSystemError(errno,
253 254
                             _("failed to listen server socket %s"),
                             sockpath);
255 256 257 258 259 260
        goto error;
    }

    return fd;

error:
261
    VIR_FORCE_CLOSE(fd);
262 263
    return -1;
}
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284

/**
 * lxcFdForward:
 * @readFd: file descriptor to read
 * @writeFd: file desriptor to write
 *
 * Reads 1 byte of data from readFd and writes to writeFd.
 *
 * Returns 0 on success, EAGAIN if returned on read, or -1 in case of error
 */
static int lxcFdForward(int readFd, int writeFd)
{
    int rc = -1;
    char buf[2];

    if (1 != (saferead(readFd, buf, 1))) {
        if (EAGAIN == errno) {
            rc = EAGAIN;
            goto cleanup;
        }

285
        virReportSystemError(errno,
286 287
                             _("read of fd %d failed"),
                             readFd);
288 289 290 291
        goto cleanup;
    }

    if (1 != (safewrite(writeFd, buf, 1))) {
292
        virReportSystemError(errno,
293 294
                             _("write to fd %d failed"),
                             writeFd);
295 296 297 298 299 300 301 302 303
        goto cleanup;
    }

    rc = 0;

cleanup:
    return rc;
}

D
Daniel P. Berrange 已提交
304 305 306 307 308 309 310 311 312

static int lxcControllerClearCapabilities(void)
{
#if HAVE_CAPNG
    int ret;

    capng_clear(CAPNG_SELECT_BOTH);

    if ((ret = capng_apply(CAPNG_SELECT_BOTH)) < 0) {
313
        lxcError(VIR_ERR_INTERNAL_ERROR,
D
Daniel P. Berrange 已提交
314 315 316 317
                 _("failed to apply capabilities: %d"), ret);
        return -1;
    }
#else
318
    VIR_WARN0("libcap-ng support not compiled in, unable to clear capabilities");
D
Daniel P. Berrange 已提交
319 320 321 322
#endif
    return 0;
}

323 324 325 326 327
typedef struct _lxcTtyForwardFd_t {
    int fd;
    int active;
} lxcTtyForwardFd_t;

328 329 330 331 332 333 334 335 336 337 338
/* Return true if it is ok to ignore an accept-after-epoll syscall
   that fails with the specified errno value.  Else false.  */
static bool
ignorable_epoll_accept_errno(int errnum)
{
  return (errnum == EINVAL
          || errnum == ECONNABORTED
          || errnum == EAGAIN
          || errnum == EWOULDBLOCK);
}

339 340 341 342 343 344 345 346 347 348 349 350
static bool
lxcPidGone(pid_t container)
{
    waitpid(container, NULL, WNOHANG);

    if (kill(container, 0) < 0 &&
        errno == ESRCH)
        return true;

    return false;
}

351
/**
352 353 354 355 356
 * lxcControllerMain
 * @monitor: server socket fd to accept client requests
 * @client: initial client which is the libvirtd daemon
 * @appPty: open fd for application facing Pty
 * @contPty: open fd for container facing Pty
357 358 359 360 361 362 363 364
 *
 * Forwards traffic between fds.  Data read from appPty will be written to contPty
 * This process loops forever.
 * This uses epoll in edge triggered mode to avoid a hard loop on POLLHUP
 * events when the user disconnects the virsh console via ctrl-]
 *
 * Returns 0 on success or -1 in case of error
 */
365 366 367
static int lxcControllerMain(int monitor,
                             int client,
                             int appPty,
368 369
                             int contPty,
                             pid_t container)
370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
{
    int rc = -1;
    int epollFd;
    struct epoll_event epollEvent;
    int numEvents;
    int numActive = 0;
    lxcTtyForwardFd_t fdArray[2];
    int timeout = -1;
    int curFdOff = 0;
    int writeFdOff = 0;

    fdArray[0].fd = appPty;
    fdArray[0].active = 0;
    fdArray[1].fd = contPty;
    fdArray[1].active = 0;
385 386 387 388

    VIR_DEBUG("monitor=%d client=%d appPty=%d contPty=%d",
              monitor, client, appPty, contPty);

389 390 391
    /* create the epoll fild descriptor */
    epollFd = epoll_create(2);
    if (0 > epollFd) {
392
        virReportSystemError(errno, "%s",
393
                             _("epoll_create(2) failed"));
394 395 396 397 398 399 400 401
        goto cleanup;
    }

    /* add the file descriptors the epoll fd */
    memset(&epollEvent, 0x00, sizeof(epollEvent));
    epollEvent.events = EPOLLIN|EPOLLET;    /* edge triggered */
    epollEvent.data.fd = appPty;
    if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, appPty, &epollEvent)) {
402
        virReportSystemError(errno, "%s",
403
                             _("epoll_ctl(appPty) failed"));
404 405 406 407
        goto cleanup;
    }
    epollEvent.data.fd = contPty;
    if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, contPty, &epollEvent)) {
408
        virReportSystemError(errno, "%s",
409
                             _("epoll_ctl(contPty) failed"));
410 411 412
        goto cleanup;
    }

413 414 415
    epollEvent.events = EPOLLIN;
    epollEvent.data.fd = monitor;
    if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, monitor, &epollEvent)) {
416
        virReportSystemError(errno, "%s",
417
                             _("epoll_ctl(monitor) failed"));
418 419 420 421 422 423
        goto cleanup;
    }

    epollEvent.events = EPOLLHUP;
    epollEvent.data.fd = client;
    if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, client, &epollEvent)) {
424
        virReportSystemError(errno, "%s",
425
                             _("epoll_ctl(client) failed"));
426 427 428
        goto cleanup;
    }

429 430 431 432
    while (1) {
        /* if active fd's, return if no events, else wait forever */
        timeout = (numActive > 0) ? 0 : -1;
        numEvents = epoll_wait(epollFd, &epollEvent, 1, timeout);
433 434 435
        if (numEvents > 0) {
            if (epollEvent.data.fd == monitor) {
                int fd = accept(monitor, NULL, 0);
436 437 438 439 440 441 442 443 444 445 446 447
                if (fd < 0) {
                    /* First reflex may be simply to declare accept failure
                       to be a fatal error.  However, accept may fail when
                       a client quits between the above epoll_wait and here.
                       That case is not fatal, but rather to be expected,
                       if not common, so ignore it.  */
                    if (ignorable_epoll_accept_errno(errno))
                        continue;
                    virReportSystemError(errno, "%s",
                                         _("accept(monitor,...) failed"));
                    goto cleanup;
                }
448
                if (client != -1) { /* Already connected, so kick new one out */
449
                    VIR_FORCE_CLOSE(fd);
450
                    continue;
451
                }
452 453 454 455
                client = fd;
                epollEvent.events = EPOLLHUP;
                epollEvent.data.fd = client;
                if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, client, &epollEvent)) {
456
                    virReportSystemError(errno, "%s",
457
                                         _("epoll_ctl(client) failed"));
458 459 460 461
                    goto cleanup;
                }
            } else if (client != -1 && epollEvent.data.fd == client) {
                if (0 > epoll_ctl(epollFd, EPOLL_CTL_DEL, client, &epollEvent)) {
462
                    virReportSystemError(errno, "%s",
463
                                         _("epoll_ctl(client) failed"));
464 465
                    goto cleanup;
                }
466
                VIR_FORCE_CLOSE(client);
467
            } else {
468 469 470 471 472 473 474
                if (epollEvent.events & EPOLLIN) {
                    curFdOff = epollEvent.data.fd == appPty ? 0 : 1;
                    if (!fdArray[curFdOff].active) {
                        fdArray[curFdOff].active = 1;
                        ++numActive;
                    }
                } else if (epollEvent.events & EPOLLHUP) {
475 476 477 478 479 480 481
                    if (lxcPidGone(container))
                        goto cleanup;
                    curFdOff = epollEvent.data.fd == appPty ? 0 : 1;
                    if (fdArray[curFdOff].active) {
                        fdArray[curFdOff].active = 0;
                        --numActive;
                    }
482 483
                    continue;
                } else {
484
                    lxcError(VIR_ERR_INTERNAL_ERROR,
485 486 487
                             _("error event %d"), epollEvent.events);
                    goto cleanup;
                }
488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504
            }
        } else if (0 == numEvents) {
            if (2 == numActive) {
                /* both fds active, toggle between the two */
                curFdOff ^= 1;
            } else {
                /* only one active, if current is active, use it, else it */
                /* must be the other one (ie. curFd just went inactive) */
                curFdOff = fdArray[curFdOff].active ? curFdOff : curFdOff ^ 1;
            }

        } else  {
            if (EINTR == errno) {
                continue;
            }

            /* error */
505
            virReportSystemError(errno, "%s",
506
                                 _("epoll_wait() failed"));
507 508 509 510 511 512 513 514 515 516 517 518 519
            goto cleanup;

        }

        if (0 < numActive) {
            writeFdOff = curFdOff ^ 1;
            rc = lxcFdForward(fdArray[curFdOff].fd, fdArray[writeFdOff].fd);

            if (EAGAIN == rc) {
                /* this fd no longer has data, set it as inactive */
                --numActive;
                fdArray[curFdOff].active = 0;
            } else if (-1 == rc) {
520 521 522
                if (lxcPidGone(container))
                    goto cleanup;
                continue;
523 524 525 526 527 528 529 530 531
            }

        }

    }

    rc = 0;

cleanup:
532 533 534
    VIR_FORCE_CLOSE(appPty);
    VIR_FORCE_CLOSE(contPty);
    VIR_FORCE_CLOSE(epollFd);
535 536 537
    return rc;
}

538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555


/**
 * lxcControllerMoveInterfaces
 * @nveths: number of interfaces
 * @veths: interface names
 * @container: pid of container
 *
 * Moves network interfaces into a container's namespace
 *
 * Returns 0 on success or -1 in case of error
 */
static int lxcControllerMoveInterfaces(unsigned int nveths,
                                       char **veths,
                                       pid_t container)
{
    unsigned int i;
    for (i = 0 ; i < nveths ; i++)
556
        if (moveInterfaceToNetNs(veths[i], container) < 0)
557 558 559 560 561 562 563 564
            return -1;

    return 0;
}


/**
 * lxcCleanupInterfaces:
565 566
 * @nveths: number of interfaces
 * @veths: interface names
567 568 569 570 571 572 573 574 575 576
 *
 * Cleans up the container interfaces by deleting the veth device pairs.
 *
 * Returns 0 on success or -1 in case of error
 */
static int lxcControllerCleanupInterfaces(unsigned int nveths,
                                          char **veths)
{
    unsigned int i;
    for (i = 0 ; i < nveths ; i++)
577
        vethDelete(veths[i]);
578 579 580 581

    return 0;
}

582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600
static int lxcSetPersonality(virDomainDefPtr def)
{
    struct utsname utsname;
    const char *altArch;

    uname(&utsname);

    altArch = lxcContainerGetAlt32bitArch(utsname.machine);
    if (altArch &&
        STREQ(def->os.arch, altArch)) {
        if (personality(PER_LINUX32) < 0) {
            virReportSystemError(errno, _("Unable to request personality for %s on %s"),
                                 altArch, utsname.machine);
            return -1;
        }
    }
    return 0;
}

601
#ifndef MS_REC
602
# define MS_REC          16384
603 604 605
#endif

#ifndef MS_SLAVE
606
# define MS_SLAVE              (1<<19)
607
#endif
608 609

static int
610
lxcControllerRun(virDomainDefPtr def,
611 612 613 614 615 616 617 618
                 unsigned int nveths,
                 char **veths,
                 int monitor,
                 int client,
                 int appPty)
{
    int rc = -1;
    int control[2] = { -1, -1};
619
    int containerPty = -1;
620
    char *containerPtyPath = NULL;
621
    pid_t container = -1;
622 623 624
    virDomainFSDefPtr root;
    char *devpts = NULL;
    char *devptmx = NULL;
625 626

    if (socketpair(PF_UNIX, SOCK_STREAM, 0, control) < 0) {
627
        virReportSystemError(errno, "%s",
628
                             _("sockpair failed"));
629 630 631
        goto cleanup;
    }

632 633
    root = virDomainGetRootFilesystem(def);

634 635 636
    if (lxcSetContainerResources(def) < 0)
        goto cleanup;

637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659
    /*
     * If doing a chroot style setup, we need to prepare
     * a private /dev/pts for the child now, which they
     * will later move into position.
     *
     * This is complex because 'virsh console' needs to
     * use /dev/pts from the host OS, and the guest OS
     * needs to use /dev/pts from the guest.
     *
     * This means that we (libvirt_lxc) need to see and
     * use both /dev/pts instances. We're running in the
     * host OS context though and don't want to expose
     * the guest OS /dev/pts there.
     *
     * Thus we call unshare(CLONE_NS) so that we can see
     * the guest's new /dev/pts, without it becoming
     * visible to the host OS. We also put the root FS
     * into slave mode, just in case it was currently
     * marked as shared
     */
    if (root) {
        VIR_DEBUG0("Setting up private /dev/pts");
        if (unshare(CLONE_NEWNS) < 0) {
660
            virReportSystemError(errno, "%s",
661
                                 _("Cannot unshare mount namespace"));
662 663 664 665
            goto cleanup;
        }

        if (mount("", "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
666
            virReportSystemError(errno, "%s",
667
                                 _("Failed to switch root mount into slave mode"));
668 669 670 671 672
            goto cleanup;
        }

        if (virAsprintf(&devpts, "%s/dev/pts", root->src) < 0 ||
            virAsprintf(&devptmx, "%s/dev/pts/ptmx", root->src) < 0) {
673
            virReportOOMError();
674 675 676
            goto cleanup;
        }

L
Laine Stump 已提交
677
        if (virFileMakePath(devpts) != 0) {
678
            virReportSystemError(errno,
679
                                 _("Failed to make path %s"),
680 681 682 683 684
                                 devpts);
            goto cleanup;
        }

        VIR_DEBUG("Mouting 'devpts' on %s", devpts);
685 686
        if (mount("devpts", devpts, "devpts", 0,
                  "newinstance,ptmxmode=0666,mode=0620,gid=5") < 0) {
687
            virReportSystemError(errno,
688
                                 _("Failed to mount devpts on %s"),
689 690 691 692 693
                                 devpts);
            goto cleanup;
        }

        if (access(devptmx, R_OK) < 0) {
694
            VIR_WARN0("Kernel does not support private devpts, using shared devpts");
695 696
            VIR_FREE(devptmx);
        }
697 698
    }

699 700 701 702 703 704
    if (devptmx) {
        VIR_DEBUG("Opening tty on private %s", devptmx);
        if (virFileOpenTtyAt(devptmx,
                             &containerPty,
                             &containerPtyPath,
                             0) < 0) {
705
            virReportSystemError(errno, "%s",
706
                                 _("Failed to allocate tty"));
707 708 709 710 711 712 713
            goto cleanup;
        }
    } else {
        VIR_DEBUG0("Opening tty on shared /dev/ptmx");
        if (virFileOpenTty(&containerPty,
                           &containerPtyPath,
                           0) < 0) {
714
            virReportSystemError(errno, "%s",
715
                                 _("Failed to allocate tty"));
716 717 718 719
            goto cleanup;
        }
    }

720 721
    if (lxcSetPersonality(def) < 0)
        goto cleanup;
722

723 724 725 726 727 728
    if ((container = lxcContainerStart(def,
                                       nveths,
                                       veths,
                                       control[1],
                                       containerPtyPath)) < 0)
        goto cleanup;
729
    VIR_FORCE_CLOSE(control[1]);
730 731 732 733 734 735 736

    if (lxcControllerMoveInterfaces(nveths, veths, container) < 0)
        goto cleanup;

    if (lxcContainerSendContinue(control[0]) < 0)
        goto cleanup;

D
Daniel P. Berrange 已提交
737 738 739 740 741
    /* Now the container is running, there's no need for us to keep
       any elevated capabilities */
    if (lxcControllerClearCapabilities() < 0)
        goto cleanup;

742
    rc = lxcControllerMain(monitor, client, appPty, containerPty, container);
743 744

cleanup:
745 746
    VIR_FREE(devptmx);
    VIR_FREE(devpts);
747 748
    VIR_FORCE_CLOSE(control[0]);
    VIR_FORCE_CLOSE(control[1]);
749
    VIR_FREE(containerPtyPath);
750
    VIR_FORCE_CLOSE(containerPty);
751

752
    if (container > 1) {
753
        int status;
754
        kill(container, SIGTERM);
755 756 757
        if (!(waitpid(container, &status, WNOHANG) == 0 &&
            WIFEXITED(status)))
            kill(container, SIGKILL);
758 759
        waitpid(container, NULL, 0);
    }
760 761 762 763
    return rc;
}


764
int main(int argc, char *argv[])
765 766
{
    pid_t pid;
767
    int rc = 1;
768
    int client;
769 770 771 772 773 774 775 776 777 778
    char *name = NULL;
    int nveths = 0;
    char **veths = NULL;
    int monitor = -1;
    int appPty = -1;
    int bg = 0;
    virCapsPtr caps = NULL;
    virDomainDefPtr def = NULL;
    char *configFile = NULL;
    char *sockpath = NULL;
779
    const struct option options[] = {
780 781 782 783 784 785 786
        { "background", 0, NULL, 'b' },
        { "name",   1, NULL, 'n' },
        { "veth",   1, NULL, 'v' },
        { "console", 1, NULL, 'c' },
        { "help", 0, NULL, 'h' },
        { 0, 0, 0, 0 },
    };
787

E
Eric Blake 已提交
788 789 790 791 792 793 794
    if (setlocale(LC_ALL, "") == NULL ||
        bindtextdomain(PACKAGE, LOCALEDIR) == NULL ||
        textdomain(PACKAGE) == NULL) {
        fprintf(stderr, _("%s: initialization failed\n"), argv[0]);
        exit(EXIT_FAILURE);
    }

795 796
    while (1) {
        int c;
797

798 799 800 801 802 803 804 805 806 807 808 809 810
        c = getopt_long(argc, argv, "dn:v:m:c:h",
                       options, NULL);

        if (c == -1)
            break;

        switch (c) {
        case 'b':
            bg = 1;
            break;

        case 'n':
            if ((name = strdup(optarg)) == NULL) {
811
                virReportOOMError();
812
                goto cleanup;
813
            }
814 815 816 817
            break;

        case 'v':
            if (VIR_REALLOC_N(veths, nveths+1) < 0) {
818
                virReportOOMError();
819
                goto cleanup;
820
            }
821
            if ((veths[nveths++] = strdup(optarg)) == NULL) {
822
                virReportOOMError();
823
                goto cleanup;
824
            }
825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847
            break;

        case 'c':
            if (virStrToLong_i(optarg, NULL, 10, &appPty) < 0) {
                fprintf(stderr, "malformed --console argument '%s'", optarg);
                goto cleanup;
            }
            break;

        case 'h':
        case '?':
            fprintf(stderr, "\n");
            fprintf(stderr, "syntax: %s [OPTIONS]\n", argv[0]);
            fprintf(stderr, "\n");
            fprintf(stderr, "Options\n");
            fprintf(stderr, "\n");
            fprintf(stderr, "  -b, --background\n");
            fprintf(stderr, "  -n NAME, --name NAME\n");
            fprintf(stderr, "  -c FD, --console FD\n");
            fprintf(stderr, "  -v VETH, --veth VETH\n");
            fprintf(stderr, "  -h, --help\n");
            fprintf(stderr, "\n");
            goto cleanup;
848 849 850 851
        }
    }


852 853 854 855 856 857 858 859 860 861
    if (name == NULL) {
        fprintf(stderr, "%s: missing --name argument for configuration\n", argv[0]);
        goto cleanup;
    }

    if (appPty < 0) {
        fprintf(stderr, "%s: missing --console argument for container PTY\n", argv[0]);
        goto cleanup;
    }

862
    if (getuid() != 0) {
863 864 865
        fprintf(stderr, "%s: must be run as the 'root' user\n", argv[0]);
        goto cleanup;
    }
866

867 868
    if ((caps = lxcCapsInit()) == NULL)
        goto cleanup;
869

870
    if ((configFile = virDomainConfigFile(LXC_STATE_DIR,
871 872
                                          name)) == NULL)
        goto cleanup;
873

874
    if ((def = virDomainDefParseFile(caps, configFile,
875
                                     VIR_DOMAIN_XML_INACTIVE)) == NULL)
876
        goto cleanup;
877

878
    if (def->nnets != nveths) {
879
        fprintf(stderr, "%s: expecting %d veths, but got %d\n",
880
                argv[0], def->nnets, nveths);
881
        goto cleanup;
882 883
    }

884 885
    if ((sockpath = lxcMonitorPath(def)) == NULL)
        goto cleanup;
886

887 888
    if ((monitor = lxcMonitorServer(sockpath)) < 0)
        goto cleanup;
889

890 891 892
    if (bg) {
        if ((pid = fork()) < 0)
            goto cleanup;
893

894 895
        if (pid > 0) {
            if ((rc = virFileWritePid(LXC_STATE_DIR, name, pid)) != 0) {
896
                virReportSystemError(rc,
897 898
                                     _("Unable to write pid file '%s/%s.pid'"),
                                     LXC_STATE_DIR, name);
899 900
                _exit(1);
            }
901

902 903 904 905
            /* First child now exits, allowing original caller
             * (ie libvirtd's LXC driver to complete their
             * waitpid & continue */
            _exit(0);
906 907
        }

908 909
        /* Don't hold onto any cwd we inherit from libvirtd either */
        if (chdir("/") < 0) {
910
            virReportSystemError(errno, "%s",
911
                                 _("Unable to change to root dir"));
912 913 914 915
            goto cleanup;
        }

        if (setsid() < 0) {
916
            virReportSystemError(errno, "%s",
917
                                 _("Unable to become session leader"));
918 919 920
            goto cleanup;
        }
    }
921

A
Amy Griffis 已提交
922 923 924
    /* Initialize logging */
    virLogSetFromEnv();

925
    /* Accept initial client which is the libvirtd daemon */
926
    if ((client = accept(monitor, NULL, 0)) < 0) {
927
        virReportSystemError(errno, "%s",
928
                             _("Failed to accept a connection from driver"));
929
        goto cleanup;
930 931
    }

932
    rc = lxcControllerRun(def, nveths, veths, monitor, client, appPty);
933 934


935
cleanup:
936 937
    if (def)
        virFileDeletePid(LXC_STATE_DIR, def->name);
938
    lxcControllerCleanupInterfaces(nveths, veths);
J
Jim Meyering 已提交
939 940
    if (sockpath)
        unlink(sockpath);
941 942 943 944
    VIR_FREE(sockpath);

    return rc;
}