lxc_controller.c 78.3 KB
Newer Older
1
/*
2
 * Copyright (C) 2010-2016 Red Hat, Inc.
3
 * Copyright IBM Corp. 2008
4 5 6 7 8 9 10 11 12 13 14 15 16 17
 *
 * lxc_controller.c: linux container process controller
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with this library.  If not, see
O
Osier Yang 已提交
19
 * <http://www.gnu.org/licenses/>.
20 21 22 23 24
 */

#include <config.h>

#include <sys/epoll.h>
25
#include <sys/wait.h>
26

27
#ifdef __linux__
28 29 30
# include <sys/sysmacros.h>
#endif

31
#include <sys/personality.h>
32
#include <unistd.h>
33 34
#include <fcntl.h>
#include <signal.h>
35
#include <getopt.h>
36
#include <sys/mount.h>
37 38
#include <grp.h>
#include <sys/stat.h>
39
#include <time.h>
40

41
#if WITH_CAPNG
42
# include <cap-ng.h>
D
Daniel P. Berrange 已提交
43 44
#endif

45
#include "virerror.h"
46
#include "virlog.h"
47 48

#include "lxc_conf.h"
49
#include "lxc_container.h"
50
#include "lxc_cgroup.h"
51
#include "lxc_monitor_protocol.h"
G
Gao feng 已提交
52
#include "lxc_fuse.h"
53 54
#include "virnetdev.h"
#include "virnetdevveth.h"
55
#include "viralloc.h"
E
Eric Blake 已提交
56
#include "virfile.h"
57
#include "virpidfile.h"
58
#include "vircommand.h"
59
#include "virhostcpu.h"
60
#include "virrandom.h"
61
#include "virprocess.h"
62
#include "virnuma.h"
63
#include "virdbus.h"
64
#include "rpc/virnetdaemon.h"
65
#include "virstring.h"
66
#include "virgettext.h"
67
#include "virsocket.h"
J
Ján Tomko 已提交
68
#include "virutil.h"
69

70 71
#define VIR_FROM_THIS VIR_FROM_LXC

72 73
VIR_LOG_INIT("lxc.lxc_controller");

74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
typedef struct _virLXCControllerConsole virLXCControllerConsole;
typedef virLXCControllerConsole *virLXCControllerConsolePtr;
struct _virLXCControllerConsole {
    int hostWatch;
    int hostFd;  /* PTY FD in the host OS */
    bool hostClosed;
    int hostEpoll;

    int contWatch;
    int contFd;  /* PTY FD in the container */
    bool contClosed;
    int contEpoll;

    int epollWatch;
    int epollFd; /* epoll FD for dealing with EOF */

    size_t fromHostLen;
    char fromHostBuf[1024];
    size_t fromContLen;
    char fromContBuf[1024];
94

95
    virNetDaemonPtr daemon;
96 97
};

98 99 100 101
typedef struct _virLXCController virLXCController;
typedef virLXCController *virLXCControllerPtr;
struct _virLXCController {
    char *name;
102
    virDomainObjPtr vm;
103
    virDomainDefPtr def;
104

105 106
    int handshakeFd;

107 108
    pid_t initpid;

109 110 111
    size_t nnbdpids;
    pid_t *nbdpids;

112 113
    size_t nveths;
    char **veths;
114

115 116 117
    size_t nnicindexes;
    int *nicindexes;

118 119 120
    size_t npassFDs;
    int *passFDs;

I
ik.nitk 已提交
121 122
    int *nsFDs;

123 124
    size_t nconsoles;
    virLXCControllerConsolePtr consoles;
125
    char *devptmx;
126 127 128

    size_t nloopDevs;
    int *loopDevFds;
129 130

    virSecurityManagerPtr securityManager;
131

132
    virNetDaemonPtr daemon;
133
    bool firstClient;
134 135 136 137
    virNetServerClientPtr client;
    virNetServerProgramPtr prog;
    bool inShutdown;
    int timerShutdown;
G
Gao feng 已提交
138

139 140
    virCgroupPtr cgroup;

G
Gao feng 已提交
141
    virLXCFusePtr fuse;
142 143
};

144 145
#include "lxc_controller_dispatch.h"

146
static void virLXCControllerFree(virLXCControllerPtr ctrl);
147 148
static int virLXCControllerEventSendInit(virLXCControllerPtr ctrl,
                                         pid_t initpid);
149

J
Ján Tomko 已提交
150
static void virLXCControllerQuitTimer(int timer G_GNUC_UNUSED, void *opaque)
151 152 153 154
{
    virLXCControllerPtr ctrl = opaque;

    VIR_DEBUG("Triggering event loop quit");
155
    virNetDaemonQuit(ctrl->daemon);
156 157 158
}


159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
static virLXCDriverPtr
virLXCControllerDriverNew(void)
{
    virLXCDriverPtr driver = g_new0(virLXCDriver, 1);

    if (virMutexInit(&driver->lock) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       "%s", _("cannot initialize mutex"));
        g_free(driver);
        return NULL;
    }

    driver->caps = virLXCDriverCapsInit(NULL);
    driver->xmlopt = lxcDomainXMLConfInit(driver);

    return driver;
}


static void
virLXCControllerDriverFree(virLXCDriverPtr driver)
{
    if (!driver)
        return;
    virObjectUnref(driver->xmlopt);
    virObjectUnref(driver->caps);
    virMutexDestroy(&driver->lock);
    g_free(driver);
}


190 191 192
static virLXCControllerPtr virLXCControllerNew(const char *name)
{
    virLXCControllerPtr ctrl = NULL;
193
    virLXCDriverPtr driver = NULL;
194 195
    char *configFile = NULL;

196
    if (VIR_ALLOC(ctrl) < 0)
197
        goto error;
198

199
    ctrl->timerShutdown = -1;
200
    ctrl->firstClient = true;
201

202
    ctrl->name = g_strdup(name);
203

204
    if (!(driver = virLXCControllerDriverNew()))
205 206
        goto error;

207 208 209 210
    if ((configFile = virDomainConfigFile(LXC_STATE_DIR,
                                          ctrl->name)) == NULL)
        goto error;

211
    if ((ctrl->vm = virDomainObjParseFile(configFile,
212
                                          driver->xmlopt,
213
                                          0)) == NULL)
214
        goto error;
215
    ctrl->def = ctrl->vm->def;
216

217 218 219 220 221
    if ((ctrl->timerShutdown = virEventAddTimeout(-1,
                                                  virLXCControllerQuitTimer, ctrl,
                                                  NULL)) < 0)
        goto error;

222
 cleanup:
223
    VIR_FREE(configFile);
224
    virLXCControllerDriverFree(driver);
225 226
    return ctrl;

227
 error:
228 229 230 231 232
    virLXCControllerFree(ctrl);
    ctrl = NULL;
    goto cleanup;
}

233

234
static int virLXCControllerCloseLoopDevices(virLXCControllerPtr ctrl)
235 236 237
{
    size_t i;

238
    for (i = 0; i < ctrl->nloopDevs; i++)
239
        VIR_FORCE_CLOSE(ctrl->loopDevFds[i]);
240 241 242 243 244

    return 0;
}


245 246 247 248 249
static void virLXCControllerStopInit(virLXCControllerPtr ctrl)
{
    if (ctrl->initpid == 0)
        return;

250
    virLXCControllerCloseLoopDevices(ctrl);
251
    virProcessAbort(ctrl->initpid);
252 253 254 255
    ctrl->initpid = 0;
}


256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
static void virLXCControllerConsoleClose(virLXCControllerConsolePtr console)
{
    if (console->hostWatch != -1)
        virEventRemoveHandle(console->hostWatch);
    VIR_FORCE_CLOSE(console->hostFd);

    if (console->contWatch != -1)
        virEventRemoveHandle(console->contWatch);
    VIR_FORCE_CLOSE(console->contFd);

    if (console->epollWatch != -1)
        virEventRemoveHandle(console->epollWatch);
    VIR_FORCE_CLOSE(console->epollFd);
}


G
Gao feng 已提交
272 273 274 275 276 277 278
static void
virLXCControllerFreeFuse(virLXCControllerPtr ctrl)
{
    return lxcFreeFuse(&ctrl->fuse);
}


279 280
static void virLXCControllerFree(virLXCControllerPtr ctrl)
{
281 282
    size_t i;

283 284 285
    if (!ctrl)
        return;

286 287
    virLXCControllerStopInit(ctrl);

288
    virObjectUnref(ctrl->securityManager);
289

290
    for (i = 0; i < ctrl->nveths; i++)
291 292
        VIR_FREE(ctrl->veths[i]);
    VIR_FREE(ctrl->veths);
293
    VIR_FREE(ctrl->nicindexes);
294

295 296 297 298
    for (i = 0; i < ctrl->npassFDs; i++)
        VIR_FORCE_CLOSE(ctrl->passFDs[i]);
    VIR_FREE(ctrl->passFDs);

299
    for (i = 0; i < ctrl->nconsoles; i++)
300 301 302
        virLXCControllerConsoleClose(&(ctrl->consoles[i]));
    VIR_FREE(ctrl->consoles);

303 304
    VIR_FREE(ctrl->devptmx);

305
    virDomainObjEndAPI(&ctrl->vm);
306 307
    VIR_FREE(ctrl->name);

308 309 310
    if (ctrl->timerShutdown != -1)
        virEventRemoveTimeout(ctrl->timerShutdown);

311
    virObjectUnref(ctrl->daemon);
G
Gao feng 已提交
312
    virLXCControllerFreeFuse(ctrl);
313

314 315
    VIR_FREE(ctrl->nbdpids);

I
ik.nitk 已提交
316
    VIR_FREE(ctrl->nsFDs);
317
    virCgroupFree(&ctrl->cgroup);
318

319 320
    /* This must always be the last thing to be closed */
    VIR_FORCE_CLOSE(ctrl->handshakeFd);
321 322 323
    VIR_FREE(ctrl);
}

324

325 326 327
static int virLXCControllerAddConsole(virLXCControllerPtr ctrl,
                                      int hostFd)
{
328
    if (VIR_EXPAND_N(ctrl->consoles, ctrl->nconsoles, 1) < 0)
329
        return -1;
330
    ctrl->consoles[ctrl->nconsoles-1].daemon = ctrl->daemon;
331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
    ctrl->consoles[ctrl->nconsoles-1].hostFd = hostFd;
    ctrl->consoles[ctrl->nconsoles-1].hostWatch = -1;

    ctrl->consoles[ctrl->nconsoles-1].contFd = -1;
    ctrl->consoles[ctrl->nconsoles-1].contWatch = -1;

    ctrl->consoles[ctrl->nconsoles-1].epollFd = -1;
    ctrl->consoles[ctrl->nconsoles-1].epollWatch = -1;
    return 0;
}


static int virLXCControllerConsoleSetNonblocking(virLXCControllerConsolePtr console)
{
    if (virSetBlocking(console->hostFd, false) < 0 ||
        virSetBlocking(console->contFd, false) < 0) {
        virReportSystemError(errno, "%s",
                             _("Unable to set console file descriptor non-blocking"));
        return -1;
    }

    return 0;
}


356 357 358 359 360 361 362 363 364 365 366 367
static int virLXCControllerDaemonHandshake(virLXCControllerPtr ctrl)
{
    if (lxcContainerSendContinue(ctrl->handshakeFd) < 0) {
        virReportSystemError(errno, "%s",
                             _("error sending continue signal to daemon"));
        return -1;
    }
    VIR_FORCE_CLOSE(ctrl->handshakeFd);
    return 0;
}


368 369 370
static int virLXCControllerValidateNICs(virLXCControllerPtr ctrl)
{
    if (ctrl->def->nnets != ctrl->nveths) {
371
        virReportError(VIR_ERR_INTERNAL_ERROR,
372
                       _("expecting %zu veths, but got %zu"),
373
                       ctrl->def->nnets, ctrl->nveths);
374 375 376 377 378 379 380
        return -1;
    }

    return 0;
}


381 382 383 384
static int virLXCControllerGetNICIndexes(virLXCControllerPtr ctrl)
{
    size_t i;

385 386 387 388 389 390 391 392 393 394
    /* Gather the ifindexes of the "parent" veths for all interfaces
     * implemented with a veth pair. These will be used when calling
     * virCgroupNewMachine (and eventually the dbus method
     * CreateMachineWithNetwork). ifindexes for the child veths, and
     * for macvlan interfaces, *should not* be in this list, as they
     * will be moved into the container. Only the interfaces that will
     * remain outside the container, but are used for communication
     * with the container, should be added to the list.
     */

395 396 397
    VIR_DEBUG("Getting nic indexes");
    for (i = 0; i < ctrl->def->nnets; i++) {
        int nicindex = -1;
398 399 400
        virDomainNetType actualType = virDomainNetGetActualType(ctrl->def->nets[i]);

        switch (actualType) {
401 402
        case VIR_DOMAIN_NET_TYPE_BRIDGE:
        case VIR_DOMAIN_NET_TYPE_NETWORK:
403
        case VIR_DOMAIN_NET_TYPE_ETHERNET:
404 405 406 407
            if (ctrl->def->nets[i]->ifname == NULL)
                continue;
            if (virNetDevGetIndex(ctrl->def->nets[i]->ifname,
                                  &nicindex) < 0)
408
                return -1;
409 410 411
            if (VIR_EXPAND_N(ctrl->nicindexes,
                             ctrl->nnicindexes,
                             1) < 0)
412
                return -1;
413 414 415 416 417
            VIR_DEBUG("Index %d for %s", nicindex,
                      ctrl->def->nets[i]->ifname);
            ctrl->nicindexes[ctrl->nnicindexes-1] = nicindex;
            break;

418 419 420
        case VIR_DOMAIN_NET_TYPE_DIRECT:
           break;

421 422 423 424 425
        case VIR_DOMAIN_NET_TYPE_USER:
        case VIR_DOMAIN_NET_TYPE_VHOSTUSER:
        case VIR_DOMAIN_NET_TYPE_SERVER:
        case VIR_DOMAIN_NET_TYPE_CLIENT:
        case VIR_DOMAIN_NET_TYPE_MCAST:
426
        case VIR_DOMAIN_NET_TYPE_UDP:
427 428
        case VIR_DOMAIN_NET_TYPE_INTERNAL:
        case VIR_DOMAIN_NET_TYPE_HOSTDEV:
429 430
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                           _("Unsupported net type %s"),
431
                           virDomainNetTypeToString(actualType));
432
            return -1;
433
        case VIR_DOMAIN_NET_TYPE_LAST:
434
        default:
435
            virReportEnumRangeError(virDomainNetType, actualType);
436
            return -1;
437 438 439
        }
    }

440
    return 0;
441 442 443
}


444 445 446
static int virLXCControllerValidateConsoles(virLXCControllerPtr ctrl)
{
    if (ctrl->def->nconsoles != ctrl->nconsoles) {
447
        virReportError(VIR_ERR_INTERNAL_ERROR,
448
                       _("expecting %zu consoles, but got %zu tty file handlers"),
449
                       ctrl->def->nconsoles, ctrl->nconsoles);
450 451 452 453 454 455 456
        return -1;
    }

    return 0;
}


457
static int virLXCControllerSetupLoopDeviceFS(virDomainFSDefPtr fs)
458
{
459
    int lofd;
460 461
    char *loname = NULL;

462
    if ((lofd = virFileLoopDeviceAssociate(fs->src->path, &loname)) < 0)
463 464
        return -1;

465
    VIR_DEBUG("Changing fs %s to use type=block for dev %s",
466
              fs->src->path, loname);
467 468 469 470 471
    /*
     * We now change it into a block device type, so that
     * the rest of container setup 'just works'
     */
    fs->type = VIR_DOMAIN_FS_TYPE_BLOCK;
472 473
    VIR_FREE(fs->src->path);
    fs->src->path = loname;
474 475 476 477 478 479
    loname = NULL;

    return lofd;
}


480 481 482 483
static int virLXCControllerSetupLoopDeviceDisk(virDomainDiskDefPtr disk)
{
    int lofd;
    char *loname = NULL;
484
    const char *src = virDomainDiskGetSource(disk);
J
John Ferlan 已提交
485
    int ret = -1;
486

487
    if ((lofd = virFileLoopDeviceAssociate(src, &loname)) < 0)
488 489
        return -1;

490
    VIR_DEBUG("Changing disk %s to use type=block for dev %s",
491
              src, loname);
492

493 494 495 496
    /*
     * We now change it into a block device type, so that
     * the rest of container setup 'just works'
     */
E
Eric Blake 已提交
497
    virDomainDiskSetType(disk, VIR_STORAGE_TYPE_BLOCK);
J
John Ferlan 已提交
498 499 500 501 502 503
    if (virDomainDiskSetSource(disk, loname) < 0)
        goto cleanup;

    ret = 0;

 cleanup:
504
    VIR_FREE(loname);
J
John Ferlan 已提交
505
    if (ret < 0)
506
        VIR_FORCE_CLOSE(lofd);
507 508

    return lofd;
J
John Ferlan 已提交
509

510 511 512
}


513 514 515 516 517 518 519 520 521 522
static int virLXCControllerSetupNBDDeviceFS(virDomainFSDefPtr fs)
{
    char *dev;

    if (fs->format <= VIR_STORAGE_FILE_NONE) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("An explicit disk format must be specified"));
        return -1;
    }

523
    if (virFileNBDDeviceAssociate(fs->src->path,
524 525 526 527 528
                                  fs->format,
                                  fs->readonly,
                                  &dev) < 0)
        return -1;

529
    VIR_DEBUG("Changing fs %s to use type=block for dev %s",
530
              fs->src->path, dev);
531 532 533 534
    /*
     * We now change it into a block device type, so that
     * the rest of container setup 'just works'
     */
535
    fs->type = VIR_DOMAIN_FS_TYPE_BLOCK;
536 537
    VIR_FREE(fs->src->path);
    fs->src->path = dev;
538 539 540 541 542 543 544 545

    return 0;
}


static int virLXCControllerSetupNBDDeviceDisk(virDomainDiskDefPtr disk)
{
    char *dev;
546 547
    const char *src = virDomainDiskGetSource(disk);
    int format = virDomainDiskGetFormat(disk);
548

549
    if (format <= VIR_STORAGE_FILE_NONE) {
550 551 552 553 554
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("An explicit disk format must be specified"));
        return -1;
    }

555 556
    if (virFileNBDDeviceAssociate(src,
                                  format,
557
                                  disk->src->readonly,
558 559 560
                                  &dev) < 0)
        return -1;

561
    VIR_DEBUG("Changing disk %s to use type=block for dev %s",
562
              src, dev);
563 564 565 566
    /*
     * We now change it into a block device type, so that
     * the rest of container setup 'just works'
     */
E
Eric Blake 已提交
567
    virDomainDiskSetType(disk, VIR_STORAGE_TYPE_BLOCK);
568 569 570 571 572
    if (virDomainDiskSetSource(disk, dev) < 0) {
        VIR_FREE(dev);
        return -1;
    }
    VIR_FREE(dev);
573 574 575 576

    return 0;
}

577 578 579 580
static int virLXCControllerAppendNBDPids(virLXCControllerPtr ctrl,
                                         const char *dev)
{
    char *pidpath = NULL;
C
Cédric Bosdonnat 已提交
581 582
    pid_t *pids = NULL;
    size_t npids = 0;
583 584
    size_t i;
    int ret = -1;
585
    size_t loops = 0;
586 587
    pid_t pid;

588
    if (!STRPREFIX(dev, "/dev/"))
589 590
        goto cleanup;

591 592
    pidpath = g_strdup_printf("/sys/devices/virtual/block/%s/pid", dev + 5);

593 594 595 596
    /* Wait for the pid file to appear */
    while (!virFileExists(pidpath)) {
        /* wait for 100ms before checking again, but don't do it for ever */
        if (errno == ENOENT && loops < 10) {
597
            g_usleep(100 * 1000);
598 599 600 601 602 603 604 605 606
            loops++;
        } else {
            virReportSystemError(errno,
                                 _("Cannot check NBD device %s pid"),
                                 dev + 5);
            goto cleanup;
        }
    }

607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624
    if (virPidFileReadPath(pidpath, &pid) < 0)
        goto cleanup;

    if (virProcessGetPids(pid, &npids, &pids) < 0)
        goto cleanup;

    for (i = 0; i < npids; i++) {
        if (VIR_APPEND_ELEMENT(ctrl->nbdpids, ctrl->nnbdpids, pids[i]) < 0)
            goto cleanup;
    }

    ret = 0;

 cleanup:
    VIR_FREE(pids);
    VIR_FREE(pidpath);
    return ret;
}
625

626
static int virLXCControllerSetupLoopDevices(virLXCControllerPtr ctrl)
627 628 629
{
    size_t i;

630 631
    VIR_DEBUG("Setting up loop devices for filesystems");

632
    for (i = 0; i < ctrl->def->nfss; i++) {
633
        virDomainFSDefPtr fs = ctrl->def->fss[i];
634 635
        int fd;

636
        if (fs->type != VIR_DOMAIN_FS_TYPE_FILE)
637 638
            continue;

639 640 641 642 643 644 645 646
        if (fs->fsdriver == VIR_DOMAIN_FS_DRIVER_TYPE_DEFAULT) {
            if (fs->format == VIR_STORAGE_FILE_RAW ||
                fs->format == VIR_STORAGE_FILE_NONE)
                fs->fsdriver = VIR_DOMAIN_FS_DRIVER_TYPE_LOOP;
            else
                fs->fsdriver = VIR_DOMAIN_FS_DRIVER_TYPE_NBD;
        }

647 648 649 650 651 652
        if (fs->fsdriver == VIR_DOMAIN_FS_DRIVER_TYPE_LOOP) {
            if (fs->format != VIR_STORAGE_FILE_RAW &&
                fs->format != VIR_STORAGE_FILE_NONE) {
                virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                               _("fs format %s is not supported"),
                               virStorageFileFormatTypeToString(fs->format));
653
                return -1;
654 655 656 657
            }

            fd = virLXCControllerSetupLoopDeviceFS(fs);
            if (fd < 0)
658
                return -1;
659 660 661 662

            VIR_DEBUG("Saving loop fd %d", fd);
            if (VIR_EXPAND_N(ctrl->loopDevFds, ctrl->nloopDevs, 1) < 0) {
                VIR_FORCE_CLOSE(fd);
663
                return -1;
664 665
            }
            ctrl->loopDevFds[ctrl->nloopDevs - 1] = fd;
666 667
        } else if (fs->fsdriver == VIR_DOMAIN_FS_DRIVER_TYPE_NBD) {
            if (virLXCControllerSetupNBDDeviceFS(fs) < 0)
668
                return -1;
669 670 671

            /* The NBD device will be cleaned up while the cgroup will end.
             * For this we need to remember the qemu-nbd pid and add it to
J
Ján Tomko 已提交
672
             * the cgroup */
673
            if (virLXCControllerAppendNBDPids(ctrl, fs->src->path) < 0)
674
                return -1;
675 676 677
        } else {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                           _("fs driver %s is not supported"),
678
                           virDomainFSDriverTypeToString(fs->fsdriver));
679
            return -1;
680 681 682
        }
    }

683 684
    VIR_DEBUG("Setting up loop devices for disks");

685
    for (i = 0; i < ctrl->def->ndisks; i++) {
686 687
        virDomainDiskDefPtr disk = ctrl->def->disks[i];
        int fd;
688 689
        const char *driver = virDomainDiskGetDriver(disk);
        int format = virDomainDiskGetFormat(disk);
690

E
Eric Blake 已提交
691
        if (virDomainDiskGetType(disk) != VIR_STORAGE_TYPE_FILE)
692 693
            continue;

694 695 696 697
        /* If no driverName is set, we prefer 'loop' for
         * dealing with raw or undefined formats, otherwise
         * we use 'nbd'.
         */
698 699 700 701 702 703
        if (STREQ_NULLABLE(driver, "loop") ||
            (!driver &&
             (format == VIR_STORAGE_FILE_RAW ||
              format == VIR_STORAGE_FILE_NONE))) {
            if (format != VIR_STORAGE_FILE_RAW &&
                format != VIR_STORAGE_FILE_NONE) {
704
                virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
705
                               _("disk format %s is not supported"),
706
                               virStorageFileFormatTypeToString(format));
707
                return -1;
708 709
            }

710 711 712 713
            /* We treat 'none' as meaning 'raw' since we
             * don't want to go into the auto-probing
             * business for security reasons
             */
714 715
            fd = virLXCControllerSetupLoopDeviceDisk(disk);
            if (fd < 0)
716
                return -1;
717

718 719 720
            VIR_DEBUG("Saving loop fd %d", fd);
            if (VIR_EXPAND_N(ctrl->loopDevFds, ctrl->nloopDevs, 1) < 0) {
                VIR_FORCE_CLOSE(fd);
721
                return -1;
722 723
            }
            ctrl->loopDevFds[ctrl->nloopDevs - 1] = fd;
724
        } else if (!driver || STREQ(driver, "nbd")) {
725 726 727 728 729
            if (disk->cachemode != VIR_DOMAIN_DISK_CACHE_DEFAULT &&
                disk->cachemode != VIR_DOMAIN_DISK_CACHE_DISABLE) {
                virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                               _("Disk cache mode %s is not supported"),
                               virDomainDiskCacheTypeToString(disk->cachemode));
730
                return -1;
731 732
            }
            if (virLXCControllerSetupNBDDeviceDisk(disk) < 0)
733
                return -1;
734 735 736

            /* The NBD device will be cleaned up while the cgroup will end.
             * For this we need to remember the qemu-nbd pid and add it to
J
Ján Tomko 已提交
737
             * the cgroup */
738
            if (virLXCControllerAppendNBDPids(ctrl, virDomainDiskGetSource(disk)) < 0)
739
                return -1;
740
        } else {
741
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
742
                           _("disk driver %s is not supported"),
743
                           driver);
744
            return -1;
745 746 747
        }
    }

748 749
    VIR_DEBUG("Setup all loop devices");

750
    return 0;
751 752
}

753 754 755 756

/*
 * To be run while still single threaded
 */
757
static int virLXCControllerSetupCpuAffinity(virLXCControllerPtr ctrl)
758
{
H
Hu Tao 已提交
759 760
    int hostcpus, maxcpu = CPU_SETSIZE;
    virBitmapPtr cpumap, cpumapToSet;
761 762 763 764 765

    VIR_DEBUG("Setting CPU affinity");

    /* setaffinity fails if you set bits for CPUs which
     * aren't present, so we have to limit ourselves */
766
    if ((hostcpus = virHostCPUGetCount()) < 0)
767 768
        return -1;

769 770 771
    if (maxcpu > hostcpus)
        maxcpu = hostcpus;

772 773
    cpumap = virBitmapNew(maxcpu);
    if (!cpumap)
774 775
        return -1;

H
Hu Tao 已提交
776 777
    cpumapToSet = cpumap;

778
    if (ctrl->def->cpumask) {
H
Hu Tao 已提交
779
        cpumapToSet = ctrl->def->cpumask;
780 781 782 783 784
    } else {
        /* You may think this is redundant, but we can't assume libvirtd
         * itself is running on all pCPUs, so we need to explicitly set
         * the spawned LXC instance to all pCPUs if no map is given in
         * its config file */
785
        virBitmapSetAll(cpumap);
786 787
    }

788
    /* We are presuming we are running between fork/exec of LXC
789 790 791
     * so use '0' to indicate our own process ID. No threads are
     * running at this point
     */
792
    if (virProcessSetAffinity(0 /* Self */, cpumapToSet) < 0) {
793
        virBitmapFree(cpumap);
794 795
        return -1;
    }
796
    virBitmapFree(cpumap);
797 798 799 800 801

    return 0;
}


802 803 804 805
static int virLXCControllerGetNumadAdvice(virLXCControllerPtr ctrl,
                                          virBitmapPtr *mask)
{
    virBitmapPtr nodemask = NULL;
806
    char *nodeset = NULL;
807 808 809 810 811
    int ret = -1;

    /* Get the advisory nodeset from numad if 'placement' of
     * either <vcpu> or <numatune> is 'auto'.
     */
812
    if (virDomainDefNeedsPlacementAdvice(ctrl->def)) {
813
        nodeset = virNumaGetAutoPlacementAdvice(virDomainDefGetVcpus(ctrl->def),
814 815 816 817 818 819
                                                ctrl->def->mem.cur_balloon);
        if (!nodeset)
            goto cleanup;

        VIR_DEBUG("Nodeset returned from numad: %s", nodeset);

820
        if (virBitmapParse(nodeset, &nodemask, VIR_DOMAIN_CPUMASK_LEN) < 0)
821 822 823 824 825 826
            goto cleanup;
    }

    ret = 0;
    *mask = nodemask;

827
 cleanup:
828 829 830 831 832
    VIR_FREE(nodeset);
    return ret;
}


833
/**
834 835
 * virLXCControllerSetupResourceLimits
 * @ctrl: the controller state
836
 *
837 838 839
 * Sets up the non-cgroup based resource limits that need
 * to be inherited by the child process across clone()/exec().
 * The cgroup limits are setup later
840 841 842
 *
 * Returns 0 on success or -1 in case of error
 */
843
static int virLXCControllerSetupResourceLimits(virLXCControllerPtr ctrl)
844
{
845
    virBitmapPtr auto_nodeset = NULL;
846
    int ret = -1;
847 848 849
    virBitmapPtr nodeset = NULL;
    virDomainNumatuneMemMode mode;

850 851 852 853 854 855 856 857
    if (virDomainNumatuneGetMode(ctrl->def->numa, -1, &mode) == 0) {
        if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
            virCgroupControllerAvailable(VIR_CGROUP_CONTROLLER_CPUSET)) {
            /* Use virNuma* API iff necessary. Once set and child is exec()-ed,
             * there's no way for us to change it. Rely on cgroups (if available
             * and enabled in the config) rather than virNuma*. */
            VIR_DEBUG("Relying on CGroups for memory binding");
        } else {
858

859
            VIR_DEBUG("Setting up process resource limits");
860

861 862
            if (virLXCControllerGetNumadAdvice(ctrl, &auto_nodeset) < 0)
                goto cleanup;
863

864
            nodeset = virDomainNumatuneGetNodeset(ctrl->def->numa, auto_nodeset, -1);
865

866 867 868
            if (virNumaSetupMemoryPolicy(mode, nodeset) < 0)
                goto cleanup;
        }
869
    }
870

871
    if (virLXCControllerSetupCpuAffinity(ctrl) < 0)
872
        goto cleanup;
873

874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889
    ret = 0;
 cleanup:
    virBitmapFree(auto_nodeset);
    return ret;
}


/*
 * Creates the cgroup and sets up the various limits associated
 * with it
 */
static int virLXCControllerSetupCgroupLimits(virLXCControllerPtr ctrl)
{
    virBitmapPtr auto_nodeset = NULL;
    int ret = -1;
    virBitmapPtr nodeset = NULL;
890
    size_t i;
891 892 893 894 895 896

    VIR_DEBUG("Setting up cgroup resource limits");

    if (virLXCControllerGetNumadAdvice(ctrl, &auto_nodeset) < 0)
        goto cleanup;

897
    nodeset = virDomainNumatuneGetNodeset(ctrl->def->numa, auto_nodeset, -1);
898 899

    if (!(ctrl->cgroup = virLXCCgroupCreate(ctrl->def,
900 901 902
                                            ctrl->initpid,
                                            ctrl->nnicindexes,
                                            ctrl->nicindexes)))
903 904
        goto cleanup;

905
    if (virCgroupAddMachineProcess(ctrl->cgroup, getpid()) < 0)
906 907
        goto cleanup;

908 909
    /* Add all qemu-nbd tasks to the cgroup */
    for (i = 0; i < ctrl->nnbdpids; i++) {
910
        if (virCgroupAddMachineProcess(ctrl->cgroup, ctrl->nbdpids[i]) < 0)
911 912 913
            goto cleanup;
    }

914
    if (virLXCCgroupSetup(ctrl->def, ctrl->cgroup, nodeset) < 0)
915
        goto cleanup;
916

917
    ret = 0;
918
 cleanup:
919
    virBitmapFree(auto_nodeset);
920
    return ret;
D
Dan Smith 已提交
921 922
}

923

924 925 926 927 928 929 930 931 932 933 934 935 936
static void virLXCControllerClientCloseHook(virNetServerClientPtr client)
{
    virLXCControllerPtr ctrl = virNetServerClientGetPrivateData(client);

    VIR_DEBUG("Client %p has closed", client);
    if (ctrl->client == client)
        ctrl->client = NULL;
    if (ctrl->inShutdown) {
        VIR_DEBUG("Arm timer to quit event loop");
        virEventUpdateTimeout(ctrl->timerShutdown, 0);
    }
}

937 938
static void virLXCControllerClientPrivateFree(void *data)
{
939 940
    virLXCControllerPtr ctrl = data;
    VIR_DEBUG("Got private data free %p", ctrl);
941 942 943 944
}

static void *virLXCControllerClientPrivateNew(virNetServerClientPtr client,
                                              void *opaque)
945 946
{
    virLXCControllerPtr ctrl = opaque;
947

948 949 950
    virNetServerClientSetCloseHook(client, virLXCControllerClientCloseHook);
    VIR_DEBUG("Got new client %p", client);
    ctrl->client = client;
951 952 953 954 955

    if (ctrl->initpid && ctrl->firstClient)
        virLXCControllerEventSendInit(ctrl, ctrl->initpid);
    ctrl->firstClient = false;

956
    return ctrl;
957 958
}

959 960

static int virLXCControllerSetupServer(virLXCControllerPtr ctrl)
961
{
962
    virNetServerPtr srv = NULL;
963 964
    virNetServerServicePtr svc = NULL;
    char *sockpath;
965

966
    sockpath = g_strdup_printf("%s/%s.sock", LXC_STATE_DIR, ctrl->name);
967

968
    if (!(srv = virNetServerNew("LXC", 1,
969
                                0, 0, 0, 1,
970
                                0, -1, 0,
971 972 973 974
                                virLXCControllerClientPrivateNew,
                                NULL,
                                virLXCControllerClientPrivateFree,
                                ctrl)))
C
Chris Lalancette 已提交
975
        goto error;
976

977 978 979
    if (virSecurityManagerSetSocketLabel(ctrl->securityManager, ctrl->def) < 0)
        goto error;

980 981 982 983
    if (!(svc = virNetServerServiceNewUNIX(sockpath,
                                           0700,
                                           0,
                                           0,
984
                                           NULL,
985
                                           false,
M
Michal Privoznik 已提交
986
                                           0,
987
                                           5)))
988
        goto error;
989

990 991 992
    if (virSecurityManagerClearSocketLabel(ctrl->securityManager, ctrl->def) < 0)
        goto error;

993
    if (virNetServerAddService(srv, svc) < 0)
994
        goto error;
995
    virObjectUnref(svc);
996
    svc = NULL;
997

998 999 1000 1001
    if (!(ctrl->prog = virNetServerProgramNew(VIR_LXC_MONITOR_PROGRAM,
                                              VIR_LXC_MONITOR_PROGRAM_VERSION,
                                              virLXCMonitorProcs,
                                              virLXCMonitorNProcs)))
1002 1003
        goto error;

1004
    if (!(ctrl->daemon = virNetDaemonNew()) ||
1005
        virNetDaemonAddServer(ctrl->daemon, srv) < 0)
1006 1007 1008
        goto error;

    virNetDaemonUpdateServices(ctrl->daemon, true);
1009 1010
    VIR_FREE(sockpath);
    return 0;
1011

1012
 error:
1013
    VIR_FREE(sockpath);
1014 1015 1016
    virObjectUnref(srv);
    virObjectUnref(ctrl->daemon);
    ctrl->daemon = NULL;
1017
    virObjectUnref(svc);
1018 1019
    return -1;
}
1020

D
Daniel P. Berrange 已提交
1021 1022 1023

static int lxcControllerClearCapabilities(void)
{
1024
#if WITH_CAPNG
D
Daniel P. Berrange 已提交
1025 1026 1027 1028 1029
    int ret;

    capng_clear(CAPNG_SELECT_BOTH);

    if ((ret = capng_apply(CAPNG_SELECT_BOTH)) < 0) {
1030 1031
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("failed to apply capabilities: %d"), ret);
D
Daniel P. Berrange 已提交
1032 1033 1034
        return -1;
    }
#else
1035
    VIR_WARN("libcap-ng support not compiled in, unable to clear capabilities");
D
Daniel P. Berrange 已提交
1036 1037 1038 1039
#endif
    return 0;
}

1040
static bool wantReboot;
1041
static virMutex lock = VIR_MUTEX_INITIALIZER;
1042 1043


M
Martin Kletzander 已提交
1044
static void virLXCControllerSignalChildIO(virNetDaemonPtr dmn,
J
Ján Tomko 已提交
1045
                                          siginfo_t *info G_GNUC_UNUSED,
1046
                                          void *opaque)
1047
{
1048
    virLXCControllerPtr ctrl = opaque;
1049
    int ret;
1050
    int status;
1051

1052
    ret = waitpid(-1, &status, WNOHANG);
1053
    VIR_DEBUG("Got sig child %d vs %lld", ret, (long long)ctrl->initpid);
1054
    if (ret == ctrl->initpid) {
M
Martin Kletzander 已提交
1055
        virNetDaemonQuit(dmn);
1056
        virMutexLock(&lock);
1057
        if (WIFSIGNALED(status) &&
1058 1059
            WTERMSIG(status) == SIGHUP) {
            VIR_DEBUG("Status indicates reboot");
1060
            wantReboot = true;
1061
        }
1062 1063
        virMutexUnlock(&lock);
    }
1064 1065 1066
}


1067
static void virLXCControllerConsoleUpdateWatch(virLXCControllerConsolePtr console)
1068 1069 1070 1071
{
    int hostEvents = 0;
    int contEvents = 0;

1072 1073
    /* If host console is open, then we can look to read/write */
    if (!console->hostClosed) {
1074 1075 1076 1077 1078
        if (console->fromHostLen < sizeof(console->fromHostBuf))
            hostEvents |= VIR_EVENT_HANDLE_READABLE;
        if (console->fromContLen)
            hostEvents |= VIR_EVENT_HANDLE_WRITABLE;
    }
1079 1080 1081

    /* If cont console is open, then we can look to read/write */
    if (!console->contClosed) {
1082 1083 1084 1085 1086 1087
        if (console->fromContLen < sizeof(console->fromContBuf))
            contEvents |= VIR_EVENT_HANDLE_READABLE;
        if (console->fromHostLen)
            contEvents |= VIR_EVENT_HANDLE_WRITABLE;
    }

1088 1089 1090
    VIR_DEBUG("Container watch=%d, events=%d closed=%d; host watch=%d events=%d closed=%d",
              console->contWatch, contEvents, console->contClosed,
              console->hostWatch, hostEvents, console->hostClosed);
1091 1092
    virEventUpdateHandle(console->contWatch, contEvents);
    virEventUpdateHandle(console->hostWatch, hostEvents);
1093

1094
    if (console->hostClosed) {
1095
        /* Must setup an epoll to detect when host becomes accessible again */
1096
        int events = EPOLLIN | EPOLLET;
1097
        if (console->fromContLen)
1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113
            events |= EPOLLOUT;

        if (events != console->hostEpoll) {
            struct epoll_event event;
            int action = EPOLL_CTL_ADD;
            if (console->hostEpoll)
                action = EPOLL_CTL_MOD;

            VIR_DEBUG("newHostEvents=%x oldHostEvents=%x", events, console->hostEpoll);

            event.events = events;
            event.data.fd = console->hostFd;
            if (epoll_ctl(console->epollFd, action, console->hostFd, &event) < 0) {
                VIR_DEBUG(":fail");
                virReportSystemError(errno, "%s",
                                     _("Unable to add epoll fd"));
1114
                virNetDaemonQuit(console->daemon);
1115
                return;
1116 1117 1118 1119 1120 1121 1122 1123 1124
            }
            console->hostEpoll = events;
            VIR_DEBUG("newHostEvents=%x oldHostEvents=%x", events, console->hostEpoll);
        }
    } else if (console->hostEpoll) {
        VIR_DEBUG("Stop epoll oldContEvents=%x", console->hostEpoll);
        if (epoll_ctl(console->epollFd, EPOLL_CTL_DEL, console->hostFd, NULL) < 0) {
            virReportSystemError(errno, "%s",
                                 _("Unable to remove epoll fd"));
1125
            VIR_DEBUG(":fail");
1126
            virNetDaemonQuit(console->daemon);
1127
            return;
1128 1129 1130
        }
        console->hostEpoll = 0;
    }
1131

1132
    if (console->contClosed) {
1133
        /* Must setup an epoll to detect when guest becomes accessible again */
1134
        int events = EPOLLIN | EPOLLET;
1135
        if (console->fromHostLen)
1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151
            events |= EPOLLOUT;

        if (events != console->contEpoll) {
            struct epoll_event event;
            int action = EPOLL_CTL_ADD;
            if (console->contEpoll)
                action = EPOLL_CTL_MOD;

            VIR_DEBUG("newContEvents=%x oldContEvents=%x", events, console->contEpoll);

            event.events = events;
            event.data.fd = console->contFd;
            if (epoll_ctl(console->epollFd, action, console->contFd, &event) < 0) {
                virReportSystemError(errno, "%s",
                                     _("Unable to add epoll fd"));
                VIR_DEBUG(":fail");
1152
                virNetDaemonQuit(console->daemon);
1153
                return;
1154 1155 1156 1157 1158 1159 1160 1161 1162
            }
            console->contEpoll = events;
            VIR_DEBUG("newHostEvents=%x oldHostEvents=%x", events, console->contEpoll);
        }
    } else if (console->contEpoll) {
        VIR_DEBUG("Stop epoll oldContEvents=%x", console->contEpoll);
        if (epoll_ctl(console->epollFd, EPOLL_CTL_DEL, console->contFd, NULL) < 0) {
            virReportSystemError(errno, "%s",
                                 _("Unable to remove epoll fd"));
1163
            VIR_DEBUG(":fail");
1164
            virNetDaemonQuit(console->daemon);
1165
            return;
1166 1167 1168 1169
        }
        console->contEpoll = 0;
    }
}
1170 1171


1172
static void virLXCControllerConsoleEPoll(int watch, int fd, int events, void *opaque)
1173
{
1174
    virLXCControllerConsolePtr console = opaque;
1175

1176 1177 1178 1179 1180 1181 1182 1183 1184 1185
    virMutexLock(&lock);
    VIR_DEBUG("IO event watch=%d fd=%d events=%d fromHost=%zu fromcont=%zu",
              watch, fd, events,
              console->fromHostLen,
              console->fromContLen);

    while (1) {
        struct epoll_event event;
        int ret;
        ret = epoll_wait(console->epollFd, &event, 1, 0);
1186
        if (ret < 0) {
S
Stefan Berger 已提交
1187
            if (errno == EINTR)
1188 1189 1190
                continue;
            virReportSystemError(errno, "%s",
                                 _("Unable to wait on epoll"));
1191
            virNetDaemonQuit(console->daemon);
1192 1193 1194
            goto cleanup;
        }

1195 1196 1197 1198 1199 1200 1201
        if (ret == 0)
            break;

        VIR_DEBUG("fd=%d hostFd=%d contFd=%d hostEpoll=%x contEpoll=%x",
                  event.data.fd, console->hostFd, console->contFd,
                  console->hostEpoll, console->contEpoll);

1202 1203
        /* If we get HUP+dead PID, we just re-enable the main loop
         * which will see the PID has died and exit */
1204
        if ((event.events & (EPOLLIN|EPOLLOUT))) {
1205 1206
            if (event.data.fd == console->hostFd) {
                console->hostClosed = false;
1207
            } else {
1208
                console->contClosed = false;
1209
            }
1210
            virLXCControllerConsoleUpdateWatch(console);
1211 1212 1213 1214
            break;
        }
    }

1215
 cleanup:
1216
    virMutexUnlock(&lock);
1217 1218
}

1219
static void virLXCControllerConsoleIO(int watch, int fd, int events, void *opaque)
1220
{
1221
    virLXCControllerConsolePtr console = opaque;
1222 1223

    virMutexLock(&lock);
1224 1225 1226 1227
    VIR_DEBUG("IO event watch=%d fd=%d events=%d fromHost=%zu fromcont=%zu",
              watch, fd, events,
              console->fromHostLen,
              console->fromContLen);
1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241
    if (events & VIR_EVENT_HANDLE_READABLE) {
        char *buf;
        size_t *len;
        size_t avail;
        ssize_t done;
        if (watch == console->hostWatch) {
            buf = console->fromHostBuf;
            len = &console->fromHostLen;
            avail = sizeof(console->fromHostBuf) - *len;
        } else {
            buf = console->fromContBuf;
            len = &console->fromContLen;
            avail = sizeof(console->fromContBuf) - *len;
        }
1242
     reread:
1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269
        done = read(fd, buf + *len, avail);
        if (done == -1 && errno == EINTR)
            goto reread;
        if (done == -1 && errno != EAGAIN) {
            virReportSystemError(errno, "%s",
                                 _("Unable to read container pty"));
            goto error;
        }
        if (done > 0) {
            *len += done;
        } else {
            VIR_DEBUG("Read fd %d done %d errno %d", fd, (int)done, errno);
        }
    }

    if (events & VIR_EVENT_HANDLE_WRITABLE) {
        char *buf;
        size_t *len;
        ssize_t done;
        if (watch == console->hostWatch) {
            buf = console->fromContBuf;
            len = &console->fromContLen;
        } else {
            buf = console->fromHostBuf;
            len = &console->fromHostLen;
        }

1270
     rewrite:
1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295
        done = write(fd, buf, *len);
        if (done == -1 && errno == EINTR)
            goto rewrite;
        if (done == -1 && errno != EAGAIN) {
            virReportSystemError(errno, "%s",
                                 _("Unable to write to container pty"));
            goto error;
        }
        if (done > 0) {
            memmove(buf, buf + done, (*len - done));
            *len -= done;
        } else {
            VIR_DEBUG("Write fd %d done %d errno %d", fd, (int)done, errno);
        }
    }

    if (events & VIR_EVENT_HANDLE_HANGUP) {
        if (watch == console->hostWatch) {
            console->hostClosed = true;
        } else {
            console->contClosed = true;
        }
        VIR_DEBUG("Got EOF on %d %d", watch, fd);
    }

1296
    virLXCControllerConsoleUpdateWatch(console);
1297 1298 1299
    virMutexUnlock(&lock);
    return;

1300
 error:
1301 1302 1303
    virEventRemoveHandle(console->contWatch);
    virEventRemoveHandle(console->hostWatch);
    console->contWatch = console->hostWatch = -1;
1304
    virNetDaemonQuit(console->daemon);
1305 1306 1307 1308
    virMutexUnlock(&lock);
}


1309
/**
1310
 * lxcControllerMain
1311 1312
 * @serverFd: server socket fd to accept client requests
 * @clientFd: initial client which is the libvirtd daemon
1313
 *
1314
 * Processes I/O on consoles and the monitor
1315 1316 1317
 *
 * Returns 0 on success or -1 in case of error
 */
1318
static int virLXCControllerMain(virLXCControllerPtr ctrl)
1319 1320
{
    int rc = -1;
1321
    size_t i;
1322

1323
    if (virNetDaemonAddSignalHandler(ctrl->daemon,
1324 1325 1326
                                     SIGCHLD,
                                     virLXCControllerSignalChildIO,
                                     ctrl) < 0)
1327 1328
        goto cleanup;

1329 1330
    virResetLastError();

1331
    for (i = 0; i < ctrl->nconsoles; i++) {
1332
        if ((ctrl->consoles[i].epollFd = epoll_create1(EPOLL_CLOEXEC)) < 0) {
1333 1334 1335 1336 1337
            virReportSystemError(errno, "%s",
                                 _("Unable to create epoll fd"));
            goto cleanup;
        }

1338 1339 1340 1341 1342
        if ((ctrl->consoles[i].epollWatch = virEventAddHandle(ctrl->consoles[i].epollFd,
                                                              VIR_EVENT_HANDLE_READABLE,
                                                              virLXCControllerConsoleEPoll,
                                                              &(ctrl->consoles[i]),
                                                              NULL)) < 0) {
1343 1344
            virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                           _("Unable to watch epoll FD"));
1345 1346 1347
            goto cleanup;
        }

1348 1349 1350 1351 1352
        if ((ctrl->consoles[i].hostWatch = virEventAddHandle(ctrl->consoles[i].hostFd,
                                                             VIR_EVENT_HANDLE_READABLE,
                                                             virLXCControllerConsoleIO,
                                                             &(ctrl->consoles[i]),
                                                             NULL)) < 0) {
1353 1354
            virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                           _("Unable to watch host console PTY"));
1355 1356 1357
            goto cleanup;
        }

1358 1359 1360 1361 1362
        if ((ctrl->consoles[i].contWatch = virEventAddHandle(ctrl->consoles[i].contFd,
                                                             VIR_EVENT_HANDLE_READABLE,
                                                             virLXCControllerConsoleIO,
                                                             &(ctrl->consoles[i]),
                                                             NULL)) < 0) {
1363 1364
            virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                           _("Unable to watch host console PTY"));
1365 1366
            goto cleanup;
        }
1367
    }
1368

1369
    virNetDaemonRun(ctrl->daemon);
1370

1371
    if (virGetLastErrorCode() == VIR_ERR_OK)
1372
        rc = wantReboot ? 1 : 0;
1373

1374
 cleanup:
1375
    for (i = 0; i < ctrl->nconsoles; i++)
1376
        virLXCControllerConsoleClose(&(ctrl->consoles[i]));
1377

1378 1379 1380
    return rc;
}

1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394
static unsigned int
virLXCControllerLookupUsernsMap(virDomainIdMapEntryPtr map,
                                int num,
                                unsigned int src)
{
    size_t i;

    for (i = 0; i < num; i++) {
        if (src > map[i].start && src < map[i].start + map[i].count)
            return map[i].target + (src - map[i].start);
    }

    return src;
}
1395

1396 1397 1398 1399 1400 1401
static int
virLXCControllerSetupUsernsMap(virDomainIdMapEntryPtr map,
                               int num,
                               char *path)
{
    virBuffer map_value = VIR_BUFFER_INITIALIZER;
1402 1403
    size_t i;
    int ret = -1;
1404

1405 1406 1407 1408 1409 1410 1411
    /* The kernel supports up to 340 lines in /proc/<pid>/{g,u}id_map */
    if (num > 340) {
        virReportError(VIR_ERR_INVALID_ARG, "%s",
                       _("Too many id mappings defined."));
        goto cleanup;
    }

1412 1413 1414 1415
    for (i = 0; i < num; i++)
        virBufferAsprintf(&map_value, "%u %u %u\n",
                          map[i].start, map[i].target, map[i].count);

1416 1417
    VIR_DEBUG("Set '%s' to '%s'", path, virBufferCurrentContent(&map_value));

1418 1419 1420 1421 1422 1423
    if (virFileWriteStr(path, virBufferCurrentContent(&map_value), 0) < 0) {
        virReportSystemError(errno, _("unable write to %s"), path);
        goto cleanup;
    }

    ret = 0;
1424
 cleanup:
1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442
    virBufferFreeAndReset(&map_value);
    return ret;
}

/**
 * virLXCControllerSetupUserns
 *
 * Set proc files for user namespace
 *
 * Returns 0 on success or -1 in case of error
 */
static int virLXCControllerSetupUserns(virLXCControllerPtr ctrl)
{
    char *uid_map = NULL;
    char *gid_map = NULL;
    int ret = -1;

    /* User namespace is disabled for container */
1443 1444
    if (ctrl->def->idmap.nuidmap == 0) {
        VIR_DEBUG("No uid map, skipping userns setup");
1445
        return 0;
1446
    }
1447

1448
    VIR_DEBUG("Setting up userns maps");
1449
    uid_map = g_strdup_printf("/proc/%d/uid_map", ctrl->initpid);
1450 1451 1452 1453 1454 1455

    if (virLXCControllerSetupUsernsMap(ctrl->def->idmap.uidmap,
                                       ctrl->def->idmap.nuidmap,
                                       uid_map) < 0)
        goto cleanup;

1456
    gid_map = g_strdup_printf("/proc/%d/gid_map", ctrl->initpid);
1457 1458 1459 1460 1461 1462 1463

    if (virLXCControllerSetupUsernsMap(ctrl->def->idmap.gidmap,
                                       ctrl->def->idmap.ngidmap,
                                       gid_map) < 0)
        goto cleanup;

    ret = 0;
1464
 cleanup:
1465 1466 1467 1468 1469
    VIR_FREE(uid_map);
    VIR_FREE(gid_map);
    return ret;
}

1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481
static int virLXCControllerSetupDev(virLXCControllerPtr ctrl)
{
    char *mount_options = NULL;
    char *opts = NULL;
    char *dev = NULL;
    int ret = -1;

    VIR_DEBUG("Setting up /dev/ for container");

    mount_options = virSecurityManagerGetMountOptions(ctrl->securityManager,
                                                      ctrl->def);

1482
    dev = g_strdup_printf("/%s/%s.dev", LXC_STATE_DIR, ctrl->def->name);
1483 1484 1485 1486 1487 1488

    /*
     * tmpfs is limited to 64kb, since we only have device nodes in there
     * and don't want to DOS the entire OS RAM usage
     */

1489
    opts = g_strdup_printf("mode=755,size=65536%s", mount_options);
1490

1491
    if (virFileSetupDev(dev, opts) < 0)
1492 1493
        goto cleanup;

1494
    if (lxcContainerChown(ctrl->def, dev) < 0)
1495 1496
        goto cleanup;

1497
    ret = 0;
1498
 cleanup:
1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520
    VIR_FREE(opts);
    VIR_FREE(mount_options);
    VIR_FREE(dev);
    return ret;
}

static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl)
{
    size_t i;
    int ret = -1;
    char *path = NULL;
    const struct {
        int maj;
        int min;
        mode_t mode;
        const char *path;
    } devs[] = {
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/null" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/zero" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/full" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/random" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/urandom" },
1521
        { LXC_DEV_MAJ_TTY, LXC_DEV_MIN_TTY, 0666, "/tty" },
1522 1523 1524 1525 1526 1527
    };

    if (virLXCControllerSetupDev(ctrl) < 0)
        goto cleanup;

    /* Populate /dev/ with a few important bits */
1528
    for (i = 0; i < G_N_ELEMENTS(devs); i++) {
1529 1530
        path = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR, ctrl->def->name,
                               devs[i].path);
1531 1532 1533 1534 1535 1536 1537 1538 1539

        dev_t dev = makedev(devs[i].maj, devs[i].min);
        if (mknod(path, S_IFCHR, dev) < 0 ||
            chmod(path, devs[i].mode)) {
            virReportSystemError(errno,
                                 _("Failed to make device %s"),
                                 path);
            goto cleanup;
        }
1540

1541
        if (lxcContainerChown(ctrl->def, path) < 0)
1542 1543
            goto cleanup;

1544 1545 1546 1547
        VIR_FREE(path);
    }

    ret = 0;
1548
 cleanup:
1549 1550 1551
    VIR_FREE(path);
    return ret;
}
1552

1553

1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565
static int
virLXCControllerSetupHostdevSubsysUSB(virDomainDefPtr vmDef,
                                      virDomainHostdevDefPtr def,
                                      virSecurityManagerPtr securityDriver)
{
    int ret = -1;
    char *src = NULL;
    char *dstdir = NULL;
    char *dstfile = NULL;
    char *vroot = NULL;
    struct stat sb;
    mode_t mode;
1566
    virDomainHostdevSubsysUSBPtr usbsrc = &def->source.subsys.u.usb;
1567

1568
    src = g_strdup_printf(USB_DEVFS "/%03d/%03d", usbsrc->bus, usbsrc->device);
1569

1570
    vroot = g_strdup_printf("/%s/%s.dev/bus/usb/", LXC_STATE_DIR, vmDef->name);
1571

1572
    dstdir = g_strdup_printf("%s/%03d/", vroot, usbsrc->bus);
1573

1574
    dstfile = g_strdup_printf("%s/%03d", dstdir, usbsrc->device);
1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605

    if (stat(src, &sb) < 0) {
        virReportSystemError(errno,
                             _("Unable to access %s"), src);
        goto cleanup;
    }

    if (!S_ISCHR(sb.st_mode)) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("USB source %s was not a character device"),
                       src);
        goto cleanup;
    }

    mode = 0700 | S_IFCHR;

    if (virFileMakePath(dstdir) < 0) {
        virReportSystemError(errno,
                             _("Unable to create %s"), dstdir);
        goto cleanup;
    }

    VIR_DEBUG("Creating dev %s (%d,%d)",
              dstfile, major(sb.st_rdev), minor(sb.st_rdev));
    if (mknod(dstfile, mode, sb.st_rdev) < 0) {
        virReportSystemError(errno,
                             _("Unable to create device %s"),
                             dstfile);
        goto cleanup;
    }

1606 1607 1608
    if (lxcContainerChown(vmDef, dstfile) < 0)
        goto cleanup;

1609 1610 1611 1612 1613 1614
    if (virSecurityManagerSetHostdevLabel(securityDriver,
                                          vmDef, def, vroot) < 0)
        goto cleanup;

    ret = 0;

1615
 cleanup:
1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642
    VIR_FREE(src);
    VIR_FREE(dstfile);
    VIR_FREE(dstdir);
    VIR_FREE(vroot);
    return ret;
}


static int
virLXCControllerSetupHostdevCapsStorage(virDomainDefPtr vmDef,
                                        virDomainHostdevDefPtr def,
                                        virSecurityManagerPtr securityDriver)
{
    char *dst = NULL;
    char *path = NULL;
    int len = 0;
    int ret = -1;
    struct stat sb;
    mode_t mode;
    char *dev = def->source.caps.u.storage.block;

    if (dev == NULL) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("Missing storage host block path"));
        goto cleanup;
    }

1643
    path = g_strdup(dev);
1644 1645 1646 1647

    while (*(path + len) == '/')
        len++;

1648 1649
    dst = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR, vmDef->name,
                          strchr(path + len, '/'));
1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682

    if (stat(dev, &sb) < 0) {
        virReportSystemError(errno,
                             _("Unable to access %s"),
                             dev);
        goto cleanup;
    }

    if (!S_ISBLK(sb.st_mode)) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Storage source %s must be a block device"),
                       dev);
        goto cleanup;
    }

    if (lxcContainerSetupHostdevCapsMakePath(dst) < 0) {
        virReportError(errno,
                       _("Failed to create directory for device %s"),
                       dev);
        goto cleanup;
    }

    mode = 0700 | S_IFBLK;

    VIR_DEBUG("Creating dev %s (%d,%d)", dst,
              major(sb.st_rdev), minor(sb.st_rdev));
    if (mknod(dst, mode, sb.st_rdev) < 0) {
        virReportSystemError(errno,
                             _("Unable to create device %s"),
                             dst);
        goto cleanup;
    }

1683 1684 1685
    if (lxcContainerChown(vmDef, dst) < 0)
        goto cleanup;

1686 1687 1688 1689 1690 1691
    def->source.caps.u.storage.block = dst;
    if (virSecurityManagerSetHostdevLabel(securityDriver, vmDef, def, NULL) < 0)
        goto cleanup;

    ret = 0;

1692
 cleanup:
1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718
    def->source.caps.u.storage.block = dev;
    VIR_FREE(dst);
    VIR_FREE(path);
    return ret;
}


static int
virLXCControllerSetupHostdevCapsMisc(virDomainDefPtr vmDef,
                                     virDomainHostdevDefPtr def,
                                     virSecurityManagerPtr securityDriver)
{
    char *dst = NULL;
    char *path = NULL;
    int len = 0;
    int ret = -1;
    struct stat sb;
    mode_t mode;
    char *dev = def->source.caps.u.misc.chardev;

    if (dev == NULL) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("Missing storage host block path"));
        goto cleanup;
    }

1719
    path = g_strdup(dev);
1720 1721 1722 1723

    while (*(path + len) == '/')
        len++;

1724 1725
    dst = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR, vmDef->name,
                          strchr(path + len, '/'));
1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758

    if (stat(dev, &sb) < 0) {
        virReportSystemError(errno,
                             _("Unable to access %s"),
                             dev);
        goto cleanup;
    }

    if (!S_ISCHR(sb.st_mode)) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Storage source %s must be a character device"),
                       dev);
        goto cleanup;
    }

    if (lxcContainerSetupHostdevCapsMakePath(dst) < 0) {
        virReportError(errno,
                       _("Failed to create directory for device %s"),
                       dst);
        goto cleanup;
    }

    mode = 0700 | S_IFCHR;

    VIR_DEBUG("Creating dev %s (%d,%d)", dst,
              major(sb.st_rdev), minor(sb.st_rdev));
    if (mknod(dst, mode, sb.st_rdev) < 0) {
        virReportSystemError(errno,
                             _("Unable to create device %s"),
                             dev);
        goto cleanup;
    }

1759 1760 1761
    if (lxcContainerChown(vmDef, dst) < 0)
        goto cleanup;

1762 1763 1764 1765 1766 1767
    def->source.caps.u.misc.chardev = dst;
    if (virSecurityManagerSetHostdevLabel(securityDriver, vmDef, def, NULL) < 0)
        goto cleanup;

    ret = 0;

1768
 cleanup:
1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811
    def->source.caps.u.misc.chardev = dev;
    VIR_FREE(dst);
    VIR_FREE(path);
    return ret;
}

static int
virLXCControllerSetupHostdevSubsys(virDomainDefPtr vmDef,
                                   virDomainHostdevDefPtr def,
                                   virSecurityManagerPtr securityDriver)
{
    switch (def->source.subsys.type) {
    case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
        return virLXCControllerSetupHostdevSubsysUSB(vmDef,
                                                     def,
                                                     securityDriver);

    default:
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Unsupported host device mode %s"),
                       virDomainHostdevSubsysTypeToString(def->source.subsys.type));
        return -1;
    }
}


static int
virLXCControllerSetupHostdevCaps(virDomainDefPtr vmDef,
                                 virDomainHostdevDefPtr def,
                                 virSecurityManagerPtr securityDriver)
{
    switch (def->source.subsys.type) {
    case VIR_DOMAIN_HOSTDEV_CAPS_TYPE_STORAGE:
        return virLXCControllerSetupHostdevCapsStorage(vmDef,
                                                       def,
                                                       securityDriver);

    case VIR_DOMAIN_HOSTDEV_CAPS_TYPE_MISC:
        return virLXCControllerSetupHostdevCapsMisc(vmDef,
                                                    def,
                                                    securityDriver);

    case VIR_DOMAIN_HOSTDEV_CAPS_TYPE_NET:
M
Michal Privoznik 已提交
1812
        return 0; /* case is handled in virLXCControllerMoveInterfaces */
1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858

    default:
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Unsupported host device mode %s"),
                       virDomainHostdevCapsTypeToString(def->source.subsys.type));
        return -1;
    }
}


static int
virLXCControllerSetupAllHostdevs(virLXCControllerPtr ctrl)
{
    size_t i;
    virDomainDefPtr vmDef = ctrl->def;
    virSecurityManagerPtr securityDriver = ctrl->securityManager;
    VIR_DEBUG("Setting up hostdevs");

    for (i = 0; i < vmDef->nhostdevs; i++) {
        virDomainHostdevDefPtr def = vmDef->hostdevs[i];
        switch (def->mode) {
        case VIR_DOMAIN_HOSTDEV_MODE_SUBSYS:
            if (virLXCControllerSetupHostdevSubsys(vmDef,
                                                   def,
                                                   securityDriver) < 0)
                return -1;
            break;
        case VIR_DOMAIN_HOSTDEV_MODE_CAPABILITIES:
            if (virLXCControllerSetupHostdevCaps(vmDef,
                                                 def,
                                                 securityDriver) < 0)
                return -1;
            break;
        default:
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                           _("Unsupported host device mode %s"),
                           virDomainHostdevModeTypeToString(def->mode));
            return -1;
        }
    }

    VIR_DEBUG("Setup all hostdevs");
    return 0;
}


1859 1860 1861 1862 1863 1864 1865 1866
static int virLXCControllerSetupDisk(virLXCControllerPtr ctrl,
                                     virDomainDiskDefPtr def,
                                     virSecurityManagerPtr securityDriver)
{
    char *dst = NULL;
    int ret = -1;
    struct stat sb;
    mode_t mode;
1867
    char *tmpsrc = def->src->path;
1868

E
Eric Blake 已提交
1869
    if (virDomainDiskGetType(def) != VIR_STORAGE_TYPE_BLOCK) {
1870 1871 1872 1873
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("Can't setup disk for non-block device"));
        goto cleanup;
    }
1874
    if (!tmpsrc) {
1875 1876 1877 1878 1879
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("Can't setup disk without media"));
        goto cleanup;
    }

1880 1881
    dst = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR, ctrl->def->name,
                          def->dst);
1882

1883
    if (stat(def->src->path, &sb) < 0) {
1884
        virReportSystemError(errno,
1885
                             _("Unable to access %s"), tmpsrc);
1886 1887 1888 1889 1890 1891
        goto cleanup;
    }

    if (!S_ISCHR(sb.st_mode) && !S_ISBLK(sb.st_mode)) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Disk source %s must be a character/block device"),
1892
                       tmpsrc);
1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909
        goto cleanup;
    }

    mode = 0700;
    if (S_ISCHR(sb.st_mode))
        mode |= S_IFCHR;
    else
        mode |= S_IFBLK;

    /* Yes, the device name we're creating may not
     * actually correspond to the major:minor number
     * we're using, but we've no other option at this
     * time. Just have to hope that containerized apps
     * don't get upset that the major:minor is different
     * to that normally implied by the device name
     */
    VIR_DEBUG("Creating dev %s (%d,%d) from %s",
1910
              dst, major(sb.st_rdev), minor(sb.st_rdev), tmpsrc);
1911 1912 1913 1914 1915 1916 1917
    if (mknod(dst, mode, sb.st_rdev) < 0) {
        virReportSystemError(errno,
                             _("Unable to create device %s"),
                             dst);
        goto cleanup;
    }

1918
    if (lxcContainerChown(ctrl->def, dst) < 0)
1919 1920
        goto cleanup;

1921
    /* Labelling normally operates on src, but we need
E
Eric Blake 已提交
1922
     * to actually label the dst here, so hack the config */
1923
    def->src->path = dst;
1924 1925
    if (virSecurityManagerSetImageLabel(securityDriver, ctrl->def, def->src,
                                        VIR_SECURITY_DOMAIN_IMAGE_LABEL_BACKING_CHAIN) < 0)
1926 1927 1928 1929
        goto cleanup;

    ret = 0;

1930
 cleanup:
1931
    def->src->path = tmpsrc;
1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952
    VIR_FREE(dst);
    return ret;
}

static int virLXCControllerSetupAllDisks(virLXCControllerPtr ctrl)
{
    size_t i;
    VIR_DEBUG("Setting up disks");

    for (i = 0; i < ctrl->def->ndisks; i++) {
        if (virLXCControllerSetupDisk(ctrl, ctrl->def->disks[i],
                                      ctrl->securityManager) < 0)
            return -1;
    }

    VIR_DEBUG("Setup all disks");
    return 0;
}



1953
/**
1954
 * virLXCControllerMoveInterfaces
1955 1956 1957 1958 1959 1960 1961 1962
 * @nveths: number of interfaces
 * @veths: interface names
 * @container: pid of container
 *
 * Moves network interfaces into a container's namespace
 *
 * Returns 0 on success or -1 in case of error
 */
1963
static int virLXCControllerMoveInterfaces(virLXCControllerPtr ctrl)
1964
{
1965
    size_t i;
1966
    virDomainDefPtr def = ctrl->def;
1967

1968
    for (i = 0; i < ctrl->nveths; i++) {
1969
        if (virNetDevSetNamespace(ctrl->veths[i], ctrl->initpid) < 0)
1970
            return -1;
1971
    }
1972

1973 1974 1975 1976 1977 1978 1979 1980 1981
    for (i = 0; i < def->nhostdevs; i ++) {
        virDomainHostdevDefPtr hdev = def->hostdevs[i];

        if (hdev->mode != VIR_DOMAIN_HOSTDEV_MODE_CAPABILITIES)
            continue;

        virDomainHostdevCaps hdcaps = hdev->source.caps;

        if (hdcaps.type != VIR_DOMAIN_HOSTDEV_CAPS_TYPE_NET)
1982
            continue;
1983

1984
        if (virNetDevSetNamespace(hdcaps.u.net.ifname, ctrl->initpid) < 0)
1985 1986 1987
            return -1;
    }

1988 1989 1990 1991 1992
    return 0;
}


/**
1993 1994
 * virLXCControllerDeleteInterfaces:
 * @ctrl: the LXC controller
1995 1996 1997 1998 1999
 *
 * Cleans up the container interfaces by deleting the veth device pairs.
 *
 * Returns 0 on success or -1 in case of error
 */
2000
static int virLXCControllerDeleteInterfaces(virLXCControllerPtr ctrl)
2001
{
2002 2003
    size_t i;
    int ret = 0;
2004

2005
    for (i = 0; i < ctrl->nveths; i++) {
2006 2007 2008 2009 2010
        if (virNetDevVethDelete(ctrl->veths[i]) < 0)
            ret = -1;
    }

    return ret;
2011 2012
}

2013

2014 2015
static int lxcSetPersonality(virDomainDefPtr def)
{
2016
    virArch altArch;
2017

2018
    VIR_DEBUG("Checking for 32-bit personality");
2019
    altArch = lxcContainerGetAlt32bitArch(virArchFromHost());
2020
    if (altArch &&
2021
        (def->os.arch == altArch)) {
2022 2023
        VIR_DEBUG("Setting personality to %s",
                  virArchToString(altArch));
2024 2025
        if (personality(PER_LINUX32) < 0) {
            virReportSystemError(errno, _("Unable to request personality for %s on %s"),
2026 2027
                                 virArchToString(altArch),
                                 virArchToString(virArchFromHost()));
2028 2029 2030 2031 2032 2033
            return -1;
        }
    }
    return 0;
}

2034
#ifndef MS_REC
2035
# define MS_REC          16384
2036 2037 2038
#endif

#ifndef MS_SLAVE
2039
# define MS_SLAVE              (1<<19)
2040
#endif
2041

2042 2043 2044 2045 2046 2047
/* Create a private tty using the private devpts at PTMX, returning
 * the master in *TTYMASTER and the name of the slave, _from the
 * perspective of the guest after remounting file systems_, in
 * *TTYNAME.  Heavily borrowed from glibc, but doesn't require that
 * devpts == "/dev/pts" */
static int
2048 2049
lxcCreateTty(virLXCControllerPtr ctrl, int *ttymaster,
             char **ttyName, char **ttyHostPath)
2050 2051 2052 2053 2054
{
    int ret = -1;
    int ptyno;
    int unlock = 0;

2055
    if ((*ttymaster = open(ctrl->devptmx, O_RDWR|O_NOCTTY|O_NONBLOCK)) < 0)
2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069
        goto cleanup;

    if (ioctl(*ttymaster, TIOCSPTLCK, &unlock) < 0)
        goto cleanup;

    if (ioctl(*ttymaster, TIOCGPTN, &ptyno) < 0)
        goto cleanup;

    /* If mount() succeeded at honoring newinstance, then the kernel
     * was new enough to also honor the mode=0620,gid=5 options, which
     * guarantee that the new pty already has correct permissions; so
     * while glibc has to fstat(), fchmod(), and fchown() for older
     * kernels, we can skip those steps.  ptyno shouldn't currently be
     * anything other than 0, but let's play it safe.  */
2070 2071
    *ttyName = g_strdup_printf("/dev/pts/%d", ptyno);
    *ttyHostPath = g_strdup_printf("/%s/%s.devpts/%d", LXC_STATE_DIR, ctrl->def->name, ptyno);
2072 2073 2074

    ret = 0;

2075
 cleanup:
2076 2077 2078 2079 2080 2081 2082 2083
    if (ret != 0) {
        VIR_FORCE_CLOSE(*ttymaster);
        VIR_FREE(*ttyName);
    }

    return ret;
}

2084

2085 2086 2087
static int
virLXCControllerSetupPrivateNS(void)
{
2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108
    /*
     * If doing a chroot style setup, we need to prepare
     * a private /dev/pts for the child now, which they
     * will later move into position.
     *
     * This is complex because 'virsh console' needs to
     * use /dev/pts from the host OS, and the guest OS
     * needs to use /dev/pts from the guest.
     *
     * This means that we (libvirt_lxc) need to see and
     * use both /dev/pts instances. We're running in the
     * host OS context though and don't want to expose
     * the guest OS /dev/pts there.
     *
     * Thus we call unshare(CLONE_NS) so that we can see
     * the guest's new /dev/pts, without it becoming
     * visible to the host OS. We also put the root FS
     * into slave mode, just in case it was currently
     * marked as shared
     */

2109
    return virProcessSetupPrivateMountNS();
2110 2111 2112
}


2113
static int
2114
virLXCControllerSetupDevPTS(virLXCControllerPtr ctrl)
2115
{
2116
    char *mount_options = NULL;
2117
    char *opts = NULL;
2118 2119
    char *devpts = NULL;
    int ret = -1;
2120
    gid_t ptsgid = 5;
2121

2122
    VIR_DEBUG("Setting up private /dev/pts");
2123

2124 2125
    mount_options = virSecurityManagerGetMountOptions(ctrl->securityManager,
                                                      ctrl->def);
2126

2127 2128
    devpts = g_strdup_printf("%s/%s.devpts", LXC_STATE_DIR, ctrl->def->name);
    ctrl->devptmx = g_strdup_printf("%s/%s.devpts/ptmx", LXC_STATE_DIR, ctrl->def->name);
2129

2130 2131 2132 2133 2134 2135
    if (virFileMakePath(devpts) < 0) {
        virReportSystemError(errno,
                             _("Failed to make path %s"),
                             devpts);
        goto cleanup;
    }
2136

2137 2138 2139 2140 2141
    if (ctrl->def->idmap.ngidmap)
        ptsgid = virLXCControllerLookupUsernsMap(ctrl->def->idmap.gidmap,
                                                 ctrl->def->idmap.ngidmap,
                                                 ptsgid);

2142 2143
    /* XXX should we support gid=X for X!=5 for distros which use
     * a different gid for tty?  */
2144 2145
    opts = g_strdup_printf("newinstance,ptmxmode=0666,mode=0620,gid=%u%s", ptsgid,
                           NULLSTR_EMPTY(mount_options));
2146

2147
    VIR_DEBUG("Mount devpts on %s type=tmpfs flags=0x%x, opts=%s",
2148 2149 2150 2151 2152 2153 2154
              devpts, MS_NOSUID, opts);
    if (mount("devpts", devpts, "devpts", MS_NOSUID, opts) < 0) {
        virReportSystemError(errno,
                             _("Failed to mount devpts on %s"),
                             devpts);
        goto cleanup;
    }
2155

2156
    if (access(ctrl->devptmx, R_OK) < 0) {
2157 2158 2159
        virReportSystemError(ENOSYS, "%s",
                             _("Kernel does not support private devpts"));
        goto cleanup;
2160 2161
    }

2162 2163
    if ((lxcContainerChown(ctrl->def, ctrl->devptmx) < 0) ||
        (lxcContainerChown(ctrl->def, devpts) < 0))
2164
        goto cleanup;
2165

2166 2167
    ret = 0;

2168
 cleanup:
2169 2170
    VIR_FREE(opts);
    VIR_FREE(devpts);
2171
    VIR_FREE(mount_options);
2172 2173 2174 2175
    return ret;
}


G
Gao feng 已提交
2176 2177 2178 2179 2180 2181
static int
virLXCControllerSetupFuse(virLXCControllerPtr ctrl)
{
    return lxcSetupFuse(&ctrl->fuse, ctrl->def);
}

2182 2183 2184 2185 2186 2187
static int
virLXCControllerStartFuse(virLXCControllerPtr ctrl)
{
    return lxcStartFuse(ctrl->fuse);
}

2188 2189 2190 2191 2192
static int
virLXCControllerSetupConsoles(virLXCControllerPtr ctrl,
                              char **containerTTYPaths)
{
    size_t i;
2193 2194
    int ret = -1;
    char *ttyHostPath = NULL;
2195

2196
    for (i = 0; i < ctrl->nconsoles; i++) {
2197
        VIR_DEBUG("Opening tty on private %s", ctrl->devptmx);
2198
        if (lxcCreateTty(ctrl,
2199
                         &ctrl->consoles[i].contFd,
2200
                         &containerTTYPaths[i], &ttyHostPath) < 0) {
2201
            virReportSystemError(errno, "%s",
2202
                                 _("Failed to allocate tty"));
2203
            goto cleanup;
2204
        }
2205 2206

        /* Change the owner of tty device to the root user of container */
2207
        if (lxcContainerChown(ctrl->def, ttyHostPath) < 0)
2208 2209 2210
            goto cleanup;

        VIR_FREE(ttyHostPath);
2211
    }
2212 2213

    ret = 0;
2214
 cleanup:
2215 2216
    VIR_FREE(ttyHostPath);
    return ret;
2217 2218 2219
}


2220 2221 2222 2223 2224 2225 2226 2227
static void
virLXCControllerEventSend(virLXCControllerPtr ctrl,
                          int procnr,
                          xdrproc_t proc,
                          void *data)
{
    virNetMessagePtr msg;

2228
    if (!ctrl->client) {
2229
        VIR_WARN("Dropping event %d because libvirtd is not connected", procnr);
2230
        return;
2231
    }
2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250

    VIR_DEBUG("Send event %d client=%p", procnr, ctrl->client);
    if (!(msg = virNetMessageNew(false)))
        goto error;

    msg->header.prog = virNetServerProgramGetID(ctrl->prog);
    msg->header.vers = virNetServerProgramGetVersion(ctrl->prog);
    msg->header.proc = procnr;
    msg->header.type = VIR_NET_MESSAGE;
    msg->header.serial = 1;
    msg->header.status = VIR_NET_OK;

    if (virNetMessageEncodeHeader(msg) < 0)
        goto error;

    if (virNetMessageEncodePayload(msg, proc, data) < 0)
        goto error;

    VIR_DEBUG("Queue event %d %zu", procnr, msg->bufferLength);
2251 2252
    if (virNetServerClientSendMessage(ctrl->client, msg) < 0)
        goto error;
2253 2254 2255 2256

    xdr_free(proc, data);
    return;

2257
 error:
2258 2259 2260 2261 2262 2263 2264 2265 2266
    virNetMessageFree(msg);
    xdr_free(proc, data);
}


static int
virLXCControllerEventSendExit(virLXCControllerPtr ctrl,
                              int exitstatus)
{
2267
    virLXCMonitorExitEventMsg msg;
2268

2269
    VIR_DEBUG("Exit status %d (client=%p)", exitstatus, ctrl->client);
2270 2271 2272
    memset(&msg, 0, sizeof(msg));
    switch (exitstatus) {
    case 0:
2273
        msg.status = VIR_LXC_MONITOR_EXIT_STATUS_SHUTDOWN;
2274
        break;
2275
    case 1:
2276
        msg.status = VIR_LXC_MONITOR_EXIT_STATUS_REBOOT;
2277
        break;
2278
    default:
2279
        msg.status = VIR_LXC_MONITOR_EXIT_STATUS_ERROR;
2280 2281 2282 2283
        break;
    }

    virLXCControllerEventSend(ctrl,
2284 2285
                              VIR_LXC_MONITOR_PROC_EXIT_EVENT,
                              (xdrproc_t)xdr_virLXCMonitorExitEventMsg,
2286 2287 2288 2289 2290 2291
                              (void*)&msg);

    if (ctrl->client) {
        VIR_DEBUG("Waiting for client to complete dispatch");
        ctrl->inShutdown = true;
        virNetServerClientDelayedClose(ctrl->client);
2292
        virNetDaemonRun(ctrl->daemon);
2293 2294 2295 2296 2297 2298
    }
    VIR_DEBUG("Client has gone away");
    return 0;
}


2299 2300 2301 2302
static int
virLXCControllerEventSendInit(virLXCControllerPtr ctrl,
                              pid_t initpid)
{
2303
    virLXCMonitorInitEventMsg msg;
2304

2305
    VIR_DEBUG("Init pid %lld", (long long)initpid);
2306 2307 2308 2309
    memset(&msg, 0, sizeof(msg));
    msg.initpid = initpid;

    virLXCControllerEventSend(ctrl,
2310 2311
                              VIR_LXC_MONITOR_PROC_INIT_EVENT,
                              (xdrproc_t)xdr_virLXCMonitorInitEventMsg,
2312 2313 2314 2315 2316
                              (void*)&msg);
    return 0;
}


2317
static int
2318
virLXCControllerRun(virLXCControllerPtr ctrl)
2319 2320 2321 2322 2323 2324 2325
{
    int rc = -1;
    int control[2] = { -1, -1};
    int containerhandshake[2] = { -1, -1 };
    char **containerTTYPaths = NULL;
    size_t i;

2326
    if (VIR_ALLOC_N(containerTTYPaths, ctrl->nconsoles) < 0)
2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340
        goto cleanup;

    if (socketpair(PF_UNIX, SOCK_STREAM, 0, control) < 0) {
        virReportSystemError(errno, "%s",
                             _("sockpair failed"));
        goto cleanup;
    }

    if (socketpair(PF_UNIX, SOCK_STREAM, 0, containerhandshake) < 0) {
        virReportSystemError(errno, "%s",
                             _("socketpair failed"));
        goto cleanup;
    }

2341 2342 2343
    if (virLXCControllerSetupPrivateNS() < 0)
        goto cleanup;

2344 2345 2346
    if (virLXCControllerSetupLoopDevices(ctrl) < 0)
        goto cleanup;

2347
    if (virLXCControllerSetupResourceLimits(ctrl) < 0)
2348 2349 2350 2351 2352
        goto cleanup;

    if (virLXCControllerSetupDevPTS(ctrl) < 0)
        goto cleanup;

2353 2354 2355
    if (virLXCControllerPopulateDevices(ctrl) < 0)
        goto cleanup;

2356 2357 2358
    if (virLXCControllerSetupAllDisks(ctrl) < 0)
        goto cleanup;

2359 2360 2361
    if (virLXCControllerSetupAllHostdevs(ctrl) < 0)
        goto cleanup;

G
Gao feng 已提交
2362 2363 2364
    if (virLXCControllerSetupFuse(ctrl) < 0)
        goto cleanup;

2365 2366
    if (virLXCControllerSetupConsoles(ctrl, containerTTYPaths) < 0)
        goto cleanup;
2367

2368
    if (lxcSetPersonality(ctrl->def) < 0)
2369
        goto cleanup;
2370

2371
    if ((ctrl->initpid = lxcContainerStart(ctrl->def,
2372
                                           ctrl->securityManager,
2373 2374
                                           ctrl->nveths,
                                           ctrl->veths,
2375 2376
                                           ctrl->npassFDs,
                                           ctrl->passFDs,
2377 2378
                                           control[1],
                                           containerhandshake[1],
I
ik.nitk 已提交
2379
                                           ctrl->nsFDs,
2380 2381
                                           ctrl->nconsoles,
                                           containerTTYPaths)) < 0)
2382
        goto cleanup;
2383
    VIR_FORCE_CLOSE(control[1]);
2384
    VIR_FORCE_CLOSE(containerhandshake[1]);
2385

2386 2387 2388
    for (i = 0; i < ctrl->npassFDs; i++)
        VIR_FORCE_CLOSE(ctrl->passFDs[i]);

I
ik.nitk 已提交
2389 2390 2391 2392
    if (ctrl->nsFDs)
        for (i = 0; i < VIR_LXC_DOMAIN_NAMESPACE_LAST; i++)
            VIR_FORCE_CLOSE(ctrl->nsFDs[i]);

2393 2394 2395
    if (virLXCControllerSetupCgroupLimits(ctrl) < 0)
        goto cleanup;

2396 2397 2398
    if (virLXCControllerSetupUserns(ctrl) < 0)
        goto cleanup;

2399
    if (virLXCControllerMoveInterfaces(ctrl) < 0)
2400 2401
        goto cleanup;

2402 2403 2404
    if (virLXCControllerStartFuse(ctrl) < 0)
        goto cleanup;

2405 2406 2407
    if (lxcContainerSendContinue(control[0]) < 0) {
        virReportSystemError(errno, "%s",
                             _("Unable to send container continue message"));
2408
        goto cleanup;
2409
    }
2410

2411 2412 2413 2414 2415 2416
    if (lxcContainerWaitForContinue(containerhandshake[0]) < 0) {
        virReportSystemError(errno, "%s",
                             _("error receiving signal from container"));
        goto cleanup;
    }

2417
    /* ...and reduce our privileges */
D
Daniel P. Berrange 已提交
2418 2419 2420
    if (lxcControllerClearCapabilities() < 0)
        goto cleanup;

2421
    for (i = 0; i < ctrl->nconsoles; i++)
2422
        if (virLXCControllerConsoleSetNonblocking(&(ctrl->consoles[i])) < 0)
2423
            goto cleanup;
2424

2425 2426 2427
    if (virLXCControllerDaemonHandshake(ctrl) < 0)
        goto cleanup;

2428 2429 2430 2431 2432 2433
    /* We must not hold open a dbus connection for life
     * of LXC instance, since dbus-daemon is limited to
     * only a few 100 connections by default
     */
    virDBusCloseSystemBus();

2434
    rc = virLXCControllerMain(ctrl);
2435

2436 2437
    virLXCControllerEventSendExit(ctrl, rc);

2438
 cleanup:
2439 2440
    VIR_FORCE_CLOSE(control[0]);
    VIR_FORCE_CLOSE(control[1]);
2441 2442
    VIR_FORCE_CLOSE(containerhandshake[0]);
    VIR_FORCE_CLOSE(containerhandshake[1]);
2443

2444
    for (i = 0; i < ctrl->nconsoles; i++)
2445 2446
        VIR_FREE(containerTTYPaths[i]);
    VIR_FREE(containerTTYPaths);
2447

2448
    virLXCControllerStopInit(ctrl);
2449

2450 2451 2452 2453
    return rc;
}


2454
int main(int argc, char *argv[])
2455 2456
{
    pid_t pid;
2457
    int rc = -1;
2458
    const char *name = NULL;
2459
    size_t nveths = 0;
2460
    char **veths = NULL;
I
ik.nitk 已提交
2461
    int ns_fd[VIR_LXC_DOMAIN_NAMESPACE_LAST];
2462
    int handshakeFd = -1;
2463
    bool bg = false;
2464
    const struct option options[] = {
2465 2466 2467 2468
        { "background", 0, NULL, 'b' },
        { "name",   1, NULL, 'n' },
        { "veth",   1, NULL, 'v' },
        { "console", 1, NULL, 'c' },
2469
        { "passfd", 1, NULL, 'p' },
2470
        { "handshakefd", 1, NULL, 's' },
2471
        { "security", 1, NULL, 'S' },
I
ik.nitk 已提交
2472 2473 2474
        { "share-net", 1, NULL, 'N' },
        { "share-ipc", 1, NULL, 'I' },
        { "share-uts", 1, NULL, 'U' },
2475 2476 2477
        { "help", 0, NULL, 'h' },
        { 0, 0, 0, 0 },
    };
2478 2479
    int *ttyFDs = NULL;
    size_t nttyFDs = 0;
2480 2481
    int *passFDs = NULL;
    size_t npassFDs = 0;
2482
    virLXCControllerPtr ctrl = NULL;
2483
    size_t i;
2484
    const char *securityDriver = "none";
2485

I
ik.nitk 已提交
2486 2487 2488
    for (i = 0; i < VIR_LXC_DOMAIN_NAMESPACE_LAST; i++)
        ns_fd[i] = -1;

2489
    if (virGettextInitialize() < 0 ||
2490
        virErrorInitialize() < 0) {
E
Eric Blake 已提交
2491 2492 2493 2494
        fprintf(stderr, _("%s: initialization failed\n"), argv[0]);
        exit(EXIT_FAILURE);
    }

2495 2496 2497
    /* Initialize logging */
    virLogSetFromEnv();

2498 2499
    while (1) {
        int c;
2500

I
ik.nitk 已提交
2501
        c = getopt_long(argc, argv, "dn:v:p:m:c:s:h:S:N:I:U:",
2502
                        options, NULL);
2503 2504 2505 2506 2507 2508

        if (c == -1)
            break;

        switch (c) {
        case 'b':
2509
            bg = true;
2510 2511 2512
            break;

        case 'n':
2513
            name = optarg;
2514 2515 2516
            break;

        case 'v':
2517
            if (VIR_REALLOC_N(veths, nveths+1) < 0)
2518
                goto cleanup;
2519
            veths[nveths++] = g_strdup(optarg);
2520 2521 2522
            break;

        case 'c':
2523
            if (VIR_REALLOC_N(ttyFDs, nttyFDs + 1) < 0)
2524 2525
                goto cleanup;
            if (virStrToLong_i(optarg, NULL, 10, &ttyFDs[nttyFDs++]) < 0) {
2526 2527 2528 2529 2530
                fprintf(stderr, "malformed --console argument '%s'", optarg);
                goto cleanup;
            }
            break;

2531 2532 2533 2534 2535 2536 2537 2538 2539
        case 'p':
            if (VIR_REALLOC_N(passFDs, npassFDs + 1) < 0)
                goto cleanup;
            if (virStrToLong_i(optarg, NULL, 10, &passFDs[npassFDs++]) < 0) {
                fprintf(stderr, "malformed --passfd argument '%s'", optarg);
                goto cleanup;
            }
            break;

2540
        case 's':
2541
            if (virStrToLong_i(optarg, NULL, 10, &handshakeFd) < 0) {
2542 2543 2544 2545 2546 2547
                fprintf(stderr, "malformed --handshakefd argument '%s'",
                        optarg);
                goto cleanup;
            }
            break;

I
ik.nitk 已提交
2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571
        case 'N':
            if (virStrToLong_i(optarg, NULL, 10, &ns_fd[VIR_LXC_DOMAIN_NAMESPACE_SHARENET]) < 0) {
                fprintf(stderr, "malformed --share-net argument '%s'",
                        optarg);
                goto cleanup;
            }
            break;

        case 'I':
            if (virStrToLong_i(optarg, NULL, 10, &ns_fd[VIR_LXC_DOMAIN_NAMESPACE_SHAREIPC]) < 0) {
                fprintf(stderr, "malformed --share-ipc argument '%s'",
                        optarg);
                goto cleanup;
            }
            break;

        case 'U':
            if (virStrToLong_i(optarg, NULL, 10, &ns_fd[VIR_LXC_DOMAIN_NAMESPACE_SHAREUTS]) < 0) {
                fprintf(stderr, "malformed --share-uts argument '%s'",
                        optarg);
                goto cleanup;
            }
            break;

2572
        case 'S':
2573
            securityDriver = optarg;
2574 2575
            break;

2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586
        case 'h':
        case '?':
            fprintf(stderr, "\n");
            fprintf(stderr, "syntax: %s [OPTIONS]\n", argv[0]);
            fprintf(stderr, "\n");
            fprintf(stderr, "Options\n");
            fprintf(stderr, "\n");
            fprintf(stderr, "  -b, --background\n");
            fprintf(stderr, "  -n NAME, --name NAME\n");
            fprintf(stderr, "  -c FD, --console FD\n");
            fprintf(stderr, "  -v VETH, --veth VETH\n");
2587
            fprintf(stderr, "  -s FD, --handshakefd FD\n");
2588
            fprintf(stderr, "  -S NAME, --security NAME\n");
I
ik.nitk 已提交
2589 2590 2591
            fprintf(stderr, "  -N FD, --share-net FD\n");
            fprintf(stderr, "  -I FD, --share-ipc FD\n");
            fprintf(stderr, "  -U FD, --share-uts FD\n");
2592 2593
            fprintf(stderr, "  -h, --help\n");
            fprintf(stderr, "\n");
2594
            rc = 0;
2595
            goto cleanup;
2596 2597 2598
        }
    }

2599 2600 2601 2602 2603
    if (name == NULL) {
        fprintf(stderr, "%s: missing --name argument for configuration\n", argv[0]);
        goto cleanup;
    }

2604
    if (handshakeFd < 0) {
2605
        fprintf(stderr, "%s: missing --handshakefd argument for container PTY\n",
2606 2607 2608 2609
                argv[0]);
        goto cleanup;
    }

2610
    if (geteuid() != 0) {
2611 2612 2613
        fprintf(stderr, "%s: must be run as the 'root' user\n", argv[0]);
        goto cleanup;
    }
2614

2615
    virEventRegisterDefaultImpl();
2616 2617

    virDBusSetSharedBus(false);
2618

2619
    if (!(ctrl = virLXCControllerNew(name)))
2620
        goto cleanup;
2621

2622 2623
    ctrl->handshakeFd = handshakeFd;

2624
    if (!(ctrl->securityManager = virSecurityManagerNew(securityDriver,
2625
                                                        LXC_DRIVER_NAME, 0)))
2626 2627
        goto cleanup;

2628 2629 2630 2631 2632 2633 2634 2635 2636
    if (ctrl->def->seclabels) {
        VIR_DEBUG("Security model %s type %s label %s imagelabel %s",
                  NULLSTR(ctrl->def->seclabels[0]->model),
                  virDomainSeclabelTypeToString(ctrl->def->seclabels[0]->type),
                  NULLSTR(ctrl->def->seclabels[0]->label),
                  NULLSTR(ctrl->def->seclabels[0]->imagelabel));
    } else {
        VIR_DEBUG("Security model not initialized");
    }
2637

2638 2639 2640
    ctrl->veths = veths;
    ctrl->nveths = nveths;

2641 2642 2643
    ctrl->passFDs = passFDs;
    ctrl->npassFDs = npassFDs;

I
ik.nitk 已提交
2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656
    for (i = 0; i < VIR_LXC_DOMAIN_NAMESPACE_LAST; i++) {
        if (ns_fd[i] != -1) {
            if (!ctrl->nsFDs) {/*allocate only once */
                size_t j = 0;
                if (VIR_ALLOC_N(ctrl->nsFDs, VIR_LXC_DOMAIN_NAMESPACE_LAST) < 0)
                    goto cleanup;
                for (j = 0; j < VIR_LXC_DOMAIN_NAMESPACE_LAST; j++)
                    ctrl->nsFDs[j] = -1;
            }
            ctrl->nsFDs[i] = ns_fd[i];
        }
    }

2657
    for (i = 0; i < nttyFDs; i++) {
2658 2659 2660 2661 2662
        if (virLXCControllerAddConsole(ctrl, ttyFDs[i]) < 0)
            goto cleanup;
        ttyFDs[i] = -1;
    }

2663
    if (virLXCControllerValidateNICs(ctrl) < 0)
2664
        goto cleanup;
2665

2666 2667 2668
    if (virLXCControllerGetNICIndexes(ctrl) < 0)
        goto cleanup;

2669 2670 2671
    if (virLXCControllerValidateConsoles(ctrl) < 0)
        goto cleanup;

2672
    if (virLXCControllerSetupServer(ctrl) < 0)
2673
        goto cleanup;
2674

2675 2676 2677
    if (bg) {
        if ((pid = fork()) < 0)
            goto cleanup;
2678

2679
        if (pid > 0) {
2680
            if ((rc = virPidFileWrite(LXC_STATE_DIR, name, pid)) < 0) {
2681
                virReportSystemError(-rc,
2682 2683
                                     _("Unable to write pid file '%s/%s.pid'"),
                                     LXC_STATE_DIR, name);
2684 2685
                _exit(1);
            }
2686

2687 2688 2689 2690
            /* First child now exits, allowing original caller
             * (ie libvirtd's LXC driver to complete their
             * waitpid & continue */
            _exit(0);
2691 2692
        }

E
Eric Blake 已提交
2693
        /* Don't hold on to any cwd we inherit from libvirtd either */
2694
        if (chdir("/") < 0) {
2695
            virReportSystemError(errno, "%s",
2696
                                 _("Unable to change to root dir"));
2697 2698 2699 2700
            goto cleanup;
        }

        if (setsid() < 0) {
2701
            virReportSystemError(errno, "%s",
2702
                                 _("Unable to become session leader"));
2703 2704 2705
            goto cleanup;
        }
    }
2706

2707
    rc = virLXCControllerRun(ctrl);
2708

2709
 cleanup:
2710
    if (rc < 0) {
2711 2712 2713
        fprintf(stderr,
                _("Failure in libvirt_lxc startup: %s\n"),
                virGetLastErrorMessage());
2714 2715
    }

2716
    virPidFileDelete(LXC_STATE_DIR, name);
2717 2718
    if (ctrl)
        virLXCControllerDeleteInterfaces(ctrl);
2719
    for (i = 0; i < nttyFDs; i++)
2720 2721
        VIR_FORCE_CLOSE(ttyFDs[i]);
    VIR_FREE(ttyFDs);
2722 2723 2724
    for (i = 0; i < npassFDs; i++)
        VIR_FORCE_CLOSE(passFDs[i]);
    VIR_FREE(passFDs);
2725

2726
    virLXCControllerFree(ctrl);
2727

2728
    return rc < 0? EXIT_FAILURE : EXIT_SUCCESS;
2729
}