提交 bb4e5296 编写于 作者: M Michal Privoznik

qemu: Spawn qemu under mount namespace

Prime time. When it comes to spawning qemu process and
relabelling all the devices it's going to touch, there's inherent
race with other applications in the system (e.g. udev). Instead
of trying convincing udev to not touch libvirt managed devices,
we can create a separate mount namespace for the qemu, and mount
our own /dev there. Of course this puts more work onto us as we
have to maintain /dev files on each domain start and device
hot(un-)plug. On the other hand, this enhances security also.

From technical POV, on domain startup process the parent
(libvirtd) creates:

  /var/lib/libvirt/qemu/$domain.dev
  /var/lib/libvirt/qemu/$domain.devpts

The child (which is going to be qemu eventually) calls unshare()
to create new mount namespace. From now on anything that child
does is invisible to the parent. Child then mounts tmpfs on
$domain.dev (so that it still sees original /dev from the host)
and creates some devices (as explained in one of the previous
patches). The devices have to be created exactly as they are in
the host (including perms, seclabels, ACLs, ...). After that it
moves $domain.dev mount to /dev.

What's the $domain.devpts mount there for then you ask? QEMU can
create PTYs for some chardevs. And historically we exposed the
host ends in our domain XML allowing users to connect to them.
Therefore we must preserve devpts mount to be shared with the
host's one.

To make this patch as small as possible, creating of devices
configured for domain in question is implemented in next patches.
Signed-off-by: NMichal Privoznik <mprivozn@redhat.com>
上级 a5896e8c
......@@ -55,6 +55,9 @@
#include <sys/time.h>
#include <fcntl.h>
#if defined(HAVE_SYS_MOUNT_H)
# include <sys/mount.h>
#endif
#include <libxml/xpathInternals.h>
......@@ -86,6 +89,20 @@ VIR_ENUM_IMPL(qemuDomainAsyncJob, QEMU_ASYNC_JOB_LAST,
"start",
);
VIR_ENUM_IMPL(qemuDomainNamespace, QEMU_DOMAIN_NS_LAST,
"mount",
);
static struct {
const char *path;
const char *suffix;
} devPreserveMounts[] = {
{"/dev/pts", "devpts"},
{"/dev/shm", "devshm"},
{"/dev/mqueue", "mqueue"},
};
struct _qemuDomainLogContext {
int refs;
......@@ -146,6 +163,70 @@ qemuDomainAsyncJobPhaseFromString(qemuDomainAsyncJob job,
}
bool
qemuDomainNamespaceEnabled(virDomainObjPtr vm,
qemuDomainNamespace ns)
{
qemuDomainObjPrivatePtr priv = vm->privateData;
return priv->namespaces &&
virBitmapIsBitSet(priv->namespaces, ns);
}
static int
qemuDomainEnableNamespace(virDomainObjPtr vm,
qemuDomainNamespace ns)
{
qemuDomainObjPrivatePtr priv = vm->privateData;
if (!priv->namespaces &&
!(priv->namespaces = virBitmapNew(QEMU_DOMAIN_NS_LAST)))
return -1;
if (virBitmapSetBit(priv->namespaces, ns) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Unable to enable namespace: %s"),
qemuDomainNamespaceTypeToString(ns));
return -1;
}
return 0;
}
static int
qemuDomainGetPreservedMounts(virQEMUDriverPtr driver,
virDomainObjPtr vm,
char ***devMountsPath,
size_t *ndevMountsPath)
{
virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
char **paths;
size_t i;
if (VIR_ALLOC_N(paths, ARRAY_CARDINALITY(devPreserveMounts)) < 0)
goto error;
for (i = 0; i < ARRAY_CARDINALITY(devPreserveMounts); i++) {
if (virAsprintf(&paths[i], "%s/%s.%s",
cfg->stateDir, vm->def->name,
devPreserveMounts[i].suffix) < 0)
goto error;
}
*devMountsPath = paths;
*ndevMountsPath = ARRAY_CARDINALITY(devPreserveMounts);
virObjectUnref(cfg);
return 0;
error:
virStringListFreeCount(paths, ARRAY_CARDINALITY(devPreserveMounts));
virObjectUnref(cfg);
return -1;
}
void qemuDomainEventQueue(virQEMUDriverPtr driver,
virObjectEventPtr event)
{
......@@ -1541,6 +1622,8 @@ qemuDomainObjPrivateFree(void *data)
virObjectUnref(priv->qemuCaps);
virBitmapFree(priv->namespaces);
virCgroupFree(&priv->cgroup);
virDomainPCIAddressSetFree(priv->pciaddrs);
virDomainUSBAddressSetFree(priv->usbaddrs);
......@@ -1627,6 +1710,17 @@ qemuDomainObjPrivateXMLFormat(virBufferPtr buf,
virDomainChrTypeToString(priv->monConfig->type));
}
if (priv->namespaces) {
ssize_t ns = -1;
virBufferAddLit(buf, "<namespaces>\n");
virBufferAdjustIndent(buf, 2);
while ((ns = virBitmapNextSetBit(priv->namespaces, ns)) >= 0)
virBufferAsprintf(buf, "<%s/>\n", qemuDomainNamespaceTypeToString(ns));
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</namespaces>\n");
}
qemuDomainObjPrivateXMLFormatVcpus(buf, vm->def);
if (priv->qemuCaps) {
......@@ -1771,6 +1865,7 @@ qemuDomainObjPrivateXMLParse(xmlXPathContextPtr ctxt,
int n;
size_t i;
xmlNodePtr *nodes = NULL;
xmlNodePtr node = NULL;
virQEMUCapsPtr qemuCaps = NULL;
virCapsPtr caps = NULL;
......@@ -1809,6 +1904,30 @@ qemuDomainObjPrivateXMLParse(xmlXPathContextPtr ctxt,
goto error;
}
if ((node = virXPathNode("./namespaces", ctxt))) {
xmlNodePtr next;
for (next = node->children; next; next = next->next) {
int ns = qemuDomainNamespaceTypeFromString((const char *) next->name);
if (ns < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("malformed namespace name: %s"),
next->name);
goto error;
}
if (qemuDomainEnableNamespace(vm, ns) < 0)
goto error;
}
}
if (priv->namespaces &&
virBitmapIsAllClear(priv->namespaces)) {
virBitmapFree(priv->namespaces);
priv->namespaces = NULL;
}
if ((n = virXPathNodeSet("./vcpus/vcpu", ctxt, &nodes)) < 0)
goto error;
......@@ -1959,10 +2078,12 @@ qemuDomainObjPrivateXMLParse(xmlXPathContextPtr ctxt,
return 0;
error:
virDomainChrSourceDefFree(priv->monConfig);
priv->monConfig = NULL;
VIR_FREE(nodes);
VIR_FREE(tmp);
virBitmapFree(priv->namespaces);
priv->namespaces = NULL;
virDomainChrSourceDefFree(priv->monConfig);
priv->monConfig = NULL;
virStringListFree(priv->qemuDevices);
priv->qemuDevices = NULL;
virObjectUnref(qemuCaps);
......@@ -6653,3 +6774,319 @@ qemuDomainSupportsVideoVga(virDomainVideoDefPtr video,
return true;
}
static int
qemuDomainCreateDevice(const char *device,
const char *path,
bool allow_noent)
{
char *devicePath = NULL;
struct stat sb;
int ret = -1;
if (!STRPREFIX(device, "/dev")) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("invalid device: %s"),
device);
goto cleanup;
}
if (virAsprintf(&devicePath, "%s/%s",
path, device + 4) < 0)
goto cleanup;
if (stat(device, &sb) < 0) {
if (errno == ENOENT && allow_noent) {
/* Ignore non-existent device. */
ret = 0;
goto cleanup;
}
virReportSystemError(errno, _("Unable to stat %s"), device);
goto cleanup;
}
if (virFileMakeParentPath(devicePath) < 0) {
virReportSystemError(errno,
_("Unable to create %s"),
devicePath);
goto cleanup;
}
if (mknod(devicePath, sb.st_mode, sb.st_rdev) < 0) {
virReportSystemError(errno,
_("Failed to make device %s"),
devicePath);
goto cleanup;
}
if (chown(devicePath, sb.st_uid, sb.st_gid) < 0) {
virReportSystemError(errno,
_("Failed to chown device %s"),
devicePath);
goto cleanup;
}
if (virFileCopyACLs(device, devicePath) < 0 &&
errno != ENOTSUP) {
virReportSystemError(errno,
_("Failed to copy ACLs on device %s"),
devicePath);
goto cleanup;
}
ret = 0;
cleanup:
VIR_FREE(devicePath);
return ret;
}
static int
qemuDomainPopulateDevices(virQEMUDriverPtr driver,
virDomainObjPtr vm ATTRIBUTE_UNUSED,
const char *path)
{
virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
const char *const *devices = (const char *const *) cfg->cgroupDeviceACL;
size_t i;
int ret = -1;
if (!devices)
devices = defaultDeviceACL;
for (i = 0; devices[i]; i++) {
if (qemuDomainCreateDevice(devices[i], path, true) < 0)
goto cleanup;
}
ret = 0;
cleanup:
virObjectUnref(cfg);
return ret;
}
static int
qemuDomainSetupDev(virQEMUDriverPtr driver,
virDomainObjPtr vm,
const char *path)
{
char *mount_options = NULL;
char *opts = NULL;
int ret = -1;
VIR_DEBUG("Setting up /dev/ for domain %s", vm->def->name);
mount_options = virSecurityManagerGetMountOptions(driver->securityManager,
vm->def);
if (!mount_options &&
VIR_STRDUP(mount_options, "") < 0)
goto cleanup;
/*
* tmpfs is limited to 64kb, since we only have device nodes in there
* and don't want to DOS the entire OS RAM usage
*/
if (virAsprintf(&opts,
"mode=755,size=65536%s", mount_options) < 0)
goto cleanup;
if (virFileSetupDev(path, opts) < 0)
goto cleanup;
if (qemuDomainPopulateDevices(driver, vm, path) < 0)
goto cleanup;
ret = 0;
cleanup:
VIR_FREE(opts);
VIR_FREE(mount_options);
return ret;
}
int
qemuDomainBuildNamespace(virQEMUDriverPtr driver,
virDomainObjPtr vm)
{
virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
const unsigned long mount_flags = MS_MOVE;
char *devPath = NULL;
char **devMountsPath = NULL;
size_t ndevMountsPath = 0, i;
int ret = -1;
if (!qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT)) {
ret = 0;
goto cleanup;
}
if (virAsprintf(&devPath, "%s/%s.dev",
cfg->stateDir, vm->def->name) < 0)
goto cleanup;
if (qemuDomainGetPreservedMounts(driver, vm,
&devMountsPath, &ndevMountsPath) < 0)
goto cleanup;
if (qemuDomainSetupDev(driver, vm, devPath) < 0)
goto cleanup;
/* Save some mount points because we want to share them with the host */
for (i = 0; i < ndevMountsPath; i++) {
if (mount(devPreserveMounts[i].path, devMountsPath[i],
NULL, mount_flags, NULL) < 0) {
virReportSystemError(errno,
_("Unable to move %s mount"),
devPreserveMounts[i].path);
goto cleanup;
}
}
if (mount(devPath, "/dev", NULL, mount_flags, NULL) < 0) {
virReportSystemError(errno,
_("Failed to mount %s on /dev"),
devPath);
goto cleanup;
}
for (i = 0; i < ndevMountsPath; i++) {
if (virFileMakePath(devPreserveMounts[i].path) < 0) {
virReportSystemError(errno, _("Cannot create %s"),
devPreserveMounts[i].path);
goto cleanup;
}
if (mount(devMountsPath[i], devPreserveMounts[i].path,
NULL, mount_flags, NULL) < 0) {
virReportSystemError(errno,
_("Failed to mount %s on %s"),
devMountsPath[i],
devPreserveMounts[i].path);
goto cleanup;
}
}
ret = 0;
cleanup:
virObjectUnref(cfg);
VIR_FREE(devPath);
virStringListFreeCount(devMountsPath, ndevMountsPath);
return ret;
}
#if defined(__linux__)
int
qemuDomainCreateNamespace(virQEMUDriverPtr driver,
virDomainObjPtr vm)
{
virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
int ret = -1;
char *devPath = NULL;
char **devMountsPath = NULL;
size_t ndevMountsPath = 0, i;
if (!virQEMUDriverIsPrivileged(driver)) {
ret = 0;
goto cleanup;
}
if (virAsprintf(&devPath, "%s/%s.dev",
cfg->stateDir, vm->def->name) < 0)
goto cleanup;
if (qemuDomainGetPreservedMounts(driver, vm,
&devMountsPath, &ndevMountsPath) < 0)
goto cleanup;
if (virFileMakePath(devPath) < 0) {
virReportSystemError(errno,
_("Failed to create %s"),
devPath);
goto cleanup;
}
for (i = 0; i < ndevMountsPath; i++) {
if (virFileMakePath(devMountsPath[i]) < 0) {
virReportSystemError(errno,
_("Failed to create %s"),
devMountsPath[i]);
goto cleanup;
}
}
/* Enabling of the mount namespace goes here. */
ret = 0;
cleanup:
if (ret < 0) {
if (devPath)
unlink(devPath);
for (i = 0; i < ndevMountsPath; i++)
unlink(devMountsPath[i]);
}
virStringListFreeCount(devMountsPath, ndevMountsPath);
VIR_FREE(devPath);
virObjectUnref(cfg);
return ret;
}
#else /* !defined(__linux__) */
int
qemuDomainCreateNamespace(virQEMUDriverPtr driver ATTRIBUTE_UNUSED,
virDomainObjPtr vm ATTRIBUTE_UNUSED)
{
/* Namespaces are Linux specific. On other platforms just
* carry on with the old behaviour. */
return 0;
}
#endif /* !defined(__linux__) */
void
qemuDomainDeleteNamespace(virQEMUDriverPtr driver,
virDomainObjPtr vm)
{
virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
char *devPath = NULL;
char **devMountsPath = NULL;
size_t ndevMountsPath = 0, i;
if (!qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT))
return;
if (virAsprintf(&devPath, "%s/%s.dev",
cfg->stateDir, vm->def->name) < 0)
goto cleanup;
if (qemuDomainGetPreservedMounts(driver, vm,
&devMountsPath, &ndevMountsPath) < 0)
goto cleanup;
if (rmdir(devPath) < 0) {
virReportSystemError(errno,
_("Unable to remove %s"),
devPath);
/* Bet effort. Fall through. */
}
for (i = 0; i < ndevMountsPath; i++) {
if (rmdir(devMountsPath[i]) < 0) {
virReportSystemError(errno,
_("Unable to remove %s"),
devMountsPath[i]);
/* Bet effort. Fall through. */
}
}
cleanup:
virObjectUnref(cfg);
virStringListFreeCount(devMountsPath, ndevMountsPath);
VIR_FREE(devPath);
}
......@@ -165,11 +165,23 @@ struct _qemuDomainUnpluggingDevice {
qemuDomainUnpluggingDeviceStatus status;
};
typedef enum {
QEMU_DOMAIN_NS_MOUNT = 0,
QEMU_DOMAIN_NS_LAST
} qemuDomainNamespace;
VIR_ENUM_DECL(qemuDomainNamespace)
bool qemuDomainNamespaceEnabled(virDomainObjPtr vm,
qemuDomainNamespace ns);
typedef struct _qemuDomainObjPrivate qemuDomainObjPrivate;
typedef qemuDomainObjPrivate *qemuDomainObjPrivatePtr;
struct _qemuDomainObjPrivate {
struct qemuDomainJobObj job;
virBitmapPtr namespaces;
qemuMonitorPtr mon;
virDomainChrSourceDefPtr monConfig;
bool monJSON;
......@@ -785,4 +797,12 @@ int qemuDomainCheckMonitor(virQEMUDriverPtr driver,
bool qemuDomainSupportsVideoVga(virDomainVideoDefPtr video,
virQEMUCapsPtr qemuCaps);
int qemuDomainBuildNamespace(virQEMUDriverPtr driver,
virDomainObjPtr vm);
int qemuDomainCreateNamespace(virQEMUDriverPtr driver,
virDomainObjPtr vm);
void qemuDomainDeleteNamespace(virQEMUDriverPtr driver,
virDomainObjPtr vm);
#endif /* __QEMU_DOMAIN_H__ */
......@@ -2673,6 +2673,12 @@ static int qemuProcessHook(void *data)
if (virSecurityManagerClearSocketLabel(h->driver->securityManager, h->vm->def) < 0)
goto cleanup;
if (virProcessSetupPrivateMountNS() < 0)
goto cleanup;
if (qemuDomainBuildNamespace(h->driver, h->vm) < 0)
goto cleanup;
if (virDomainNumatuneGetMode(h->vm->def->numa, -1, &mode) == 0) {
if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
h->cfg->cgroupControllers & (1 << VIR_CGROUP_CONTROLLER_CPUSET) &&
......@@ -5498,6 +5504,11 @@ qemuProcessLaunch(virConnectPtr conn,
qemuDomainLogContextMarkPosition(logCtxt);
VIR_DEBUG("Building mount namespace");
if (qemuDomainCreateNamespace(driver, vm) < 0)
goto cleanup;
VIR_DEBUG("Clear emulator capabilities: %d",
cfg->clearEmulatorCapabilities);
if (cfg->clearEmulatorCapabilities)
......@@ -6269,6 +6280,8 @@ void qemuProcessStop(virQEMUDriverPtr driver,
}
}
qemuDomainDeleteNamespace(driver, vm);
vm->taint = 0;
vm->pid = -1;
virDomainObjSetState(vm, VIR_DOMAIN_SHUTOFF, reason);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册