virpci.c 80.8 KB
Newer Older
1
/*
2 3
 * virpci.c: helper APIs for managing host PCI devices
 *
4
 * Copyright (C) 2009-2015 Red Hat, Inc.
5 6 7 8 9 10 11 12 13 14 15 16
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library.  If not, see
O
Osier Yang 已提交
18
 * <http://www.gnu.org/licenses/>.
19 20 21 22 23 24 25
 *
 * Authors:
 *     Mark McLoughlin <markmc@redhat.com>
 */

#include <config.h>

26
#include "virpci.h"
27 28 29 30 31 32 33 34 35 36

#include <dirent.h>
#include <fcntl.h>
#include <inttypes.h>
#include <limits.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
37
#include <stdlib.h>
38

39
#include "dirname.h"
40
#include "virlog.h"
41
#include "viralloc.h"
42
#include "vircommand.h"
43
#include "virerror.h"
E
Eric Blake 已提交
44
#include "virfile.h"
45
#include "virkmod.h"
46 47
#include "virstring.h"
#include "virutil.h"
48

49 50
VIR_LOG_INIT("util.pci");

51 52 53 54
#define PCI_SYSFS "/sys/bus/pci/"
#define PCI_ID_LEN 10   /* "XXXX XXXX" */
#define PCI_ADDR_LEN 13 /* "XXXX:XX:XX.X" */

55 56 57
VIR_ENUM_IMPL(virPCIELinkSpeed, VIR_PCIE_LINK_SPEED_LAST,
              "", "2.5", "5", "8")

58 59 60 61 62 63 64
VIR_ENUM_IMPL(virPCIStubDriver, VIR_PCI_STUB_DRIVER_LAST,
              "none",
              "pciback", /* XEN */
              "pci-stub", /* KVM */
              "vfio-pci", /* VFIO */
);

65
struct _virPCIDevice {
66
    virPCIDeviceAddress address;
67 68 69

    char          name[PCI_ADDR_LEN]; /* domain:bus:slot.function */
    char          id[PCI_ID_LEN];     /* product vendor */
E
Eric Blake 已提交
70
    char          *path;
C
Chunyan Liu 已提交
71 72 73 74

    /* The driver:domain which uses the device */
    char          *used_by_drvname;
    char          *used_by_domname;
75

76 77
    unsigned int  pcie_cap_pos;
    unsigned int  pci_pm_cap_pos;
78 79
    bool          has_flr;
    bool          has_pm_reset;
80
    bool          managed;
81 82

    virPCIStubDriver stubDriver;
83 84

    /* used by reattach function */
85 86 87
    bool          unbind_from_stub;
    bool          remove_slot;
    bool          reprobe;
88 89
};

90
struct _virPCIDeviceList {
91 92
    virObjectLockable parent;

93
    size_t count;
94
    virPCIDevicePtr *devs;
95 96 97
};


98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
/* For virReportOOMError()  and virReportSystemError() */
#define VIR_FROM_THIS VIR_FROM_NONE

/* Specifications referenced in comments:
 *  PCI30  - PCI Local Bus Specification 3.0
 *  PCIe20 - PCI Express Base Specification 2.0
 *  BR12   - PCI-to-PCI Bridge Architecture Specification 1.2
 *  PM12   - PCI Bus Power Management Interface Specification 1.2
 *  ECN_AF - Advanced Capabilities for Conventional PCI ECN
 */

/* Type 0 config space header length; PCI30 Section 6.1 Configuration Space Organization */
#define PCI_CONF_LEN            0x100
#define PCI_CONF_HEADER_LEN     0x40

/* PCI30 6.2.1 */
#define PCI_HEADER_TYPE         0x0e    /* Header type */
115 116 117
#define PCI_HEADER_TYPE_BRIDGE 0x1
#define PCI_HEADER_TYPE_MASK   0x7f
#define PCI_HEADER_TYPE_MULTI  0x80
118 119 120 121 122 123 124 125 126

/* PCI30 6.2.1  Device Identification */
#define PCI_CLASS_DEVICE        0x0a    /* Device class */

/* Class Code for bridge; PCI30 D.7  Base Class 06h */
#define PCI_CLASS_BRIDGE_PCI    0x0604

/* PCI30 6.2.3  Device Status */
#define PCI_STATUS              0x06    /* 16 bits */
127
#define PCI_STATUS_CAP_LIST    0x10    /* Support Capability List */
128 129 130

/* PCI30 6.7  Capabilities List */
#define PCI_CAPABILITY_LIST     0x34    /* Offset of first capability list entry */
131
#define PCI_CAP_FLAGS           2       /* Capability defined flags (16 bits) */
132 133 134 135 136 137 138 139 140 141

/* PM12 3.2.1  Capability Identifier */
#define PCI_CAP_ID_PM           0x01    /* Power Management */
/* PCI30 H Capability IDs */
#define PCI_CAP_ID_EXP          0x10    /* PCI Express */
/* ECN_AF 6.x.1.1  Capability ID for AF */
#define PCI_CAP_ID_AF           0x13    /* Advanced Features */

/* PCIe20 7.8.3  Device Capabilities Register (Offset 04h) */
#define PCI_EXP_DEVCAP          0x4     /* Device capabilities */
142 143 144 145 146 147 148
#define PCI_EXP_DEVCAP_FLR     (1<<28)  /* Function Level Reset */
#define PCI_EXP_LNKCAP          0xc     /* Link Capabilities */
#define PCI_EXP_LNKCAP_SPEED    0x0000f /* Maximum Link Speed */
#define PCI_EXP_LNKCAP_WIDTH    0x003f0 /* Maximum Link Width */
#define PCI_EXP_LNKSTA          0x12    /* Link Status */
#define PCI_EXP_LNKSTA_SPEED    0x000f  /* Negotiated Link Speed */
#define PCI_EXP_LNKSTA_WIDTH    0x03f0  /* Negotiated Link Width */
149 150 151 152 153 154 155

/* Header type 1 BR12 3.2 PCI-to-PCI Bridge Configuration Space Header Format */
#define PCI_PRIMARY_BUS         0x18    /* BR12 3.2.5.2 Primary bus number */
#define PCI_SECONDARY_BUS       0x19    /* BR12 3.2.5.3 Secondary bus number */
#define PCI_SUBORDINATE_BUS     0x1a    /* BR12 3.2.5.4 Highest bus number behind the bridge */
#define PCI_BRIDGE_CONTROL      0x3e
/* BR12 3.2.5.18  Bridge Control Register */
156
#define PCI_BRIDGE_CTL_RESET   0x40    /* Secondary bus reset */
157 158 159

/* PM12 3.2.4  Power Management Control/Status (Offset = 4) */
#define PCI_PM_CTRL                4    /* PM control and status register */
160 161 162 163
#define PCI_PM_CTRL_STATE_MASK    0x3  /* Current power state (D0 to D3) */
#define PCI_PM_CTRL_STATE_D0      0x0  /* D0 state */
#define PCI_PM_CTRL_STATE_D3hot   0x3  /* D3 state */
#define PCI_PM_CTRL_NO_SOFT_RESET 0x8  /* No reset for D3hot->D0 */
164 165 166

/* ECN_AF 6.x.1  Advanced Features Capability Structure */
#define PCI_AF_CAP              0x3     /* Advanced features capabilities */
167
#define PCI_AF_CAP_FLR         0x2     /* Function Level Reset */
168

J
Jiri Denemark 已提交
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
#define PCI_EXP_FLAGS           0x2
#define PCI_EXP_FLAGS_TYPE      0x00f0
#define PCI_EXP_TYPE_DOWNSTREAM 0x6

#define PCI_EXT_CAP_BASE          0x100
#define PCI_EXT_CAP_LIMIT         0x1000
#define PCI_EXT_CAP_ID_MASK       0x0000ffff
#define PCI_EXT_CAP_OFFSET_SHIFT  20
#define PCI_EXT_CAP_OFFSET_MASK   0x00000ffc

#define PCI_EXT_CAP_ID_ACS      0x000d
#define PCI_EXT_ACS_CTRL        0x06

#define PCI_EXT_CAP_ACS_SV      0x01
#define PCI_EXT_CAP_ACS_RR      0x04
#define PCI_EXT_CAP_ACS_CR      0x08
#define PCI_EXT_CAP_ACS_UF      0x10
186 187 188
#define PCI_EXT_CAP_ACS_ENABLED (PCI_EXT_CAP_ACS_SV |   \
                                 PCI_EXT_CAP_ACS_RR |   \
                                 PCI_EXT_CAP_ACS_CR |   \
J
Jiri Denemark 已提交
189 190
                                 PCI_EXT_CAP_ACS_UF)

191 192 193
#define PCI_EXP_TYPE_ROOT_INT_EP 0x9    /* Root Complex Integrated Endpoint */
#define PCI_EXP_TYPE_ROOT_EC 0xa        /* Root Complex Event Collector */

194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
static virClassPtr virPCIDeviceListClass;

static void virPCIDeviceListDispose(void *obj);

static int virPCIOnceInit(void)
{
    if (!(virPCIDeviceListClass = virClassNew(virClassForObjectLockable(),
                                              "virPCIDeviceList",
                                              sizeof(virPCIDeviceList),
                                              virPCIDeviceListDispose)))
        return -1;

    return 0;
}

VIR_ONCE_GLOBAL_INIT(virPCI)

L
Laine Stump 已提交
211

212 213
static char *
virPCIDriverDir(const char *driver)
L
Laine Stump 已提交
214
{
215
    char *buffer;
L
Laine Stump 已提交
216

217 218
    ignore_value(virAsprintf(&buffer, PCI_SYSFS "drivers/%s", driver));
    return buffer;
L
Laine Stump 已提交
219 220 221
}


222 223
static char *
virPCIDriverFile(const char *driver, const char *file)
L
Laine Stump 已提交
224
{
225
    char *buffer;
L
Laine Stump 已提交
226

227 228
    ignore_value(virAsprintf(&buffer, PCI_SYSFS "drivers/%s/%s", driver, file));
    return buffer;
L
Laine Stump 已提交
229 230 231
}


232 233
static char *
virPCIFile(const char *device, const char *file)
L
Laine Stump 已提交
234
{
235
    char *buffer;
L
Laine Stump 已提交
236

237 238
    ignore_value(virAsprintf(&buffer, PCI_SYSFS "devices/%s/%s", device, file));
    return buffer;
L
Laine Stump 已提交
239 240 241 242 243 244 245 246 247 248
}


/* virPCIDeviceGetDriverPathAndName - put the path to the driver
 * directory of the driver in use for this device in @path and the
 * name of the driver in @name. Both could be NULL if it's not bound
 * to any driver.
 *
 * Return 0 for success, -1 for error.
 */
249
int
L
Laine Stump 已提交
250 251 252 253 254 255 256
virPCIDeviceGetDriverPathAndName(virPCIDevicePtr dev, char **path, char **name)
{
    int ret = -1;
    char *drvlink = NULL;

    *path = *name = NULL;
    /* drvlink = "/sys/bus/pci/dddd:bb:ss.ff/driver" */
257
    if (!(drvlink = virPCIFile(dev->name, "driver")))
L
Laine Stump 已提交
258 259
        goto cleanup;

260 261 262 263 264
    if (!virFileExists(drvlink)) {
        ret = 0;
        goto cleanup;
    }

L
Laine Stump 已提交
265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
    if (virFileIsLink(drvlink) != 1) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Invalid device %s driver file %s is not a symlink"),
                       dev->name, drvlink);
        goto cleanup;
    }
    if (virFileResolveLink(drvlink, path) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unable to resolve device %s driver symlink %s"),
                       dev->name, drvlink);
        goto cleanup;
    }
    /* path = "/sys/bus/pci/drivers/${drivername}" */

    if (VIR_STRDUP(*name, last_component(*path)) < 0)
        goto cleanup;
    /* name = "${drivername}" */

    ret = 0;
284
 cleanup:
L
Laine Stump 已提交
285 286 287 288 289 290 291 292 293
    VIR_FREE(drvlink);
    if (ret < 0) {
        VIR_FREE(*path);
        VIR_FREE(*name);
    }
    return ret;
}


294
static int
295
virPCIDeviceConfigOpen(virPCIDevicePtr dev, bool fatal)
296 297 298 299
{
    int fd;

    fd = open(dev->path, O_RDWR);
300

301
    if (fd < 0) {
302 303 304 305 306 307 308 309 310
        if (fatal) {
            virReportSystemError(errno,
                                 _("Failed to open config space file '%s'"),
                                 dev->path);
        } else {
            char ebuf[1024];
            VIR_WARN("Failed to open config space file '%s': %s",
                     dev->path, virStrerror(errno, ebuf, sizeof(ebuf)));
        }
311 312
        return -1;
    }
313

314
    VIR_DEBUG("%s %s: opened %s", dev->id, dev->name, dev->path);
315
    return fd;
316 317
}

318
static void
319
virPCIDeviceConfigClose(virPCIDevicePtr dev, int cfgfd)
320
{
321 322 323 324 325
    if (VIR_CLOSE(cfgfd) < 0) {
        char ebuf[1024];
        VIR_WARN("Failed to close config space file '%s': %s",
                 dev->path, virStrerror(errno, ebuf, sizeof(ebuf)));
    }
326 327
}

328

329
static int
330 331
virPCIDeviceRead(virPCIDevicePtr dev,
                 int cfgfd,
332
                 unsigned int pos,
333
                 uint8_t *buf,
334
                 unsigned int buflen)
335 336 337
{
    memset(buf, 0, buflen);

338 339
    if (lseek(cfgfd, pos, SEEK_SET) != pos ||
        saferead(cfgfd, buf, buflen) != buflen) {
340
        char ebuf[1024];
341
        VIR_WARN("Failed to read from '%s' : %s", dev->path,
342 343 344 345 346 347 348
                 virStrerror(errno, ebuf, sizeof(ebuf)));
        return -1;
    }
    return 0;
}

static uint8_t
349
virPCIDeviceRead8(virPCIDevicePtr dev, int cfgfd, unsigned int pos)
350 351
{
    uint8_t buf;
352
    virPCIDeviceRead(dev, cfgfd, pos, &buf, sizeof(buf));
353 354 355 356
    return buf;
}

static uint16_t
357
virPCIDeviceRead16(virPCIDevicePtr dev, int cfgfd, unsigned int pos)
358 359
{
    uint8_t buf[2];
360
    virPCIDeviceRead(dev, cfgfd, pos, &buf[0], sizeof(buf));
361 362 363 364
    return (buf[0] << 0) | (buf[1] << 8);
}

static uint32_t
365
virPCIDeviceRead32(virPCIDevicePtr dev, int cfgfd, unsigned int pos)
366 367
{
    uint8_t buf[4];
368
    virPCIDeviceRead(dev, cfgfd, pos, &buf[0], sizeof(buf));
369 370 371
    return (buf[0] << 0) | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24);
}

372 373 374 375 376 377 378 379
static int
virPCIDeviceReadClass(virPCIDevicePtr dev, uint16_t *device_class)
{
    char *path = NULL;
    char *id_str = NULL;
    int ret = -1;
    unsigned int value;

380
    if (!(path = virPCIFile(dev->name, "class")))
381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
        return ret;

    /* class string is '0xNNNNNN\n' ... i.e. 9 bytes */
    if (virFileReadAll(path, 9, &id_str) < 0)
        goto cleanup;

    id_str[8] = '\0';
    if (virStrToLong_ui(id_str, NULL, 16, &value) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unusual value in %s/devices/%s/class: %s"),
                       PCI_SYSFS, dev->name, id_str);
        goto cleanup;
    }

    *device_class = (value >> 8) & 0xFFFF;
    ret = 0;
397
 cleanup:
398 399 400 401 402
    VIR_FREE(id_str);
    VIR_FREE(path);
    return ret;
}

403
static int
404 405
virPCIDeviceWrite(virPCIDevicePtr dev,
                  int cfgfd,
406
                  unsigned int pos,
407
                  uint8_t *buf,
408
                  unsigned int buflen)
409
{
410 411
    if (lseek(cfgfd, pos, SEEK_SET) != pos ||
        safewrite(cfgfd, buf, buflen) != buflen) {
412
        char ebuf[1024];
413
        VIR_WARN("Failed to write to '%s' : %s", dev->path,
414 415 416 417 418 419 420
                 virStrerror(errno, ebuf, sizeof(ebuf)));
        return -1;
    }
    return 0;
}

static void
421
virPCIDeviceWrite16(virPCIDevicePtr dev, int cfgfd, unsigned int pos, uint16_t val)
422 423
{
    uint8_t buf[2] = { (val >> 0), (val >> 8) };
424
    virPCIDeviceWrite(dev, cfgfd, pos, &buf[0], sizeof(buf));
425 426 427
}

static void
428
virPCIDeviceWrite32(virPCIDevicePtr dev, int cfgfd, unsigned int pos, uint32_t val)
429
{
430
    uint8_t buf[4] = { (val >> 0), (val >> 8), (val >> 16), (val >> 24) };
431
    virPCIDeviceWrite(dev, cfgfd, pos, &buf[0], sizeof(buf));
432 433
}

E
Eric Blake 已提交
434 435
typedef int (*virPCIDeviceIterPredicate)(virPCIDevicePtr, virPCIDevicePtr,
                                         void *);
436 437 438 439 440 441 442

/* Iterate over available PCI devices calling @predicate
 * to compare each one to @dev.
 * Return -1 on error since we don't want to assume it is
 * safe to reset if there is an error.
 */
static int
443 444 445 446
virPCIDeviceIterDevices(virPCIDeviceIterPredicate predicate,
                        virPCIDevicePtr dev,
                        virPCIDevicePtr *matched,
                        void *data)
447 448 449
{
    DIR *dir;
    struct dirent *entry;
450
    int ret = 0;
451
    int rc;
452 453 454 455 456 457 458

    *matched = NULL;

    VIR_DEBUG("%s %s: iterating over " PCI_SYSFS "devices", dev->id, dev->name);

    dir = opendir(PCI_SYSFS "devices");
    if (!dir) {
459
        VIR_WARN("Failed to open " PCI_SYSFS "devices");
460 461 462
        return -1;
    }

E
Eric Blake 已提交
463
    while ((ret = virDirRead(dir, &entry, PCI_SYSFS "devices")) > 0) {
464
        unsigned int domain, bus, slot, function;
465
        virPCIDevicePtr check;
466
        char *tmp;
467 468 469 470 471

        /* Ignore '.' and '..' */
        if (entry->d_name[0] == '.')
            continue;

472 473 474 475 476 477 478 479 480
        /* expected format: <domain>:<bus>:<slot>.<function> */
        if (/* domain */
            virStrToLong_ui(entry->d_name, &tmp, 16, &domain) < 0 || *tmp != ':' ||
            /* bus */
            virStrToLong_ui(tmp + 1, &tmp, 16, &bus) < 0 || *tmp != ':' ||
            /* slot */
            virStrToLong_ui(tmp + 1, &tmp, 16, &slot) < 0 || *tmp != '.' ||
            /* function */
            virStrToLong_ui(tmp + 1, NULL, 16, &function) < 0) {
481 482 483 484
            VIR_WARN("Unusual entry in " PCI_SYSFS "devices: %s", entry->d_name);
            continue;
        }

485
        check = virPCIDeviceNew(domain, bus, slot, function);
486
        if (!check) {
487 488 489
            ret = -1;
            break;
        }
490

491 492 493
        rc = predicate(dev, check, data);
        if (rc < 0) {
            /* the predicate returned an error, bail */
494
            virPCIDeviceFree(check);
495 496
            ret = -1;
            break;
497
        } else if (rc == 1) {
498 499
            VIR_DEBUG("%s %s: iter matched on %s", dev->id, dev->name, check->name);
            *matched = check;
500
            ret = 1;
501 502
            break;
        }
503

504
        virPCIDeviceFree(check);
505 506
    }
    closedir(dir);
507
    return ret;
508 509 510
}

static uint8_t
511 512 513
virPCIDeviceFindCapabilityOffset(virPCIDevicePtr dev,
                                 int cfgfd,
                                 unsigned int capability)
514 515 516 517
{
    uint16_t status;
    uint8_t pos;

518
    status = virPCIDeviceRead16(dev, cfgfd, PCI_STATUS);
519 520 521
    if (!(status & PCI_STATUS_CAP_LIST))
        return 0;

522
    pos = virPCIDeviceRead8(dev, cfgfd, PCI_CAPABILITY_LIST);
523 524 525 526 527 528 529 530 531

    /* Zero indicates last capability, capabilities can't
     * be in the config space header and 0xff is returned
     * by the kernel if we don't have access to this region
     *
     * Note: we're not handling loops or extended
     * capabilities here.
     */
    while (pos >= PCI_CONF_HEADER_LEN && pos != 0xff) {
532
        uint8_t capid = virPCIDeviceRead8(dev, cfgfd, pos);
533 534 535 536 537 538
        if (capid == capability) {
            VIR_DEBUG("%s %s: found cap 0x%.2x at 0x%.2x",
                      dev->id, dev->name, capability, pos);
            return pos;
        }

539
        pos = virPCIDeviceRead8(dev, cfgfd, pos + 1);
540 541 542 543 544 545 546
    }

    VIR_DEBUG("%s %s: failed to find cap 0x%.2x", dev->id, dev->name, capability);

    return 0;
}

J
Jiri Denemark 已提交
547
static unsigned int
548 549
virPCIDeviceFindExtendedCapabilityOffset(virPCIDevicePtr dev,
                                         int cfgfd,
550
                                         unsigned int capability)
J
Jiri Denemark 已提交
551 552 553 554 555 556 557 558 559 560
{
    int ttl;
    unsigned int pos;
    uint32_t header;

    /* minimum 8 bytes per capability */
    ttl = (PCI_EXT_CAP_LIMIT - PCI_EXT_CAP_BASE) / 8;
    pos = PCI_EXT_CAP_BASE;

    while (ttl > 0 && pos >= PCI_EXT_CAP_BASE) {
561
        header = virPCIDeviceRead32(dev, cfgfd, pos);
J
Jiri Denemark 已提交
562 563 564 565 566 567 568 569 570 571 572

        if ((header & PCI_EXT_CAP_ID_MASK) == capability)
            return pos;

        pos = (header >> PCI_EXT_CAP_OFFSET_SHIFT) & PCI_EXT_CAP_OFFSET_MASK;
        ttl--;
    }

    return 0;
}

573 574 575 576
/* detects whether this device has FLR.  Returns 0 if the device does
 * not have FLR, 1 if it does, and -1 on error
 */
static int
577
virPCIDeviceDetectFunctionLevelReset(virPCIDevicePtr dev, int cfgfd)
578
{
M
Mark McLoughlin 已提交
579
    uint32_t caps;
580
    uint8_t pos;
581 582
    char *path;
    int found;
583 584 585 586 587 588 589 590

    /* The PCIe Function Level Reset capability allows
     * individual device functions to be reset without
     * affecting any other functions on the device or
     * any other devices on the bus. This is only common
     * on SR-IOV NICs at the moment.
     */
    if (dev->pcie_cap_pos) {
591
        caps = virPCIDeviceRead32(dev, cfgfd, dev->pcie_cap_pos + PCI_EXP_DEVCAP);
592 593 594 595 596 597 598 599 600 601
        if (caps & PCI_EXP_DEVCAP_FLR) {
            VIR_DEBUG("%s %s: detected PCIe FLR capability", dev->id, dev->name);
            return 1;
        }
    }

    /* The PCI AF Function Level Reset capability is
     * the same thing, except for conventional PCI
     * devices. This is not common yet.
     */
602
    pos = virPCIDeviceFindCapabilityOffset(dev, cfgfd, PCI_CAP_ID_AF);
603
    if (pos) {
604
        caps = virPCIDeviceRead16(dev, cfgfd, pos + PCI_AF_CAP);
605 606 607 608 609 610
        if (caps & PCI_AF_CAP_FLR) {
            VIR_DEBUG("%s %s: detected PCI FLR capability", dev->id, dev->name);
            return 1;
        }
    }

611 612 613 614 615 616
    /* there are some buggy devices that do support FLR, but forget to
     * advertise that fact in their capabilities.  However, FLR is *required*
     * to be present for virtual functions (VFs), so if we see that this
     * device is a VF, we just assume FLR works
     */

617
    if (virAsprintf(&path, PCI_SYSFS "devices/%s/physfn", dev->name) < 0)
618 619 620 621 622 623 624 625 626 627
        return -1;

    found = virFileExists(path);
    VIR_FREE(path);
    if (found) {
        VIR_DEBUG("%s %s: buggy device didn't advertise FLR, but is a VF; forcing flr on",
                  dev->id, dev->name);
        return 1;
    }

628 629 630 631 632 633 634 635 636
    VIR_DEBUG("%s %s: no FLR capability found", dev->id, dev->name);

    return 0;
}

/* Require the device has the PCI Power Management capability
 * and that a D3hot->D0 transition will results in a full
 * internal reset, not just a soft reset.
 */
637
static unsigned int
638
virPCIDeviceDetectPowerManagementReset(virPCIDevicePtr dev, int cfgfd)
639 640 641 642 643
{
    if (dev->pci_pm_cap_pos) {
        uint32_t ctl;

        /* require the NO_SOFT_RESET bit is clear */
644
        ctl = virPCIDeviceRead32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL);
645 646 647 648 649 650 651 652 653 654 655
        if (!(ctl & PCI_PM_CTRL_NO_SOFT_RESET)) {
            VIR_DEBUG("%s %s: detected PM reset capability", dev->id, dev->name);
            return 1;
        }
    }

    VIR_DEBUG("%s %s: no PM reset capability found", dev->id, dev->name);

    return 0;
}

656
/* Any active devices on the same domain/bus ? */
657
static int
658
virPCIDeviceSharesBusWithActive(virPCIDevicePtr dev, virPCIDevicePtr check, void *data)
659
{
660
    virPCIDeviceList *inactiveDevs = data;
661

662
    /* Different domain, different bus, or simply identical device */
663 664 665 666
    if (dev->address.domain != check->address.domain ||
        dev->address.bus != check->address.bus ||
        (dev->address.slot == check->address.slot &&
         dev->address.function == check->address.function))
667 668
        return 0;

669
    /* same bus, but inactive, i.e. about to be assigned to guest */
670
    if (inactiveDevs && virPCIDeviceListFind(inactiveDevs, check))
671
        return 0;
672

673
    return 1;
674 675
}

676 677 678
static virPCIDevicePtr
virPCIDeviceBusContainsActiveDevices(virPCIDevicePtr dev,
                                     virPCIDeviceList *inactiveDevs)
679
{
680 681 682
    virPCIDevicePtr active = NULL;
    if (virPCIDeviceIterDevices(virPCIDeviceSharesBusWithActive,
                                dev, &active, inactiveDevs) < 0)
683 684 685 686 687
        return NULL;
    return active;
}

/* Is @check the parent of @dev ? */
688
static int
689
virPCIDeviceIsParent(virPCIDevicePtr dev, virPCIDevicePtr check, void *data)
690 691 692
{
    uint16_t device_class;
    uint8_t header_type, secondary, subordinate;
693
    virPCIDevicePtr *best = data;
694 695
    int ret = 0;
    int fd;
696

697
    if (dev->address.domain != check->address.domain)
698 699
        return 0;

700
    if ((fd = virPCIDeviceConfigOpen(check, false)) < 0)
701 702
        return 0;

703
    /* Is it a bridge? */
704 705
    ret = virPCIDeviceReadClass(check, &device_class);
    if (ret < 0 || device_class != PCI_CLASS_BRIDGE_PCI)
706
        goto cleanup;
707 708

    /* Is it a plane? */
709
    header_type = virPCIDeviceRead8(check, fd, PCI_HEADER_TYPE);
710
    if ((header_type & PCI_HEADER_TYPE_MASK) != PCI_HEADER_TYPE_BRIDGE)
711
        goto cleanup;
712

713 714
    secondary   = virPCIDeviceRead8(check, fd, PCI_SECONDARY_BUS);
    subordinate = virPCIDeviceRead8(check, fd, PCI_SUBORDINATE_BUS);
715

716
    VIR_DEBUG("%s %s: found parent device %s", dev->id, dev->name, check->name);
717

718 719 720
    /* if the secondary bus exactly equals the device's bus, then we found
     * the direct parent.  No further work is necessary
     */
721
    if (dev->address.bus == secondary) {
722 723 724
        ret = 1;
        goto cleanup;
    }
725

726
    /* otherwise, SRIOV allows VFs to be on different buses than their PFs.
727 728 729
     * In this case, what we need to do is look for the "best" match; i.e.
     * the most restrictive match that still satisfies all of the conditions.
     */
730
    if (dev->address.bus > secondary && dev->address.bus <= subordinate) {
731
        if (*best == NULL) {
732 733 734 735
            *best = virPCIDeviceNew(check->address.domain,
                                    check->address.bus,
                                    check->address.slot,
                                    check->address.function);
736 737 738 739 740
            if (*best == NULL) {
                ret = -1;
                goto cleanup;
            }
        } else {
741 742 743 744
            /* OK, we had already recorded a previous "best" match for the
             * parent.  See if the current device is more restrictive than the
             * best, and if so, make it the new best
             */
745 746 747
            int bestfd;
            uint8_t best_secondary;

748
            if ((bestfd = virPCIDeviceConfigOpen(*best, false)) < 0)
749
                goto cleanup;
750 751
            best_secondary = virPCIDeviceRead8(*best, bestfd, PCI_SECONDARY_BUS);
            virPCIDeviceConfigClose(*best, bestfd);
752 753

            if (secondary > best_secondary) {
754
                virPCIDeviceFree(*best);
755 756 757 758
                *best = virPCIDeviceNew(check->address.domain,
                                        check->address.bus,
                                        check->address.slot,
                                        check->address.function);
759 760 761 762
                if (*best == NULL) {
                    ret = -1;
                    goto cleanup;
                }
763 764 765 766
            }
        }
    }

767
 cleanup:
768
    virPCIDeviceConfigClose(check, fd);
769
    return ret;
770 771
}

772
static int
773
virPCIDeviceGetParent(virPCIDevicePtr dev, virPCIDevicePtr *parent)
774
{
775
    virPCIDevicePtr best = NULL;
776 777 778
    int ret;

    *parent = NULL;
779
    ret = virPCIDeviceIterDevices(virPCIDeviceIsParent, dev, parent, &best);
780
    if (ret == 1)
781
        virPCIDeviceFree(best);
782 783 784
    else if (ret == 0)
        *parent = best;
    return ret;
785 786 787 788 789 790
}

/* Secondary Bus Reset is our sledgehammer - it resets all
 * devices behind a bus.
 */
static int
791 792 793
virPCIDeviceTrySecondaryBusReset(virPCIDevicePtr dev,
                                 int cfgfd,
                                 virPCIDeviceList *inactiveDevs)
794
{
795
    virPCIDevicePtr parent, conflict;
796 797 798
    uint8_t config_space[PCI_CONF_LEN];
    uint16_t ctl;
    int ret = -1;
799
    int parentfd;
800

801 802 803
    /* Refuse to do a secondary bus reset if there are other
     * devices/functions behind the bus are used by the host
     * or other guests.
804
     */
805
    if ((conflict = virPCIDeviceBusContainsActiveDevices(dev, inactiveDevs))) {
806
        virReportError(VIR_ERR_INTERNAL_ERROR,
807 808
                       _("Active %s devices on bus with %s, not doing bus reset"),
                       conflict->name, dev->name);
W
Wang Rui 已提交
809
        virPCIDeviceFree(conflict);
810 811 812 813
        return -1;
    }

    /* Find the parent bus */
814
    if (virPCIDeviceGetParent(dev, &parent) < 0)
815
        return -1;
816
    if (!parent) {
817
        virReportError(VIR_ERR_INTERNAL_ERROR,
818 819
                       _("Failed to find parent device for %s"),
                       dev->name);
820 821
        return -1;
    }
822
    if ((parentfd = virPCIDeviceConfigOpen(parent, true)) < 0)
823
        goto out;
824 825 826 827 828 829 830

    VIR_DEBUG("%s %s: doing a secondary bus reset", dev->id, dev->name);

    /* Save and restore the device's config space; we only do this
     * for the supplied device since we refuse to do a reset if there
     * are multiple devices/functions
     */
831
    if (virPCIDeviceRead(dev, cfgfd, 0, config_space, PCI_CONF_LEN) < 0) {
832
        virReportError(VIR_ERR_INTERNAL_ERROR,
833
                       _("Failed to read PCI config space for %s"),
834
                       dev->name);
835 836 837 838 839 840
        goto out;
    }

    /* Read the control register, set the reset flag, wait 200ms,
     * unset the reset flag and wait 200ms.
     */
841
    ctl = virPCIDeviceRead16(dev, cfgfd, PCI_BRIDGE_CONTROL);
842

843 844
    virPCIDeviceWrite16(parent, parentfd, PCI_BRIDGE_CONTROL,
                        ctl | PCI_BRIDGE_CTL_RESET);
845 846 847

    usleep(200 * 1000); /* sleep 200ms */

848
    virPCIDeviceWrite16(parent, parentfd, PCI_BRIDGE_CONTROL, ctl);
849 850 851

    usleep(200 * 1000); /* sleep 200ms */

852
    if (virPCIDeviceWrite(dev, cfgfd, 0, config_space, PCI_CONF_LEN) < 0) {
853
        virReportError(VIR_ERR_INTERNAL_ERROR,
854 855 856 857
                       _("Failed to restore PCI config space for %s"),
                       dev->name);
        goto out;
    }
858
    ret = 0;
859

860
 out:
861 862
    virPCIDeviceConfigClose(parent, parentfd);
    virPCIDeviceFree(parent);
863 864 865 866 867 868 869 870
    return ret;
}

/* Power management reset attempts to reset a device using a
 * D-state transition from D3hot to D0. Note, in detect_pm_reset()
 * above we require the device supports a full internal reset.
 */
static int
871
virPCIDeviceTryPowerManagementReset(virPCIDevicePtr dev, int cfgfd)
872 873 874 875 876 877 878 879
{
    uint8_t config_space[PCI_CONF_LEN];
    uint32_t ctl;

    if (!dev->pci_pm_cap_pos)
        return -1;

    /* Save and restore the device's config space. */
880
    if (virPCIDeviceRead(dev, cfgfd, 0, &config_space[0], PCI_CONF_LEN) < 0) {
881
        virReportError(VIR_ERR_INTERNAL_ERROR,
882
                       _("Failed to read PCI config space for %s"),
883
                       dev->name);
884 885 886 887 888
        return -1;
    }

    VIR_DEBUG("%s %s: doing a power management reset", dev->id, dev->name);

889
    ctl = virPCIDeviceRead32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL);
890 891
    ctl &= ~PCI_PM_CTRL_STATE_MASK;

892 893
    virPCIDeviceWrite32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL,
                        ctl | PCI_PM_CTRL_STATE_D3hot);
894 895 896

    usleep(10 * 1000); /* sleep 10ms */

897 898
    virPCIDeviceWrite32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL,
                        ctl | PCI_PM_CTRL_STATE_D0);
899 900 901

    usleep(10 * 1000); /* sleep 10ms */

902
    if (virPCIDeviceWrite(dev, cfgfd, 0, &config_space[0], PCI_CONF_LEN) < 0) {
903
        virReportError(VIR_ERR_INTERNAL_ERROR,
904 905 906 907
                       _("Failed to restore PCI config space for %s"),
                       dev->name);
        return -1;
    }
908 909 910 911 912

    return 0;
}

static int
913
virPCIDeviceInit(virPCIDevicePtr dev, int cfgfd)
914
{
915 916
    int flr;

917 918 919
    dev->pcie_cap_pos   = virPCIDeviceFindCapabilityOffset(dev, cfgfd, PCI_CAP_ID_EXP);
    dev->pci_pm_cap_pos = virPCIDeviceFindCapabilityOffset(dev, cfgfd, PCI_CAP_ID_PM);
    flr = virPCIDeviceDetectFunctionLevelReset(dev, cfgfd);
920
    if (flr < 0)
921
        return flr;
922 923
    dev->has_flr        = !!flr;
    dev->has_pm_reset   = !!virPCIDeviceDetectPowerManagementReset(dev, cfgfd);
924

925 926 927 928
    return 0;
}

int
929 930 931
virPCIDeviceReset(virPCIDevicePtr dev,
                  virPCIDeviceList *activeDevs,
                  virPCIDeviceList *inactiveDevs)
932
{
933 934
    char *drvPath = NULL;
    char *drvName = NULL;
935
    int ret = -1;
936
    int fd = -1;
937

938
    if (activeDevs && virPCIDeviceListFind(activeDevs, dev)) {
939
        virReportError(VIR_ERR_INTERNAL_ERROR,
940 941 942 943
                       _("Not resetting active device %s"), dev->name);
        return -1;
    }

944 945 946 947 948 949 950 951
    /* If the device is currently bound to vfio-pci, ignore all
     * requests to reset it, since the vfio-pci driver will always
     * reset it whenever appropriate, so doing it ourselves would just
     * be redundant.
     */
    if (virPCIDeviceGetDriverPathAndName(dev, &drvPath, &drvName) < 0)
        goto cleanup;

952
    if (virPCIStubDriverTypeFromString(drvName) == VIR_PCI_STUB_DRIVER_VFIO) {
953 954 955 956 957 958 959
        VIR_DEBUG("Device %s is bound to vfio-pci - skip reset",
                  dev->name);
        ret = 0;
        goto cleanup;
    }
    VIR_DEBUG("Resetting device %s", dev->name);

960
    if ((fd = virPCIDeviceConfigOpen(dev, true)) < 0)
961
        goto cleanup;
962

963
    if (virPCIDeviceInit(dev, fd) < 0)
964 965
        goto cleanup;

966 967 968
    /* KVM will perform FLR when starting and stopping
     * a guest, so there is no need for us to do it here.
     */
969 970 971 972
    if (dev->has_flr) {
        ret = 0;
        goto cleanup;
    }
973

974 975 976 977 978
    /* If the device supports PCI power management reset,
     * that's the next best thing because it only resets
     * the function, not the whole device.
     */
    if (dev->has_pm_reset)
979
        ret = virPCIDeviceTryPowerManagementReset(dev, fd);
980

981
    /* Bus reset is not an option with the root bus */
982
    if (ret < 0 && dev->address.bus != 0)
983
        ret = virPCIDeviceTrySecondaryBusReset(dev, fd, inactiveDevs);
984

985 986
    if (ret < 0) {
        virErrorPtr err = virGetLastError();
987
        virReportError(VIR_ERR_INTERNAL_ERROR,
988 989
                       _("Unable to reset PCI device %s: %s"),
                       dev->name,
990 991
                       err ? err->message :
                       _("no FLR, PM reset or bus reset available"));
992 993
    }

994
 cleanup:
995 996
    VIR_FREE(drvPath);
    VIR_FREE(drvName);
997
    virPCIDeviceConfigClose(dev, fd);
998 999 1000
    return ret;
}

1001

1002
static int
1003
virPCIProbeStubDriver(virPCIStubDriver driver)
1004
{
1005
    const char *drvname = NULL;
1006
    char *drvpath = NULL;
1007
    bool probed = false;
1008

1009 1010 1011 1012 1013 1014 1015 1016
    if (driver == VIR_PCI_STUB_DRIVER_NONE ||
        !(drvname = virPCIStubDriverTypeToString(driver))) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       "%s",
                       _("Attempting to use unknown stub driver"));
        return -1;
    }

1017
 recheck:
1018
    if ((drvpath = virPCIDriverDir(drvname)) && virFileExists(drvpath)) {
1019
        /* driver already loaded, return */
1020
        VIR_FREE(drvpath);
1021
        return 0;
1022 1023 1024
    }

    VIR_FREE(drvpath);
1025 1026

    if (!probed) {
1027
        char *errbuf = NULL;
1028
        probed = true;
1029 1030
        if ((errbuf = virKModLoad(drvname, true))) {
            VIR_WARN("failed to load driver %s: %s", drvname, errbuf);
1031 1032
            VIR_FREE(errbuf);
            goto cleanup;
1033
        }
1034 1035

        goto recheck;
1036 1037
    }

1038
 cleanup:
1039 1040 1041
    /* If we know failure was because of blacklist, let's report that;
     * otherwise, report a more generic failure message
     */
1042
    if (virKModIsBlacklisted(drvname)) {
1043 1044 1045
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Failed to load PCI stub module %s: "
                         "administratively prohibited"),
1046
                       drvname);
1047 1048 1049
    } else {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Failed to load PCI stub module %s"),
1050
                       drvname);
1051 1052
    }

1053
    return -1;
1054 1055
}

1056
int
1057
virPCIDeviceUnbind(virPCIDevicePtr dev)
1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072
{
    char *path = NULL;
    char *drvpath = NULL;
    char *driver = NULL;
    int ret = -1;

    if (virPCIDeviceGetDriverPathAndName(dev, &drvpath, &driver) < 0)
        goto cleanup;

    if (!driver) {
        /* The device is not bound to any driver */
        ret = 0;
        goto cleanup;
    }

1073
    if (!(path = virPCIFile(dev->name, "driver/unbind")))
1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085
        goto cleanup;

    if (virFileExists(path)) {
        if (virFileWriteStr(path, dev->name, 0) < 0) {
            virReportSystemError(errno,
                                 _("Failed to unbind PCI device '%s' from %s"),
                                 dev->name, driver);
            goto cleanup;
        }
    }

    ret = 0;
1086
 cleanup:
1087 1088 1089 1090 1091 1092
    VIR_FREE(path);
    VIR_FREE(drvpath);
    VIR_FREE(driver);
    return ret;
}

1093
static int
1094
virPCIDeviceUnbindFromStub(virPCIDevicePtr dev)
1095 1096 1097 1098
{
    int result = -1;
    char *drvdir = NULL;
    char *path = NULL;
1099
    char *driver = NULL;
1100

1101 1102 1103
    /* If the device is currently bound to one of the "well known"
     * stub drivers, then unbind it, otherwise ignore it.
     */
L
Laine Stump 已提交
1104
    if (virPCIDeviceGetDriverPathAndName(dev, &drvdir, &driver) < 0)
1105
        goto cleanup;
E
Eric Blake 已提交
1106

1107 1108
    if (!driver) {
        /* The device is not bound to any driver and we are almost done. */
1109
        VIR_DEBUG("PCI device %s is not bound to any driver", dev->name);
1110 1111 1112
        goto reprobe;
    }

1113 1114
    if (!dev->unbind_from_stub) {
        VIR_DEBUG("Unbind from stub skipped for PCI device %s", dev->name);
1115
        goto remove_slot;
1116
    }
1117

1118
    /* If the device isn't bound to a known stub, skip the unbind. */
1119
    if (virPCIStubDriverTypeFromString(driver) < 0 ||
1120 1121 1122
        virPCIStubDriverTypeFromString(driver) == VIR_PCI_STUB_DRIVER_NONE) {
        VIR_DEBUG("Unbind from stub skipped for PCI device %s because of "
                  "unknown stub driver", dev->name);
1123
        goto remove_slot;
1124
    }
1125

1126 1127
    VIR_DEBUG("Unbinding PCI device %s from stub driver %s",
              dev->name, driver);
1128

1129
    if (virPCIDeviceUnbind(dev) < 0)
1130
        goto cleanup;
1131
    dev->unbind_from_stub = false;
1132

1133
 remove_slot:
1134 1135
    if (!dev->remove_slot) {
        VIR_DEBUG("Slot removal skipped for PCI device %s", dev->name);
1136
        goto reprobe;
1137 1138 1139
    }

    VIR_DEBUG("Removing slot for PCI device %s", dev->name);
1140 1141

    /* Xen's pciback.ko wants you to use remove_slot on the specific device */
1142
    if (!(path = virPCIDriverFile(driver, "remove_slot")))
1143 1144 1145 1146
        goto cleanup;

    if (virFileExists(path) && virFileWriteStr(path, dev->name, 0) < 0) {
        virReportSystemError(errno,
1147
                             _("Failed to remove slot for PCI device '%s' from %s"),
1148 1149 1150
                             dev->name, driver);
        goto cleanup;
    }
1151
    dev->remove_slot = false;
1152

1153
 reprobe:
1154
    if (!dev->reprobe) {
1155
        VIR_DEBUG("Reprobe skipped for PCI device %s", dev->name);
1156 1157 1158
        result = 0;
        goto cleanup;
    }
1159

1160 1161
    VIR_DEBUG("Reprobing for PCI device %s", dev->name);

1162 1163 1164 1165 1166
    /* Trigger a re-probe of the device is not in the stub's dynamic
     * ID table. If the stub is available, but 'remove_id' isn't
     * available, then re-probing would just cause the device to be
     * re-bound to the stub.
     */
1167 1168
    VIR_FREE(path);
    if (driver && !(path = virPCIDriverFile(driver, "remove_id")))
1169 1170
        goto cleanup;

1171
    if (!driver || !virFileExists(drvdir) || virFileExists(path)) {
1172 1173 1174 1175 1176 1177 1178 1179 1180 1181
        if (virFileWriteStr(PCI_SYSFS "drivers_probe", dev->name, 0) < 0) {
            virReportSystemError(errno,
                                 _("Failed to trigger a re-probe for PCI device '%s'"),
                                 dev->name);
            goto cleanup;
        }
    }

    result = 0;

1182
 cleanup:
1183
    /* do not do it again */
1184 1185 1186
    dev->unbind_from_stub = false;
    dev->remove_slot = false;
    dev->reprobe = false;
1187

1188 1189
    VIR_FREE(drvdir);
    VIR_FREE(path);
1190
    VIR_FREE(driver);
1191 1192 1193 1194

    return result;
}

1195 1196

static int
1197
virPCIDeviceBindToStub(virPCIDevicePtr dev)
1198
{
1199
    int result = -1;
E
Eric Blake 已提交
1200
    bool reprobe = false;
1201 1202 1203
    char *stubDriverPath = NULL;
    char *driverLink = NULL;
    char *path = NULL; /* reused for different purposes */
1204
    const char *stubDriverName = NULL;
J
Jiri Denemark 已提交
1205
    virErrorPtr err = NULL;
1206

1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219
    /* Check the device is configured to use one of the known stub drivers */
    if (dev->stubDriver == VIR_PCI_STUB_DRIVER_NONE) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("No stub driver configured for PCI device %s"),
                       dev->name);
        return -1;
    } else if (!(stubDriverName = virPCIStubDriverTypeToString(dev->stubDriver))) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unknown stub driver configured for PCI device %s"),
                       dev->name);
        return -1;
    }

1220
    if (!(stubDriverPath = virPCIDriverDir(stubDriverName))  ||
1221
        !(driverLink = virPCIFile(dev->name, "driver")))
1222 1223
        goto cleanup;

1224 1225 1226 1227 1228
    if (virFileExists(driverLink)) {
        if (virFileLinkPointsTo(driverLink, stubDriverPath)) {
            /* The device is already bound to the correct driver */
            VIR_DEBUG("Device %s is already bound to %s",
                      dev->name, stubDriverName);
1229 1230 1231
            result = 0;
            goto cleanup;
        }
1232
        reprobe = true;
1233
    }
1234 1235 1236 1237 1238 1239 1240 1241 1242

    /* Add the PCI device ID to the stub's dynamic ID table;
     * this is needed to allow us to bind the device to the stub.
     * Note: if the device is not currently bound to any driver,
     * stub will immediately be bound to the device. Also, note
     * that if a new device with this ID is hotplugged, or if a probe
     * is triggered for such a device, it will also be immediately
     * bound by the stub.
     */
1243
    if (!(path = virPCIDriverFile(stubDriverName, "new_id")))
1244
        goto cleanup;
1245

1246
    if (virFileWriteStr(path, dev->id, 0) < 0) {
1247
        virReportSystemError(errno,
1248
                             _("Failed to add PCI device ID '%s' to %s"),
1249
                             dev->id, stubDriverName);
1250
        goto cleanup;
1251 1252
    }

1253
    /* check whether the device is bound to pci-stub when we write dev->id to
1254
     * ${stubDriver}/new_id.
1255
     */
1256
    if (virFileLinkPointsTo(driverLink, stubDriverPath)) {
1257 1258
        dev->unbind_from_stub = true;
        dev->remove_slot = true;
J
Jiri Denemark 已提交
1259
        result = 0;
1260 1261 1262
        goto remove_id;
    }

1263
    if (virPCIDeviceUnbind(dev) < 0)
J
Jiri Denemark 已提交
1264
        goto remove_id;
1265

1266 1267 1268
    /* If the device was bound to a driver we'll need to reprobe later */
    dev->reprobe = reprobe;

1269 1270
    /* If the device isn't already bound to pci-stub, try binding it now.
     */
1271
    if (!virFileLinkPointsTo(driverLink, stubDriverPath)) {
1272
        /* Xen's pciback.ko wants you to use new_slot first */
1273 1274
        VIR_FREE(path);
        if (!(path = virPCIDriverFile(stubDriverName, "new_slot")))
1275
            goto remove_id;
1276

1277
        if (virFileExists(path) && virFileWriteStr(path, dev->name, 0) < 0) {
1278
            virReportSystemError(errno,
1279 1280 1281
                                 _("Failed to add slot for "
                                   "PCI device '%s' to %s"),
                                 dev->name, stubDriverName);
1282
            goto remove_id;
1283
        }
1284
        dev->remove_slot = true;
1285

1286 1287
        VIR_FREE(path);
        if (!(path = virPCIDriverFile(stubDriverName, "bind")))
1288
            goto remove_id;
1289

1290
        if (virFileWriteStr(path, dev->name, 0) < 0) {
1291
            virReportSystemError(errno,
1292
                                 _("Failed to bind PCI device '%s' to %s"),
1293
                                 dev->name, stubDriverName);
1294
            goto remove_id;
1295
        }
1296
        dev->unbind_from_stub = true;
1297 1298
    }

J
Jiri Denemark 已提交
1299 1300
    result = 0;

1301
 remove_id:
J
Jiri Denemark 已提交
1302 1303
    err = virSaveLastError();

1304 1305 1306
    /* If 'remove_id' exists, remove the device id from pci-stub's dynamic
     * ID table so that 'drivers_probe' works below.
     */
1307 1308
    VIR_FREE(path);
    if (!(path = virPCIDriverFile(stubDriverName, "remove_id"))) {
E
Eric Blake 已提交
1309
        /* We do not remove PCI ID from pci-stub, and we cannot reprobe it */
1310 1311
        if (dev->reprobe) {
            VIR_WARN("Could not remove PCI ID '%s' from %s, and the device "
1312
                     "cannot be probed again.", dev->id, stubDriverName);
1313
        }
1314
        dev->reprobe = false;
J
Jiri Denemark 已提交
1315
        result = -1;
1316 1317 1318
        goto cleanup;
    }

1319
    if (virFileExists(path) && virFileWriteStr(path, dev->id, 0) < 0) {
1320
        virReportSystemError(errno,
1321
                             _("Failed to remove PCI ID '%s' from %s"),
1322
                             dev->id, stubDriverName);
1323

E
Eric Blake 已提交
1324
        /* remove PCI ID from pci-stub failed, and we cannot reprobe it */
1325 1326
        if (dev->reprobe) {
            VIR_WARN("Failed to remove PCI ID '%s' from %s, and the device "
1327
                     "cannot be probed again.", dev->id, stubDriverName);
1328
        }
1329
        dev->reprobe = false;
J
Jiri Denemark 已提交
1330
        result = -1;
1331
        goto cleanup;
1332 1333
    }

1334
 cleanup:
1335 1336
    VIR_FREE(stubDriverPath);
    VIR_FREE(driverLink);
1337 1338
    VIR_FREE(path);

1339
    if (result < 0)
J
Jiri Denemark 已提交
1340 1341 1342 1343 1344
        virPCIDeviceUnbindFromStub(dev);

    if (err)
        virSetError(err);
    virFreeError(err);
1345

1346
    return result;
1347 1348
}

1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366
/* virPCIDeviceDetach:
 *
 * Detach this device from the host driver, attach it to the stub
 * driver (previously set with virPCIDeviceSetStubDriver(), and add *a
 * copy* of the object to the inactiveDevs list (if provided). This
 * function will *never* consume dev, so the caller should free it.
 *
 * Returns 0 on success, -1 on failure (will fail if the device is
 * already in the activeDevs list, but will be a NOP if the device is
 * already bound to the stub).
 *
 * GENERAL NOTE: activeDevs should be a list of all PCI devices
 * currently in use by a domain. inactiveDevs is a list of all PCI
 * devices that libvirt has detached from the host driver + attached
 * to the stub driver, but hasn't yet assigned to a domain. Any device
 * that is still attached to its host driver should not be on either
 * list.
 */
1367
int
1368 1369
virPCIDeviceDetach(virPCIDevicePtr dev,
                   virPCIDeviceList *activeDevs,
1370
                   virPCIDeviceList *inactiveDevs)
1371
{
1372
    if (virPCIProbeStubDriver(dev->stubDriver) < 0)
1373 1374
        return -1;

1375
    if (activeDevs && virPCIDeviceListFind(activeDevs, dev)) {
1376
        virReportError(VIR_ERR_INTERNAL_ERROR,
1377 1378 1379 1380
                       _("Not detaching active device %s"), dev->name);
        return -1;
    }

1381
    if (virPCIDeviceBindToStub(dev) < 0)
1382 1383
        return -1;

1384 1385 1386
    /* Add *a copy of* the dev into list inactiveDevs, if
     * it's not already there.
     */
1387 1388 1389 1390
    if (inactiveDevs && !virPCIDeviceListFind(inactiveDevs, dev)) {
        VIR_DEBUG("Adding PCI device %s to inactive list", dev->name);
        if (virPCIDeviceListAddCopy(inactiveDevs, dev) < 0)
            return -1;
1391 1392 1393
    }

    return 0;
1394 1395 1396
}

int
1397 1398
virPCIDeviceReattach(virPCIDevicePtr dev,
                     virPCIDeviceListPtr activeDevs,
1399
                     virPCIDeviceListPtr inactiveDevs)
1400
{
1401
    if (activeDevs && virPCIDeviceListFind(activeDevs, dev)) {
1402
        virReportError(VIR_ERR_INTERNAL_ERROR,
1403 1404 1405 1406
                       _("Not reattaching active device %s"), dev->name);
        return -1;
    }

1407
    if (virPCIDeviceUnbindFromStub(dev) < 0)
1408 1409 1410
        return -1;

    /* Steal the dev from list inactiveDevs */
1411 1412
    if (inactiveDevs) {
        VIR_DEBUG("Removing PCI device %s from inactive list", dev->name);
1413
        virPCIDeviceListDel(inactiveDevs, dev);
1414
    }
1415 1416

    return 0;
1417 1418
}

1419 1420 1421 1422 1423
/* Certain hypervisors (like qemu/kvm) map the PCI bar(s) on
 * the host when doing device passthrough.  This can lead to a race
 * condition where the hypervisor is still cleaning up the device while
 * libvirt is trying to re-attach it to the host device driver.  To avoid
 * this situation, we look through /proc/iomem, and if the hypervisor is
E
Eric Blake 已提交
1424 1425
 * still holding on to the bar (denoted by the string in the matcher
 * variable), then we can wait around a bit for that to clear up.
1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445
 *
 * A typical /proc/iomem looks like this (snipped for brevity):
 * 00010000-0008efff : System RAM
 * 0008f000-0008ffff : reserved
 * ...
 * 00100000-cc9fcfff : System RAM
 *   00200000-00483d3b : Kernel code
 *   00483d3c-005c88df : Kernel data
 * cc9fd000-ccc71fff : ACPI Non-volatile Storage
 * ...
 * d0200000-d02fffff : PCI Bus #05
 *   d0200000-d021ffff : 0000:05:00.0
 *     d0200000-d021ffff : e1000e
 *   d0220000-d023ffff : 0000:05:00.0
 *     d0220000-d023ffff : e1000e
 * ...
 * f0000000-f0003fff : 0000:00:1b.0
 *   f0000000-f0003fff : kvm_assigned_device
 *
 * Returns 0 if we are clear to continue, and 1 if the hypervisor is still
E
Eric Blake 已提交
1446
 * holding on to the resource.
1447 1448
 */
int
1449
virPCIDeviceWaitForCleanup(virPCIDevicePtr dev, const char *matcher)
1450 1451 1452
{
    FILE *fp;
    char line[160];
1453
    char *tmp;
1454
    unsigned long long start, end;
1455
    unsigned int domain, bus, slot, function;
1456
    bool in_matching_device;
1457 1458 1459 1460 1461 1462 1463 1464 1465
    int ret;
    size_t match_depth;

    fp = fopen("/proc/iomem", "r");
    if (!fp) {
        /* If we failed to open iomem, we just basically ignore the error.  The
         * unbind might succeed anyway, and besides, it's very likely we have
         * no way to report the error
         */
1466
        VIR_DEBUG("Failed to open /proc/iomem, trying to continue anyway");
1467 1468 1469 1470
        return 0;
    }

    ret = 0;
1471
    in_matching_device = false;
1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482
    match_depth = 0;
    while (fgets(line, sizeof(line), fp) != 0) {
        /* the logic here is a bit confusing.  For each line, we look to
         * see if it matches the domain:bus:slot.function we were given.
         * If this line matches the DBSF, then any subsequent lines indented
         * by 2 spaces are the PCI regions for this device.  It's also
         * possible that none of the PCI regions are currently mapped, in
         * which case we have no indented regions.  This code handles all
         * of these situations
         */
        if (in_matching_device && (strspn(line, " ") == (match_depth + 2))) {
1483 1484 1485 1486 1487 1488
            /* expected format: <start>-<end> : <suffix> */
            if (/* start */
                virStrToLong_ull(line, &tmp, 16, &start) < 0 || *tmp != '-' ||
                /* end */
                virStrToLong_ull(tmp + 1, &tmp, 16, &end) < 0 ||
                (tmp = STRSKIP(tmp, " : ")) == NULL)
1489 1490
                continue;

1491
            if (STRPREFIX(tmp, matcher)) {
1492 1493 1494
                ret = 1;
                break;
            }
1495
        } else {
1496
            in_matching_device = false;
1497

1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511
            /* expected format: <start>-<end> : <domain>:<bus>:<slot>.<function> */
            if (/* start */
                virStrToLong_ull(line, &tmp, 16, &start) < 0 || *tmp != '-' ||
                /* end */
                virStrToLong_ull(tmp + 1, &tmp, 16, &end) < 0 ||
                (tmp = STRSKIP(tmp, " : ")) == NULL ||
                /* domain */
                virStrToLong_ui(tmp, &tmp, 16, &domain) < 0 || *tmp != ':' ||
                /* bus */
                virStrToLong_ui(tmp + 1, &tmp, 16, &bus) < 0 || *tmp != ':' ||
                /* slot */
                virStrToLong_ui(tmp + 1, &tmp, 16, &slot) < 0 || *tmp != '.' ||
                /* function */
                virStrToLong_ui(tmp + 1, &tmp, 16, &function) < 0 || *tmp != '\n')
1512 1513
                continue;

1514 1515
            if (domain != dev->address.domain || bus != dev->address.bus ||
                slot != dev->address.slot || function != dev->address.function)
1516
                continue;
1517
            in_matching_device = true;
1518 1519 1520 1521
            match_depth = strspn(line, " ");
        }
    }

E
Eric Blake 已提交
1522
    VIR_FORCE_FCLOSE(fp);
1523 1524 1525 1526

    return ret;
}

1527
static char *
1528
virPCIDeviceReadID(virPCIDevicePtr dev, const char *id_name)
1529
{
1530
    char *path = NULL;
1531 1532
    char *id_str;

1533
    if (!(path = virPCIFile(dev->name, id_name)))
1534
        return NULL;
1535 1536

    /* ID string is '0xNNNN\n' ... i.e. 7 bytes */
1537 1538
    if (virFileReadAll(path, 7, &id_str) < 0) {
        VIR_FREE(path);
1539
        return NULL;
1540 1541 1542
    }

    VIR_FREE(path);
1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555

    /* Check for 0x suffix */
    if (id_str[0] != '0' || id_str[1] != 'x') {
        VIR_FREE(id_str);
        return NULL;
    }

    /* Chop off the newline; we know the string is 7 bytes */
    id_str[6] = '\0';

    return id_str;
}

1556
int
1557 1558 1559 1560
virPCIGetAddrString(unsigned int domain,
                    unsigned int bus,
                    unsigned int slot,
                    unsigned int function,
1561
                    char **pciConfigAddr)
1562
{
1563
    virPCIDevicePtr dev = NULL;
1564 1565
    int ret = -1;

1566
    dev = virPCIDeviceNew(domain, bus, slot, function);
1567
    if (dev != NULL) {
1568
        if (VIR_STRDUP(*pciConfigAddr, dev->name) < 0)
1569 1570 1571 1572
            goto cleanup;
        ret = 0;
    }

1573
 cleanup:
1574
    virPCIDeviceFree(dev);
1575 1576 1577
    return ret;
}

1578
virPCIDevicePtr
1579 1580 1581 1582
virPCIDeviceNew(unsigned int domain,
                unsigned int bus,
                unsigned int slot,
                unsigned int function)
1583
{
1584
    virPCIDevicePtr dev;
E
Eric Blake 已提交
1585 1586
    char *vendor = NULL;
    char *product = NULL;
1587

1588
    if (VIR_ALLOC(dev) < 0)
1589 1590
        return NULL;

1591 1592 1593 1594
    dev->address.domain = domain;
    dev->address.bus = bus;
    dev->address.slot = slot;
    dev->address.function = function;
1595

E
Eric Blake 已提交
1596
    if (snprintf(dev->name, sizeof(dev->name), "%.4x:%.2x:%.2x.%.1x",
1597
                 domain, bus, slot, function) >= sizeof(dev->name)) {
1598
        virReportError(VIR_ERR_INTERNAL_ERROR,
E
Eric Blake 已提交
1599
                       _("dev->name buffer overflow: %.4x:%.2x:%.2x.%.1x"),
1600
                       domain, bus, slot, function);
E
Eric Blake 已提交
1601
        goto error;
E
Eric Blake 已提交
1602 1603
    }
    if (virAsprintf(&dev->path, PCI_SYSFS "devices/%s/config",
1604
                    dev->name) < 0)
E
Eric Blake 已提交
1605
        goto error;
1606

1607
    if (!virFileExists(dev->path)) {
1608 1609 1610
        virReportSystemError(errno,
                             _("Device %s not found: could not access %s"),
                             dev->name, dev->path);
E
Eric Blake 已提交
1611
        goto error;
1612 1613
    }

1614 1615
    vendor  = virPCIDeviceReadID(dev, "vendor");
    product = virPCIDeviceReadID(dev, "device");
1616 1617

    if (!vendor || !product) {
1618
        virReportError(VIR_ERR_INTERNAL_ERROR,
1619 1620
                       _("Failed to read product/vendor ID for %s"),
                       dev->name);
E
Eric Blake 已提交
1621
        goto error;
1622 1623 1624
    }

    /* strings contain '0x' prefix */
E
Eric Blake 已提交
1625 1626
    if (snprintf(dev->id, sizeof(dev->id), "%s %s", &vendor[2],
                 &product[2]) >= sizeof(dev->id)) {
1627
        virReportError(VIR_ERR_INTERNAL_ERROR,
E
Eric Blake 已提交
1628 1629
                       _("dev->id buffer overflow: %s %s"),
                       &vendor[2], &product[2]);
E
Eric Blake 已提交
1630
        goto error;
E
Eric Blake 已提交
1631
    }
1632 1633 1634

    VIR_DEBUG("%s %s: initialized", dev->id, dev->name);

1635
 cleanup:
E
Eric Blake 已提交
1636 1637
    VIR_FREE(product);
    VIR_FREE(vendor);
1638
    return dev;
E
Eric Blake 已提交
1639

1640
 error:
1641
    virPCIDeviceFree(dev);
E
Eric Blake 已提交
1642 1643
    dev = NULL;
    goto cleanup;
1644 1645
}

L
Laine Stump 已提交
1646 1647 1648 1649 1650 1651

virPCIDevicePtr
virPCIDeviceCopy(virPCIDevicePtr dev)
{
    virPCIDevicePtr copy;

1652
    if (VIR_ALLOC(copy) < 0)
L
Laine Stump 已提交
1653 1654 1655 1656
        return NULL;

    /* shallow copy to take care of most attributes */
    *copy = *dev;
1657
    copy->path = NULL;
C
Chunyan Liu 已提交
1658
    copy->used_by_drvname = copy->used_by_domname = NULL;
L
Laine Stump 已提交
1659
    if (VIR_STRDUP(copy->path, dev->path) < 0 ||
C
Chunyan Liu 已提交
1660 1661
        VIR_STRDUP(copy->used_by_drvname, dev->used_by_drvname) < 0 ||
        VIR_STRDUP(copy->used_by_domname, dev->used_by_domname) < 0) {
L
Laine Stump 已提交
1662 1663 1664 1665
        goto error;
    }
    return copy;

1666
 error:
L
Laine Stump 已提交
1667 1668 1669 1670 1671
    virPCIDeviceFree(copy);
    return NULL;
}


1672
void
1673
virPCIDeviceFree(virPCIDevicePtr dev)
1674
{
1675 1676
    if (!dev)
        return;
1677
    VIR_DEBUG("%s %s: freeing", dev->id, dev->name);
E
Eric Blake 已提交
1678
    VIR_FREE(dev->path);
C
Chunyan Liu 已提交
1679 1680
    VIR_FREE(dev->used_by_drvname);
    VIR_FREE(dev->used_by_domname);
1681 1682
    VIR_FREE(dev);
}
1683

1684 1685 1686 1687 1688
/**
 * virPCIDeviceGetAddress:
 * @dev: device to get address from
 *
 * Take a PCI device on input and return its PCI address. The
1689
 * returned object is owned by the device and must not be freed.
1690
 *
1691
 * Returns: a pointer to the address, which can never be NULL.
1692 1693 1694 1695
 */
virPCIDeviceAddressPtr
virPCIDeviceGetAddress(virPCIDevicePtr dev)
{
1696
    return &(dev->address);
1697 1698
}

1699
const char *
1700
virPCIDeviceGetName(virPCIDevicePtr dev)
1701 1702 1703 1704
{
    return dev->name;
}

1705
void virPCIDeviceSetManaged(virPCIDevicePtr dev, bool managed)
1706
{
1707
    dev->managed = managed;
1708 1709
}

1710 1711
unsigned int
virPCIDeviceGetManaged(virPCIDevicePtr dev)
1712 1713 1714 1715
{
    return dev->managed;
}

1716 1717
void
virPCIDeviceSetStubDriver(virPCIDevicePtr dev, virPCIStubDriver driver)
1718
{
1719
    dev->stubDriver = driver;
1720 1721
}

1722
virPCIStubDriver
1723 1724 1725 1726 1727
virPCIDeviceGetStubDriver(virPCIDevicePtr dev)
{
    return dev->stubDriver;
}

1728
unsigned int
1729
virPCIDeviceGetUnbindFromStub(virPCIDevicePtr dev)
1730 1731 1732 1733 1734
{
    return dev->unbind_from_stub;
}

void
1735
virPCIDeviceSetUnbindFromStub(virPCIDevicePtr dev, bool unbind)
1736
{
1737
    dev->unbind_from_stub = unbind;
1738 1739
}

1740
unsigned int
1741
virPCIDeviceGetRemoveSlot(virPCIDevicePtr dev)
1742 1743 1744 1745 1746
{
    return dev->remove_slot;
}

void
1747
virPCIDeviceSetRemoveSlot(virPCIDevicePtr dev, bool remove_slot)
1748
{
1749
    dev->remove_slot = remove_slot;
1750 1751
}

1752
unsigned int
1753
virPCIDeviceGetReprobe(virPCIDevicePtr dev)
1754 1755 1756 1757 1758
{
    return dev->reprobe;
}

void
1759
virPCIDeviceSetReprobe(virPCIDevicePtr dev, bool reprobe)
1760
{
1761
    dev->reprobe = reprobe;
1762 1763
}

C
Chunyan Liu 已提交
1764 1765 1766 1767
int
virPCIDeviceSetUsedBy(virPCIDevicePtr dev,
                      const char *drv_name,
                      const char *dom_name)
1768
{
C
Chunyan Liu 已提交
1769 1770 1771 1772 1773 1774 1775 1776
    VIR_FREE(dev->used_by_drvname);
    VIR_FREE(dev->used_by_domname);
    if (VIR_STRDUP(dev->used_by_drvname, drv_name) < 0)
        return -1;
    if (VIR_STRDUP(dev->used_by_domname, dom_name) < 0)
        return -1;

    return 0;
1777 1778
}

C
Chunyan Liu 已提交
1779 1780 1781 1782
void
virPCIDeviceGetUsedBy(virPCIDevicePtr dev,
                      const char **drv_name,
                      const char **dom_name)
1783
{
C
Chunyan Liu 已提交
1784 1785
    *drv_name = dev->used_by_drvname;
    *dom_name = dev->used_by_domname;
1786 1787
}

1788 1789
virPCIDeviceListPtr
virPCIDeviceListNew(void)
1790
{
1791
    virPCIDeviceListPtr list;
1792

1793 1794 1795 1796
    if (virPCIInitialize() < 0)
        return NULL;

    if (!(list = virObjectLockableNew(virPCIDeviceListClass)))
1797 1798 1799 1800 1801
        return NULL;

    return list;
}

1802 1803
static void
virPCIDeviceListDispose(void *obj)
1804
{
1805
    virPCIDeviceListPtr list = obj;
1806
    size_t i;
1807 1808

    for (i = 0; i < list->count; i++) {
1809
        virPCIDeviceFree(list->devs[i]);
1810 1811 1812 1813 1814 1815 1816 1817
        list->devs[i] = NULL;
    }

    list->count = 0;
    VIR_FREE(list->devs);
}

int
1818 1819
virPCIDeviceListAdd(virPCIDeviceListPtr list,
                    virPCIDevicePtr dev)
1820
{
1821
    if (virPCIDeviceListFind(list, dev)) {
1822
        virReportError(VIR_ERR_INTERNAL_ERROR,
1823 1824 1825
                       _("Device %s is already in use"), dev->name);
        return -1;
    }
1826
    return VIR_APPEND_ELEMENT(list->devs, list->count, dev);
1827 1828
}

L
Laine Stump 已提交
1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845

/* virPCIDeviceListAddCopy - add a *copy* of the device to this list */
int
virPCIDeviceListAddCopy(virPCIDeviceListPtr list, virPCIDevicePtr dev)
{
    virPCIDevicePtr copy = virPCIDeviceCopy(dev);

    if (!copy)
        return -1;
    if (virPCIDeviceListAdd(list, copy) < 0) {
        virPCIDeviceFree(copy);
        return -1;
    }
    return 0;
}


1846 1847 1848
virPCIDevicePtr
virPCIDeviceListGet(virPCIDeviceListPtr list,
                    int idx)
1849 1850 1851 1852 1853 1854 1855 1856 1857
{
    if (idx >= list->count)
        return NULL;
    if (idx < 0)
        return NULL;

    return list->devs[idx];
}

1858
size_t
1859
virPCIDeviceListCount(virPCIDeviceListPtr list)
1860
{
1861 1862 1863
    return list->count;
}

1864 1865 1866
virPCIDevicePtr
virPCIDeviceListStealIndex(virPCIDeviceListPtr list,
                           int idx)
1867
{
1868
    virPCIDevicePtr ret;
1869

1870 1871
    if (idx < 0 || idx >= list->count)
        return NULL;
1872

1873
    ret = list->devs[idx];
1874
    VIR_DELETE_ELEMENT(list->devs, idx, list->count);
1875 1876 1877
    return ret;
}

1878 1879 1880
virPCIDevicePtr
virPCIDeviceListSteal(virPCIDeviceListPtr list,
                      virPCIDevicePtr dev)
1881
{
1882
    return virPCIDeviceListStealIndex(list, virPCIDeviceListFindIndex(list, dev));
1883 1884
}

1885
void
1886 1887
virPCIDeviceListDel(virPCIDeviceListPtr list,
                    virPCIDevicePtr dev)
1888
{
1889
    virPCIDevicePtr ret = virPCIDeviceListSteal(list, dev);
1890
    virPCIDeviceFree(ret);
1891 1892
}

1893
int
1894
virPCIDeviceListFindIndex(virPCIDeviceListPtr list, virPCIDevicePtr dev)
1895
{
1896
    size_t i;
1897

1898 1899 1900 1901 1902 1903
    for (i = 0; i < list->count; i++) {
        virPCIDevicePtr other = list->devs[i];
        if (other->address.domain   == dev->address.domain &&
            other->address.bus      == dev->address.bus    &&
            other->address.slot     == dev->address.slot   &&
            other->address.function == dev->address.function)
1904
            return i;
1905
    }
1906 1907 1908
    return -1;
}

L
Laine Stump 已提交
1909 1910 1911 1912 1913 1914 1915 1916

virPCIDevicePtr
virPCIDeviceListFindByIDs(virPCIDeviceListPtr list,
                          unsigned int domain,
                          unsigned int bus,
                          unsigned int slot,
                          unsigned int function)
{
1917
    size_t i;
L
Laine Stump 已提交
1918 1919

    for (i = 0; i < list->count; i++) {
1920 1921 1922 1923 1924
        virPCIDevicePtr other = list->devs[i];
        if (other->address.domain   == domain &&
            other->address.bus      == bus    &&
            other->address.slot     == slot   &&
            other->address.function == function)
L
Laine Stump 已提交
1925 1926 1927 1928 1929 1930
            return list->devs[i];
    }
    return NULL;
}


1931 1932
virPCIDevicePtr
virPCIDeviceListFind(virPCIDeviceListPtr list, virPCIDevicePtr dev)
1933
{
1934
    int idx;
1935

1936 1937
    if ((idx = virPCIDeviceListFindIndex(list, dev)) >= 0)
        return list->devs[idx];
1938 1939
    else
        return NULL;
1940
}
1941 1942


1943 1944 1945
int virPCIDeviceFileIterate(virPCIDevicePtr dev,
                            virPCIDeviceFileActor actor,
                            void *opaque)
1946 1947 1948 1949 1950 1951
{
    char *pcidir = NULL;
    char *file = NULL;
    DIR *dir = NULL;
    int ret = -1;
    struct dirent *ent;
E
Eric Blake 已提交
1952
    int direrr;
1953 1954

    if (virAsprintf(&pcidir, "/sys/bus/pci/devices/%04x:%02x:%02x.%x",
1955 1956
                    dev->address.domain, dev->address.bus,
                    dev->address.slot, dev->address.function) < 0)
1957 1958 1959
        goto cleanup;

    if (!(dir = opendir(pcidir))) {
1960
        virReportSystemError(errno,
1961 1962 1963 1964
                             _("cannot open %s"), pcidir);
        goto cleanup;
    }

E
Eric Blake 已提交
1965
    while ((direrr = virDirRead(dir, &ent, pcidir)) > 0) {
1966
        /* Device assignment requires:
A
Alex Williamson 已提交
1967
         *   $PCIDIR/config, $PCIDIR/resource, $PCIDIR/resourceNNN,
1968
         *   $PCIDIR/rom, $PCIDIR/reset, $PCIDIR/vendor, $PCIDIR/device
1969 1970 1971
         */
        if (STREQ(ent->d_name, "config") ||
            STRPREFIX(ent->d_name, "resource") ||
A
Alex Williamson 已提交
1972
            STREQ(ent->d_name, "rom") ||
1973 1974
            STREQ(ent->d_name, "vendor") ||
            STREQ(ent->d_name, "device") ||
A
Alex Williamson 已提交
1975
            STREQ(ent->d_name, "reset")) {
1976
            if (virAsprintf(&file, "%s/%s", pcidir, ent->d_name) < 0)
1977
                goto cleanup;
1978
            if ((actor)(dev, file, opaque) < 0)
1979 1980 1981 1982 1983
                goto cleanup;

            VIR_FREE(file);
        }
    }
E
Eric Blake 已提交
1984 1985
    if (direrr < 0)
        goto cleanup;
1986 1987 1988

    ret = 0;

1989
 cleanup:
1990 1991 1992 1993 1994 1995
    if (dir)
        closedir(dir);
    VIR_FREE(file);
    VIR_FREE(pcidir);
    return ret;
}
J
Jiri Denemark 已提交
1996

L
Laine Stump 已提交
1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011

/* virPCIDeviceAddressIOMMUGroupIterate:
 *   Call @actor for all devices in the same iommu_group as orig
 *   (including orig itself) Even if there is no iommu_group for the
 *   device, call @actor once for orig.
 */
int
virPCIDeviceAddressIOMMUGroupIterate(virPCIDeviceAddressPtr orig,
                                     virPCIDeviceAddressActor actor,
                                     void *opaque)
{
    char *groupPath = NULL;
    DIR *groupDir = NULL;
    int ret = -1;
    struct dirent *ent;
E
Eric Blake 已提交
2012
    int direrr;
L
Laine Stump 已提交
2013 2014 2015

    if (virAsprintf(&groupPath,
                    PCI_SYSFS "devices/%04x:%02x:%02x.%x/iommu_group/devices",
2016
                    orig->domain, orig->bus, orig->slot, orig->function) < 0)
L
Laine Stump 已提交
2017 2018 2019 2020 2021 2022 2023 2024
        goto cleanup;

    if (!(groupDir = opendir(groupPath))) {
        /* just process the original device, nothing more */
        ret = (actor)(orig, opaque);
        goto cleanup;
    }

E
Eric Blake 已提交
2025
    while ((direrr = virDirRead(groupDir, &ent, groupPath)) > 0) {
L
Laine Stump 已提交
2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040
        virPCIDeviceAddress newDev;

        if (ent->d_name[0] == '.')
            continue;

        if (virPCIDeviceAddressParse(ent->d_name, &newDev) < 0) {
            virReportError(VIR_ERR_INTERNAL_ERROR,
                           _("Found invalid device link '%s' in '%s'"),
                           ent->d_name, groupPath);
            goto cleanup;
        }

        if ((actor)(&newDev, opaque) < 0)
            goto cleanup;
    }
E
Eric Blake 已提交
2041
    if (direrr < 0)
L
Laine Stump 已提交
2042 2043 2044 2045
        goto cleanup;

    ret = 0;

2046
 cleanup:
L
Laine Stump 已提交
2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069
    VIR_FREE(groupPath);
    if (groupDir)
        closedir(groupDir);
    return ret;
}


static int
virPCIDeviceGetIOMMUGroupAddOne(virPCIDeviceAddressPtr newDevAddr, void *opaque)
{
    int ret = -1;
    virPCIDeviceListPtr groupList = opaque;
    virPCIDevicePtr newDev;

    if (!(newDev = virPCIDeviceNew(newDevAddr->domain, newDevAddr->bus,
                                   newDevAddr->slot, newDevAddr->function)))
        goto cleanup;

    if (virPCIDeviceListAdd(groupList, newDev) < 0)
        goto cleanup;

    newDev = NULL; /* it's now on the list */
    ret = 0;
2070
 cleanup:
L
Laine Stump 已提交
2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089
    virPCIDeviceFree(newDev);
    return ret;
}


/*
 * virPCIDeviceGetIOMMUGroupList - return a virPCIDeviceList containing
 * all of the devices in the same iommu_group as @dev.
 *
 * Return the new list, or NULL on failure
 */
virPCIDeviceListPtr
virPCIDeviceGetIOMMUGroupList(virPCIDevicePtr dev)
{
    virPCIDeviceListPtr groupList = virPCIDeviceListNew();

    if (!groupList)
        goto error;

2090
    if (virPCIDeviceAddressIOMMUGroupIterate(&(dev->address),
L
Laine Stump 已提交
2091 2092 2093 2094 2095 2096
                                             virPCIDeviceGetIOMMUGroupAddOne,
                                             groupList) < 0)
        goto error;

    return groupList;

2097
 error:
L
Laine Stump 已提交
2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121
    virObjectUnref(groupList);
    return NULL;
}


typedef struct {
    virPCIDeviceAddressPtr **iommuGroupDevices;
    size_t *nIommuGroupDevices;
} virPCIDeviceAddressList;
typedef virPCIDeviceAddressList *virPCIDeviceAddressListPtr;

static int
virPCIGetIOMMUGroupAddressesAddOne(virPCIDeviceAddressPtr newDevAddr, void *opaque)
{
    int ret = -1;
    virPCIDeviceAddressListPtr addrList = opaque;
    virPCIDeviceAddressPtr copyAddr;

    /* make a copy to insert onto the list */
    if (VIR_ALLOC(copyAddr) < 0)
        goto cleanup;

    *copyAddr = *newDevAddr;

2122 2123
    if (VIR_APPEND_ELEMENT(*addrList->iommuGroupDevices,
                           *addrList->nIommuGroupDevices, copyAddr) < 0)
L
Laine Stump 已提交
2124 2125 2126
        goto cleanup;

    ret = 0;
2127
 cleanup:
L
Laine Stump 已提交
2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154
    VIR_FREE(copyAddr);
    return ret;
}


/*
 * virPCIDeviceAddressGetIOMMUGroupAddresses - return a
 * virPCIDeviceList containing all of the devices in the same
 * iommu_group as @dev.
 *
 * Return the new list, or NULL on failure
 */
int
virPCIDeviceAddressGetIOMMUGroupAddresses(virPCIDeviceAddressPtr devAddr,
                                          virPCIDeviceAddressPtr **iommuGroupDevices,
                                          size_t *nIommuGroupDevices)
{
    int ret = -1;
    virPCIDeviceAddressList addrList = { iommuGroupDevices,
                                         nIommuGroupDevices };

    if (virPCIDeviceAddressIOMMUGroupIterate(devAddr,
                                             virPCIGetIOMMUGroupAddressesAddOne,
                                             &addrList) < 0)
        goto cleanup;

    ret = 0;
2155
 cleanup:
L
Laine Stump 已提交
2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174
    return ret;
}


/* virPCIDeviceAddressGetIOMMUGroupNum - return the group number of
 * this PCI device's iommu_group, or -2 if there is no iommu_group for
 * the device (or -1 if there was any other error)
 */
int
virPCIDeviceAddressGetIOMMUGroupNum(virPCIDeviceAddressPtr addr)
{
    char *devName = NULL;
    char *devPath = NULL;
    char *groupPath = NULL;
    const char *groupNumStr;
    unsigned int groupNum;
    int ret = -1;

    if (virAsprintf(&devName, "%.4x:%.2x:%.2x.%.1x", addr->domain,
2175
                    addr->bus, addr->slot, addr->function) < 0)
L
Laine Stump 已提交
2176 2177
        goto cleanup;

2178
    if (!(devPath = virPCIFile(devName, "iommu_group")))
L
Laine Stump 已提交
2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201
        goto cleanup;
    if (virFileIsLink(devPath) != 1) {
        ret = -2;
        goto cleanup;
    }
    if (virFileResolveLink(devPath, &groupPath) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unable to resolve device %s iommu_group symlink %s"),
                       devName, devPath);
        goto cleanup;
    }

    groupNumStr = last_component(groupPath);
    if (virStrToLong_ui(groupNumStr, NULL, 10, &groupNum) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("device %s iommu_group symlink %s has "
                         "invalid group number %s"),
                       devName, groupPath, groupNumStr);
        ret = -1;
        goto cleanup;
    }

    ret = groupNum;
2202
 cleanup:
L
Laine Stump 已提交
2203 2204 2205 2206 2207 2208 2209
    VIR_FREE(devName);
    VIR_FREE(devPath);
    VIR_FREE(groupPath);
    return ret;
}


2210 2211
/* virPCIDeviceGetIOMMUGroupDev - return the name of the device used
 * to control this PCI device's group (e.g. "/dev/vfio/15")
2212 2213
 */
char *
2214
virPCIDeviceGetIOMMUGroupDev(virPCIDevicePtr dev)
2215 2216 2217 2218 2219
{
    char *devPath = NULL;
    char *groupPath = NULL;
    char *groupDev = NULL;

2220
    if (!(devPath = virPCIFile(dev->name, "iommu_group")))
2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234
        goto cleanup;
    if (virFileIsLink(devPath) != 1) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Invalid device %s iommu_group file %s is not a symlink"),
                       dev->name, devPath);
        goto cleanup;
    }
    if (virFileResolveLink(devPath, &groupPath) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unable to resolve device %s iommu_group symlink %s"),
                       dev->name, devPath);
        goto cleanup;
    }
    if (virAsprintf(&groupDev, "/dev/vfio/%s",
2235
                    last_component(groupPath)) < 0)
2236
        goto cleanup;
2237
 cleanup:
2238 2239 2240 2241 2242
    VIR_FREE(devPath);
    VIR_FREE(groupPath);
    return groupDev;
}

J
Jiri Denemark 已提交
2243
static int
2244
virPCIDeviceDownstreamLacksACS(virPCIDevicePtr dev)
J
Jiri Denemark 已提交
2245 2246 2247 2248
{
    uint16_t flags;
    uint16_t ctrl;
    unsigned int pos;
2249 2250
    int fd;
    int ret = 0;
2251
    uint16_t device_class;
J
Jiri Denemark 已提交
2252

2253
    if ((fd = virPCIDeviceConfigOpen(dev, true)) < 0)
J
Jiri Denemark 已提交
2254 2255
        return -1;

2256
    if (virPCIDeviceInit(dev, fd) < 0) {
2257 2258 2259 2260
        ret = -1;
        goto cleanup;
    }

2261 2262 2263
    if (virPCIDeviceReadClass(dev, &device_class) < 0)
        goto cleanup;

J
Jiri Denemark 已提交
2264
    pos = dev->pcie_cap_pos;
2265
    if (!pos || device_class != PCI_CLASS_BRIDGE_PCI)
2266
        goto cleanup;
J
Jiri Denemark 已提交
2267

2268
    flags = virPCIDeviceRead16(dev, fd, pos + PCI_EXP_FLAGS);
J
Jiri Denemark 已提交
2269
    if (((flags & PCI_EXP_FLAGS_TYPE) >> 4) != PCI_EXP_TYPE_DOWNSTREAM)
2270
        goto cleanup;
J
Jiri Denemark 已提交
2271

2272
    pos = virPCIDeviceFindExtendedCapabilityOffset(dev, fd, PCI_EXT_CAP_ID_ACS);
J
Jiri Denemark 已提交
2273 2274
    if (!pos) {
        VIR_DEBUG("%s %s: downstream port lacks ACS", dev->id, dev->name);
2275 2276
        ret = 1;
        goto cleanup;
J
Jiri Denemark 已提交
2277 2278
    }

2279
    ctrl = virPCIDeviceRead16(dev, fd, pos + PCI_EXT_ACS_CTRL);
J
Jiri Denemark 已提交
2280 2281 2282
    if ((ctrl & PCI_EXT_CAP_ACS_ENABLED) != PCI_EXT_CAP_ACS_ENABLED) {
        VIR_DEBUG("%s %s: downstream port has ACS disabled",
                  dev->id, dev->name);
2283 2284
        ret = 1;
        goto cleanup;
J
Jiri Denemark 已提交
2285 2286
    }

2287
 cleanup:
2288
    virPCIDeviceConfigClose(dev, fd);
2289
    return ret;
J
Jiri Denemark 已提交
2290 2291 2292
}

static int
2293
virPCIDeviceIsBehindSwitchLackingACS(virPCIDevicePtr dev)
J
Jiri Denemark 已提交
2294
{
2295
    virPCIDevicePtr parent;
J
Jiri Denemark 已提交
2296

2297
    if (virPCIDeviceGetParent(dev, &parent) < 0)
2298
        return -1;
2299 2300 2301 2302 2303
    if (!parent) {
        /* if we have no parent, and this is the root bus, ACS doesn't come
         * into play since devices on the root bus can't P2P without going
         * through the root IOMMU.
         */
2304
        if (dev->address.bus == 0) {
2305
            return 0;
2306
        } else {
2307
            virReportError(VIR_ERR_INTERNAL_ERROR,
2308 2309 2310 2311
                           _("Failed to find parent device for %s"),
                           dev->name);
            return -1;
        }
J
Jiri Denemark 已提交
2312 2313 2314 2315 2316 2317 2318
    }

    /* XXX we should rather fail when we can't find device's parent and
     * stop the loop when we get to root instead of just stopping when no
     * parent can be found
     */
    do {
2319
        virPCIDevicePtr tmp;
J
Jiri Denemark 已提交
2320
        int acs;
2321
        int ret;
J
Jiri Denemark 已提交
2322

2323
        acs = virPCIDeviceDownstreamLacksACS(parent);
J
Jiri Denemark 已提交
2324 2325

        if (acs) {
2326
            virPCIDeviceFree(parent);
J
Jiri Denemark 已提交
2327 2328 2329 2330 2331 2332 2333
            if (acs < 0)
                return -1;
            else
                return 1;
        }

        tmp = parent;
2334 2335
        ret = virPCIDeviceGetParent(parent, &parent);
        virPCIDeviceFree(tmp);
2336 2337
        if (ret < 0)
            return -1;
J
Jiri Denemark 已提交
2338 2339 2340 2341 2342
    } while (parent);

    return 0;
}

2343 2344
int virPCIDeviceIsAssignable(virPCIDevicePtr dev,
                             int strict_acs_check)
J
Jiri Denemark 已提交
2345 2346 2347 2348 2349 2350 2351 2352
{
    int ret;

    /* XXX This could be a great place to actually check that a non-managed
     * device isn't in use, e.g. by checking that device is either un-bound
     * or bound to a stub driver.
     */

2353
    ret = virPCIDeviceIsBehindSwitchLackingACS(dev);
J
Jiri Denemark 已提交
2354 2355 2356 2357 2358 2359 2360 2361
    if (ret < 0)
        return 0;

    if (ret) {
        if (!strict_acs_check) {
            VIR_DEBUG("%s %s: strict ACS check disabled; device assignment allowed",
                      dev->id, dev->name);
        } else {
2362
            virReportError(VIR_ERR_INTERNAL_ERROR,
J
Jiri Denemark 已提交
2363 2364 2365 2366 2367 2368 2369 2370 2371
                           _("Device %s is behind a switch lacking ACS and "
                             "cannot be assigned"),
                           dev->name);
            return 0;
        }
    }

    return 1;
}
2372 2373 2374 2375 2376 2377 2378 2379 2380 2381

static int
logStrToLong_ui(char const *s,
                char **end_ptr,
                int base,
                unsigned int *result)
{
    int ret = 0;

    ret = virStrToLong_ui(s, end_ptr, base, result);
2382
    if (ret != 0)
2383 2384 2385 2386
        VIR_ERROR(_("Failed to convert '%s' to unsigned int"), s);
    return ret;
}

2387 2388
int
virPCIDeviceAddressParse(char *address,
2389
                         virPCIDeviceAddressPtr bdf)
2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415
{
    char *p = NULL;
    int ret = -1;

    if ((address == NULL) || (logStrToLong_ui(address, &p, 16,
                                              &bdf->domain) == -1)) {
        goto out;
    }

    if ((p == NULL) || (logStrToLong_ui(p+1, &p, 16,
                                        &bdf->bus) == -1)) {
        goto out;
    }

    if ((p == NULL) || (logStrToLong_ui(p+1, &p, 16,
                                        &bdf->slot) == -1)) {
        goto out;
    }

    if ((p == NULL) || (logStrToLong_ui(p+1, &p, 16,
                                        &bdf->function) == -1)) {
        goto out;
    }

    ret = 0;

2416
 out:
2417 2418 2419
    return ret;
}

2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434
#ifdef __linux__

/*
 * returns true if equal
 */
static bool
virPCIDeviceAddressIsEqual(virPCIDeviceAddressPtr bdf1,
                           virPCIDeviceAddressPtr bdf2)
{
    return ((bdf1->domain == bdf2->domain) &&
            (bdf1->bus == bdf2->bus) &&
            (bdf1->slot == bdf2->slot) &&
            (bdf1->function == bdf2->function));
}

2435
static int
2436 2437
virPCIGetDeviceAddressFromSysfsLink(const char *device_link,
                                    virPCIDeviceAddressPtr *bdf)
2438 2439 2440 2441 2442 2443 2444
{
    char *config_address = NULL;
    char *device_path = NULL;
    char errbuf[64];
    int ret = -1;

    if (!virFileExists(device_link)) {
2445
        VIR_DEBUG("'%s' does not exist", device_link);
2446 2447 2448
        return ret;
    }

2449
    device_path = canonicalize_file_name(device_link);
2450 2451
    if (device_path == NULL) {
        memset(errbuf, '\0', sizeof(errbuf));
2452 2453 2454
        virReportSystemError(errno,
                             _("Failed to resolve device link '%s'"),
                             device_link);
2455 2456 2457
        return ret;
    }

2458
    config_address = last_component(device_path);
2459
    if (VIR_ALLOC(*bdf) != 0)
2460 2461
        goto out;

2462
    if (virPCIDeviceAddressParse(config_address, *bdf) != 0) {
2463
        virReportError(VIR_ERR_INTERNAL_ERROR,
2464 2465 2466 2467 2468 2469 2470
                       _("Failed to parse PCI config address '%s'"),
                       config_address);
        VIR_FREE(*bdf);
        goto out;
    }

    ret = 0;
2471
 out:
2472 2473 2474 2475 2476 2477 2478 2479 2480
    VIR_FREE(device_path);

    return ret;
}

/*
 * Returns Physical function given a virtual function
 */
int
2481
virPCIGetPhysicalFunction(const char *vf_sysfs_path,
2482
                          virPCIDeviceAddressPtr *pf)
2483 2484 2485 2486 2487 2488 2489 2490 2491
{
    int ret = -1;
    char *device_link = NULL;

    if (virBuildPath(&device_link, vf_sysfs_path, "physfn") == -1) {
        virReportOOMError();
        return ret;
    }

2492 2493 2494 2495
    if ((ret = virPCIGetDeviceAddressFromSysfsLink(device_link, pf)) >= 0) {
        VIR_DEBUG("PF for VF device '%s': %.4x:%.2x:%.2x.%.1x", vf_sysfs_path,
                  (*pf)->domain, (*pf)->bus, (*pf)->slot, (*pf)->function);
    }
2496 2497 2498 2499 2500
    VIR_FREE(device_link);

    return ret;
}

2501

2502 2503 2504 2505
/*
 * Returns virtual functions of a physical function
 */
int
2506 2507
virPCIGetVirtualFunctions(const char *sysfs_path,
                          virPCIDeviceAddressPtr **virtual_functions,
2508 2509
                          size_t *num_virtual_functions,
                          unsigned int *max_virtual_functions)
2510 2511
{
    int ret = -1;
2512
    size_t i;
2513
    char *device_link = NULL;
2514
    virPCIDeviceAddress *config_addr = NULL;
2515
    char *totalvfs_file = NULL, *totalvfs_str = NULL;
2516

2517 2518
    *virtual_functions = NULL;
    *num_virtual_functions = 0;
2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534
    *max_virtual_functions = 0;

    if (virAsprintf(&totalvfs_file, "%s/sriov_totalvfs", sysfs_path) < 0)
       goto error;
    if (virFileExists(totalvfs_file)) {
        char *end = NULL; /* so that terminating \n doesn't create error */

        if (virFileReadAll(totalvfs_file, 16, &totalvfs_str) < 0)
            goto error;
        if (virStrToLong_ui(totalvfs_str, &end, 10, max_virtual_functions) < 0) {
            virReportError(VIR_ERR_INTERNAL_ERROR,
                           _("Unrecognized value in %s: %s"),
                           totalvfs_file, totalvfs_str);
            goto error;
        }
    }
2535

2536 2537 2538 2539
    do {
        /* look for virtfn%d links until one isn't found */
        if (virAsprintf(&device_link, "%s/virtfn%zu", sysfs_path, *num_virtual_functions) < 0)
            goto error;
2540

2541 2542
        if (!virFileExists(device_link))
            break;
2543

2544 2545 2546 2547 2548 2549
        if (virPCIGetDeviceAddressFromSysfsLink(device_link, &config_addr) < 0) {
            virReportError(VIR_ERR_INTERNAL_ERROR,
                           _("Failed to get SRIOV function from device link '%s'"),
                           device_link);
            goto error;
        }
2550

2551 2552 2553
        if (VIR_APPEND_ELEMENT(*virtual_functions, *num_virtual_functions, config_addr) < 0)
            goto error;
        VIR_FREE(device_link);
2554

2555
    } while (1);
2556

2557
    VIR_DEBUG("Found %zu virtual functions for %s", *num_virtual_functions, sysfs_path);
2558
    ret = 0;
2559
 cleanup:
2560
    VIR_FREE(device_link);
2561
    VIR_FREE(config_addr);
2562 2563
    VIR_FREE(totalvfs_file);
    VIR_FREE(totalvfs_str);
2564
    return ret;
2565

2566
 error:
2567 2568 2569
    for (i = 0; i < *num_virtual_functions; i++)
        VIR_FREE((*virtual_functions)[i]);
    VIR_FREE(*virtual_functions);
2570
    goto cleanup;
2571
}
2572

2573

2574 2575 2576 2577
/*
 * Returns 1 if vf device is a virtual function, 0 if not, -1 on error
 */
int
2578
virPCIIsVirtualFunction(const char *vf_sysfs_device_link)
2579 2580 2581 2582 2583
{
    char *vf_sysfs_physfn_link = NULL;
    int ret = -1;

    if (virAsprintf(&vf_sysfs_physfn_link, "%s/physfn",
2584
                    vf_sysfs_device_link) < 0)
2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597
        return ret;

    ret = virFileExists(vf_sysfs_physfn_link);

    VIR_FREE(vf_sysfs_physfn_link);

    return ret;
}

/*
 * Returns the sriov virtual function index of vf given its pf
 */
int
2598 2599 2600
virPCIGetVirtualFunctionIndex(const char *pf_sysfs_device_link,
                              const char *vf_sysfs_device_link,
                              int *vf_index)
2601
{
2602 2603
    int ret = -1;
    size_t i;
2604
    size_t num_virt_fns = 0;
2605
    unsigned int max_virt_fns = 0;
2606 2607
    virPCIDeviceAddressPtr vf_bdf = NULL;
    virPCIDeviceAddressPtr *virt_fns = NULL;
2608

2609 2610
    if (virPCIGetDeviceAddressFromSysfsLink(vf_sysfs_device_link,
                                            &vf_bdf) < 0)
2611 2612
        return ret;

2613
    if (virPCIGetVirtualFunctions(pf_sysfs_device_link, &virt_fns,
2614
                                  &num_virt_fns, &max_virt_fns) < 0) {
2615
        virReportError(VIR_ERR_INTERNAL_ERROR,
2616
                       _("Error getting physical function's '%s' "
2617
                         "virtual_functions"), pf_sysfs_device_link);
2618 2619 2620 2621
        goto out;
    }

    for (i = 0; i < num_virt_fns; i++) {
2622 2623 2624 2625 2626
        if (virPCIDeviceAddressIsEqual(vf_bdf, virt_fns[i])) {
            *vf_index = i;
            ret = 0;
            break;
        }
2627 2628
    }

2629
 out:
2630 2631 2632

    /* free virtual functions */
    for (i = 0; i < num_virt_fns; i++)
2633
        VIR_FREE(virt_fns[i]);
2634

A
ajia@redhat.com 已提交
2635
    VIR_FREE(virt_fns);
2636 2637 2638 2639 2640
    VIR_FREE(vf_bdf);

    return ret;
}

2641 2642 2643 2644 2645
/*
 * Returns a path to the PCI sysfs file given the BDF of the PCI function
 */

int
2646
virPCIGetSysfsFile(char *virPCIDeviceName, char **pci_sysfs_device_link)
2647
{
2648 2649 2650 2651
    if (virAsprintf(pci_sysfs_device_link, PCI_SYSFS "devices/%s",
                    virPCIDeviceName) < 0)
        return -1;
    return 0;
2652 2653
}

R
Roopa Prabhu 已提交
2654
int
2655
virPCIDeviceAddressGetSysfsFile(virPCIDeviceAddressPtr addr,
2656
                                char **pci_sysfs_device_link)
R
Roopa Prabhu 已提交
2657
{
2658
    if (virAsprintf(pci_sysfs_device_link,
2659 2660 2661
                    PCI_SYSFS "devices/%04x:%02x:%02x.%x",
                    addr->domain, addr->bus,
                    addr->slot, addr->function) < 0)
2662 2663
        return -1;
    return 0;
R
Roopa Prabhu 已提交
2664 2665
}

2666 2667 2668 2669
/*
 * Returns the network device name of a pci device
 */
int
2670 2671 2672 2673 2674 2675
virPCIGetNetName(char *device_link_sysfs_path, char **netname)
{
    char *pcidev_sysfs_net_path = NULL;
    int ret = -1;
    DIR *dir = NULL;
    struct dirent *entry = NULL;
2676

2677 2678 2679 2680 2681 2682 2683 2684 2685
    if (virBuildPath(&pcidev_sysfs_net_path, device_link_sysfs_path,
                     "net") == -1) {
        virReportOOMError();
        return -1;
    }

    dir = opendir(pcidev_sysfs_net_path);
    if (dir == NULL)
        goto out;
2686

E
Eric Blake 已提交
2687
    while (virDirRead(dir, &entry, pcidev_sysfs_net_path) > 0) {
2688 2689 2690 2691 2692
        if (STREQ(entry->d_name, ".") ||
            STREQ(entry->d_name, ".."))
            continue;

        /* Assume a single directory entry */
2693
        if (VIR_STRDUP(*netname, entry->d_name) > 0)
2694 2695 2696 2697 2698
            ret = 0;
        break;
    }

    closedir(dir);
2699

2700
 out:
2701
    VIR_FREE(pcidev_sysfs_net_path);
2702

2703
    return ret;
2704
}
R
Roopa Prabhu 已提交
2705 2706

int
2707 2708
virPCIGetVirtualFunctionInfo(const char *vf_sysfs_device_path,
                             char **pfname, int *vf_index)
R
Roopa Prabhu 已提交
2709
{
2710
    virPCIDeviceAddressPtr pf_config_address = NULL;
R
Roopa Prabhu 已提交
2711 2712 2713
    char *pf_sysfs_device_path = NULL;
    int ret = -1;

2714
    if (virPCIGetPhysicalFunction(vf_sysfs_device_path, &pf_config_address) < 0)
R
Roopa Prabhu 已提交
2715 2716
        return ret;

2717 2718
    if (virPCIDeviceAddressGetSysfsFile(pf_config_address,
                                        &pf_sysfs_device_path) < 0) {
R
Roopa Prabhu 已提交
2719 2720 2721 2722 2723

        VIR_FREE(pf_config_address);
        return ret;
    }

2724 2725
    if (virPCIGetVirtualFunctionIndex(pf_sysfs_device_path, vf_sysfs_device_path,
                                      vf_index) < 0)
R
Roopa Prabhu 已提交
2726 2727
        goto cleanup;

2728
    ret = virPCIGetNetName(pf_sysfs_device_path, pfname);
R
Roopa Prabhu 已提交
2729

2730
 cleanup:
R
Roopa Prabhu 已提交
2731 2732 2733 2734 2735 2736
    VIR_FREE(pf_config_address);
    VIR_FREE(pf_sysfs_device_path);

    return ret;
}

2737
#else
2738 2739
static const char *unsupported = N_("not supported on non-linux platforms");

2740
int
2741
virPCIGetPhysicalFunction(const char *vf_sysfs_path ATTRIBUTE_UNUSED,
2742
                          virPCIDeviceAddressPtr *pf ATTRIBUTE_UNUSED)
2743
{
2744
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2745 2746 2747 2748
    return -1;
}

int
2749 2750
virPCIGetVirtualFunctions(const char *sysfs_path ATTRIBUTE_UNUSED,
                          virPCIDeviceAddressPtr **virtual_functions ATTRIBUTE_UNUSED,
2751 2752
                          size_t *num_virtual_functions ATTRIBUTE_UNUSED,
                          unsigned int *max_virtual_functions ATTRIBUTE_UNUSED)
2753
{
2754
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2755 2756
    return -1;
}
2757 2758

int
E
Eric Blake 已提交
2759
virPCIIsVirtualFunction(const char *vf_sysfs_device_link ATTRIBUTE_UNUSED)
2760
{
2761
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2762 2763 2764 2765
    return -1;
}

int
2766 2767 2768
virPCIGetVirtualFunctionIndex(const char *pf_sysfs_device_link ATTRIBUTE_UNUSED,
                              const char *vf_sysfs_device_link ATTRIBUTE_UNUSED,
                              int *vf_index ATTRIBUTE_UNUSED)
2769
{
2770
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2771 2772 2773 2774
    return -1;

}

2775
int
2776 2777
virPCIDeviceAddressGetSysfsFile(virPCIDeviceAddressPtr dev ATTRIBUTE_UNUSED,
                                char **pci_sysfs_device_link ATTRIBUTE_UNUSED)
2778
{
2779
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2780 2781 2782
    return -1;
}

2783
int
2784
virPCIGetNetName(char *device_link_sysfs_path ATTRIBUTE_UNUSED,
2785
                 char **netname ATTRIBUTE_UNUSED)
2786
{
2787
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2788 2789
    return -1;
}
R
Roopa Prabhu 已提交
2790 2791

int
2792 2793 2794
virPCIGetVirtualFunctionInfo(const char *vf_sysfs_device_path ATTRIBUTE_UNUSED,
                             char **pfname ATTRIBUTE_UNUSED,
                             int *vf_index ATTRIBUTE_UNUSED)
R
Roopa Prabhu 已提交
2795
{
2796
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
R
Roopa Prabhu 已提交
2797 2798
    return -1;
}
2799
#endif /* __linux__ */
2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883

int
virPCIDeviceIsPCIExpress(virPCIDevicePtr dev)
{
    int fd;
    int ret = -1;

    if ((fd = virPCIDeviceConfigOpen(dev, true)) < 0)
        return ret;

    if (virPCIDeviceInit(dev, fd) < 0)
        goto cleanup;

    ret = dev->pcie_cap_pos != 0;

 cleanup:
    virPCIDeviceConfigClose(dev, fd);
    return ret;
}

int
virPCIDeviceHasPCIExpressLink(virPCIDevicePtr dev)
{
    int fd;
    int ret = -1;
    uint16_t cap, type;

    if ((fd = virPCIDeviceConfigOpen(dev, true)) < 0)
        return ret;

    if (virPCIDeviceInit(dev, fd) < 0)
        goto cleanup;

    cap = virPCIDeviceRead16(dev, fd, dev->pcie_cap_pos + PCI_CAP_FLAGS);
    type = (cap & PCI_EXP_FLAGS_TYPE) >> 4;

    ret = type != PCI_EXP_TYPE_ROOT_INT_EP && type != PCI_EXP_TYPE_ROOT_EC;

 cleanup:
    virPCIDeviceConfigClose(dev, fd);
    return ret;
}

int
virPCIDeviceGetLinkCapSta(virPCIDevicePtr dev,
                          int *cap_port,
                          unsigned int *cap_speed,
                          unsigned int *cap_width,
                          unsigned int *sta_speed,
                          unsigned int *sta_width)
{
    uint32_t t;
    int fd;
    int ret = -1;

    if ((fd = virPCIDeviceConfigOpen(dev, true)) < 0)
        return ret;

    if (virPCIDeviceInit(dev, fd) < 0)
        goto cleanup;

    if (!dev->pcie_cap_pos) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("pci device %s is not a PCI-Express device"),
                       dev->name);
        goto cleanup;
    }

    t = virPCIDeviceRead32(dev, fd, dev->pcie_cap_pos + PCI_EXP_LNKCAP);

    *cap_port = t >> 24;
    *cap_speed = t & PCI_EXP_LNKCAP_SPEED;
    *cap_width = (t & PCI_EXP_LNKCAP_WIDTH) >> 4;

    t = virPCIDeviceRead16(dev, fd, dev->pcie_cap_pos + PCI_EXP_LNKSTA);

    *sta_speed = t & PCI_EXP_LNKSTA_SPEED;
    *sta_width = (t & PCI_EXP_LNKSTA_WIDTH) >> 4;
    ret = 0;

 cleanup:
    virPCIDeviceConfigClose(dev, fd);
    return ret;
}
2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895


void
virPCIEDeviceInfoFree(virPCIEDeviceInfoPtr dev)
{
    if (!dev)
        return;

    VIR_FREE(dev->link_cap);
    VIR_FREE(dev->link_sta);
    VIR_FREE(dev);
}