提交 98d176f8 编写于 作者: Y Yuval Shaia 提交者: Marcel Apfelbaum

hw/rdma: PVRDMA commands and data-path ops

First PVRDMA sub-module - implementation of the PVRDMA device.
- PVRDMA commands such as create CQ and create MR.
- Data path QP operations - post_send and post_recv.
- Completion handler.
Reviewed-by: NDotan Barak <dotanb@mellanox.com>
Reviewed-by: NZhu Yanjun <yanjun.zhu@oracle.com>
Signed-off-by: NYuval Shaia <yuval.shaia@oracle.com>
Signed-off-by: NMarcel Apfelbaum <marcel@redhat.com>
上级 ef6d4ccd
ifeq ($(CONFIG_RDMA),y)
obj-$(CONFIG_PCI) += rdma_utils.o rdma_backend.o rdma_rm.o
obj-$(CONFIG_PCI) += vmw/pvrdma_dev_ring.o vmw/pvrdma_cmd.o \
vmw/pvrdma_qp_ops.o
endif
/*
* QEMU VMWARE paravirtual RDMA device definitions
*
* Copyright (C) 2018 Oracle
* Copyright (C) 2018 Red Hat Inc
*
* Authors:
* Yuval Shaia <yuval.shaia@oracle.com>
* Marcel Apfelbaum <marcel@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#ifndef PVRDMA_PVRDMA_H
#define PVRDMA_PVRDMA_H
#include <hw/pci/pci.h>
#include <hw/pci/msix.h>
#include "../rdma_backend_defs.h"
#include "../rdma_rm_defs.h"
#include <standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h>
#include <standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h>
#include "pvrdma_dev_ring.h"
/* BARs */
#define RDMA_MSIX_BAR_IDX 0
#define RDMA_REG_BAR_IDX 1
#define RDMA_UAR_BAR_IDX 2
#define RDMA_BAR0_MSIX_SIZE (16 * 1024)
#define RDMA_BAR1_REGS_SIZE 256
#define RDMA_BAR2_UAR_SIZE (0x1000 * MAX_UCS) /* each uc gets page */
/* MSIX */
#define RDMA_MAX_INTRS 3
#define RDMA_MSIX_TABLE 0x0000
#define RDMA_MSIX_PBA 0x2000
/* Interrupts Vectors */
#define INTR_VEC_CMD_RING 0
#define INTR_VEC_CMD_ASYNC_EVENTS 1
#define INTR_VEC_CMD_COMPLETION_Q 2
/* HW attributes */
#define PVRDMA_HW_NAME "pvrdma"
#define PVRDMA_HW_VERSION 17
#define PVRDMA_FW_VERSION 14
typedef struct DSRInfo {
dma_addr_t dma;
struct pvrdma_device_shared_region *dsr;
union pvrdma_cmd_req *req;
union pvrdma_cmd_resp *rsp;
struct pvrdma_ring *async_ring_state;
PvrdmaRing async;
struct pvrdma_ring *cq_ring_state;
PvrdmaRing cq;
} DSRInfo;
typedef struct PVRDMADev {
PCIDevice parent_obj;
MemoryRegion msix;
MemoryRegion regs;
uint32_t regs_data[RDMA_BAR1_REGS_SIZE];
MemoryRegion uar;
uint32_t uar_data[RDMA_BAR2_UAR_SIZE];
DSRInfo dsr_info;
int interrupt_mask;
struct ibv_device_attr dev_attr;
uint64_t node_guid;
char *backend_device_name;
uint8_t backend_gid_idx;
uint8_t backend_port_num;
RdmaBackendDev backend_dev;
RdmaDeviceResources rdma_dev_res;
} PVRDMADev;
#define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME)
static inline int get_reg_val(PVRDMADev *dev, hwaddr addr, uint32_t *val)
{
int idx = addr >> 2;
if (idx > RDMA_BAR1_REGS_SIZE) {
return -EINVAL;
}
*val = dev->regs_data[idx];
return 0;
}
static inline int set_reg_val(PVRDMADev *dev, hwaddr addr, uint32_t val)
{
int idx = addr >> 2;
if (idx > RDMA_BAR1_REGS_SIZE) {
return -EINVAL;
}
dev->regs_data[idx] = val;
return 0;
}
static inline void post_interrupt(PVRDMADev *dev, unsigned vector)
{
PCIDevice *pci_dev = PCI_DEVICE(dev);
if (likely(!dev->interrupt_mask)) {
msix_notify(pci_dev, vector);
}
}
int execute_command(PVRDMADev *dev);
#endif
此差异已折叠。
/*
* QEMU paravirtual RDMA - Device rings
*
* Copyright (C) 2018 Oracle
* Copyright (C) 2018 Red Hat Inc
*
* Authors:
* Yuval Shaia <yuval.shaia@oracle.com>
* Marcel Apfelbaum <marcel@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#include <qemu/osdep.h>
#include <hw/pci/pci.h>
#include <cpu.h>
#include "../rdma_utils.h"
#include <standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h>
#include "pvrdma_dev_ring.h"
int pvrdma_ring_init(PvrdmaRing *ring, const char *name, PCIDevice *dev,
struct pvrdma_ring *ring_state, uint32_t max_elems,
size_t elem_sz, dma_addr_t *tbl, dma_addr_t npages)
{
int i;
int rc = 0;
strncpy(ring->name, name, MAX_RING_NAME_SZ);
ring->name[MAX_RING_NAME_SZ - 1] = 0;
pr_dbg("Initializing %s ring\n", ring->name);
ring->dev = dev;
ring->ring_state = ring_state;
ring->max_elems = max_elems;
ring->elem_sz = elem_sz;
pr_dbg("ring->elem_sz=%ld\n", ring->elem_sz);
pr_dbg("npages=%ld\n", npages);
/* TODO: Give a moment to think if we want to redo driver settings
atomic_set(&ring->ring_state->prod_tail, 0);
atomic_set(&ring->ring_state->cons_head, 0);
*/
ring->npages = npages;
ring->pages = g_malloc(npages * sizeof(void *));
for (i = 0; i < npages; i++) {
if (!tbl[i]) {
pr_err("npages=%ld but tbl[%d] is NULL\n", (long)npages, i);
continue;
}
ring->pages[i] = rdma_pci_dma_map(dev, tbl[i], TARGET_PAGE_SIZE);
if (!ring->pages[i]) {
rc = -ENOMEM;
pr_dbg("Failed to map to page %d\n", i);
goto out_free;
}
memset(ring->pages[i], 0, TARGET_PAGE_SIZE);
}
goto out;
out_free:
while (i--) {
rdma_pci_dma_unmap(dev, ring->pages[i], TARGET_PAGE_SIZE);
}
g_free(ring->pages);
out:
return rc;
}
void *pvrdma_ring_next_elem_read(PvrdmaRing *ring)
{
unsigned int idx = 0, offset;
/*
pr_dbg("%s: t=%d, h=%d\n", ring->name, ring->ring_state->prod_tail,
ring->ring_state->cons_head);
*/
if (!pvrdma_idx_ring_has_data(ring->ring_state, ring->max_elems, &idx)) {
pr_dbg("No more data in ring\n");
return NULL;
}
offset = idx * ring->elem_sz;
/*
pr_dbg("idx=%d\n", idx);
pr_dbg("offset=%d\n", offset);
*/
return ring->pages[offset / TARGET_PAGE_SIZE] + (offset % TARGET_PAGE_SIZE);
}
void pvrdma_ring_read_inc(PvrdmaRing *ring)
{
pvrdma_idx_ring_inc(&ring->ring_state->cons_head, ring->max_elems);
/*
pr_dbg("%s: t=%d, h=%d, m=%ld\n", ring->name,
ring->ring_state->prod_tail, ring->ring_state->cons_head,
ring->max_elems);
*/
}
void *pvrdma_ring_next_elem_write(PvrdmaRing *ring)
{
unsigned int idx, offset, tail;
/*
pr_dbg("%s: t=%d, h=%d\n", ring->name, ring->ring_state->prod_tail,
ring->ring_state->cons_head);
*/
if (!pvrdma_idx_ring_has_space(ring->ring_state, ring->max_elems, &tail)) {
pr_dbg("CQ is full\n");
return NULL;
}
idx = pvrdma_idx(&ring->ring_state->prod_tail, ring->max_elems);
/* TODO: tail == idx */
offset = idx * ring->elem_sz;
return ring->pages[offset / TARGET_PAGE_SIZE] + (offset % TARGET_PAGE_SIZE);
}
void pvrdma_ring_write_inc(PvrdmaRing *ring)
{
pvrdma_idx_ring_inc(&ring->ring_state->prod_tail, ring->max_elems);
/*
pr_dbg("%s: t=%d, h=%d, m=%ld\n", ring->name,
ring->ring_state->prod_tail, ring->ring_state->cons_head,
ring->max_elems);
*/
}
void pvrdma_ring_free(PvrdmaRing *ring)
{
if (!ring) {
return;
}
if (!ring->pages) {
return;
}
pr_dbg("ring->npages=%d\n", ring->npages);
while (ring->npages--) {
rdma_pci_dma_unmap(ring->dev, ring->pages[ring->npages],
TARGET_PAGE_SIZE);
}
g_free(ring->pages);
ring->pages = NULL;
}
/*
* QEMU VMWARE paravirtual RDMA ring utilities
*
* Copyright (C) 2018 Oracle
* Copyright (C) 2018 Red Hat Inc
*
* Authors:
* Yuval Shaia <yuval.shaia@oracle.com>
* Marcel Apfelbaum <marcel@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#ifndef PVRDMA_DEV_RING_H
#define PVRDMA_DEV_RING_H
#include <qemu/typedefs.h>
#define MAX_RING_NAME_SZ 32
typedef struct PvrdmaRing {
char name[MAX_RING_NAME_SZ];
PCIDevice *dev;
uint32_t max_elems;
size_t elem_sz;
struct pvrdma_ring *ring_state; /* used only for unmap */
int npages;
void **pages;
} PvrdmaRing;
int pvrdma_ring_init(PvrdmaRing *ring, const char *name, PCIDevice *dev,
struct pvrdma_ring *ring_state, uint32_t max_elems,
size_t elem_sz, dma_addr_t *tbl, dma_addr_t npages);
void *pvrdma_ring_next_elem_read(PvrdmaRing *ring);
void pvrdma_ring_read_inc(PvrdmaRing *ring);
void *pvrdma_ring_next_elem_write(PvrdmaRing *ring);
void pvrdma_ring_write_inc(PvrdmaRing *ring);
void pvrdma_ring_free(PvrdmaRing *ring);
#endif
/*
* QEMU paravirtual RDMA - QP implementation
*
* Copyright (C) 2018 Oracle
* Copyright (C) 2018 Red Hat Inc
*
* Authors:
* Yuval Shaia <yuval.shaia@oracle.com>
* Marcel Apfelbaum <marcel@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#include <qemu/osdep.h>
#include "../rdma_utils.h"
#include "../rdma_rm.h"
#include "../rdma_backend.h"
#include "pvrdma.h"
#include <standard-headers/rdma/vmw_pvrdma-abi.h>
#include "pvrdma_qp_ops.h"
typedef struct CompHandlerCtx {
PVRDMADev *dev;
uint32_t cq_handle;
struct pvrdma_cqe cqe;
} CompHandlerCtx;
/* Send Queue WQE */
typedef struct PvrdmaSqWqe {
struct pvrdma_sq_wqe_hdr hdr;
struct pvrdma_sge sge[0];
} PvrdmaSqWqe;
/* Recv Queue WQE */
typedef struct PvrdmaRqWqe {
struct pvrdma_rq_wqe_hdr hdr;
struct pvrdma_sge sge[0];
} PvrdmaRqWqe;
/*
* 1. Put CQE on send CQ ring
* 2. Put CQ number on dsr completion ring
* 3. Interrupt host
*/
static int pvrdma_post_cqe(PVRDMADev *dev, uint32_t cq_handle,
struct pvrdma_cqe *cqe)
{
struct pvrdma_cqe *cqe1;
struct pvrdma_cqne *cqne;
PvrdmaRing *ring;
RdmaRmCQ *cq = rdma_rm_get_cq(&dev->rdma_dev_res, cq_handle);
if (unlikely(!cq)) {
pr_dbg("Invalid cqn %d\n", cq_handle);
return -EINVAL;
}
ring = (PvrdmaRing *)cq->opaque;
pr_dbg("ring=%p\n", ring);
/* Step #1: Put CQE on CQ ring */
pr_dbg("Writing CQE\n");
cqe1 = pvrdma_ring_next_elem_write(ring);
if (unlikely(!cqe1)) {
return -EINVAL;
}
cqe1->wr_id = cqe->wr_id;
cqe1->qp = cqe->qp;
cqe1->opcode = cqe->opcode;
cqe1->status = cqe->status;
cqe1->vendor_err = cqe->vendor_err;
pvrdma_ring_write_inc(ring);
/* Step #2: Put CQ number on dsr completion ring */
pr_dbg("Writing CQNE\n");
cqne = pvrdma_ring_next_elem_write(&dev->dsr_info.cq);
if (unlikely(!cqne)) {
return -EINVAL;
}
cqne->info = cq_handle;
pvrdma_ring_write_inc(&dev->dsr_info.cq);
pr_dbg("cq->notify=%d\n", cq->notify);
if (cq->notify) {
cq->notify = false;
post_interrupt(dev, INTR_VEC_CMD_COMPLETION_Q);
}
return 0;
}
static void pvrdma_qp_ops_comp_handler(int status, unsigned int vendor_err,
void *ctx)
{
CompHandlerCtx *comp_ctx = (CompHandlerCtx *)ctx;
pr_dbg("cq_handle=%d\n", comp_ctx->cq_handle);
pr_dbg("wr_id=%ld\n", comp_ctx->cqe.wr_id);
pr_dbg("status=%d\n", status);
pr_dbg("vendor_err=0x%x\n", vendor_err);
comp_ctx->cqe.status = status;
comp_ctx->cqe.vendor_err = vendor_err;
pvrdma_post_cqe(comp_ctx->dev, comp_ctx->cq_handle, &comp_ctx->cqe);
g_free(ctx);
}
void pvrdma_qp_ops_fini(void)
{
rdma_backend_unregister_comp_handler();
}
int pvrdma_qp_ops_init(void)
{
rdma_backend_register_comp_handler(pvrdma_qp_ops_comp_handler);
return 0;
}
int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle)
{
RdmaRmQP *qp;
PvrdmaSqWqe *wqe;
PvrdmaRing *ring;
pr_dbg("qp_handle=%d\n", qp_handle);
qp = rdma_rm_get_qp(&dev->rdma_dev_res, qp_handle);
if (unlikely(!qp)) {
return -EINVAL;
}
ring = (PvrdmaRing *)qp->opaque;
pr_dbg("sring=%p\n", ring);
wqe = (struct PvrdmaSqWqe *)pvrdma_ring_next_elem_read(ring);
while (wqe) {
CompHandlerCtx *comp_ctx;
pr_dbg("wr_id=%ld\n", wqe->hdr.wr_id);
/* Prepare CQE */
comp_ctx = g_malloc(sizeof(CompHandlerCtx));
comp_ctx->dev = dev;
comp_ctx->cq_handle = qp->send_cq_handle;
comp_ctx->cqe.wr_id = wqe->hdr.wr_id;
comp_ctx->cqe.qp = qp_handle;
comp_ctx->cqe.opcode = wqe->hdr.opcode;
rdma_backend_post_send(&dev->backend_dev, &qp->backend_qp, qp->qp_type,
(struct ibv_sge *)&wqe->sge[0], wqe->hdr.num_sge,
(union ibv_gid *)wqe->hdr.wr.ud.av.dgid,
wqe->hdr.wr.ud.remote_qpn,
wqe->hdr.wr.ud.remote_qkey, comp_ctx);
pvrdma_ring_read_inc(ring);
wqe = pvrdma_ring_next_elem_read(ring);
}
return 0;
}
int pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle)
{
RdmaRmQP *qp;
PvrdmaRqWqe *wqe;
PvrdmaRing *ring;
pr_dbg("qp_handle=%d\n", qp_handle);
qp = rdma_rm_get_qp(&dev->rdma_dev_res, qp_handle);
if (unlikely(!qp)) {
return -EINVAL;
}
ring = &((PvrdmaRing *)qp->opaque)[1];
pr_dbg("rring=%p\n", ring);
wqe = (struct PvrdmaRqWqe *)pvrdma_ring_next_elem_read(ring);
while (wqe) {
CompHandlerCtx *comp_ctx;
pr_dbg("wr_id=%ld\n", wqe->hdr.wr_id);
/* Prepare CQE */
comp_ctx = g_malloc(sizeof(CompHandlerCtx));
comp_ctx->dev = dev;
comp_ctx->cq_handle = qp->recv_cq_handle;
comp_ctx->cqe.qp = qp_handle;
comp_ctx->cqe.wr_id = wqe->hdr.wr_id;
rdma_backend_post_recv(&dev->backend_dev, &dev->rdma_dev_res,
&qp->backend_qp, qp->qp_type,
(struct ibv_sge *)&wqe->sge[0], wqe->hdr.num_sge,
comp_ctx);
pvrdma_ring_read_inc(ring);
wqe = pvrdma_ring_next_elem_read(ring);
}
return 0;
}
void pvrdma_cq_poll(RdmaDeviceResources *dev_res, uint32_t cq_handle)
{
RdmaRmCQ *cq;
cq = rdma_rm_get_cq(dev_res, cq_handle);
if (!cq) {
pr_dbg("Invalid CQ# %d\n", cq_handle);
}
rdma_backend_poll_cq(dev_res, &cq->backend_cq);
}
/*
* QEMU VMWARE paravirtual RDMA QP Operations
*
* Copyright (C) 2018 Oracle
* Copyright (C) 2018 Red Hat Inc
*
* Authors:
* Yuval Shaia <yuval.shaia@oracle.com>
* Marcel Apfelbaum <marcel@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#ifndef PVRDMA_QP_H
#define PVRDMA_QP_H
#include "pvrdma.h"
int pvrdma_qp_ops_init(void);
void pvrdma_qp_ops_fini(void);
int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle);
int pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle);
void pvrdma_cq_poll(RdmaDeviceResources *dev_res, uint32_t cq_handle);
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册