提交 e0f30bac 编写于 作者: R Ramkrishna Vepa 提交者: Roland Dreier

IB/qib: Add optional NUMA affinity

This patch adds context relative numa affinity conditioned on the
module parameter numa_aware. The qib_ctxtdata has an additional
node_id member and qib_create_ctxtdata() has an addition node_id
parameter.

The allocations within the hdr queue and eager queue setup routines
now take this additional member and adjust allocations as necesary.
PSM will pass the either current numa node or the node closest to the
HCA depending on numa_aware. Verbs will always use the node closest to
the HCA.
Reviewed-by: NDean Luick <dean.luick@intel.com>
Signed-off-by: NRamkrishna Vepa <ramkrishna.vepa@intel.com>
Signed-off-by: NVinit Agnihotri <vinit.abhay.agnihotri@intel.com>
Signed-off-by: NMike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: NRoland Dreier <roland@purestorage.com>
上级 ab4a13d6
......@@ -154,6 +154,8 @@ struct qib_ctxtdata {
*/
/* instead of calculating it */
unsigned ctxt;
/* local node of context */
int node_id;
/* non-zero if ctxt is being shared. */
u16 subctxt_cnt;
/* non-zero if ctxt is being shared. */
......@@ -1088,6 +1090,8 @@ struct qib_devdata {
u16 psxmitwait_check_rate;
/* high volume overflow errors defered to tasklet */
struct tasklet_struct error_tasklet;
int assigned_node_id; /* NUMA node closest to HCA */
};
/* hol_state values */
......@@ -1167,7 +1171,7 @@ int qib_create_rcvhdrq(struct qib_devdata *, struct qib_ctxtdata *);
int qib_setup_eagerbufs(struct qib_ctxtdata *);
void qib_set_ctxtcnt(struct qib_devdata *);
int qib_create_ctxts(struct qib_devdata *dd);
struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *, u32);
struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *, u32, int);
void qib_init_pportdata(struct qib_pportdata *, struct qib_devdata *, u8, u8);
void qib_free_ctxtdata(struct qib_devdata *, struct qib_ctxtdata *);
......@@ -1458,6 +1462,7 @@ extern unsigned qib_n_krcv_queues;
extern unsigned qib_sdma_fetch_arb;
extern unsigned qib_compat_ddr_negotiate;
extern int qib_special_trigger;
extern unsigned qib_numa_aware;
extern struct mutex qib_mutex;
......
......@@ -1263,8 +1263,12 @@ static int setup_ctxt(struct qib_pportdata *ppd, int ctxt,
struct qib_ctxtdata *rcd;
void *ptmp = NULL;
int ret;
int numa_id;
rcd = qib_create_ctxtdata(ppd, ctxt);
numa_id = qib_numa_aware ? numa_node_id() :
dd->assigned_node_id;
rcd = qib_create_ctxtdata(ppd, ctxt, numa_id);
/*
* Allocate memory for use in qib_tid_update() at open to
......
......@@ -67,6 +67,11 @@ ushort qib_cfgctxts;
module_param_named(cfgctxts, qib_cfgctxts, ushort, S_IRUGO);
MODULE_PARM_DESC(cfgctxts, "Set max number of contexts to use");
unsigned qib_numa_aware;
module_param_named(numa_aware, qib_numa_aware, uint, S_IRUGO);
MODULE_PARM_DESC(numa_aware,
"0 -> PSM allocation close to HCA, 1 -> PSM allocation local to process");
/*
* If set, do not write to any regs if avoidable, hack to allow
* check for deranged default register values.
......@@ -124,6 +129,11 @@ int qib_create_ctxts(struct qib_devdata *dd)
{
unsigned i;
int ret;
int local_node_id = pcibus_to_node(dd->pcidev->bus);
if (local_node_id < 0)
local_node_id = numa_node_id();
dd->assigned_node_id = local_node_id;
/*
* Allocate full ctxtcnt array, rather than just cfgctxts, because
......@@ -146,7 +156,8 @@ int qib_create_ctxts(struct qib_devdata *dd)
continue;
ppd = dd->pport + (i % dd->num_pports);
rcd = qib_create_ctxtdata(ppd, i);
rcd = qib_create_ctxtdata(ppd, i, dd->assigned_node_id);
if (!rcd) {
qib_dev_err(dd,
"Unable to allocate ctxtdata for Kernel ctxt, failing\n");
......@@ -164,14 +175,16 @@ int qib_create_ctxts(struct qib_devdata *dd)
/*
* Common code for user and kernel context setup.
*/
struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt)
struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt,
int node_id)
{
struct qib_devdata *dd = ppd->dd;
struct qib_ctxtdata *rcd;
rcd = kzalloc(sizeof(*rcd), GFP_KERNEL);
rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, node_id);
if (rcd) {
INIT_LIST_HEAD(&rcd->qp_wait_list);
rcd->node_id = node_id;
rcd->ppd = ppd;
rcd->dd = dd;
rcd->cnt = 1;
......@@ -1524,6 +1537,7 @@ static void qib_remove_one(struct pci_dev *pdev)
int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
{
unsigned amt;
int old_node_id;
if (!rcd->rcvhdrq) {
dma_addr_t phys_hdrqtail;
......@@ -1533,9 +1547,13 @@ int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
sizeof(u32), PAGE_SIZE);
gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
GFP_USER : GFP_KERNEL;
old_node_id = dev_to_node(&dd->pcidev->dev);
set_dev_node(&dd->pcidev->dev, rcd->node_id);
rcd->rcvhdrq = dma_alloc_coherent(
&dd->pcidev->dev, amt, &rcd->rcvhdrq_phys,
gfp_flags | __GFP_COMP);
set_dev_node(&dd->pcidev->dev, old_node_id);
if (!rcd->rcvhdrq) {
qib_dev_err(dd,
......@@ -1551,9 +1569,11 @@ int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
}
if (!(dd->flags & QIB_NODMA_RTAIL)) {
set_dev_node(&dd->pcidev->dev, rcd->node_id);
rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(
&dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
gfp_flags);
set_dev_node(&dd->pcidev->dev, old_node_id);
if (!rcd->rcvhdrtail_kvaddr)
goto bail_free;
rcd->rcvhdrqtailaddr_phys = phys_hdrqtail;
......@@ -1597,6 +1617,7 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd)
unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff;
size_t size;
gfp_t gfp_flags;
int old_node_id;
/*
* GFP_USER, but without GFP_FS, so buffer cache can be
......@@ -1615,25 +1636,29 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd)
size = rcd->rcvegrbuf_size;
if (!rcd->rcvegrbuf) {
rcd->rcvegrbuf =
kzalloc(chunk * sizeof(rcd->rcvegrbuf[0]),
GFP_KERNEL);
kzalloc_node(chunk * sizeof(rcd->rcvegrbuf[0]),
GFP_KERNEL, rcd->node_id);
if (!rcd->rcvegrbuf)
goto bail;
}
if (!rcd->rcvegrbuf_phys) {
rcd->rcvegrbuf_phys =
kmalloc(chunk * sizeof(rcd->rcvegrbuf_phys[0]),
GFP_KERNEL);
kmalloc_node(chunk * sizeof(rcd->rcvegrbuf_phys[0]),
GFP_KERNEL, rcd->node_id);
if (!rcd->rcvegrbuf_phys)
goto bail_rcvegrbuf;
}
for (e = 0; e < rcd->rcvegrbuf_chunks; e++) {
if (rcd->rcvegrbuf[e])
continue;
old_node_id = dev_to_node(&dd->pcidev->dev);
set_dev_node(&dd->pcidev->dev, rcd->node_id);
rcd->rcvegrbuf[e] =
dma_alloc_coherent(&dd->pcidev->dev, size,
&rcd->rcvegrbuf_phys[e],
gfp_flags);
set_dev_node(&dd->pcidev->dev, old_node_id);
if (!rcd->rcvegrbuf[e])
goto bail_rcvegrbuf_phys;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册