IB/qib: Add optional NUMA affinity
authorRamkrishna Vepa <ramkrishna.vepa@intel.com>
Tue, 28 May 2013 16:57:33 +0000 (12:57 -0400)
committerRoland Dreier <roland@purestorage.com>
Sat, 22 Jun 2013 00:19:48 +0000 (17:19 -0700)
This patch adds context relative numa affinity conditioned on the
module parameter numa_aware. The qib_ctxtdata has an additional
node_id member and qib_create_ctxtdata() has an addition node_id
parameter.

The allocations within the hdr queue and eager queue setup routines
now take this additional member and adjust allocations as necesary.
PSM will pass the either current numa node or the node closest to the
HCA depending on numa_aware. Verbs will always use the node closest to
the HCA.

Reviewed-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Ramkrishna Vepa <ramkrishna.vepa@intel.com>
Signed-off-by: Vinit Agnihotri <vinit.abhay.agnihotri@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
drivers/infiniband/hw/qib/qib.h
drivers/infiniband/hw/qib/qib_file_ops.c
drivers/infiniband/hw/qib/qib_init.c

index cecbd43f9212522a204f4b5e4ccfc73b2092f0d4..2ee82e6550c77e0fbed9154fb1c35be811dbfa56 100644 (file)
@@ -154,6 +154,8 @@ struct qib_ctxtdata {
         */
        /* instead of calculating it */
        unsigned ctxt;
+       /* local node of context */
+       int node_id;
        /* non-zero if ctxt is being shared. */
        u16 subctxt_cnt;
        /* non-zero if ctxt is being shared. */
@@ -1088,6 +1090,8 @@ struct qib_devdata {
        u16 psxmitwait_check_rate;
        /* high volume overflow errors defered to tasklet */
        struct tasklet_struct error_tasklet;
+
+       int assigned_node_id; /* NUMA node closest to HCA */
 };
 
 /* hol_state values */
@@ -1167,7 +1171,7 @@ int qib_create_rcvhdrq(struct qib_devdata *, struct qib_ctxtdata *);
 int qib_setup_eagerbufs(struct qib_ctxtdata *);
 void qib_set_ctxtcnt(struct qib_devdata *);
 int qib_create_ctxts(struct qib_devdata *dd);
-struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *, u32);
+struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *, u32, int);
 void qib_init_pportdata(struct qib_pportdata *, struct qib_devdata *, u8, u8);
 void qib_free_ctxtdata(struct qib_devdata *, struct qib_ctxtdata *);
 
@@ -1458,6 +1462,7 @@ extern unsigned qib_n_krcv_queues;
 extern unsigned qib_sdma_fetch_arb;
 extern unsigned qib_compat_ddr_negotiate;
 extern int qib_special_trigger;
+extern unsigned qib_numa_aware;
 
 extern struct mutex qib_mutex;
 
index b56c9428f3c5f5fbf574b0b9570ec2caa02cbee3..65b2fc3f957c8b50b3ba4a3991be669bae60813b 100644 (file)
@@ -1263,8 +1263,12 @@ static int setup_ctxt(struct qib_pportdata *ppd, int ctxt,
        struct qib_ctxtdata *rcd;
        void *ptmp = NULL;
        int ret;
+       int numa_id;
 
-       rcd = qib_create_ctxtdata(ppd, ctxt);
+       numa_id = qib_numa_aware ? numa_node_id() :
+               dd->assigned_node_id;
+
+       rcd = qib_create_ctxtdata(ppd, ctxt, numa_id);
 
        /*
         * Allocate memory for use in qib_tid_update() at open to
index 4b64c885fa0de912926da8cc96d8dc16629bf73e..e02217b5c46db134e62039e0f837241dfff796a9 100644 (file)
@@ -67,6 +67,11 @@ ushort qib_cfgctxts;
 module_param_named(cfgctxts, qib_cfgctxts, ushort, S_IRUGO);
 MODULE_PARM_DESC(cfgctxts, "Set max number of contexts to use");
 
+unsigned qib_numa_aware;
+module_param_named(numa_aware, qib_numa_aware, uint, S_IRUGO);
+MODULE_PARM_DESC(numa_aware,
+       "0 -> PSM allocation close to HCA, 1 -> PSM allocation local to process");
+
 /*
  * If set, do not write to any regs if avoidable, hack to allow
  * check for deranged default register values.
@@ -124,6 +129,11 @@ int qib_create_ctxts(struct qib_devdata *dd)
 {
        unsigned i;
        int ret;
+       int local_node_id = pcibus_to_node(dd->pcidev->bus);
+
+       if (local_node_id < 0)
+               local_node_id = numa_node_id();
+       dd->assigned_node_id = local_node_id;
 
        /*
         * Allocate full ctxtcnt array, rather than just cfgctxts, because
@@ -146,7 +156,8 @@ int qib_create_ctxts(struct qib_devdata *dd)
                        continue;
 
                ppd = dd->pport + (i % dd->num_pports);
-               rcd = qib_create_ctxtdata(ppd, i);
+
+               rcd = qib_create_ctxtdata(ppd, i, dd->assigned_node_id);
                if (!rcd) {
                        qib_dev_err(dd,
                                "Unable to allocate ctxtdata for Kernel ctxt, failing\n");
@@ -164,14 +175,16 @@ done:
 /*
  * Common code for user and kernel context setup.
  */
-struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt)
+struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt,
+       int node_id)
 {
        struct qib_devdata *dd = ppd->dd;
        struct qib_ctxtdata *rcd;
 
-       rcd = kzalloc(sizeof(*rcd), GFP_KERNEL);
+       rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, node_id);
        if (rcd) {
                INIT_LIST_HEAD(&rcd->qp_wait_list);
+               rcd->node_id = node_id;
                rcd->ppd = ppd;
                rcd->dd = dd;
                rcd->cnt = 1;
@@ -1524,6 +1537,7 @@ static void qib_remove_one(struct pci_dev *pdev)
 int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
 {
        unsigned amt;
+       int old_node_id;
 
        if (!rcd->rcvhdrq) {
                dma_addr_t phys_hdrqtail;
@@ -1533,9 +1547,13 @@ int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
                            sizeof(u32), PAGE_SIZE);
                gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
                        GFP_USER : GFP_KERNEL;
+
+               old_node_id = dev_to_node(&dd->pcidev->dev);
+               set_dev_node(&dd->pcidev->dev, rcd->node_id);
                rcd->rcvhdrq = dma_alloc_coherent(
                        &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys,
                        gfp_flags | __GFP_COMP);
+               set_dev_node(&dd->pcidev->dev, old_node_id);
 
                if (!rcd->rcvhdrq) {
                        qib_dev_err(dd,
@@ -1551,9 +1569,11 @@ int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
                }
 
                if (!(dd->flags & QIB_NODMA_RTAIL)) {
+                       set_dev_node(&dd->pcidev->dev, rcd->node_id);
                        rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(
                                &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
                                gfp_flags);
+                       set_dev_node(&dd->pcidev->dev, old_node_id);
                        if (!rcd->rcvhdrtail_kvaddr)
                                goto bail_free;
                        rcd->rcvhdrqtailaddr_phys = phys_hdrqtail;
@@ -1597,6 +1617,7 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd)
        unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff;
        size_t size;
        gfp_t gfp_flags;
+       int old_node_id;
 
        /*
         * GFP_USER, but without GFP_FS, so buffer cache can be
@@ -1615,25 +1636,29 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd)
        size = rcd->rcvegrbuf_size;
        if (!rcd->rcvegrbuf) {
                rcd->rcvegrbuf =
-                       kzalloc(chunk * sizeof(rcd->rcvegrbuf[0]),
-                               GFP_KERNEL);
+                       kzalloc_node(chunk * sizeof(rcd->rcvegrbuf[0]),
+                               GFP_KERNEL, rcd->node_id);
                if (!rcd->rcvegrbuf)
                        goto bail;
        }
        if (!rcd->rcvegrbuf_phys) {
                rcd->rcvegrbuf_phys =
-                       kmalloc(chunk * sizeof(rcd->rcvegrbuf_phys[0]),
-                               GFP_KERNEL);
+                       kmalloc_node(chunk * sizeof(rcd->rcvegrbuf_phys[0]),
+                               GFP_KERNEL, rcd->node_id);
                if (!rcd->rcvegrbuf_phys)
                        goto bail_rcvegrbuf;
        }
        for (e = 0; e < rcd->rcvegrbuf_chunks; e++) {
                if (rcd->rcvegrbuf[e])
                        continue;
+
+               old_node_id = dev_to_node(&dd->pcidev->dev);
+               set_dev_node(&dd->pcidev->dev, rcd->node_id);
                rcd->rcvegrbuf[e] =
                        dma_alloc_coherent(&dd->pcidev->dev, size,
                                           &rcd->rcvegrbuf_phys[e],
                                           gfp_flags);
+               set_dev_node(&dd->pcidev->dev, old_node_id);
                if (!rcd->rcvegrbuf[e])
                        goto bail_rcvegrbuf_phys;
        }