nvmet-rdma: support max(16KB, PAGE_SIZE) inline data
authorSteve Wise <swise@opengridcomputing.com>
Wed, 20 Jun 2018 14:15:10 +0000 (07:15 -0700)
committerChristoph Hellwig <hch@lst.de>
Mon, 23 Jul 2018 07:35:16 +0000 (09:35 +0200)
The patch enables inline data sizes using up to 4 recv sges, and capping
the size at 16KB or at least 1 page size.  So on a 4K page system, up to
16KB is supported, and for a 64K page system 1 page of 64KB is supported.

We avoid > 0 order page allocations for the inline buffers by using
multiple recv sges, one for each page.  If the device cannot support
the configured inline data size due to lack of enough recv sges, then
log a warning and reduce the inline size.

Add a new configfs port attribute, called param_inline_data_size,
to allow configuring the size of inline data for a given nvmf port.
The maximum size allowed is still enforced by nvmet-rdma with
NVMET_RDMA_MAX_INLINE_DATA_SIZE, which is now max(16KB, PAGE_SIZE).
And the default size, if not specified via configfs, is still PAGE_SIZE.
This preserves the existing behavior, but allows larger inline sizes
for small page systems.  If the configured inline data size exceeds
NVMET_RDMA_MAX_INLINE_DATA_SIZE, a warning is logged and the size is
reduced.  If param_inline_data_size is set to 0, then inline data is
disabled for that nvmf port.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
drivers/nvme/target/admin-cmd.c
drivers/nvme/target/configfs.c
drivers/nvme/target/core.c
drivers/nvme/target/discovery.c
drivers/nvme/target/nvmet.h
drivers/nvme/target/rdma.c

index e2c6f8b39388865f4dba18102d86542341721477..837bbdbfaa4bb288205178f658ea9bb46eaef29e 100644 (file)
@@ -268,14 +268,14 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
        id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
        if (ctrl->ops->has_keyed_sgls)
                id->sgls |= cpu_to_le32(1 << 2);
-       if (ctrl->ops->sqe_inline_size)
+       if (req->port->inline_data_size)
                id->sgls |= cpu_to_le32(1 << 20);
 
        strcpy(id->subnqn, ctrl->subsys->subsysnqn);
 
        /* Max command capsule size is sqe + single page of in-capsule data */
        id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) +
-                                 ctrl->ops->sqe_inline_size) / 16);
+                                 req->port->inline_data_size) / 16);
        /* Max response capsule size is cqe */
        id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16);
 
index fee56b3a23bc7d57cb186ab54fb9b920c6f2e4bd..3ba5ea5c4376a1c5db25777b2be7757a91c62b62 100644 (file)
@@ -218,6 +218,35 @@ static ssize_t nvmet_addr_trsvcid_store(struct config_item *item,
 
 CONFIGFS_ATTR(nvmet_, addr_trsvcid);
 
+static ssize_t nvmet_param_inline_data_size_show(struct config_item *item,
+               char *page)
+{
+       struct nvmet_port *port = to_nvmet_port(item);
+
+       return snprintf(page, PAGE_SIZE, "%d\n", port->inline_data_size);
+}
+
+static ssize_t nvmet_param_inline_data_size_store(struct config_item *item,
+               const char *page, size_t count)
+{
+       struct nvmet_port *port = to_nvmet_port(item);
+       int ret;
+
+       if (port->enabled) {
+               pr_err("Cannot modify inline_data_size while port enabled\n");
+               pr_err("Disable the port before modifying\n");
+               return -EACCES;
+       }
+       ret = kstrtoint(page, 0, &port->inline_data_size);
+       if (ret) {
+               pr_err("Invalid value '%s' for inline_data_size\n", page);
+               return -EINVAL;
+       }
+       return count;
+}
+
+CONFIGFS_ATTR(nvmet_, param_inline_data_size);
+
 static ssize_t nvmet_addr_trtype_show(struct config_item *item,
                char *page)
 {
@@ -903,6 +932,7 @@ static struct configfs_attribute *nvmet_port_attrs[] = {
        &nvmet_attr_addr_traddr,
        &nvmet_attr_addr_trsvcid,
        &nvmet_attr_addr_trtype,
+       &nvmet_attr_param_inline_data_size,
        NULL,
 };
 
@@ -932,6 +962,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
        INIT_LIST_HEAD(&port->entry);
        INIT_LIST_HEAD(&port->subsystems);
        INIT_LIST_HEAD(&port->referrals);
+       port->inline_data_size = -1;    /* < 0 == let the transport choose */
 
        port->disc_addr.portid = cpu_to_le16(portid);
        config_group_init_type_name(&port->group, name, &nvmet_port_type);
index 96eafbd419e7364c4d00a509de360ba418613a1d..ddd85715a00ab4e5ec5af1f04521e10e47f9432a 100644 (file)
@@ -242,6 +242,10 @@ int nvmet_enable_port(struct nvmet_port *port)
                return ret;
        }
 
+       /* If the transport didn't set inline_data_size, then disable it. */
+       if (port->inline_data_size < 0)
+               port->inline_data_size = 0;
+
        port->enabled = true;
        return 0;
 }
index 08656b849bd6efdd4762a930b2a2bfa895da3292..eae29f493a0748d5d4daf5e2941a9be485e57f9c 100644 (file)
@@ -171,7 +171,7 @@ static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req)
        id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
        if (ctrl->ops->has_keyed_sgls)
                id->sgls |= cpu_to_le32(1 << 2);
-       if (ctrl->ops->sqe_inline_size)
+       if (req->port->inline_data_size)
                id->sgls |= cpu_to_le32(1 << 20);
 
        strcpy(id->subnqn, ctrl->subsys->subsysnqn);
index 5efb98ec95df8da77dd41599af624c9a51124500..68899385540260b68b6a659975fc2fb53c239f1a 100644 (file)
@@ -117,6 +117,7 @@ struct nvmet_port {
        struct list_head                referrals;
        void                            *priv;
        bool                            enabled;
+       int                             inline_data_size;
 };
 
 static inline struct nvmet_port *to_nvmet_port(struct config_item *item)
@@ -226,7 +227,6 @@ struct nvmet_req;
 struct nvmet_fabrics_ops {
        struct module *owner;
        unsigned int type;
-       unsigned int sqe_inline_size;
        unsigned int msdbd;
        bool has_keyed_sgls : 1;
        void (*queue_response)(struct nvmet_req *req);
index 52e0c5d579a7aad180472faa0d53cd106e35c615..2106ae2ec17738da07ebf6a78855525ec7dc3046 100644 (file)
 #include "nvmet.h"
 
 /*
- * We allow up to a page of inline data to go with the SQE
+ * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data
  */
-#define NVMET_RDMA_INLINE_DATA_SIZE    PAGE_SIZE
+#define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE    PAGE_SIZE
+#define NVMET_RDMA_MAX_INLINE_SGE              4
+#define NVMET_RDMA_MAX_INLINE_DATA_SIZE                max_t(int, SZ_16K, PAGE_SIZE)
 
 struct nvmet_rdma_cmd {
-       struct ib_sge           sge[2];
+       struct ib_sge           sge[NVMET_RDMA_MAX_INLINE_SGE + 1];
        struct ib_cqe           cqe;
        struct ib_recv_wr       wr;
-       struct scatterlist      inline_sg;
-       struct page             *inline_page;
+       struct scatterlist      inline_sg[NVMET_RDMA_MAX_INLINE_SGE];
        struct nvme_command     *nvme_cmd;
        struct nvmet_rdma_queue *queue;
 };
@@ -116,6 +117,8 @@ struct nvmet_rdma_device {
        size_t                  srq_size;
        struct kref             ref;
        struct list_head        entry;
+       int                     inline_data_size;
+       int                     inline_page_count;
 };
 
 static bool nvmet_rdma_use_srq;
@@ -138,6 +141,11 @@ static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
 
 static const struct nvmet_fabrics_ops nvmet_rdma_ops;
 
+static int num_pages(int len)
+{
+       return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT);
+}
+
 /* XXX: really should move to a generic header sooner or later.. */
 static inline u32 get_unaligned_le24(const u8 *p)
 {
@@ -184,6 +192,71 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
        spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
 }
 
+static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev,
+                               struct nvmet_rdma_cmd *c)
+{
+       struct scatterlist *sg;
+       struct ib_sge *sge;
+       int i;
+
+       if (!ndev->inline_data_size)
+               return;
+
+       sg = c->inline_sg;
+       sge = &c->sge[1];
+
+       for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
+               if (sge->length)
+                       ib_dma_unmap_page(ndev->device, sge->addr,
+                                       sge->length, DMA_FROM_DEVICE);
+               if (sg_page(sg))
+                       __free_page(sg_page(sg));
+       }
+}
+
+static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev,
+                               struct nvmet_rdma_cmd *c)
+{
+       struct scatterlist *sg;
+       struct ib_sge *sge;
+       struct page *pg;
+       int len;
+       int i;
+
+       if (!ndev->inline_data_size)
+               return 0;
+
+       sg = c->inline_sg;
+       sg_init_table(sg, ndev->inline_page_count);
+       sge = &c->sge[1];
+       len = ndev->inline_data_size;
+
+       for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
+               pg = alloc_page(GFP_KERNEL);
+               if (!pg)
+                       goto out_err;
+               sg_assign_page(sg, pg);
+               sge->addr = ib_dma_map_page(ndev->device,
+                       pg, 0, PAGE_SIZE, DMA_FROM_DEVICE);
+               if (ib_dma_mapping_error(ndev->device, sge->addr))
+                       goto out_err;
+               sge->length = min_t(int, len, PAGE_SIZE);
+               sge->lkey = ndev->pd->local_dma_lkey;
+               len -= sge->length;
+       }
+
+       return 0;
+out_err:
+       for (; i >= 0; i--, sg--, sge--) {
+               if (sge->length)
+                       ib_dma_unmap_page(ndev->device, sge->addr,
+                                       sge->length, DMA_FROM_DEVICE);
+               if (sg_page(sg))
+                       __free_page(sg_page(sg));
+       }
+       return -ENOMEM;
+}
+
 static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
                        struct nvmet_rdma_cmd *c, bool admin)
 {
@@ -200,33 +273,17 @@ static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
        c->sge[0].length = sizeof(*c->nvme_cmd);
        c->sge[0].lkey = ndev->pd->local_dma_lkey;
 
-       if (!admin) {
-               c->inline_page = alloc_pages(GFP_KERNEL,
-                               get_order(NVMET_RDMA_INLINE_DATA_SIZE));
-               if (!c->inline_page)
-                       goto out_unmap_cmd;
-               c->sge[1].addr = ib_dma_map_page(ndev->device,
-                               c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE,
-                               DMA_FROM_DEVICE);
-               if (ib_dma_mapping_error(ndev->device, c->sge[1].addr))
-                       goto out_free_inline_page;
-               c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE;
-               c->sge[1].lkey = ndev->pd->local_dma_lkey;
-       }
+       if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c))
+               goto out_unmap_cmd;
 
        c->cqe.done = nvmet_rdma_recv_done;
 
        c->wr.wr_cqe = &c->cqe;
        c->wr.sg_list = c->sge;
-       c->wr.num_sge = admin ? 1 : 2;
+       c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1;
 
        return 0;
 
-out_free_inline_page:
-       if (!admin) {
-               __free_pages(c->inline_page,
-                               get_order(NVMET_RDMA_INLINE_DATA_SIZE));
-       }
 out_unmap_cmd:
        ib_dma_unmap_single(ndev->device, c->sge[0].addr,
                        sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
@@ -240,12 +297,8 @@ out:
 static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev,
                struct nvmet_rdma_cmd *c, bool admin)
 {
-       if (!admin) {
-               ib_dma_unmap_page(ndev->device, c->sge[1].addr,
-                               NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE);
-               __free_pages(c->inline_page,
-                               get_order(NVMET_RDMA_INLINE_DATA_SIZE));
-       }
+       if (!admin)
+               nvmet_rdma_free_inline_pages(ndev, c);
        ib_dma_unmap_single(ndev->device, c->sge[0].addr,
                                sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
        kfree(c->nvme_cmd);
@@ -429,7 +482,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
                                rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
        }
 
-       if (rsp->req.sg != &rsp->cmd->inline_sg)
+       if (rsp->req.sg != rsp->cmd->inline_sg)
                sgl_free(rsp->req.sg);
 
        if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
@@ -529,10 +582,25 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
 static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
                u64 off)
 {
-       sg_init_table(&rsp->cmd->inline_sg, 1);
-       sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off);
-       rsp->req.sg = &rsp->cmd->inline_sg;
-       rsp->req.sg_cnt = 1;
+       int sg_count = num_pages(len);
+       struct scatterlist *sg;
+       int i;
+
+       sg = rsp->cmd->inline_sg;
+       for (i = 0; i < sg_count; i++, sg++) {
+               if (i < sg_count - 1)
+                       sg_unmark_end(sg);
+               else
+                       sg_mark_end(sg);
+               sg->offset = off;
+               sg->length = min_t(int, len, PAGE_SIZE - off);
+               len -= sg->length;
+               if (!i)
+                       off = 0;
+       }
+
+       rsp->req.sg = rsp->cmd->inline_sg;
+       rsp->req.sg_cnt = sg_count;
 }
 
 static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
@@ -544,7 +612,7 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
        if (!nvme_is_write(rsp->req.cmd))
                return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 
-       if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) {
+       if (off + len > rsp->queue->dev->inline_data_size) {
                pr_err("invalid inline data offset!\n");
                return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
        }
@@ -743,7 +811,7 @@ static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
        srq_size = 4095;        /* XXX: tune */
 
        srq_attr.attr.max_wr = srq_size;
-       srq_attr.attr.max_sge = 2;
+       srq_attr.attr.max_sge = 1 + ndev->inline_page_count;
        srq_attr.attr.srq_limit = 0;
        srq_attr.srq_type = IB_SRQT_BASIC;
        srq = ib_create_srq(ndev->pd, &srq_attr);
@@ -793,7 +861,10 @@ static void nvmet_rdma_free_dev(struct kref *ref)
 static struct nvmet_rdma_device *
 nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
 {
+       struct nvmet_port *port = cm_id->context;
        struct nvmet_rdma_device *ndev;
+       int inline_page_count;
+       int inline_sge_count;
        int ret;
 
        mutex_lock(&device_list_mutex);
@@ -807,6 +878,18 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
        if (!ndev)
                goto out_err;
 
+       inline_page_count = num_pages(port->inline_data_size);
+       inline_sge_count = max(cm_id->device->attrs.max_sge_rd,
+                               cm_id->device->attrs.max_sge) - 1;
+       if (inline_page_count > inline_sge_count) {
+               pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n",
+                       port->inline_data_size, cm_id->device->name,
+                       inline_sge_count * PAGE_SIZE);
+               port->inline_data_size = inline_sge_count * PAGE_SIZE;
+               inline_page_count = inline_sge_count;
+       }
+       ndev->inline_data_size = port->inline_data_size;
+       ndev->inline_page_count = inline_page_count;
        ndev->device = cm_id->device;
        kref_init(&ndev->ref);
 
@@ -881,7 +964,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
        } else {
                /* +1 for drain */
                qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
-               qp_attr.cap.max_recv_sge = 2;
+               qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count;
        }
 
        ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
@@ -1379,6 +1462,15 @@ static int nvmet_rdma_add_port(struct nvmet_port *port)
                return -EINVAL;
        }
 
+       if (port->inline_data_size < 0) {
+               port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE;
+       } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) {
+               pr_warn("inline_data_size %u is too large, reducing to %u\n",
+                       port->inline_data_size,
+                       NVMET_RDMA_MAX_INLINE_DATA_SIZE);
+               port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
+       }
+
        ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
                        port->disc_addr.trsvcid, &addr);
        if (ret) {
@@ -1456,7 +1548,6 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
 static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
        .owner                  = THIS_MODULE,
        .type                   = NVMF_TRTYPE_RDMA,
-       .sqe_inline_size        = NVMET_RDMA_INLINE_DATA_SIZE,
        .msdbd                  = 1,
        .has_keyed_sgls         = 1,
        .add_port               = nvmet_rdma_add_port,