From d10bcf947a3ea240351a8182d71e4aa9c8ddba56 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Tue, 2 Apr 2019 14:52:52 -0500 Subject: [PATCH] RDMA/umem: Combine contiguous PAGE_SIZE regions in SGEs Combine contiguous regions of PAGE_SIZE pages into single scatter list entry while building the scatter table for a umem. This minimizes the number of the entries in the scatter list and reduces the DMA mapping overhead, particularly with the IOMMU. Set default max_seg_size in core for IB devices to 2G and do not combine if we exceed this limit. Also, purge npages in struct ib_umem as we now DMA map the umem SGL with sg_nents and npage computation is not needed. Drivers should now be using ib_umem_num_pages(), so fix the last stragglers. Move npages tracking to ib_umem_odp as ODP drivers still need it. Suggested-by: Jason Gunthorpe Reviewed-by: Michael J. Ruhl Reviewed-by: Ira Weiny Acked-by: Adit Ranadive Signed-off-by: Shiraz Saleem Tested-by: Gal Pressman Tested-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 3 + drivers/infiniband/core/umem.c | 101 +++++++++++++++---- drivers/infiniband/core/umem_odp.c | 4 +- drivers/infiniband/hw/mlx5/odp.c | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c | 11 +- include/rdma/ib_umem.h | 2 +- include/rdma/ib_umem_odp.h | 1 + 7 files changed, 95 insertions(+), 29 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 2dbd04739ac6..0f98da17af23 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1089,6 +1089,9 @@ static void setup_dma_device(struct ib_device *device) WARN_ON_ONCE(!parent); device->dma_device = parent; } + /* Setup default max segment size for all IB devices */ + dma_set_max_seg_size(device->dma_device, SZ_2G); + } /* diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 89a7d57f9fa5..d31f5e386c7d 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -39,25 +39,22 @@ #include #include #include +#include #include #include "uverbs.h" - static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { - struct scatterlist *sg; + struct sg_page_iter sg_iter; struct page *page; - int i; if (umem->nmap > 0) - ib_dma_unmap_sg(dev, umem->sg_head.sgl, - umem->npages, + ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents, DMA_BIDIRECTIONAL); - for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { - - page = sg_page(sg); + for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { + page = sg_page_iter_page(&sg_iter); if (!PageDirty(page) && umem->writable && dirty) set_page_dirty_lock(page); put_page(page); @@ -66,6 +63,69 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d sg_free_table(&umem->sg_head); } +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table + * + * sg: current scatterlist entry + * page_list: array of npage struct page pointers + * npages: number of pages in page_list + * max_seg_sz: maximum segment size in bytes + * nents: [out] number of entries in the scatterlist + * + * Return new end of scatterlist + */ +static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg, + struct page **page_list, + unsigned long npages, + unsigned int max_seg_sz, + int *nents) +{ + unsigned long first_pfn; + unsigned long i = 0; + bool update_cur_sg = false; + bool first = !sg_page(sg); + + /* Check if new page_list is contiguous with end of previous page_list. + * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0. + */ + if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) == + page_to_pfn(page_list[0]))) + update_cur_sg = true; + + while (i != npages) { + unsigned long len; + struct page *first_page = page_list[i]; + + first_pfn = page_to_pfn(first_page); + + /* Compute the number of contiguous pages we have starting + * at i + */ + for (len = 0; i != npages && + first_pfn + len == page_to_pfn(page_list[i]); + len++) + i++; + + /* Squash N contiguous pages from page_list into current sge */ + if (update_cur_sg && + ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT))) { + sg_set_page(sg, sg_page(sg), + sg->length + (len << PAGE_SHIFT), 0); + update_cur_sg = false; + continue; + } + + /* Squash N contiguous pages into next sge or first sge */ + if (!first) + sg = sg_next(sg); + + (*nents)++; + sg_set_page(sg, first_page, len << PAGE_SHIFT, 0); + first = false; + } + + return sg; +} + /** * ib_umem_get - Pin and DMA map userspace memory. * @@ -93,7 +153,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, int ret; int i; unsigned long dma_attrs = 0; - struct scatterlist *sg, *sg_list_start; + struct scatterlist *sg; unsigned int gup_flags = FOLL_WRITE; if (!udata) @@ -190,7 +250,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, if (!umem->writable) gup_flags |= FOLL_FORCE; - sg_list_start = umem->sg_head.sgl; + sg = umem->sg_head.sgl; while (npages) { down_read(&mm->mmap_sem); @@ -203,28 +263,29 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, goto umem_release; } - umem->npages += ret; cur_base += ret * PAGE_SIZE; npages -= ret; + sg = ib_umem_add_sg_table(sg, page_list, ret, + dma_get_max_seg_size(context->device->dma_device), + &umem->sg_nents); + /* Continue to hold the mmap_sem as vma_list access * needs to be protected. */ - for_each_sg(sg_list_start, sg, ret, i) { + for (i = 0; i < ret && umem->hugetlb; i++) { if (vma_list && !is_vm_hugetlb_page(vma_list[i])) umem->hugetlb = 0; - - sg_set_page(sg, page_list[i], PAGE_SIZE, 0); } - up_read(&mm->mmap_sem); - /* preparing for next loop */ - sg_list_start = sg; + up_read(&mm->mmap_sem); } + sg_mark_end(sg); + umem->nmap = ib_dma_map_sg_attrs(context->device, umem->sg_head.sgl, - umem->npages, + umem->sg_nents, DMA_BIDIRECTIONAL, dma_attrs); @@ -320,8 +381,8 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, return -EINVAL; } - ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length, - offset + ib_umem_offset(umem)); + ret = sg_pcopy_to_buffer(umem->sg_head.sgl, ib_umem_num_pages(umem), + dst, length, offset + ib_umem_offset(umem)); if (ret < 0) return ret; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 6f8c36fcda78..97219143f16f 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -526,7 +526,7 @@ static int ib_umem_odp_map_dma_single_page( } umem_odp->dma_list[page_index] = dma_addr | access_mask; umem_odp->page_list[page_index] = page; - umem->npages++; + umem_odp->npages++; } else if (umem_odp->page_list[page_index] == page) { umem_odp->dma_list[page_index] |= access_mask; } else { @@ -752,7 +752,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, } umem_odp->page_list[idx] = NULL; umem_odp->dma_list[idx] = 0; - umem->npages--; + umem_odp->npages--; } } mutex_unlock(&umem_odp->umem_mutex); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index cdb0d63fa4b1..91507a2e9290 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -288,7 +288,7 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, ib_umem_odp_unmap_dma_pages(umem_odp, start, end); - if (unlikely(!umem->npages && mr->parent && + if (unlikely(!umem_odp->npages && mr->parent && !umem_odp->dying)) { WRITE_ONCE(umem_odp->dying, 1); atomic_inc(&mr->parent->num_leaf_free); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c index 9e6c44ebaf54..65dc47ffb8f3 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c @@ -119,7 +119,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, union pvrdma_cmd_resp rsp; struct pvrdma_cmd_create_mr *cmd = &req.create_mr; struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp; - int ret; + int ret, npages; if (length == 0 || length > dev->dsr->caps.max_mr_size) { dev_warn(&dev->pdev->dev, "invalid mem region length\n"); @@ -133,9 +133,10 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_CAST(umem); } - if (umem->npages < 0 || umem->npages > PVRDMA_PAGE_DIR_MAX_PAGES) { + npages = ib_umem_num_pages(umem); + if (npages < 0 || npages > PVRDMA_PAGE_DIR_MAX_PAGES) { dev_warn(&dev->pdev->dev, "overflow %d pages in mem region\n", - umem->npages); + npages); ret = -EINVAL; goto err_umem; } @@ -150,7 +151,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mr->mmr.size = length; mr->umem = umem; - ret = pvrdma_page_dir_init(dev, &mr->pdir, umem->npages, false); + ret = pvrdma_page_dir_init(dev, &mr->pdir, npages, false); if (ret) { dev_warn(&dev->pdev->dev, "could not allocate page directory\n"); @@ -167,7 +168,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, cmd->length = length; cmd->pd_handle = to_vpd(pd)->pd_handle; cmd->access_flags = access_flags; - cmd->nchunks = umem->npages; + cmd->nchunks = npages; cmd->pdir_dma = mr->pdir.dir_dma; ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_MR_RESP); diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 73af05db04c7..b13a2e9a50d4 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -53,7 +53,7 @@ struct ib_umem { struct work_struct work; struct sg_table sg_head; int nmap; - int npages; + unsigned int sg_nents; }; /* Returns the offset of the umem start relative to the first page. */ diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index dadc96dea39c..eeec4e53c448 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -69,6 +69,7 @@ struct ib_umem_odp { int notifiers_seq; int notifiers_count; + int npages; /* Tree tracking */ struct umem_odp_node interval_tree; -- 2.30.2