rbd: introduce OWN_BVECS data type
authorIlya Dryomov <idryomov@gmail.com>
Tue, 6 Feb 2018 18:26:35 +0000 (19:26 +0100)
committerIlya Dryomov <idryomov@gmail.com>
Mon, 2 Apr 2018 08:12:44 +0000 (10:12 +0200)
If the layout is "fancy", we need to be able to rearrange the provided
bio_vecs in stripe unit chunks to make it possible for the messenger to
read/write directly from/to the provided data buffer, without employing
a temporary data buffer for assembling the result.

Higher level bio_vec arrays are generally immutable, so this requires
copying into a private array.  Only the bio_vecs themselves are shuffled
around, not the actual data.  OWN_BVECS doesn't own any pages.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
drivers/block/rbd.c

index 5fa4e1aced042c584e460362f53db10006e248af..056865cfc596539746fdeec8839fb63813beb262 100644 (file)
@@ -215,6 +215,7 @@ enum obj_request_type {
        OBJ_REQUEST_NODATA = 1,
        OBJ_REQUEST_BIO,        /* pointer into provided bio (list) */
        OBJ_REQUEST_BVECS,      /* pointer into provided bio_vec array */
+       OBJ_REQUEST_OWN_BVECS,  /* private bio_vec array, doesn't own pages */
 };
 
 enum obj_operation_type {
@@ -261,6 +262,7 @@ struct rbd_obj_request {
                struct {
                        struct ceph_bvec_iter   bvec_pos;
                        u32                     bvec_count;
+                       u32                     bvec_idx;
                };
        };
        struct bio_vec          *copyup_bvecs;
@@ -1238,7 +1240,7 @@ static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
 
 /*
  * Zero a range in @obj_req data buffer defined by a bio (list) or
- * bio_vec array.
+ * (private) bio_vec array.
  *
  * @off is relative to the start of the data buffer.
  */
@@ -1250,6 +1252,7 @@ static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
                zero_bios(&obj_req->bio_pos, off, bytes);
                break;
        case OBJ_REQUEST_BVECS:
+       case OBJ_REQUEST_OWN_BVECS:
                zero_bvecs(&obj_req->bvec_pos, off, bytes);
                break;
        default:
@@ -1485,6 +1488,9 @@ static void rbd_obj_request_destroy(struct kref *kref)
        case OBJ_REQUEST_BIO:
        case OBJ_REQUEST_BVECS:
                break;          /* Nothing to do */
+       case OBJ_REQUEST_OWN_BVECS:
+               kfree(obj_request->bvec_pos.bvecs);
+               break;
        default:
                rbd_assert(0);
        }
@@ -1679,8 +1685,10 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
                                               obj_req->ex.oe_len);
                break;
        case OBJ_REQUEST_BVECS:
+       case OBJ_REQUEST_OWN_BVECS:
                rbd_assert(obj_req->bvec_pos.iter.bi_size ==
                                                        obj_req->ex.oe_len);
+               rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
                osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
                                                    &obj_req->bvec_pos);
                break;
@@ -1893,6 +1901,8 @@ struct rbd_img_fill_ctx {
        union rbd_img_fill_iter *pos;
        union rbd_img_fill_iter iter;
        ceph_object_extent_fn_t set_pos_fn;
+       ceph_object_extent_fn_t count_fn;
+       ceph_object_extent_fn_t copy_fn;
 };
 
 static struct ceph_object_extent *alloc_object_extent(void *arg)
@@ -1908,13 +1918,58 @@ static struct ceph_object_extent *alloc_object_extent(void *arg)
        return &obj_req->ex;
 }
 
+/*
+ * While su != os && sc == 1 is technically not fancy (it's the same
+ * layout as su == os && sc == 1), we can't use the nocopy path for it
+ * because ->set_pos_fn() should be called only once per object.
+ * ceph_file_to_extents() invokes action_fn once per stripe unit, so
+ * treat su != os && sc == 1 as fancy.
+ */
+static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
+{
+       return l->stripe_unit != l->object_size;
+}
+
+static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
+                                      struct ceph_file_extent *img_extents,
+                                      u32 num_img_extents,
+                                      struct rbd_img_fill_ctx *fctx)
+{
+       u32 i;
+       int ret;
+
+       img_req->data_type = fctx->pos_type;
+
+       /*
+        * Create object requests and set each object request's starting
+        * position in the provided bio (list) or bio_vec array.
+        */
+       fctx->iter = *fctx->pos;
+       for (i = 0; i < num_img_extents; i++) {
+               ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
+                                          img_extents[i].fe_off,
+                                          img_extents[i].fe_len,
+                                          &img_req->object_extents,
+                                          alloc_object_extent, img_req,
+                                          fctx->set_pos_fn, &fctx->iter);
+               if (ret)
+                       return ret;
+       }
+
+       return __rbd_img_fill_request(img_req);
+}
+
 /*
  * Map a list of image extents to a list of object extents, create the
  * corresponding object requests (normally each to a different object,
  * but not always) and add them to @img_req.  For each object request,
- * set up its data descriptor to point to the corresponding chunk of
+ * set up its data descriptor to point to the corresponding chunk(s) of
  * @fctx->pos data buffer.
  *
+ * Because ceph_file_to_extents() will merge adjacent object extents
+ * together, each object request's data descriptor may point to multiple
+ * different chunks of @fctx->pos data buffer.
+ *
  * @fctx->pos data buffer is assumed to be large enough.
  */
 static int rbd_img_fill_request(struct rbd_img_request *img_req,
@@ -1922,23 +1977,56 @@ static int rbd_img_fill_request(struct rbd_img_request *img_req,
                                u32 num_img_extents,
                                struct rbd_img_fill_ctx *fctx)
 {
+       struct rbd_device *rbd_dev = img_req->rbd_dev;
+       struct rbd_obj_request *obj_req;
        u32 i;
        int ret;
 
-       img_req->data_type = fctx->pos_type;
+       if (fctx->pos_type == OBJ_REQUEST_NODATA ||
+           !rbd_layout_is_fancy(&rbd_dev->layout))
+               return rbd_img_fill_request_nocopy(img_req, img_extents,
+                                                  num_img_extents, fctx);
+
+       img_req->data_type = OBJ_REQUEST_OWN_BVECS;
 
        /*
-        * Create object requests and set each object request's starting
-        * position in the provided bio (list) or bio_vec array.
+        * Create object requests and determine ->bvec_count for each object
+        * request.  Note that ->bvec_count sum over all object requests may
+        * be greater than the number of bio_vecs in the provided bio (list)
+        * or bio_vec array because when mapped, those bio_vecs can straddle
+        * stripe unit boundaries.
         */
        fctx->iter = *fctx->pos;
        for (i = 0; i < num_img_extents; i++) {
-               ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
+               ret = ceph_file_to_extents(&rbd_dev->layout,
                                           img_extents[i].fe_off,
                                           img_extents[i].fe_len,
                                           &img_req->object_extents,
                                           alloc_object_extent, img_req,
-                                          fctx->set_pos_fn, &fctx->iter);
+                                          fctx->count_fn, &fctx->iter);
+               if (ret)
+                       return ret;
+       }
+
+       for_each_obj_request(img_req, obj_req) {
+               obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
+                                             sizeof(*obj_req->bvec_pos.bvecs),
+                                             GFP_NOIO);
+               if (!obj_req->bvec_pos.bvecs)
+                       return -ENOMEM;
+       }
+
+       /*
+        * Fill in each object request's private bio_vec array, splitting and
+        * rearranging the provided bio_vecs in stripe unit chunks as needed.
+        */
+       fctx->iter = *fctx->pos;
+       for (i = 0; i < num_img_extents; i++) {
+               ret = ceph_iterate_extents(&rbd_dev->layout,
+                                          img_extents[i].fe_off,
+                                          img_extents[i].fe_len,
+                                          &img_req->object_extents,
+                                          fctx->copy_fn, &fctx->iter);
                if (ret)
                        return ret;
        }
@@ -1970,6 +2058,32 @@ static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
        ceph_bio_iter_advance(it, bytes);
 }
 
+static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
+{
+       struct rbd_obj_request *obj_req =
+           container_of(ex, struct rbd_obj_request, ex);
+       struct ceph_bio_iter *it = arg;
+
+       dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
+       ceph_bio_iter_advance_step(it, bytes, ({
+               obj_req->bvec_count++;
+       }));
+
+}
+
+static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
+{
+       struct rbd_obj_request *obj_req =
+           container_of(ex, struct rbd_obj_request, ex);
+       struct ceph_bio_iter *it = arg;
+
+       dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
+       ceph_bio_iter_advance_step(it, bytes, ({
+               obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
+               obj_req->bvec_pos.iter.bi_size += bv.bv_len;
+       }));
+}
+
 static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
                                   struct ceph_file_extent *img_extents,
                                   u32 num_img_extents,
@@ -1979,6 +2093,8 @@ static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
                .pos_type = OBJ_REQUEST_BIO,
                .pos = (union rbd_img_fill_iter *)bio_pos,
                .set_pos_fn = set_bio_pos,
+               .count_fn = count_bio_bvecs,
+               .copy_fn = copy_bio_bvecs,
        };
 
        return rbd_img_fill_request(img_req, img_extents, num_img_extents,
@@ -2005,6 +2121,29 @@ static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
        ceph_bvec_iter_advance(it, bytes);
 }
 
+static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
+{
+       struct rbd_obj_request *obj_req =
+           container_of(ex, struct rbd_obj_request, ex);
+       struct ceph_bvec_iter *it = arg;
+
+       ceph_bvec_iter_advance_step(it, bytes, ({
+               obj_req->bvec_count++;
+       }));
+}
+
+static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
+{
+       struct rbd_obj_request *obj_req =
+           container_of(ex, struct rbd_obj_request, ex);
+       struct ceph_bvec_iter *it = arg;
+
+       ceph_bvec_iter_advance_step(it, bytes, ({
+               obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
+               obj_req->bvec_pos.iter.bi_size += bv.bv_len;
+       }));
+}
+
 static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
                                     struct ceph_file_extent *img_extents,
                                     u32 num_img_extents,
@@ -2014,6 +2153,8 @@ static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
                .pos_type = OBJ_REQUEST_BVECS,
                .pos = (union rbd_img_fill_iter *)bvec_pos,
                .set_pos_fn = set_bvec_pos,
+               .count_fn = count_bvecs,
+               .copy_fn = copy_bvecs,
        };
 
        return rbd_img_fill_request(img_req, img_extents, num_img_extents,
@@ -2071,6 +2212,7 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
                                                      &obj_req->bio_pos);
                        break;
                case OBJ_REQUEST_BVECS:
+               case OBJ_REQUEST_OWN_BVECS:
                        ret = __rbd_img_fill_from_bvecs(child_img_req,
                                                      obj_req->img_extents,
                                                      obj_req->num_img_extents,