dm thin: support discards
authorJoe Thornber <ejt@redhat.com>
Wed, 28 Mar 2012 17:41:28 +0000 (18:41 +0100)
committerAlasdair G Kergon <agk@redhat.com>
Wed, 28 Mar 2012 17:41:28 +0000 (18:41 +0100)
Support discards in the thin target.

On discard the corresponding mapping(s) are removed from the thin
device.  If the associated block(s) are no longer shared the discard
is passed to the underlying device.

All bios other than discards now have an associated deferred_entry
that is saved to the 'all_io_entry' in endio_hook.  When non-discard
IO completes and associated mappings are quiesced any discards that
were deferred, via ds_add_work() in process_discard(), will be queued
for processing by the worker thread.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
drivers/md/dm-thin.c |  173 ++++++++++++++++++++++++++++++++++++++++++++++----
 drivers/md/dm-thin.c |  172 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 158 insertions(+), 14 deletions(-)

drivers/md/dm-thin.c

index 188121ca00aab7a4bcd65343ab116d14b362a579..703bbbc4f16f52923932cb7e90451df882402cd3 100644 (file)
@@ -520,10 +520,12 @@ struct pool {
        struct bio_list deferred_bios;
        struct bio_list deferred_flush_bios;
        struct list_head prepared_mappings;
+       struct list_head prepared_discards;
 
        struct bio_list retry_on_resume_list;
 
        struct deferred_set shared_read_ds;
+       struct deferred_set all_io_ds;
 
        struct new_mapping *next_mapping;
        mempool_t *mapping_pool;
@@ -621,6 +623,7 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev
 struct endio_hook {
        struct thin_c *tc;
        struct deferred_entry *shared_read_entry;
+       struct deferred_entry *all_io_entry;
        struct new_mapping *overwrite_mapping;
 };
 
@@ -728,11 +731,12 @@ struct new_mapping {
 
        unsigned quiesced:1;
        unsigned prepared:1;
+       unsigned pass_discard:1;
 
        struct thin_c *tc;
        dm_block_t virt_block;
        dm_block_t data_block;
-       struct cell *cell;
+       struct cell *cell, *cell2;
        int err;
 
        /*
@@ -872,7 +876,30 @@ static void process_prepared_mapping(struct new_mapping *m)
        mempool_free(m, tc->pool->mapping_pool);
 }
 
-static void process_prepared_mappings(struct pool *pool)
+static void process_prepared_discard(struct new_mapping *m)
+{
+       int r;
+       struct thin_c *tc = m->tc;
+
+       r = dm_thin_remove_block(tc->td, m->virt_block);
+       if (r)
+               DMERR("dm_thin_remove_block() failed");
+
+       /*
+        * Pass the discard down to the underlying device?
+        */
+       if (m->pass_discard)
+               remap_and_issue(tc, m->bio, m->data_block);
+       else
+               bio_endio(m->bio, 0);
+
+       cell_defer_except(tc, m->cell);
+       cell_defer_except(tc, m->cell2);
+       mempool_free(m, tc->pool->mapping_pool);
+}
+
+static void process_prepared(struct pool *pool, struct list_head *head,
+                            void (*fn)(struct new_mapping *))
 {
        unsigned long flags;
        struct list_head maps;
@@ -880,21 +907,27 @@ static void process_prepared_mappings(struct pool *pool)
 
        INIT_LIST_HEAD(&maps);
        spin_lock_irqsave(&pool->lock, flags);
-       list_splice_init(&pool->prepared_mappings, &maps);
+       list_splice_init(head, &maps);
        spin_unlock_irqrestore(&pool->lock, flags);
 
        list_for_each_entry_safe(m, tmp, &maps, list)
-               process_prepared_mapping(m);
+               fn(m);
 }
 
 /*
  * Deferred bio jobs.
  */
-static int io_overwrites_block(struct pool *pool, struct bio *bio)
+static int io_overlaps_block(struct pool *pool, struct bio *bio)
 {
-       return ((bio_data_dir(bio) == WRITE) &&
-               !(bio->bi_sector & pool->offset_mask)) &&
+       return !(bio->bi_sector & pool->offset_mask) &&
                (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
+
+}
+
+static int io_overwrites_block(struct pool *pool, struct bio *bio)
+{
+       return (bio_data_dir(bio) == WRITE) &&
+               io_overlaps_block(pool, bio);
 }
 
 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
@@ -1134,6 +1167,86 @@ static void no_space(struct cell *cell)
                retry_on_resume(bio);
 }
 
+static void process_discard(struct thin_c *tc, struct bio *bio)
+{
+       int r;
+       struct pool *pool = tc->pool;
+       struct cell *cell, *cell2;
+       struct cell_key key, key2;
+       dm_block_t block = get_bio_block(tc, bio);
+       struct dm_thin_lookup_result lookup_result;
+       struct new_mapping *m;
+
+       build_virtual_key(tc->td, block, &key);
+       if (bio_detain(tc->pool->prison, &key, bio, &cell))
+               return;
+
+       r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
+       switch (r) {
+       case 0:
+               /*
+                * Check nobody is fiddling with this pool block.  This can
+                * happen if someone's in the process of breaking sharing
+                * on this block.
+                */
+               build_data_key(tc->td, lookup_result.block, &key2);
+               if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
+                       cell_release_singleton(cell, bio);
+                       break;
+               }
+
+               if (io_overlaps_block(pool, bio)) {
+                       /*
+                        * IO may still be going to the destination block.  We must
+                        * quiesce before we can do the removal.
+                        */
+                       m = get_next_mapping(pool);
+                       m->tc = tc;
+                       m->pass_discard = !lookup_result.shared;
+                       m->virt_block = block;
+                       m->data_block = lookup_result.block;
+                       m->cell = cell;
+                       m->cell2 = cell2;
+                       m->err = 0;
+                       m->bio = bio;
+
+                       if (!ds_add_work(&pool->all_io_ds, &m->list)) {
+                               list_add(&m->list, &pool->prepared_discards);
+                               wake_worker(pool);
+                       }
+               } else {
+                       /*
+                        * This path is hit if people are ignoring
+                        * limits->discard_granularity.  It ignores any
+                        * part of the discard that is in a subsequent
+                        * block.
+                        */
+                       sector_t offset = bio->bi_sector - (block << pool->block_shift);
+                       unsigned remaining = (pool->sectors_per_block - offset) << 9;
+                       bio->bi_size = min(bio->bi_size, remaining);
+
+                       cell_release_singleton(cell, bio);
+                       cell_release_singleton(cell2, bio);
+                       remap_and_issue(tc, bio, lookup_result.block);
+               }
+               break;
+
+       case -ENODATA:
+               /*
+                * It isn't provisioned, just forget it.
+                */
+               cell_release_singleton(cell, bio);
+               bio_endio(bio, 0);
+               break;
+
+       default:
+               DMERR("discard: find block unexpectedly returned %d", r);
+               cell_release_singleton(cell, bio);
+               bio_io_error(bio);
+               break;
+       }
+}
+
 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
                          struct cell_key *key,
                          struct dm_thin_lookup_result *lookup_result,
@@ -1279,6 +1392,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
 
        default:
                DMERR("dm_thin_find_block() failed, error = %d", r);
+               cell_release_singleton(cell, bio);
                bio_io_error(bio);
                break;
        }
@@ -1320,7 +1434,11 @@ static void process_deferred_bios(struct pool *pool)
 
                        break;
                }
-               process_bio(tc, bio);
+
+               if (bio->bi_rw & REQ_DISCARD)
+                       process_discard(tc, bio);
+               else
+                       process_bio(tc, bio);
        }
 
        /*
@@ -1354,7 +1472,8 @@ static void do_worker(struct work_struct *ws)
 {
        struct pool *pool = container_of(ws, struct pool, worker);
 
-       process_prepared_mappings(pool);
+       process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
+       process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
        process_deferred_bios(pool);
 }
 
@@ -1397,6 +1516,7 @@ static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
 
        h->tc = tc;
        h->shared_read_entry = NULL;
+       h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
        h->overwrite_mapping = NULL;
 
        return h;
@@ -1415,7 +1535,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
        struct dm_thin_lookup_result result;
 
        map_context->ptr = thin_hook_bio(tc, bio);
-       if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+       if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
                thin_defer_bio(tc, bio);
                return DM_MAPIO_SUBMITTED;
        }
@@ -1591,10 +1711,12 @@ static struct pool *pool_create(struct mapped_device *pool_md,
        bio_list_init(&pool->deferred_bios);
        bio_list_init(&pool->deferred_flush_bios);
        INIT_LIST_HEAD(&pool->prepared_mappings);
+       INIT_LIST_HEAD(&pool->prepared_discards);
        pool->low_water_triggered = 0;
        pool->no_free_space = 0;
        bio_list_init(&pool->retry_on_resume_list);
        ds_init(&pool->shared_read_ds);
+       ds_init(&pool->all_io_ds);
 
        pool->next_mapping = NULL;
        pool->mapping_pool =
@@ -1834,7 +1956,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
        pt->low_water_blocks = low_water_blocks;
        pt->zero_new_blocks = pf.zero_new_blocks;
        ti->num_flush_requests = 1;
-       ti->num_discard_requests = 0;
+       ti->num_discard_requests = 1;
+       ti->discards_supported = 1;
        ti->private = pt;
 
        pt->callbacks.congested_fn = pool_is_congested;
@@ -2227,6 +2350,17 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 
+static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
+{
+       limits->max_discard_sectors = pool->sectors_per_block;
+
+       /*
+        * This is just a hint, and not enforced.  We have to cope with
+        * bios that overlap 2 blocks.
+        */
+       limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+}
+
 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
        struct pool_c *pt = ti->private;
@@ -2234,6 +2368,7 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
        blk_limits_io_min(limits, 0);
        blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+       set_discard_limits(pool, limits);
 }
 
 static struct target_type pool_target = {
@@ -2350,8 +2485,8 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
        ti->split_io = tc->pool->sectors_per_block;
        ti->num_flush_requests = 1;
-       ti->num_discard_requests = 0;
-       ti->discards_supported = 0;
+       ti->num_discard_requests = 1;
+       ti->discards_supported = 1;
 
        dm_put(pool_md);
 
@@ -2407,6 +2542,13 @@ static int thin_endio(struct dm_target *ti,
                spin_unlock_irqrestore(&pool->lock, flags);
        }
 
+       if (h->all_io_entry) {
+               INIT_LIST_HEAD(&work);
+               ds_dec(h->all_io_entry, &work);
+               list_for_each_entry_safe(m, tmp, &work, list)
+                       list_add(&m->list, &pool->prepared_discards);
+       }
+
        mempool_free(h, pool->endio_hook_pool);
 
        return 0;
@@ -2487,9 +2629,11 @@ static int thin_iterate_devices(struct dm_target *ti,
 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
        struct thin_c *tc = ti->private;
+       struct pool *pool = tc->pool;
 
        blk_limits_io_min(limits, 0);
-       blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
+       blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+       set_discard_limits(pool, limits);
 }
 
 static struct target_type thin_target = {