From e2e74d617eadc15f601983270c4f4a6935c5a943 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 20 Mar 2013 17:21:27 +0000
Subject: [PATCH] dm cache: fix race in writethrough implementation

We have found a race in the optimisation used in the dm cache
writethrough implementation.  Currently, dm core sends the cache target
two bios, one for the origin device and one for the cache device and
these are processed in parallel.  This patch avoids the race by
changing the code back to a simpler (slower) implementation which
processes the two writes in series, one after the other, until we can
develop a complete fix for the problem.

When the cache is in writethrough mode it needs to send WRITE bios to
both the origin and cache devices.

Previously we've been implementing this by having dm core query the
cache target on every write to find out how many copies of the bio it
wants.  The cache will ask for two bios if the block is in the cache,
and one otherwise.

Then main problem with this is it's racey.  At the time this check is
made the bio hasn't yet been submitted and so isn't being taken into
account when quiescing a block for migration (promotion or demotion).
This means a single bio may be submitted when two were needed because
the block has since been promoted to the cache (catastrophic), or two
bios where only one is needed (harmless).

I really don't want to start entering bios into the quiescing system
(deferred_set) in the get_num_write_bios callback.  Instead this patch
simplifies things; only one bio is submitted by the core, this is
first written to the origin and then the cache device in series.
Obviously this will have a latency impact.

deferred_writethrough_bios is introduced to record bios that must be
later issued to the cache device from the worker thread.  This deferred
submission, after the origin bio completes, is required given that we're
in interrupt context (writethrough_endio).

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-cache-target.c | 138 ++++++++++++++++++++++-------------
 1 file changed, 88 insertions(+), 50 deletions(-)

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 79ac8603644d..ff267db60025 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -142,6 +142,7 @@ struct cache {
 	spinlock_t lock;
 	struct bio_list deferred_bios;
 	struct bio_list deferred_flush_bios;
+	struct bio_list deferred_writethrough_bios;
 	struct list_head quiesced_migrations;
 	struct list_head completed_migrations;
 	struct list_head need_commit_migrations;
@@ -199,6 +200,11 @@ struct per_bio_data {
 	bool tick:1;
 	unsigned req_nr:2;
 	struct dm_deferred_entry *all_io_entry;
+
+	/* writethrough fields */
+	struct cache *cache;
+	dm_cblock_t cblock;
+	bio_end_io_t *saved_bi_end_io;
 };
 
 struct dm_cache_migration {
@@ -616,6 +622,56 @@ static void issue(struct cache *cache, struct bio *bio)
 	spin_unlock_irqrestore(&cache->lock, flags);
 }
 
+static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cache->lock, flags);
+	bio_list_add(&cache->deferred_writethrough_bios, bio);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	wake_worker(cache);
+}
+
+static void writethrough_endio(struct bio *bio, int err)
+{
+	struct per_bio_data *pb = get_per_bio_data(bio);
+	bio->bi_end_io = pb->saved_bi_end_io;
+
+	if (err) {
+		bio_endio(bio, err);
+		return;
+	}
+
+	remap_to_cache(pb->cache, bio, pb->cblock);
+
+	/*
+	 * We can't issue this bio directly, since we're in interrupt
+	 * context.  So it get's put on a bio list for processing by the
+	 * worker thread.
+	 */
+	defer_writethrough_bio(pb->cache, bio);
+}
+
+/*
+ * When running in writethrough mode we need to send writes to clean blocks
+ * to both the cache and origin devices.  In future we'd like to clone the
+ * bio and send them in parallel, but for now we're doing them in
+ * series as this is easier.
+ */
+static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
+				       dm_oblock_t oblock, dm_cblock_t cblock)
+{
+	struct per_bio_data *pb = get_per_bio_data(bio);
+
+	pb->cache = cache;
+	pb->cblock = cblock;
+	pb->saved_bi_end_io = bio->bi_end_io;
+	bio->bi_end_io = writethrough_endio;
+
+	remap_to_origin_clear_discard(pb->cache, bio, oblock);
+}
+
 /*----------------------------------------------------------------
  * Migration processing
  *
@@ -1077,14 +1133,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 		inc_hit_counter(cache, bio);
 		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
 
-		if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
-			/*
-			 * No need to mark anything dirty in write through mode.
-			 */
-			pb->req_nr == 0 ?
-				remap_to_cache(cache, bio, lookup_result.cblock) :
-				remap_to_origin_clear_discard(cache, bio, block);
-		} else
+		if (is_writethrough_io(cache, bio, lookup_result.cblock))
+			remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+		else
 			remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
 
 		issue(cache, bio);
@@ -1093,17 +1144,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 	case POLICY_MISS:
 		inc_miss_counter(cache, bio);
 		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-
-		if (pb->req_nr != 0) {
-			/*
-			 * This is a duplicate writethrough io that is no
-			 * longer needed because the block has been demoted.
-			 */
-			bio_endio(bio, 0);
-		} else {
-			remap_to_origin_clear_discard(cache, bio, block);
-			issue(cache, bio);
-		}
+		remap_to_origin_clear_discard(cache, bio, block);
+		issue(cache, bio);
 		break;
 
 	case POLICY_NEW:
@@ -1224,6 +1266,23 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
 		submit_bios ? generic_make_request(bio) : bio_io_error(bio);
 }
 
+static void process_deferred_writethrough_bios(struct cache *cache)
+{
+	unsigned long flags;
+	struct bio_list bios;
+	struct bio *bio;
+
+	bio_list_init(&bios);
+
+	spin_lock_irqsave(&cache->lock, flags);
+	bio_list_merge(&bios, &cache->deferred_writethrough_bios);
+	bio_list_init(&cache->deferred_writethrough_bios);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	while ((bio = bio_list_pop(&bios)))
+		generic_make_request(bio);
+}
+
 static void writeback_some_dirty_blocks(struct cache *cache)
 {
 	int r = 0;
@@ -1320,6 +1379,7 @@ static int more_work(struct cache *cache)
 	else
 		return !bio_list_empty(&cache->deferred_bios) ||
 			!bio_list_empty(&cache->deferred_flush_bios) ||
+			!bio_list_empty(&cache->deferred_writethrough_bios) ||
 			!list_empty(&cache->quiesced_migrations) ||
 			!list_empty(&cache->completed_migrations) ||
 			!list_empty(&cache->need_commit_migrations);
@@ -1338,6 +1398,8 @@ static void do_worker(struct work_struct *ws)
 
 		writeback_some_dirty_blocks(cache);
 
+		process_deferred_writethrough_bios(cache);
+
 		if (commit_if_needed(cache)) {
 			process_deferred_flush_bios(cache, false);
 
@@ -1803,8 +1865,6 @@ static sector_t calculate_discard_block_size(sector_t cache_block_size,
 
 #define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
 
-static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio);
-
 static int cache_create(struct cache_args *ca, struct cache **result)
 {
 	int r = 0;
@@ -1831,9 +1891,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
 	memcpy(&cache->features, &ca->features, sizeof(cache->features));
 
-	if (cache->features.write_through)
-		ti->num_write_bios = cache_num_write_bios;
-
 	cache->callbacks.congested_fn = cache_is_congested;
 	dm_table_add_target_callbacks(ti->table, &cache->callbacks);
 
@@ -1883,6 +1940,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 	spin_lock_init(&cache->lock);
 	bio_list_init(&cache->deferred_bios);
 	bio_list_init(&cache->deferred_flush_bios);
+	bio_list_init(&cache->deferred_writethrough_bios);
 	INIT_LIST_HEAD(&cache->quiesced_migrations);
 	INIT_LIST_HEAD(&cache->completed_migrations);
 	INIT_LIST_HEAD(&cache->need_commit_migrations);
@@ -2028,20 +2086,6 @@ out:
 	return r;
 }
 
-static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio)
-{
-	int r;
-	struct cache *cache = ti->private;
-	dm_oblock_t block = get_bio_block(cache, bio);
-	dm_cblock_t cblock;
-
-	r = policy_lookup(cache->policy, block, &cblock);
-	if (r < 0)
-		return 2;	/* assume the worst */
-
-	return (!r && !is_dirty(cache, cblock)) ? 2 : 1;
-}
-
 static int cache_map(struct dm_target *ti, struct bio *bio)
 {
 	struct cache *cache = ti->private;
@@ -2109,18 +2153,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 		inc_hit_counter(cache, bio);
 		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
 
-		if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
-			/*
-			 * No need to mark anything dirty in write through mode.
-			 */
-			pb->req_nr == 0 ?
-				remap_to_cache(cache, bio, lookup_result.cblock) :
-				remap_to_origin_clear_discard(cache, bio, block);
-			cell_defer(cache, cell, false);
-		} else {
+		if (is_writethrough_io(cache, bio, lookup_result.cblock))
+			remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+		else
 			remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
-			cell_defer(cache, cell, false);
-		}
+
+		cell_defer(cache, cell, false);
 		break;
 
 	case POLICY_MISS:
@@ -2547,7 +2585,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type cache_target = {
 	.name = "cache",
-	.version = {1, 0, 0},
+	.version = {1, 1, 0},
 	.module = THIS_MODULE,
 	.ctr = cache_ctr,
 	.dtr = cache_dtr,
-- 
2.30.2