* This file is released under the GPL.
*/
-#include "dm-cache-policy.h"
+#include "dm-cache-background-tracker.h"
#include "dm-cache-policy-internal.h"
+#include "dm-cache-policy.h"
#include "dm.h"
#include <linux/hash.h>
unsigned hash_next:28;
unsigned prev:28;
unsigned next:28;
- unsigned level:7;
+ unsigned level:6;
bool dirty:1;
bool allocated:1;
bool sentinel:1;
+ bool pending_work:1;
dm_oblock_t oblock;
};
*/
static void q_push(struct queue *q, struct entry *e)
{
+ BUG_ON(e->pending_work);
+
if (!e->sentinel)
q->nr_elts++;
l_add_tail(q->es, q->qs + e->level, e);
}
+static void q_push_front(struct queue *q, struct entry *e)
+{
+ BUG_ON(e->pending_work);
+
+ if (!e->sentinel)
+ q->nr_elts++;
+
+ l_add_head(q->es, q->qs + e->level, e);
+}
+
static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
{
+ BUG_ON(e->pending_work);
+
if (!e->sentinel)
q->nr_elts++;
return e;
}
-/*
- * Pops an entry from a level that is not past a sentinel.
- */
-static struct entry *q_pop_old(struct queue *q, unsigned max_level)
-{
- struct entry *e = q_peek(q, max_level, false);
-
- if (e)
- q_del(q, e);
-
- return e;
-}
-
/*
* This function assumes there is a non-sentinel entry to pop. It's only
* used by redistribute, so we know this is true. It also doesn't adjust
break;
e->level = level + 1u;
- l_add_head(q->es, l_above, e);
+ l_add_tail(q->es, l_above, e);
}
}
}
-static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels)
+static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels,
+ struct entry *s1, struct entry *s2)
{
struct entry *de;
- unsigned new_level;
-
- q_del(q, e);
+ unsigned sentinels_passed = 0;
+ unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels);
+ /* try and find an entry to swap with */
if (extra_levels && (e->level < q->nr_levels - 1u)) {
- new_level = min(q->nr_levels - 1u, e->level + extra_levels);
- for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) {
- if (de->sentinel)
- continue;
+ for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
+ sentinels_passed++;
+ if (de) {
q_del(q, de);
de->level = e->level;
+ if (s1) {
+ switch (sentinels_passed) {
+ case 0:
+ q_push_before(q, s1, de);
+ break;
+
+ case 1:
+ q_push_before(q, s2, de);
+ break;
- if (dest)
- q_push_before(q, dest, de);
- else
+ default:
+ q_push(q, de);
+ }
+ } else
q_push(q, de);
- break;
}
-
- e->level = new_level;
}
+ q_del(q, e);
+ e->level = new_level;
q_push(q, e);
}
-static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels)
-{
- q_requeue_before(q, NULL, e, extra_levels);
-}
-
/*----------------------------------------------------------------*/
#define FP_SHIFT 8
/*----------------------------------------------------------------*/
-struct hash_table {
+struct smq_hash_table {
struct entry_space *es;
unsigned long long hash_bits;
unsigned *buckets;
* All cache entries are stored in a chained hash table. To save space we
* use indexing again, and only store indexes to the next entry.
*/
-static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries)
+static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries)
{
unsigned i, nr_buckets;
return 0;
}
-static void h_exit(struct hash_table *ht)
+static void h_exit(struct smq_hash_table *ht)
{
vfree(ht->buckets);
}
-static struct entry *h_head(struct hash_table *ht, unsigned bucket)
+static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket)
{
return to_entry(ht->es, ht->buckets[bucket]);
}
-static struct entry *h_next(struct hash_table *ht, struct entry *e)
+static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
{
return to_entry(ht->es, e->hash_next);
}
-static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e)
+static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e)
{
e->hash_next = ht->buckets[bucket];
ht->buckets[bucket] = to_index(ht->es, e);
}
-static void h_insert(struct hash_table *ht, struct entry *e)
+static void h_insert(struct smq_hash_table *ht, struct entry *e)
{
unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
__h_insert(ht, h, e);
}
-static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock,
+static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock,
struct entry **prev)
{
struct entry *e;
return NULL;
}
-static void __h_unlink(struct hash_table *ht, unsigned h,
+static void __h_unlink(struct smq_hash_table *ht, unsigned h,
struct entry *e, struct entry *prev)
{
if (prev)
/*
* Also moves each entry to the front of the bucket.
*/
-static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
+static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
{
struct entry *e, *prev;
unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
return e;
}
-static void h_remove(struct hash_table *ht, struct entry *e)
+static void h_remove(struct smq_hash_table *ht, struct entry *e)
{
unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
struct entry *prev;
e->next = INDEXER_NULL;
e->prev = INDEXER_NULL;
e->level = 0u;
+ e->dirty = true; /* FIXME: audit */
e->allocated = true;
+ e->sentinel = false;
+ e->pending_work = false;
}
static struct entry *alloc_entry(struct entry_alloc *ea)
#define NR_HOTSPOT_LEVELS 64u
#define NR_CACHE_LEVELS 64u
-#define WRITEBACK_PERIOD (10 * HZ)
-#define DEMOTE_PERIOD (60 * HZ)
+#define WRITEBACK_PERIOD (10ul * HZ)
+#define DEMOTE_PERIOD (60ul * HZ)
#define HOTSPOT_UPDATE_PERIOD (HZ)
-#define CACHE_UPDATE_PERIOD (10u * HZ)
+#define CACHE_UPDATE_PERIOD (60ul * HZ)
struct smq_policy {
struct dm_cache_policy policy;
* The hash tables allows us to quickly find an entry by origin
* block.
*/
- struct hash_table table;
- struct hash_table hotspot_table;
+ struct smq_hash_table table;
+ struct smq_hash_table hotspot_table;
bool current_writeback_sentinels;
unsigned long next_writeback_period;
unsigned long next_hotspot_period;
unsigned long next_cache_period;
+
+ struct background_tracker *bg_work;
+
+ bool migrations_allowed;
};
/*----------------------------------------------------------------*/
static void update_sentinels(struct smq_policy *mq)
{
if (time_after(jiffies, mq->next_writeback_period)) {
- __update_writeback_sentinels(mq);
mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
+ __update_writeback_sentinels(mq);
}
if (time_after(jiffies, mq->next_demote_period)) {
- __update_demote_sentinels(mq);
mq->next_demote_period = jiffies + DEMOTE_PERIOD;
mq->current_demote_sentinels = !mq->current_demote_sentinels;
+ __update_demote_sentinels(mq);
}
}
/*----------------------------------------------------------------*/
-/*
- * These methods tie together the dirty queue, clean queue and hash table.
- */
-static void push_new(struct smq_policy *mq, struct entry *e)
+static void del_queue(struct smq_policy *mq, struct entry *e)
{
- struct queue *q = e->dirty ? &mq->dirty : &mq->clean;
- h_insert(&mq->table, e);
- q_push(q, e);
+ q_del(e->dirty ? &mq->dirty : &mq->clean, e);
}
-static void push(struct smq_policy *mq, struct entry *e)
+static void push_queue(struct smq_policy *mq, struct entry *e)
{
- struct entry *sentinel;
-
- h_insert(&mq->table, e);
-
- /*
- * Punch this into the queue just in front of the sentinel, to
- * ensure it's cleaned straight away.
- */
- if (e->dirty) {
- sentinel = writeback_sentinel(mq, e->level);
- q_push_before(&mq->dirty, sentinel, e);
- } else {
- sentinel = demote_sentinel(mq, e->level);
- q_push_before(&mq->clean, sentinel, e);
- }
+ if (e->dirty)
+ q_push(&mq->dirty, e);
+ else
+ q_push(&mq->clean, e);
}
-/*
- * Removes an entry from cache. Removes from the hash table.
- */
-static void __del(struct smq_policy *mq, struct queue *q, struct entry *e)
+// !h, !q, a -> h, q, a
+static void push(struct smq_policy *mq, struct entry *e)
{
- q_del(q, e);
- h_remove(&mq->table, e);
+ h_insert(&mq->table, e);
+ if (!e->pending_work)
+ push_queue(mq, e);
}
-static void del(struct smq_policy *mq, struct entry *e)
+static void push_queue_front(struct smq_policy *mq, struct entry *e)
{
- __del(mq, e->dirty ? &mq->dirty : &mq->clean, e);
+ if (e->dirty)
+ q_push_front(&mq->dirty, e);
+ else
+ q_push_front(&mq->clean, e);
}
-static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level)
+static void push_front(struct smq_policy *mq, struct entry *e)
{
- struct entry *e = q_pop_old(q, max_level);
- if (e)
- h_remove(&mq->table, e);
- return e;
+ h_insert(&mq->table, e);
+ if (!e->pending_work)
+ push_queue_front(mq, e);
}
static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
static void requeue(struct smq_policy *mq, struct entry *e)
{
- struct entry *sentinel;
+ /*
+ * Pending work has temporarily been taken out of the queues.
+ */
+ if (e->pending_work)
+ return;
if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
- if (e->dirty) {
- sentinel = writeback_sentinel(mq, e->level);
- q_requeue_before(&mq->dirty, sentinel, e, 1u);
- } else {
- sentinel = demote_sentinel(mq, e->level);
- q_requeue_before(&mq->clean, sentinel, e, 1u);
+ if (!e->dirty) {
+ q_requeue(&mq->clean, e, 1u, NULL, NULL);
+ return;
}
+
+ q_requeue(&mq->dirty, e, 1u,
+ get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
+ get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
}
}
unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
+ threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
+
/*
* If the hotspot queue is performing badly then we have little
* confidence that we know which blocks to promote. So we cut down
}
mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
- mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u;
+ mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
}
/*
}
}
-static int demote_cblock(struct smq_policy *mq,
- struct policy_locker *locker,
- dm_oblock_t *oblock)
+/*----------------------------------------------------------------*/
+
+/*
+ * Targets are given as a percentage.
+ */
+#define CLEAN_TARGET 25u
+#define FREE_TARGET 25u
+
+static unsigned percent_to_target(struct smq_policy *mq, unsigned p)
{
- struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false);
- if (!demoted)
- /*
- * We could get a block from mq->dirty, but that
- * would add extra latency to the triggering bio as it
- * waits for the writeback. Better to not promote this
- * time and hope there's a clean block next time this block
- * is hit.
- */
- return -ENOSPC;
+ return from_cblock(mq->cache_size) * p / 100u;
+}
+
+static bool clean_target_met(struct smq_policy *mq, bool idle)
+{
+ /*
+ * Cache entries may not be populated. So we cannot rely on the
+ * size of the clean queue.
+ */
+ unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
- if (locker->fn(locker, demoted->oblock))
+ if (idle)
/*
- * We couldn't lock this block.
+ * We'd like to clean everything.
*/
- return -EBUSY;
+ return q_size(&mq->dirty) == 0u;
+ else
+ return (nr_clean + btracker_nr_writebacks_queued(mq->bg_work)) >=
+ percent_to_target(mq, CLEAN_TARGET);
+}
- del(mq, demoted);
- *oblock = demoted->oblock;
- free_entry(&mq->cache_alloc, demoted);
+static bool free_target_met(struct smq_policy *mq, bool idle)
+{
+ unsigned nr_free = from_cblock(mq->cache_size) -
+ mq->cache_alloc.nr_allocated;
- return 0;
+ if (idle)
+ return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
+ percent_to_target(mq, FREE_TARGET);
+ else
+ return true;
}
+/*----------------------------------------------------------------*/
+
+static void mark_pending(struct smq_policy *mq, struct entry *e)
+{
+ BUG_ON(e->sentinel);
+ BUG_ON(!e->allocated);
+ BUG_ON(e->pending_work);
+ e->pending_work = true;
+}
+
+static void clear_pending(struct smq_policy *mq, struct entry *e)
+{
+ BUG_ON(!e->pending_work);
+ e->pending_work = false;
+}
+
+static void queue_writeback(struct smq_policy *mq)
+{
+ int r;
+ struct policy_work work;
+ struct entry *e;
+
+ e = q_peek(&mq->dirty, mq->dirty.nr_levels, false);
+ if (e) {
+ mark_pending(mq, e);
+ q_del(&mq->dirty, e);
+
+ work.op = POLICY_WRITEBACK;
+ work.oblock = e->oblock;
+ work.cblock = infer_cblock(mq, e);
+
+ r = btracker_queue(mq->bg_work, &work, NULL);
+ WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race.
+ }
+}
+
+static void queue_demotion(struct smq_policy *mq)
+{
+ struct policy_work work;
+ struct entry *e;
+
+ if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed)))
+ return;
+
+ e = q_peek(&mq->clean, mq->clean.nr_levels, true);
+ if (!e) {
+ if (!clean_target_met(mq, false))
+ queue_writeback(mq);
+ return;
+ }
+
+ mark_pending(mq, e);
+ q_del(&mq->clean, e);
+
+ work.op = POLICY_DEMOTE;
+ work.oblock = e->oblock;
+ work.cblock = infer_cblock(mq, e);
+ btracker_queue(mq->bg_work, &work, NULL);
+}
+
+static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
+ struct policy_work **workp)
+{
+ struct entry *e;
+ struct policy_work work;
+
+ if (!mq->migrations_allowed)
+ return;
+
+ if (allocator_empty(&mq->cache_alloc)) {
+ if (!free_target_met(mq, false))
+ queue_demotion(mq);
+ return;
+ }
+
+ if (btracker_promotion_already_present(mq->bg_work, oblock))
+ return;
+
+ /*
+ * We allocate the entry now to reserve the cblock. If the
+ * background work is aborted we must remember to free it.
+ */
+ e = alloc_entry(&mq->cache_alloc);
+ BUG_ON(!e);
+ e->pending_work = true;
+ work.op = POLICY_PROMOTE;
+ work.oblock = oblock;
+ work.cblock = infer_cblock(mq, e);
+ btracker_queue(mq->bg_work, &work, workp);
+}
+
+/*----------------------------------------------------------------*/
+
enum promote_result {
PROMOTE_NOT,
PROMOTE_TEMPORARY,
return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
}
-static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio,
- bool fast_promote)
+static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
+ int data_dir, bool fast_promote)
{
- if (bio_data_dir(bio) == WRITE) {
+ if (data_dir == WRITE) {
if (!allocator_empty(&mq->cache_alloc) && fast_promote)
return PROMOTE_TEMPORARY;
- else
- return maybe_promote(hs_e->level >= mq->write_promote_level);
+ return maybe_promote(hs_e->level >= mq->write_promote_level);
} else
return maybe_promote(hs_e->level >= mq->read_promote_level);
}
-static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock,
- struct policy_locker *locker,
- struct policy_result *result, enum promote_result pr)
-{
- int r;
- struct entry *e;
-
- if (allocator_empty(&mq->cache_alloc)) {
- result->op = POLICY_REPLACE;
- r = demote_cblock(mq, locker, &result->old_oblock);
- if (r) {
- result->op = POLICY_MISS;
- return;
- }
-
- } else
- result->op = POLICY_NEW;
-
- e = alloc_entry(&mq->cache_alloc);
- BUG_ON(!e);
- e->oblock = oblock;
-
- if (pr == PROMOTE_TEMPORARY)
- push(mq, e);
- else
- push_new(mq, e);
-
- result->cblock = infer_cblock(mq, e);
-}
-
static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
{
sector_t r = from_oblock(b);
return to_oblock(r);
}
-static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio)
+static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
{
unsigned hi;
dm_oblock_t hb = to_hblock(mq, b);
hi = get_index(&mq->hotspot_alloc, e);
q_requeue(&mq->hotspot, e,
test_and_set_bit(hi, mq->hotspot_hit_bits) ?
- 0u : mq->hotspot_level_jump);
+ 0u : mq->hotspot_level_jump,
+ NULL, NULL);
} else {
stats_miss(&mq->hotspot_stats);
return e;
}
-/*
- * Looks the oblock up in the hash table, then decides whether to put in
- * pre_cache, or cache etc.
- */
-static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock,
- bool can_migrate, bool fast_promote,
- struct policy_locker *locker, struct policy_result *result)
-{
- struct entry *e, *hs_e;
- enum promote_result pr;
-
- hs_e = update_hotspot_queue(mq, oblock, bio);
-
- e = h_lookup(&mq->table, oblock);
- if (e) {
- stats_level_accessed(&mq->cache_stats, e->level);
-
- requeue(mq, e);
- result->op = POLICY_HIT;
- result->cblock = infer_cblock(mq, e);
-
- } else {
- stats_miss(&mq->cache_stats);
-
- pr = should_promote(mq, hs_e, bio, fast_promote);
- if (pr == PROMOTE_NOT)
- result->op = POLICY_MISS;
-
- else {
- if (!can_migrate) {
- result->op = POLICY_MISS;
- return -EWOULDBLOCK;
- }
-
- insert_in_cache(mq, oblock, locker, result, pr);
- }
- }
-
- return 0;
-}
-
/*----------------------------------------------------------------*/
/*
{
struct smq_policy *mq = to_smq_policy(p);
+ btracker_destroy(mq->bg_work);
h_exit(&mq->hotspot_table);
h_exit(&mq->table);
free_bitset(mq->hotspot_hit_bits);
kfree(mq);
}
-static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
- bool can_block, bool can_migrate, bool fast_promote,
- struct bio *bio, struct policy_locker *locker,
- struct policy_result *result)
-{
- int r;
- unsigned long flags;
- struct smq_policy *mq = to_smq_policy(p);
-
- result->op = POLICY_MISS;
-
- spin_lock_irqsave(&mq->lock, flags);
- r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
- spin_unlock_irqrestore(&mq->lock, flags);
-
- return r;
-}
+/*----------------------------------------------------------------*/
-static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
+ int data_dir, bool fast_copy,
+ struct policy_work **work, bool *background_work)
{
- int r;
- unsigned long flags;
- struct smq_policy *mq = to_smq_policy(p);
- struct entry *e;
+ struct entry *e, *hs_e;
+ enum promote_result pr;
+
+ *background_work = false;
- spin_lock_irqsave(&mq->lock, flags);
e = h_lookup(&mq->table, oblock);
if (e) {
+ stats_level_accessed(&mq->cache_stats, e->level);
+
+ requeue(mq, e);
*cblock = infer_cblock(mq, e);
- r = 0;
- } else
- r = -ENOENT;
- spin_unlock_irqrestore(&mq->lock, flags);
+ return 0;
- return r;
-}
+ } else {
+ stats_miss(&mq->cache_stats);
-static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set)
-{
- struct entry *e;
+ /*
+ * The hotspot queue only gets updated with misses.
+ */
+ hs_e = update_hotspot_queue(mq, oblock);
- e = h_lookup(&mq->table, oblock);
- BUG_ON(!e);
+ pr = should_promote(mq, hs_e, data_dir, fast_copy);
+ if (pr != PROMOTE_NOT) {
+ queue_promotion(mq, oblock, work);
+ *background_work = true;
+ }
- del(mq, e);
- e->dirty = set;
- push(mq, e);
+ return -ENOENT;
+ }
}
-static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
+ int data_dir, bool fast_copy,
+ bool *background_work)
{
+ int r;
unsigned long flags;
struct smq_policy *mq = to_smq_policy(p);
spin_lock_irqsave(&mq->lock, flags);
- __smq_set_clear_dirty(mq, oblock, true);
+ r = __lookup(mq, oblock, cblock,
+ data_dir, fast_copy,
+ NULL, background_work);
spin_unlock_irqrestore(&mq->lock, flags);
+
+ return r;
}
-static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+static int smq_lookup_with_work(struct dm_cache_policy *p,
+ dm_oblock_t oblock, dm_cblock_t *cblock,
+ int data_dir, bool fast_copy,
+ struct policy_work **work)
{
- struct smq_policy *mq = to_smq_policy(p);
+ int r;
+ bool background_queued;
unsigned long flags;
+ struct smq_policy *mq = to_smq_policy(p);
spin_lock_irqsave(&mq->lock, flags);
- __smq_set_clear_dirty(mq, oblock, false);
+ r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
spin_unlock_irqrestore(&mq->lock, flags);
-}
-static unsigned random_level(dm_cblock_t cblock)
-{
- return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
+ return r;
}
-static int smq_load_mapping(struct dm_cache_policy *p,
- dm_oblock_t oblock, dm_cblock_t cblock,
- uint32_t hint, bool hint_valid)
+static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
+ struct policy_work **result)
{
+ int r;
+ unsigned long flags;
struct smq_policy *mq = to_smq_policy(p);
- struct entry *e;
- e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
- e->oblock = oblock;
- e->dirty = false; /* this gets corrected in a minute */
- e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
- push(mq, e);
-
- return 0;
-}
+ spin_lock_irqsave(&mq->lock, flags);
+ r = btracker_issue(mq->bg_work, result);
+ if (r == -ENODATA) {
+ /* find some writeback work to do */
+ if (mq->migrations_allowed && !free_target_met(mq, idle))
+ queue_demotion(mq);
-static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
-{
- struct smq_policy *mq = to_smq_policy(p);
- struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
+ else if (!clean_target_met(mq, idle))
+ queue_writeback(mq);
- if (!e->allocated)
- return 0;
+ r = btracker_issue(mq->bg_work, result);
+ }
+ spin_unlock_irqrestore(&mq->lock, flags);
- return e->level;
+ return r;
}
-static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
-{
- struct entry *e;
+/*
+ * We need to clear any pending work flags that have been set, and in the
+ * case of promotion free the entry for the destination cblock.
+ */
+static void __complete_background_work(struct smq_policy *mq,
+ struct policy_work *work,
+ bool success)
+{
+ struct entry *e = get_entry(&mq->cache_alloc,
+ from_cblock(work->cblock));
+
+ switch (work->op) {
+ case POLICY_PROMOTE:
+ // !h, !q, a
+ clear_pending(mq, e);
+ if (success) {
+ e->oblock = work->oblock;
+ push(mq, e);
+ // h, q, a
+ } else {
+ free_entry(&mq->cache_alloc, e);
+ // !h, !q, !a
+ }
+ break;
- e = h_lookup(&mq->table, oblock);
- BUG_ON(!e);
+ case POLICY_DEMOTE:
+ // h, !q, a
+ if (success) {
+ h_remove(&mq->table, e);
+ free_entry(&mq->cache_alloc, e);
+ // !h, !q, !a
+ } else {
+ clear_pending(mq, e);
+ push_queue(mq, e);
+ // h, q, a
+ }
+ break;
- del(mq, e);
- free_entry(&mq->cache_alloc, e);
+ case POLICY_WRITEBACK:
+ // h, !q, a
+ clear_pending(mq, e);
+ push_queue(mq, e);
+ // h, q, a
+ break;
+ }
+
+ btracker_complete(mq->bg_work, work);
}
-static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+static void smq_complete_background_work(struct dm_cache_policy *p,
+ struct policy_work *work,
+ bool success)
{
- struct smq_policy *mq = to_smq_policy(p);
unsigned long flags;
+ struct smq_policy *mq = to_smq_policy(p);
spin_lock_irqsave(&mq->lock, flags);
- __remove_mapping(mq, oblock);
+ __complete_background_work(mq, work, success);
spin_unlock_irqrestore(&mq->lock, flags);
}
-static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock)
+// in_hash(oblock) -> in_hash(oblock)
+static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
{
struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
- if (!e || !e->allocated)
- return -ENODATA;
-
- del(mq, e);
- free_entry(&mq->cache_alloc, e);
-
- return 0;
+ if (e->pending_work)
+ e->dirty = set;
+ else {
+ del_queue(mq, e);
+ e->dirty = set;
+ push_queue(mq, e);
+ }
}
-static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
{
- int r;
unsigned long flags;
struct smq_policy *mq = to_smq_policy(p);
spin_lock_irqsave(&mq->lock, flags);
- r = __remove_cblock(mq, cblock);
+ __smq_set_clear_dirty(mq, cblock, true);
spin_unlock_irqrestore(&mq->lock, flags);
-
- return r;
}
-
-#define CLEAN_TARGET_CRITICAL 5u /* percent */
-
-static bool clean_target_met(struct smq_policy *mq, bool critical)
+static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
{
- if (critical) {
- /*
- * Cache entries may not be populated. So we're cannot rely on the
- * size of the clean queue.
- */
- unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
- unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u;
+ struct smq_policy *mq = to_smq_policy(p);
+ unsigned long flags;
- return nr_clean >= target;
- } else
- return !q_size(&mq->dirty);
+ spin_lock_irqsave(&mq->lock, flags);
+ __smq_set_clear_dirty(mq, cblock, false);
+ spin_unlock_irqrestore(&mq->lock, flags);
}
-static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock,
- dm_cblock_t *cblock, bool critical_only)
+static unsigned random_level(dm_cblock_t cblock)
{
- struct entry *e = NULL;
- bool target_met = clean_target_met(mq, critical_only);
-
- if (critical_only)
- /*
- * Always try and keep the bottom level clean.
- */
- e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels);
+ return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
+}
- else
- e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels);
+static int smq_load_mapping(struct dm_cache_policy *p,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ bool dirty, uint32_t hint, bool hint_valid)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+ struct entry *e;
- if (!e)
- return -ENODATA;
+ e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
+ e->oblock = oblock;
+ e->dirty = dirty;
+ e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
+ e->pending_work = false;
- *oblock = e->oblock;
- *cblock = infer_cblock(mq, e);
- e->dirty = false;
- push_new(mq, e);
+ /*
+ * When we load mappings we push ahead of both sentinels in order to
+ * allow demotions and cleaning to occur immediately.
+ */
+ push_front(mq, e);
return 0;
}
-static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
- dm_cblock_t *cblock, bool critical_only)
+static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
{
- int r;
- unsigned long flags;
struct smq_policy *mq = to_smq_policy(p);
+ struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
- spin_lock_irqsave(&mq->lock, flags);
- r = __smq_writeback_work(mq, oblock, cblock, critical_only);
- spin_unlock_irqrestore(&mq->lock, flags);
-
- return r;
-}
-
-static void __force_mapping(struct smq_policy *mq,
- dm_oblock_t current_oblock, dm_oblock_t new_oblock)
-{
- struct entry *e = h_lookup(&mq->table, current_oblock);
+ if (!e->allocated)
+ return -ENODATA;
- if (e) {
- del(mq, e);
- e->oblock = new_oblock;
- e->dirty = true;
- push(mq, e);
- }
+ // FIXME: what if this block has pending background work?
+ del_queue(mq, e);
+ h_remove(&mq->table, e);
+ free_entry(&mq->cache_alloc, e);
+ return 0;
}
-static void smq_force_mapping(struct dm_cache_policy *p,
- dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
{
- unsigned long flags;
struct smq_policy *mq = to_smq_policy(p);
+ struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
- spin_lock_irqsave(&mq->lock, flags);
- __force_mapping(mq, current_oblock, new_oblock);
- spin_unlock_irqrestore(&mq->lock, flags);
+ if (!e->allocated)
+ return 0;
+
+ return e->level;
}
static dm_cblock_t smq_residency(struct dm_cache_policy *p)
spin_unlock_irqrestore(&mq->lock, flags);
}
+static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+ mq->migrations_allowed = allow;
+}
+
/*
* smq has no config values, but the old mq policy did. To avoid breaking
* software we continue to accept these configurables for the mq policy,
static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
{
mq->policy.destroy = smq_destroy;
- mq->policy.map = smq_map;
mq->policy.lookup = smq_lookup;
+ mq->policy.lookup_with_work = smq_lookup_with_work;
+ mq->policy.get_background_work = smq_get_background_work;
+ mq->policy.complete_background_work = smq_complete_background_work;
mq->policy.set_dirty = smq_set_dirty;
mq->policy.clear_dirty = smq_clear_dirty;
mq->policy.load_mapping = smq_load_mapping;
+ mq->policy.invalidate_mapping = smq_invalidate_mapping;
mq->policy.get_hint = smq_get_hint;
- mq->policy.remove_mapping = smq_remove_mapping;
- mq->policy.remove_cblock = smq_remove_cblock;
- mq->policy.writeback_work = smq_writeback_work;
- mq->policy.force_mapping = smq_force_mapping;
mq->policy.residency = smq_residency;
mq->policy.tick = smq_tick;
+ mq->policy.allow_migrations = smq_allow_migrations;
if (mimic_mq) {
mq->policy.set_config_value = mq_set_config_value;
static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
sector_t origin_size,
sector_t cache_block_size,
- bool mimic_mq)
+ bool mimic_mq,
+ bool migrations_allowed)
{
unsigned i;
unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
}
init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
- for (i = 0; i < nr_sentinels_per_queue; i++)
+ for (i = 0; i < nr_sentinels_per_queue; i++)
get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
- for (i = 0; i < nr_sentinels_per_queue; i++)
+ for (i = 0; i < nr_sentinels_per_queue; i++)
get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
mq->next_hotspot_period = jiffies;
mq->next_cache_period = jiffies;
+ mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */
+ if (!mq->bg_work)
+ goto bad_btracker;
+
+ mq->migrations_allowed = migrations_allowed;
+
return &mq->policy;
+bad_btracker:
+ h_exit(&mq->hotspot_table);
bad_alloc_hotspot_table:
h_exit(&mq->table);
bad_alloc_table:
sector_t origin_size,
sector_t cache_block_size)
{
- return __smq_create(cache_size, origin_size, cache_block_size, false);
+ return __smq_create(cache_size, origin_size, cache_block_size, false, true);
}
static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
sector_t origin_size,
sector_t cache_block_size)
{
- return __smq_create(cache_size, origin_size, cache_block_size, true);
+ return __smq_create(cache_size, origin_size, cache_block_size, true, true);
+}
+
+static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
+ sector_t origin_size,
+ sector_t cache_block_size)
+{
+ return __smq_create(cache_size, origin_size, cache_block_size, false, false);
}
/*----------------------------------------------------------------*/
static struct dm_cache_policy_type smq_policy_type = {
.name = "smq",
- .version = {1, 5, 0},
+ .version = {2, 0, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = smq_create
static struct dm_cache_policy_type mq_policy_type = {
.name = "mq",
- .version = {1, 5, 0},
+ .version = {2, 0, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = mq_create,
};
+static struct dm_cache_policy_type cleaner_policy_type = {
+ .name = "cleaner",
+ .version = {2, 0, 0},
+ .hint_size = 4,
+ .owner = THIS_MODULE,
+ .create = cleaner_create,
+};
+
static struct dm_cache_policy_type default_policy_type = {
.name = "default",
- .version = {1, 5, 0},
+ .version = {2, 0, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = smq_create,
r = dm_cache_policy_register(&mq_policy_type);
if (r) {
DMERR("register failed (as mq) %d", r);
- dm_cache_policy_unregister(&smq_policy_type);
- return -ENOMEM;
+ goto out_mq;
+ }
+
+ r = dm_cache_policy_register(&cleaner_policy_type);
+ if (r) {
+ DMERR("register failed (as cleaner) %d", r);
+ goto out_cleaner;
}
r = dm_cache_policy_register(&default_policy_type);
if (r) {
DMERR("register failed (as default) %d", r);
- dm_cache_policy_unregister(&mq_policy_type);
- dm_cache_policy_unregister(&smq_policy_type);
- return -ENOMEM;
+ goto out_default;
}
return 0;
+
+out_default:
+ dm_cache_policy_unregister(&cleaner_policy_type);
+out_cleaner:
+ dm_cache_policy_unregister(&mq_policy_type);
+out_mq:
+ dm_cache_policy_unregister(&smq_policy_type);
+
+ return -ENOMEM;
}
static void __exit smq_exit(void)
{
+ dm_cache_policy_unregister(&cleaner_policy_type);
dm_cache_policy_unregister(&smq_policy_type);
dm_cache_policy_unregister(&mq_policy_type);
dm_cache_policy_unregister(&default_policy_type);
MODULE_ALIAS("dm-cache-default");
MODULE_ALIAS("dm-cache-mq");
+MODULE_ALIAS("dm-cache-cleaner");
*/
#include "dm.h"
-#include "dm-bio-prison-v1.h"
+#include "dm-bio-prison-v2.h"
#include "dm-bio-record.h"
#include "dm-cache-metadata.h"
#include <linux/init.h>
#include <linux/mempool.h>
#include <linux/module.h>
+#include <linux/rwsem.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
/*----------------------------------------------------------------*/
-#define IOT_RESOLUTION 4
+/*
+ * Glossary:
+ *
+ * oblock: index of an origin block
+ * cblock: index of a cache block
+ * promotion: movement of a block from origin to cache
+ * demotion: movement of a block from cache to origin
+ * migration: movement of a block between the origin and cache device,
+ * either direction
+ */
+
+/*----------------------------------------------------------------*/
struct io_tracker {
spinlock_t lock;
/*----------------------------------------------------------------*/
/*
- * Glossary:
- *
- * oblock: index of an origin block
- * cblock: index of a cache block
- * promotion: movement of a block from origin to cache
- * demotion: movement of a block from cache to origin
- * migration: movement of a block between the origin and cache device,
- * either direction
+ * Represents a chunk of future work. 'input' allows continuations to pass
+ * values between themselves, typically error values.
*/
+struct continuation {
+ struct work_struct ws;
+ int input;
+};
+
+static inline void init_continuation(struct continuation *k,
+ void (*fn)(struct work_struct *))
+{
+ INIT_WORK(&k->ws, fn);
+ k->input = 0;
+}
+
+static inline void queue_continuation(struct workqueue_struct *wq,
+ struct continuation *k)
+{
+ queue_work(wq, &k->ws);
+}
/*----------------------------------------------------------------*/
+/*
+ * The batcher collects together pieces of work that need a particular
+ * operation to occur before they can proceed (typically a commit).
+ */
+struct batcher {
+ /*
+ * The operation that everyone is waiting for.
+ */
+ int (*commit_op)(void *context);
+ void *commit_context;
+
+ /*
+ * This is how bios should be issued once the commit op is complete
+ * (accounted_request).
+ */
+ void (*issue_op)(struct bio *bio, void *context);
+ void *issue_context;
+
+ /*
+ * Queued work gets put on here after commit.
+ */
+ struct workqueue_struct *wq;
+
+ spinlock_t lock;
+ struct list_head work_items;
+ struct bio_list bios;
+ struct work_struct commit_work;
+
+ bool commit_scheduled;
+};
+
+static void __commit(struct work_struct *_ws)
+{
+ struct batcher *b = container_of(_ws, struct batcher, commit_work);
+
+ int r;
+ unsigned long flags;
+ struct list_head work_items;
+ struct work_struct *ws, *tmp;
+ struct continuation *k;
+ struct bio *bio;
+ struct bio_list bios;
+
+ INIT_LIST_HEAD(&work_items);
+ bio_list_init(&bios);
+
+ /*
+ * We have to grab these before the commit_op to avoid a race
+ * condition.
+ */
+ spin_lock_irqsave(&b->lock, flags);
+ list_splice_init(&b->work_items, &work_items);
+ bio_list_merge(&bios, &b->bios);
+ bio_list_init(&b->bios);
+ b->commit_scheduled = false;
+ spin_unlock_irqrestore(&b->lock, flags);
+
+ r = b->commit_op(b->commit_context);
+
+ list_for_each_entry_safe(ws, tmp, &work_items, entry) {
+ k = container_of(ws, struct continuation, ws);
+ k->input = r;
+ INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
+ queue_work(b->wq, ws);
+ }
+
+ while ((bio = bio_list_pop(&bios))) {
+ if (r) {
+ bio->bi_error = r;
+ bio_endio(bio);
+ } else
+ b->issue_op(bio, b->issue_context);
+ }
+}
+
+static void batcher_init(struct batcher *b,
+ int (*commit_op)(void *),
+ void *commit_context,
+ void (*issue_op)(struct bio *bio, void *),
+ void *issue_context,
+ struct workqueue_struct *wq)
+{
+ b->commit_op = commit_op;
+ b->commit_context = commit_context;
+ b->issue_op = issue_op;
+ b->issue_context = issue_context;
+ b->wq = wq;
+
+ spin_lock_init(&b->lock);
+ INIT_LIST_HEAD(&b->work_items);
+ bio_list_init(&b->bios);
+ INIT_WORK(&b->commit_work, __commit);
+ b->commit_scheduled = false;
+}
+
+static void async_commit(struct batcher *b)
+{
+ queue_work(b->wq, &b->commit_work);
+}
+
+static void continue_after_commit(struct batcher *b, struct continuation *k)
+{
+ unsigned long flags;
+ bool commit_scheduled;
+
+ spin_lock_irqsave(&b->lock, flags);
+ commit_scheduled = b->commit_scheduled;
+ list_add_tail(&k->ws.entry, &b->work_items);
+ spin_unlock_irqrestore(&b->lock, flags);
+
+ if (commit_scheduled)
+ async_commit(b);
+}
+
+/*
+ * Bios are errored if commit failed.
+ */
+static void issue_after_commit(struct batcher *b, struct bio *bio)
+{
+ unsigned long flags;
+ bool commit_scheduled;
+
+ spin_lock_irqsave(&b->lock, flags);
+ commit_scheduled = b->commit_scheduled;
+ bio_list_add(&b->bios, bio);
+ spin_unlock_irqrestore(&b->lock, flags);
+
+ if (commit_scheduled)
+ async_commit(b);
+}
+
+/*
+ * Call this if some urgent work is waiting for the commit to complete.
+ */
+static void schedule_commit(struct batcher *b)
+{
+ bool immediate;
+ unsigned long flags;
+
+ spin_lock_irqsave(&b->lock, flags);
+ immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
+ b->commit_scheduled = true;
+ spin_unlock_irqrestore(&b->lock, flags);
+
+ if (immediate)
+ async_commit(b);
+}
+
/*
* There are a couple of places where we let a bio run, but want to do some
* work before calling its endio function. We do this by temporarily
atomic_t write_miss;
atomic_t demotion;
atomic_t promotion;
+ atomic_t writeback;
atomic_t copies_avoided;
atomic_t cache_cell_clash;
atomic_t commit_count;
atomic_t discard_count;
};
-/*
- * Defines a range of cblocks, begin to (end - 1) are in the range. end is
- * the one-past-the-end value.
- */
-struct cblock_range {
- dm_cblock_t begin;
- dm_cblock_t end;
-};
-
-struct invalidation_request {
- struct list_head list;
- struct cblock_range *cblocks;
-
- atomic_t complete;
- int err;
-
- wait_queue_head_t result_wait;
-};
-
struct cache {
struct dm_target *ti;
struct dm_target_callbacks callbacks;
spinlock_t lock;
struct list_head deferred_cells;
struct bio_list deferred_bios;
- struct bio_list deferred_flush_bios;
struct bio_list deferred_writethrough_bios;
- struct list_head quiesced_migrations;
- struct list_head completed_migrations;
- struct list_head need_commit_migrations;
sector_t migration_threshold;
wait_queue_head_t migration_wait;
atomic_t nr_allocated_migrations;
*/
atomic_t nr_io_migrations;
- wait_queue_head_t quiescing_wait;
- atomic_t quiescing;
- atomic_t quiescing_ack;
+ struct rw_semaphore quiesce_lock;
/*
* cache_size entries, dirty if set
struct dm_kcopyd_client *copier;
struct workqueue_struct *wq;
- struct work_struct worker;
-
+ struct work_struct deferred_bio_worker;
+ struct work_struct deferred_writethrough_worker;
+ struct work_struct migration_worker;
struct delayed_work waker;
- unsigned long last_commit_jiffies;
-
- struct dm_bio_prison *prison;
- struct dm_deferred_set *all_io_ds;
+ struct dm_bio_prison_v2 *prison;
mempool_t *migration_pool;
struct list_head invalidation_requests;
struct io_tracker origin_tracker;
+
+ struct work_struct commit_ws;
+ struct batcher committer;
+
+ struct rw_semaphore background_work_lock;
};
struct per_bio_data {
bool tick:1;
unsigned req_nr:2;
- struct dm_deferred_entry *all_io_entry;
+ struct dm_bio_prison_cell_v2 *cell;
struct dm_hook_info hook_info;
sector_t len;
};
struct dm_cache_migration {
- struct list_head list;
+ struct continuation k;
struct cache *cache;
- unsigned long start_jiffies;
- dm_oblock_t old_oblock;
- dm_oblock_t new_oblock;
- dm_cblock_t cblock;
-
- bool err:1;
- bool discard:1;
- bool writeback:1;
- bool demote:1;
- bool promote:1;
- bool requeue_holder:1;
- bool invalidate:1;
+ struct policy_work *op;
+ struct bio *overwrite_bio;
+ struct dm_bio_prison_cell_v2 *cell;
- struct dm_bio_prison_cell *old_ocell;
- struct dm_bio_prison_cell *new_ocell;
+ dm_cblock_t invalidate_cblock;
+ dm_oblock_t invalidate_oblock;
};
-/*
- * Processing a bio in the worker thread may require these memory
- * allocations. We prealloc to avoid deadlocks (the same worker thread
- * frees them back to the mempool).
- */
-struct prealloc {
- struct dm_cache_migration *mg;
- struct dm_bio_prison_cell *cell1;
- struct dm_bio_prison_cell *cell2;
-};
+/*----------------------------------------------------------------*/
+
+static bool writethrough_mode(struct cache_features *f)
+{
+ return f->io_mode == CM_IO_WRITETHROUGH;
+}
+
+static bool writeback_mode(struct cache_features *f)
+{
+ return f->io_mode == CM_IO_WRITEBACK;
+}
+
+static inline bool passthrough_mode(struct cache_features *f)
+{
+ return unlikely(f->io_mode == CM_IO_PASSTHROUGH);
+}
+
+/*----------------------------------------------------------------*/
+
+static void wake_deferred_bio_worker(struct cache *cache)
+{
+ queue_work(cache->wq, &cache->deferred_bio_worker);
+}
-static enum cache_metadata_mode get_cache_mode(struct cache *cache);
+static void wake_deferred_writethrough_worker(struct cache *cache)
+{
+ queue_work(cache->wq, &cache->deferred_writethrough_worker);
+}
-static void wake_worker(struct cache *cache)
+static void wake_migration_worker(struct cache *cache)
{
- queue_work(cache->wq, &cache->worker);
+ if (passthrough_mode(&cache->features))
+ return;
+
+ queue_work(cache->wq, &cache->migration_worker);
}
/*----------------------------------------------------------------*/
-static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
+static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
{
- /* FIXME: change to use a local slab. */
- return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
+ return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT);
}
-static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
+static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
{
- dm_bio_prison_free_cell(cache->prison, cell);
+ dm_bio_prison_free_cell_v2(cache->prison, cell);
}
static struct dm_cache_migration *alloc_migration(struct cache *cache)
mempool_free(mg, cache->migration_pool);
}
-static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
-{
- if (!p->mg) {
- p->mg = alloc_migration(cache);
- if (!p->mg)
- return -ENOMEM;
- }
-
- if (!p->cell1) {
- p->cell1 = alloc_prison_cell(cache);
- if (!p->cell1)
- return -ENOMEM;
- }
-
- if (!p->cell2) {
- p->cell2 = alloc_prison_cell(cache);
- if (!p->cell2)
- return -ENOMEM;
- }
-
- return 0;
-}
+/*----------------------------------------------------------------*/
-static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
+static inline dm_oblock_t oblock_succ(dm_oblock_t b)
{
- if (p->cell2)
- free_prison_cell(cache, p->cell2);
-
- if (p->cell1)
- free_prison_cell(cache, p->cell1);
-
- if (p->mg)
- free_migration(p->mg);
+ return to_oblock(from_oblock(b) + 1ull);
}
-static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
+static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
{
- struct dm_cache_migration *mg = p->mg;
-
- BUG_ON(!mg);
- p->mg = NULL;
-
- return mg;
+ key->virtual = 0;
+ key->dev = 0;
+ key->block_begin = from_oblock(begin);
+ key->block_end = from_oblock(end);
}
/*
- * You must have a cell within the prealloc struct to return. If not this
- * function will BUG() rather than returning NULL.
+ * We have two lock levels. Level 0, which is used to prevent WRITEs, and
+ * level 1 which prevents *both* READs and WRITEs.
*/
-static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
+#define WRITE_LOCK_LEVEL 0
+#define READ_WRITE_LOCK_LEVEL 1
+
+static unsigned lock_level(struct bio *bio)
{
- struct dm_bio_prison_cell *r = NULL;
+ return bio_data_dir(bio) == WRITE ?
+ WRITE_LOCK_LEVEL :
+ READ_WRITE_LOCK_LEVEL;
+}
- if (p->cell1) {
- r = p->cell1;
- p->cell1 = NULL;
+/*----------------------------------------------------------------
+ * Per bio data
+ *--------------------------------------------------------------*/
- } else if (p->cell2) {
- r = p->cell2;
- p->cell2 = NULL;
- } else
- BUG();
+/*
+ * If using writeback, leave out struct per_bio_data's writethrough fields.
+ */
+#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
+#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
- return r;
+static size_t get_per_bio_data_size(struct cache *cache)
+{
+ return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
}
-/*
- * You can't have more than two cells in a prealloc struct. BUG() will be
- * called if you try and overfill.
- */
-static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
+static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
{
- if (!p->cell2)
- p->cell2 = cell;
+ struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
+ BUG_ON(!pb);
+ return pb;
+}
- else if (!p->cell1)
- p->cell1 = cell;
+static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
+{
+ struct per_bio_data *pb = get_per_bio_data(bio, data_size);
- else
- BUG();
+ pb->tick = false;
+ pb->req_nr = dm_bio_get_target_bio_nr(bio);
+ pb->cell = NULL;
+ pb->len = 0;
+
+ return pb;
}
/*----------------------------------------------------------------*/
-static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
+static void defer_bio(struct cache *cache, struct bio *bio)
{
- key->virtual = 0;
- key->dev = 0;
- key->block_begin = from_oblock(begin);
- key->block_end = from_oblock(end);
-}
+ unsigned long flags;
-/*
- * The caller hands in a preallocated cell, and a free function for it.
- * The cell will be freed if there's an error, or if it wasn't used because
- * a cell with that key already exists.
- */
-typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_add(&cache->deferred_bios, bio);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_deferred_bio_worker(cache);
+}
-static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
- struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
- cell_free_fn free_fn, void *free_context,
- struct dm_bio_prison_cell **cell_result)
+static void defer_bios(struct cache *cache, struct bio_list *bios)
{
- int r;
- struct dm_cell_key key;
+ unsigned long flags;
- build_key(oblock_begin, oblock_end, &key);
- r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
- if (r)
- free_fn(free_context, cell_prealloc);
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_merge(&cache->deferred_bios, bios);
+ bio_list_init(bios);
+ spin_unlock_irqrestore(&cache->lock, flags);
- return r;
+ wake_deferred_bio_worker(cache);
}
-static int bio_detain(struct cache *cache, dm_oblock_t oblock,
- struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
- cell_free_fn free_fn, void *free_context,
- struct dm_bio_prison_cell **cell_result)
+/*----------------------------------------------------------------*/
+
+static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
{
+ bool r;
+ size_t pb_size;
+ struct per_bio_data *pb;
+ struct dm_cell_key_v2 key;
dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
- return bio_detain_range(cache, oblock, end, bio,
- cell_prealloc, free_fn, free_context, cell_result);
-}
+ struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
-static int get_cell(struct cache *cache,
- dm_oblock_t oblock,
- struct prealloc *structs,
- struct dm_bio_prison_cell **cell_result)
-{
- int r;
- struct dm_cell_key key;
- struct dm_bio_prison_cell *cell_prealloc;
+ cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
+ if (!cell_prealloc) {
+ defer_bio(cache, bio);
+ return false;
+ }
+
+ build_key(oblock, end, &key);
+ r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
+ if (!r) {
+ /*
+ * Failed to get the lock.
+ */
+ free_prison_cell(cache, cell_prealloc);
+ return r;
+ }
- cell_prealloc = prealloc_get_cell(structs);
+ if (cell != cell_prealloc)
+ free_prison_cell(cache, cell_prealloc);
- build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
- r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
- if (r)
- prealloc_put_cell(structs, cell_prealloc);
+ pb_size = get_per_bio_data_size(cache);
+ pb = get_per_bio_data(bio, pb_size);
+ pb->cell = cell;
return r;
}
return test_bit(from_cblock(b), cache->dirty_bitset);
}
-static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+static void set_dirty(struct cache *cache, dm_cblock_t cblock)
{
if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
atomic_inc(&cache->nr_dirty);
- policy_set_dirty(cache->policy, oblock);
+ policy_set_dirty(cache->policy, cblock);
}
}
-static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+/*
+ * These two are called when setting after migrations to force the policy
+ * and dirty bitset to be in sync.
+ */
+static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
+{
+ if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
+ atomic_inc(&cache->nr_dirty);
+ policy_set_dirty(cache->policy, cblock);
+}
+
+static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
{
if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
- policy_clear_dirty(cache->policy, oblock);
if (atomic_dec_return(&cache->nr_dirty) == 0)
dm_table_event(cache->ti->table);
}
+
+ policy_clear_dirty(cache->policy, cblock);
}
/*----------------------------------------------------------------*/
oblocks_per_dblock(cache)));
}
-static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
-{
- return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
-}
-
static void set_discard(struct cache *cache, dm_dblock_t b)
{
unsigned long flags;
return r;
}
-/*----------------------------------------------------------------*/
-
-static void load_stats(struct cache *cache)
+/*----------------------------------------------------------------
+ * Remapping
+ *--------------------------------------------------------------*/
+static void remap_to_origin(struct cache *cache, struct bio *bio)
{
- struct dm_cache_statistics stats;
-
- dm_cache_metadata_get_stats(cache->cmd, &stats);
- atomic_set(&cache->stats.read_hit, stats.read_hits);
- atomic_set(&cache->stats.read_miss, stats.read_misses);
- atomic_set(&cache->stats.write_hit, stats.write_hits);
- atomic_set(&cache->stats.write_miss, stats.write_misses);
-}
-
-static void save_stats(struct cache *cache)
-{
- struct dm_cache_statistics stats;
-
- if (get_cache_mode(cache) >= CM_READ_ONLY)
- return;
-
- stats.read_hits = atomic_read(&cache->stats.read_hit);
- stats.read_misses = atomic_read(&cache->stats.read_miss);
- stats.write_hits = atomic_read(&cache->stats.write_hit);
- stats.write_misses = atomic_read(&cache->stats.write_miss);
-
- dm_cache_metadata_set_stats(cache->cmd, &stats);
-}
-
-/*----------------------------------------------------------------
- * Per bio data
- *--------------------------------------------------------------*/
-
-/*
- * If using writeback, leave out struct per_bio_data's writethrough fields.
- */
-#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
-#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
-
-static bool writethrough_mode(struct cache_features *f)
-{
- return f->io_mode == CM_IO_WRITETHROUGH;
-}
-
-static bool writeback_mode(struct cache_features *f)
-{
- return f->io_mode == CM_IO_WRITEBACK;
-}
-
-static bool passthrough_mode(struct cache_features *f)
-{
- return f->io_mode == CM_IO_PASSTHROUGH;
-}
-
-static size_t get_per_bio_data_size(struct cache *cache)
-{
- return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
-}
-
-static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
-{
- struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
- BUG_ON(!pb);
- return pb;
-}
-
-static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
-{
- struct per_bio_data *pb = get_per_bio_data(bio, data_size);
-
- pb->tick = false;
- pb->req_nr = dm_bio_get_target_bio_nr(bio);
- pb->all_io_entry = NULL;
- pb->len = 0;
-
- return pb;
-}
-
-/*----------------------------------------------------------------
- * Remapping
- *--------------------------------------------------------------*/
-static void remap_to_origin(struct cache *cache, struct bio *bio)
-{
- bio->bi_bdev = cache->origin_dev->bdev;
+ bio->bi_bdev = cache->origin_dev->bdev;
}
static void remap_to_cache(struct cache *cache, struct bio *bio,
}
static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
- dm_oblock_t oblock)
+ dm_oblock_t oblock)
{
+ // FIXME: this is called way too much.
check_if_tick_bio_needed(cache, bio);
remap_to_origin(cache, bio);
if (bio_data_dir(bio) == WRITE)
check_if_tick_bio_needed(cache, bio);
remap_to_cache(cache, bio, cblock);
if (bio_data_dir(bio) == WRITE) {
- set_dirty(cache, oblock, cblock);
+ set_dirty(cache, cblock);
clear_discard(cache, oblock_to_dblock(cache, oblock));
}
}
return to_oblock(block_nr);
}
-/*
- * You must increment the deferred set whilst the prison cell is held. To
- * encourage this, we ask for 'cell' to be passed in.
- */
-static void inc_ds(struct cache *cache, struct bio *bio,
- struct dm_bio_prison_cell *cell)
-{
- size_t pb_data_size = get_per_bio_data_size(cache);
- struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
-
- BUG_ON(!cell);
- BUG_ON(pb->all_io_entry);
-
- pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-}
-
static bool accountable_bio(struct cache *cache, struct bio *bio)
{
return ((bio->bi_bdev == cache->origin_dev->bdev) &&
generic_make_request(bio);
}
-static void issue(struct cache *cache, struct bio *bio)
-{
- unsigned long flags;
-
- if (!op_is_flush(bio->bi_opf)) {
- accounted_request(cache, bio);
- return;
- }
-
- /*
- * Batch together any bios that trigger commits and then issue a
- * single commit for them in do_worker().
- */
- spin_lock_irqsave(&cache->lock, flags);
- cache->commit_requested = true;
- bio_list_add(&cache->deferred_flush_bios, bio);
- spin_unlock_irqrestore(&cache->lock, flags);
-}
-
-static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
+static void issue_op(struct bio *bio, void *context)
{
- inc_ds(cache, bio, cell);
- issue(cache, bio);
+ struct cache *cache = context;
+ accounted_request(cache, bio);
}
static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
bio_list_add(&cache->deferred_writethrough_bios, bio);
spin_unlock_irqrestore(&cache->lock, flags);
- wake_worker(cache);
+ wake_deferred_writethrough_worker(cache);
}
static void writethrough_endio(struct bio *bio)
}
/*
+ * FIXME: send in parallel, huge latency as is.
* When running in writethrough mode we need to send writes to clean blocks
* to both the cache and origin devices. In future we'd like to clone the
* bio and send them in parallel, but for now we're doing them in
set_cache_mode(cache, CM_READ_ONLY);
}
+/*----------------------------------------------------------------*/
+
+static void load_stats(struct cache *cache)
+{
+ struct dm_cache_statistics stats;
+
+ dm_cache_metadata_get_stats(cache->cmd, &stats);
+ atomic_set(&cache->stats.read_hit, stats.read_hits);
+ atomic_set(&cache->stats.read_miss, stats.read_misses);
+ atomic_set(&cache->stats.write_hit, stats.write_hits);
+ atomic_set(&cache->stats.write_miss, stats.write_misses);
+}
+
+static void save_stats(struct cache *cache)
+{
+ struct dm_cache_statistics stats;
+
+ if (get_cache_mode(cache) >= CM_READ_ONLY)
+ return;
+
+ stats.read_hits = atomic_read(&cache->stats.read_hit);
+ stats.read_misses = atomic_read(&cache->stats.read_miss);
+ stats.write_hits = atomic_read(&cache->stats.write_hit);
+ stats.write_misses = atomic_read(&cache->stats.write_miss);
+
+ dm_cache_metadata_set_stats(cache->cmd, &stats);
+}
+
+static void update_stats(struct cache_stats *stats, enum policy_operation op)
+{
+ switch (op) {
+ case POLICY_PROMOTE:
+ atomic_inc(&stats->promotion);
+ break;
+
+ case POLICY_DEMOTE:
+ atomic_inc(&stats->demotion);
+ break;
+
+ case POLICY_WRITEBACK:
+ atomic_inc(&stats->writeback);
+ break;
+ }
+}
+
/*----------------------------------------------------------------
* Migration processing
*
* Migration covers moving data from the origin device to the cache, or
* vice versa.
*--------------------------------------------------------------*/
+
static void inc_io_migrations(struct cache *cache)
{
atomic_inc(&cache->nr_io_migrations);
return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
}
-static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
-{
- if (discard_or_flush(cell->holder)) {
- /*
- * We have to handle these bios individually.
- */
- dm_cell_release(cache->prison, cell, &cache->deferred_bios);
- free_prison_cell(cache, cell);
- } else
- list_add_tail(&cell->user_list, &cache->deferred_cells);
-}
-
-static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
+static void calc_discard_block_range(struct cache *cache, struct bio *bio,
+ dm_dblock_t *b, dm_dblock_t *e)
{
- unsigned long flags;
-
- if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
- /*
- * There was no prisoner to promote to holder, the
- * cell has been released.
- */
- free_prison_cell(cache, cell);
- return;
- }
+ sector_t sb = bio->bi_iter.bi_sector;
+ sector_t se = bio_end_sector(bio);
- spin_lock_irqsave(&cache->lock, flags);
- __cell_defer(cache, cell);
- spin_unlock_irqrestore(&cache->lock, flags);
+ *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
- wake_worker(cache);
+ if (se - sb < cache->discard_block_size)
+ *e = *b;
+ else
+ *e = to_dblock(block_div(se, cache->discard_block_size));
}
-static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
-{
- dm_cell_error(cache->prison, cell, err);
- free_prison_cell(cache, cell);
-}
+/*----------------------------------------------------------------*/
-static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
+static void prevent_background_work(struct cache *cache)
{
- cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE);
+ lockdep_off();
+ down_write(&cache->background_work_lock);
+ lockdep_on();
}
-static void free_io_migration(struct dm_cache_migration *mg)
+static void allow_background_work(struct cache *cache)
{
- struct cache *cache = mg->cache;
-
- dec_io_migrations(cache);
- free_migration(mg);
- wake_worker(cache);
+ lockdep_off();
+ up_write(&cache->background_work_lock);
+ lockdep_on();
}
-static void migration_failure(struct dm_cache_migration *mg)
+static bool background_work_begin(struct cache *cache)
{
- struct cache *cache = mg->cache;
- const char *dev_name = cache_device_name(cache);
-
- if (mg->writeback) {
- DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
- set_dirty(cache, mg->old_oblock, mg->cblock);
- cell_defer(cache, mg->old_ocell, false);
-
- } else if (mg->demote) {
- DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
- policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
+ bool r;
- cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
- if (mg->promote)
- cell_defer(cache, mg->new_ocell, true);
- } else {
- DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
- policy_remove_mapping(cache->policy, mg->new_oblock);
- cell_defer(cache, mg->new_ocell, true);
- }
+ lockdep_off();
+ r = down_read_trylock(&cache->background_work_lock);
+ lockdep_on();
- free_io_migration(mg);
+ return r;
}
-static void migration_success_pre_commit(struct dm_cache_migration *mg)
+static void background_work_end(struct cache *cache)
{
- int r;
- unsigned long flags;
- struct cache *cache = mg->cache;
-
- if (mg->writeback) {
- clear_dirty(cache, mg->old_oblock, mg->cblock);
- cell_defer(cache, mg->old_ocell, false);
- free_io_migration(mg);
- return;
+ lockdep_off();
+ up_read(&cache->background_work_lock);
+ lockdep_on();
+}
- } else if (mg->demote) {
- r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
- if (r) {
- DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
- cache_device_name(cache));
- metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
- policy_force_mapping(cache->policy, mg->new_oblock,
- mg->old_oblock);
- if (mg->promote)
- cell_defer(cache, mg->new_ocell, true);
- free_io_migration(mg);
- return;
- }
- } else {
- r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
- if (r) {
- DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
- cache_device_name(cache));
- metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
- policy_remove_mapping(cache->policy, mg->new_oblock);
- free_io_migration(mg);
- return;
- }
- }
+/*----------------------------------------------------------------*/
- spin_lock_irqsave(&cache->lock, flags);
- list_add_tail(&mg->list, &cache->need_commit_migrations);
- cache->commit_requested = true;
- spin_unlock_irqrestore(&cache->lock, flags);
+static void quiesce(struct dm_cache_migration *mg,
+ void (*continuation)(struct work_struct *))
+{
+ init_continuation(&mg->k, continuation);
+ dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
}
-static void migration_success_post_commit(struct dm_cache_migration *mg)
+static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
{
- unsigned long flags;
- struct cache *cache = mg->cache;
-
- if (mg->writeback) {
- DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
- cache_device_name(cache));
- return;
-
- } else if (mg->demote) {
- cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
-
- if (mg->promote) {
- mg->demote = false;
-
- spin_lock_irqsave(&cache->lock, flags);
- list_add_tail(&mg->list, &cache->quiesced_migrations);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- } else {
- if (mg->invalidate)
- policy_remove_mapping(cache->policy, mg->old_oblock);
- free_io_migration(mg);
- }
-
- } else {
- if (mg->requeue_holder) {
- clear_dirty(cache, mg->new_oblock, mg->cblock);
- cell_defer(cache, mg->new_ocell, true);
- } else {
- /*
- * The block was promoted via an overwrite, so it's dirty.
- */
- set_dirty(cache, mg->new_oblock, mg->cblock);
- bio_endio(mg->new_ocell->holder);
- cell_defer(cache, mg->new_ocell, false);
- }
- free_io_migration(mg);
- }
+ struct continuation *k = container_of(ws, struct continuation, ws);
+ return container_of(k, struct dm_cache_migration, k);
}
static void copy_complete(int read_err, unsigned long write_err, void *context)
{
- unsigned long flags;
- struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
- struct cache *cache = mg->cache;
+ struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
if (read_err || write_err)
- mg->err = true;
-
- spin_lock_irqsave(&cache->lock, flags);
- list_add_tail(&mg->list, &cache->completed_migrations);
- spin_unlock_irqrestore(&cache->lock, flags);
+ mg->k.input = -EIO;
- wake_worker(cache);
+ queue_continuation(mg->cache->wq, &mg->k);
}
-static void issue_copy(struct dm_cache_migration *mg)
+static int copy(struct dm_cache_migration *mg, bool promote)
{
int r;
struct dm_io_region o_region, c_region;
struct cache *cache = mg->cache;
- sector_t cblock = from_cblock(mg->cblock);
o_region.bdev = cache->origin_dev->bdev;
+ o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
o_region.count = cache->sectors_per_block;
c_region.bdev = cache->cache_dev->bdev;
- c_region.sector = cblock * cache->sectors_per_block;
+ c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
c_region.count = cache->sectors_per_block;
- if (mg->writeback || mg->demote) {
- /* demote */
- o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
- r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
- } else {
- /* promote */
- o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
- r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
- }
+ if (promote)
+ r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
+ else
+ r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
- if (r < 0) {
- DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache));
- migration_failure(mg);
- }
+ return r;
+}
+
+static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
+{
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+ if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
+ free_prison_cell(cache, pb->cell);
+ pb->cell = NULL;
}
static void overwrite_endio(struct bio *bio)
struct cache *cache = mg->cache;
size_t pb_data_size = get_per_bio_data_size(cache);
struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
- unsigned long flags;
dm_unhook_bio(&pb->hook_info, bio);
if (bio->bi_error)
- mg->err = true;
-
- mg->requeue_holder = false;
+ mg->k.input = bio->bi_error;
- spin_lock_irqsave(&cache->lock, flags);
- list_add_tail(&mg->list, &cache->completed_migrations);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- wake_worker(cache);
+ queue_continuation(mg->cache->wq, &mg->k);
}
-static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
+static void overwrite(struct dm_cache_migration *mg,
+ void (*continuation)(struct work_struct *))
{
+ struct bio *bio = mg->overwrite_bio;
size_t pb_data_size = get_per_bio_data_size(mg->cache);
struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
- remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
/*
- * No need to inc_ds() here, since the cell will be held for the
- * duration of the io.
+ * The overwrite bio is part of the copy operation, as such it does
+ * not set/clear discard or dirty flags.
*/
+ if (mg->op->op == POLICY_PROMOTE)
+ remap_to_cache(mg->cache, bio, mg->op->cblock);
+ else
+ remap_to_origin(mg->cache, bio);
+
+ init_continuation(&mg->k, continuation);
accounted_request(mg->cache, bio);
}
-static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
+/*
+ * Migration steps:
+ *
+ * 1) exclusive lock preventing WRITEs
+ * 2) quiesce
+ * 3) copy or issue overwrite bio
+ * 4) upgrade to exclusive lock preventing READs and WRITEs
+ * 5) quiesce
+ * 6) update metadata and commit
+ * 7) unlock
+ */
+static void mg_complete(struct dm_cache_migration *mg, bool success)
{
- return (bio_data_dir(bio) == WRITE) &&
- (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
+ struct bio_list bios;
+ struct cache *cache = mg->cache;
+ struct policy_work *op = mg->op;
+ dm_cblock_t cblock = op->cblock;
+
+ if (success)
+ update_stats(&cache->stats, op->op);
+
+ switch (op->op) {
+ case POLICY_PROMOTE:
+ clear_discard(cache, oblock_to_dblock(cache, op->oblock));
+ policy_complete_background_work(cache->policy, op, success);
+
+ if (mg->overwrite_bio) {
+ if (success)
+ force_set_dirty(cache, cblock);
+ else
+ mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO);
+ bio_endio(mg->overwrite_bio);
+ } else {
+ if (success)
+ force_clear_dirty(cache, cblock);
+ dec_io_migrations(cache);
+ }
+ break;
+
+ case POLICY_DEMOTE:
+ /*
+ * We clear dirty here to update the nr_dirty counter.
+ */
+ if (success)
+ force_clear_dirty(cache, cblock);
+ policy_complete_background_work(cache->policy, op, success);
+ dec_io_migrations(cache);
+ break;
+
+ case POLICY_WRITEBACK:
+ if (success)
+ force_clear_dirty(cache, cblock);
+ policy_complete_background_work(cache->policy, op, success);
+ dec_io_migrations(cache);
+ break;
+ }
+
+ bio_list_init(&bios);
+ if (mg->cell) {
+ if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
+ free_prison_cell(cache, mg->cell);
+ }
+
+ free_migration(mg);
+ defer_bios(cache, &bios);
+ wake_migration_worker(cache);
+
+ background_work_end(cache);
}
-static void avoid_copy(struct dm_cache_migration *mg)
+static void mg_success(struct work_struct *ws)
{
- atomic_inc(&mg->cache->stats.copies_avoided);
- migration_success_pre_commit(mg);
+ struct dm_cache_migration *mg = ws_to_mg(ws);
+ mg_complete(mg, mg->k.input == 0);
}
-static void calc_discard_block_range(struct cache *cache, struct bio *bio,
- dm_dblock_t *b, dm_dblock_t *e)
+static void mg_update_metadata(struct work_struct *ws)
{
- sector_t sb = bio->bi_iter.bi_sector;
- sector_t se = bio_end_sector(bio);
+ int r;
+ struct dm_cache_migration *mg = ws_to_mg(ws);
+ struct cache *cache = mg->cache;
+ struct policy_work *op = mg->op;
- *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
+ switch (op->op) {
+ case POLICY_PROMOTE:
+ r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
+ if (r) {
+ DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
+ cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
- if (se - sb < cache->discard_block_size)
- *e = *b;
- else
- *e = to_dblock(block_div(se, cache->discard_block_size));
-}
+ mg_complete(mg, false);
+ return;
+ }
+ mg_complete(mg, true);
+ break;
-static void issue_discard(struct dm_cache_migration *mg)
-{
- dm_dblock_t b, e;
- struct bio *bio = mg->new_ocell->holder;
- struct cache *cache = mg->cache;
+ case POLICY_DEMOTE:
+ r = dm_cache_remove_mapping(cache->cmd, op->cblock);
+ if (r) {
+ DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
+ cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
- calc_discard_block_range(cache, bio, &b, &e);
- while (b != e) {
- set_discard(cache, b);
- b = to_dblock(from_dblock(b) + 1);
+ mg_complete(mg, false);
+ return;
+ }
+
+ /*
+ * It would be nice if we only had to commit when a REQ_FLUSH
+ * comes through. But there's one scenario that we have to
+ * look out for:
+ *
+ * - vblock x in a cache block
+ * - domotion occurs
+ * - cache block gets reallocated and over written
+ * - crash
+ *
+ * When we recover, because there was no commit the cache will
+ * rollback to having the data for vblock x in the cache block.
+ * But the cache block has since been overwritten, so it'll end
+ * up pointing to data that was never in 'x' during the history
+ * of the device.
+ *
+ * To avoid this issue we require a commit as part of the
+ * demotion operation.
+ */
+ init_continuation(&mg->k, mg_success);
+ continue_after_commit(&cache->committer, &mg->k);
+ schedule_commit(&cache->committer);
+ break;
+
+ case POLICY_WRITEBACK:
+ mg_complete(mg, true);
+ break;
}
+}
- bio_endio(bio);
- cell_defer(cache, mg->new_ocell, false);
- free_migration(mg);
- wake_worker(cache);
+static void mg_update_metadata_after_copy(struct work_struct *ws)
+{
+ struct dm_cache_migration *mg = ws_to_mg(ws);
+
+ /*
+ * Did the copy succeed?
+ */
+ if (mg->k.input)
+ mg_complete(mg, false);
+ else
+ mg_update_metadata(ws);
}
-static void issue_copy_or_discard(struct dm_cache_migration *mg)
+static void mg_upgrade_lock(struct work_struct *ws)
{
- bool avoid;
- struct cache *cache = mg->cache;
+ int r;
+ struct dm_cache_migration *mg = ws_to_mg(ws);
- if (mg->discard) {
- issue_discard(mg);
- return;
- }
+ /*
+ * Did the copy succeed?
+ */
+ if (mg->k.input)
+ mg_complete(mg, false);
- if (mg->writeback || mg->demote)
- avoid = !is_dirty(cache, mg->cblock) ||
- is_discarded_oblock(cache, mg->old_oblock);
else {
- struct bio *bio = mg->new_ocell->holder;
+ /*
+ * Now we want the lock to prevent both reads and writes.
+ */
+ r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
+ READ_WRITE_LOCK_LEVEL);
+ if (r < 0)
+ mg_complete(mg, false);
- avoid = is_discarded_oblock(cache, mg->new_oblock);
+ else if (r)
+ quiesce(mg, mg_update_metadata);
- if (writeback_mode(&cache->features) &&
- !avoid && bio_writes_complete_block(cache, bio)) {
- issue_overwrite(mg, bio);
- return;
- }
+ else
+ mg_update_metadata(ws);
}
-
- avoid ? avoid_copy(mg) : issue_copy(mg);
}
-static void complete_migration(struct dm_cache_migration *mg)
+static void mg_copy(struct work_struct *ws)
{
- if (mg->err)
- migration_failure(mg);
- else
- migration_success_pre_commit(mg);
-}
+ int r;
+ struct dm_cache_migration *mg = ws_to_mg(ws);
-static void process_migrations(struct cache *cache, struct list_head *head,
- void (*fn)(struct dm_cache_migration *))
-{
- unsigned long flags;
- struct list_head list;
- struct dm_cache_migration *mg, *tmp;
+ if (mg->overwrite_bio) {
+ /*
+ * It's safe to do this here, even though it's new data
+ * because all IO has been locked out of the block.
+ *
+ * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
+ * so _not_ using mg_upgrade_lock() as continutation.
+ */
+ overwrite(mg, mg_update_metadata_after_copy);
- INIT_LIST_HEAD(&list);
- spin_lock_irqsave(&cache->lock, flags);
- list_splice_init(head, &list);
- spin_unlock_irqrestore(&cache->lock, flags);
+ } else {
+ struct cache *cache = mg->cache;
+ struct policy_work *op = mg->op;
+ bool is_policy_promote = (op->op == POLICY_PROMOTE);
- list_for_each_entry_safe(mg, tmp, &list, list)
- fn(mg);
-}
+ if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
+ is_discarded_oblock(cache, op->oblock)) {
+ mg_upgrade_lock(ws);
+ return;
+ }
-static void __queue_quiesced_migration(struct dm_cache_migration *mg)
-{
- list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
+ init_continuation(&mg->k, mg_upgrade_lock);
+
+ r = copy(mg, is_policy_promote);
+ if (r) {
+ DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
+ mg->k.input = -EIO;
+ mg_complete(mg, false);
+ }
+ }
}
-static void queue_quiesced_migration(struct dm_cache_migration *mg)
+static int mg_lock_writes(struct dm_cache_migration *mg)
{
- unsigned long flags;
+ int r;
+ struct dm_cell_key_v2 key;
struct cache *cache = mg->cache;
+ struct dm_bio_prison_cell_v2 *prealloc;
- spin_lock_irqsave(&cache->lock, flags);
- __queue_quiesced_migration(mg);
- spin_unlock_irqrestore(&cache->lock, flags);
+ prealloc = alloc_prison_cell(cache);
+ if (!prealloc) {
+ DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache));
+ mg_complete(mg, false);
+ return -ENOMEM;
+ }
- wake_worker(cache);
-}
+ /*
+ * Prevent writes to the block, but allow reads to continue.
+ * Unless we're using an overwrite bio, in which case we lock
+ * everything.
+ */
+ build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
+ r = dm_cell_lock_v2(cache->prison, &key,
+ mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
+ prealloc, &mg->cell);
+ if (r < 0) {
+ free_prison_cell(cache, prealloc);
+ mg_complete(mg, false);
+ return r;
+ }
-static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
-{
- unsigned long flags;
- struct dm_cache_migration *mg, *tmp;
+ if (mg->cell != prealloc)
+ free_prison_cell(cache, prealloc);
- spin_lock_irqsave(&cache->lock, flags);
- list_for_each_entry_safe(mg, tmp, work, list)
- __queue_quiesced_migration(mg);
- spin_unlock_irqrestore(&cache->lock, flags);
+ if (r == 0)
+ mg_copy(&mg->k.ws);
+ else
+ quiesce(mg, mg_copy);
- wake_worker(cache);
+ return 0;
}
-static void check_for_quiesced_migrations(struct cache *cache,
- struct per_bio_data *pb)
+static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
{
- struct list_head work;
+ struct dm_cache_migration *mg;
- if (!pb->all_io_entry)
- return;
+ if (!background_work_begin(cache)) {
+ policy_complete_background_work(cache->policy, op, false);
+ return -EPERM;
+ }
- INIT_LIST_HEAD(&work);
- dm_deferred_entry_dec(pb->all_io_entry, &work);
+ mg = alloc_migration(cache);
+ if (!mg) {
+ policy_complete_background_work(cache->policy, op, false);
+ background_work_end(cache);
+ return -ENOMEM;
+ }
- if (!list_empty(&work))
- queue_quiesced_migrations(cache, &work);
-}
+ memset(mg, 0, sizeof(*mg));
-static void quiesce_migration(struct dm_cache_migration *mg)
-{
- if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
- queue_quiesced_migration(mg);
+ mg->cache = cache;
+ mg->op = op;
+ mg->overwrite_bio = bio;
+
+ if (!bio)
+ inc_io_migrations(cache);
+
+ return mg_lock_writes(mg);
}
-static void promote(struct cache *cache, struct prealloc *structs,
- dm_oblock_t oblock, dm_cblock_t cblock,
- struct dm_bio_prison_cell *cell)
+/*----------------------------------------------------------------
+ * invalidation processing
+ *--------------------------------------------------------------*/
+
+static void invalidate_complete(struct dm_cache_migration *mg, bool success)
{
- struct dm_cache_migration *mg = prealloc_get_migration(structs);
+ struct bio_list bios;
+ struct cache *cache = mg->cache;
- mg->err = false;
- mg->discard = false;
- mg->writeback = false;
- mg->demote = false;
- mg->promote = true;
- mg->requeue_holder = true;
- mg->invalidate = false;
- mg->cache = cache;
- mg->new_oblock = oblock;
- mg->cblock = cblock;
- mg->old_ocell = NULL;
- mg->new_ocell = cell;
- mg->start_jiffies = jiffies;
+ bio_list_init(&bios);
+ if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
+ free_prison_cell(cache, mg->cell);
- inc_io_migrations(cache);
- quiesce_migration(mg);
+ if (!success && mg->overwrite_bio)
+ bio_io_error(mg->overwrite_bio);
+
+ free_migration(mg);
+ defer_bios(cache, &bios);
+
+ background_work_end(cache);
}
-static void writeback(struct cache *cache, struct prealloc *structs,
- dm_oblock_t oblock, dm_cblock_t cblock,
- struct dm_bio_prison_cell *cell)
+static void invalidate_completed(struct work_struct *ws)
{
- struct dm_cache_migration *mg = prealloc_get_migration(structs);
+ struct dm_cache_migration *mg = ws_to_mg(ws);
+ invalidate_complete(mg, !mg->k.input);
+}
- mg->err = false;
- mg->discard = false;
- mg->writeback = true;
- mg->demote = false;
- mg->promote = false;
- mg->requeue_holder = true;
- mg->invalidate = false;
- mg->cache = cache;
- mg->old_oblock = oblock;
- mg->cblock = cblock;
- mg->old_ocell = cell;
- mg->new_ocell = NULL;
- mg->start_jiffies = jiffies;
-
- inc_io_migrations(cache);
- quiesce_migration(mg);
-}
-
-static void demote_then_promote(struct cache *cache, struct prealloc *structs,
- dm_oblock_t old_oblock, dm_oblock_t new_oblock,
- dm_cblock_t cblock,
- struct dm_bio_prison_cell *old_ocell,
- struct dm_bio_prison_cell *new_ocell)
-{
- struct dm_cache_migration *mg = prealloc_get_migration(structs);
-
- mg->err = false;
- mg->discard = false;
- mg->writeback = false;
- mg->demote = true;
- mg->promote = true;
- mg->requeue_holder = true;
- mg->invalidate = false;
- mg->cache = cache;
- mg->old_oblock = old_oblock;
- mg->new_oblock = new_oblock;
- mg->cblock = cblock;
- mg->old_ocell = old_ocell;
- mg->new_ocell = new_ocell;
- mg->start_jiffies = jiffies;
+static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
+{
+ int r = policy_invalidate_mapping(cache->policy, cblock);
+ if (!r) {
+ r = dm_cache_remove_mapping(cache->cmd, cblock);
+ if (r) {
+ DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
+ cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
+ }
- inc_io_migrations(cache);
- quiesce_migration(mg);
-}
+ } else if (r == -ENODATA) {
+ /*
+ * Harmless, already unmapped.
+ */
+ r = 0;
-/*
- * Invalidate a cache entry. No writeback occurs; any changes in the cache
- * block are thrown away.
- */
-static void invalidate(struct cache *cache, struct prealloc *structs,
- dm_oblock_t oblock, dm_cblock_t cblock,
- struct dm_bio_prison_cell *cell)
-{
- struct dm_cache_migration *mg = prealloc_get_migration(structs);
-
- mg->err = false;
- mg->discard = false;
- mg->writeback = false;
- mg->demote = true;
- mg->promote = false;
- mg->requeue_holder = true;
- mg->invalidate = true;
- mg->cache = cache;
- mg->old_oblock = oblock;
- mg->cblock = cblock;
- mg->old_ocell = cell;
- mg->new_ocell = NULL;
- mg->start_jiffies = jiffies;
+ } else
+ DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
- inc_io_migrations(cache);
- quiesce_migration(mg);
+ return r;
}
-static void discard(struct cache *cache, struct prealloc *structs,
- struct dm_bio_prison_cell *cell)
+static void invalidate_remove(struct work_struct *ws)
{
- struct dm_cache_migration *mg = prealloc_get_migration(structs);
+ int r;
+ struct dm_cache_migration *mg = ws_to_mg(ws);
+ struct cache *cache = mg->cache;
- mg->err = false;
- mg->discard = true;
- mg->writeback = false;
- mg->demote = false;
- mg->promote = false;
- mg->requeue_holder = false;
- mg->invalidate = false;
- mg->cache = cache;
- mg->old_ocell = NULL;
- mg->new_ocell = cell;
- mg->start_jiffies = jiffies;
+ r = invalidate_cblock(cache, mg->invalidate_cblock);
+ if (r) {
+ invalidate_complete(mg, false);
+ return;
+ }
- quiesce_migration(mg);
+ init_continuation(&mg->k, invalidate_completed);
+ continue_after_commit(&cache->committer, &mg->k);
+ remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
+ mg->overwrite_bio = NULL;
+ schedule_commit(&cache->committer);
}
-/*----------------------------------------------------------------
- * bio processing
- *--------------------------------------------------------------*/
-static void defer_bio(struct cache *cache, struct bio *bio)
+static int invalidate_lock(struct dm_cache_migration *mg)
{
- unsigned long flags;
+ int r;
+ struct dm_cell_key_v2 key;
+ struct cache *cache = mg->cache;
+ struct dm_bio_prison_cell_v2 *prealloc;
- spin_lock_irqsave(&cache->lock, flags);
- bio_list_add(&cache->deferred_bios, bio);
- spin_unlock_irqrestore(&cache->lock, flags);
+ prealloc = alloc_prison_cell(cache);
+ if (!prealloc) {
+ invalidate_complete(mg, false);
+ return -ENOMEM;
+ }
- wake_worker(cache);
-}
+ build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
+ r = dm_cell_lock_v2(cache->prison, &key,
+ READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
+ if (r < 0) {
+ free_prison_cell(cache, prealloc);
+ invalidate_complete(mg, false);
+ return r;
+ }
-static void process_flush_bio(struct cache *cache, struct bio *bio)
-{
- size_t pb_data_size = get_per_bio_data_size(cache);
- struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+ if (mg->cell != prealloc)
+ free_prison_cell(cache, prealloc);
- BUG_ON(bio->bi_iter.bi_size);
- if (!pb->req_nr)
- remap_to_origin(cache, bio);
- else
- remap_to_cache(cache, bio, 0);
+ if (r)
+ quiesce(mg, invalidate_remove);
- /*
- * REQ_PREFLUSH is not directed at any particular block so we don't
- * need to inc_ds(). REQ_FUA's are split into a write + REQ_PREFLUSH
- * by dm-core.
- */
- issue(cache, bio);
+ else {
+ /*
+ * We can't call invalidate_remove() directly here because we
+ * might still be in request context.
+ */
+ init_continuation(&mg->k, invalidate_remove);
+ queue_work(cache->wq, &mg->k.ws);
+ }
+
+ return 0;
}
-static void process_discard_bio(struct cache *cache, struct prealloc *structs,
- struct bio *bio)
+static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
+ dm_oblock_t oblock, struct bio *bio)
{
- int r;
- dm_dblock_t b, e;
- struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
+ struct dm_cache_migration *mg;
- calc_discard_block_range(cache, bio, &b, &e);
- if (b == e) {
- bio_endio(bio);
- return;
+ if (!background_work_begin(cache))
+ return -EPERM;
+
+ mg = alloc_migration(cache);
+ if (!mg) {
+ background_work_end(cache);
+ return -ENOMEM;
}
- cell_prealloc = prealloc_get_cell(structs);
- r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
- (cell_free_fn) prealloc_put_cell,
- structs, &new_ocell);
- if (r > 0)
- return;
+ memset(mg, 0, sizeof(*mg));
+
+ mg->cache = cache;
+ mg->overwrite_bio = bio;
+ mg->invalidate_cblock = cblock;
+ mg->invalidate_oblock = oblock;
- discard(cache, structs, new_ocell);
+ return invalidate_lock(mg);
}
-static bool spare_migration_bandwidth(struct cache *cache)
+/*----------------------------------------------------------------
+ * bio processing
+ *--------------------------------------------------------------*/
+
+enum busy {
+ IDLE,
+ MODERATE,
+ BUSY
+};
+
+static enum busy spare_migration_bandwidth(struct cache *cache)
{
+ bool idle = iot_idle_for(&cache->origin_tracker, HZ);
sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
cache->sectors_per_block;
- return current_volume < cache->migration_threshold;
+
+ if (current_volume <= cache->migration_threshold)
+ return idle ? IDLE : MODERATE;
+ else
+ return idle ? MODERATE : BUSY;
}
static void inc_hit_counter(struct cache *cache, struct bio *bio)
/*----------------------------------------------------------------*/
-struct inc_detail {
- struct cache *cache;
- struct bio_list bios_for_issue;
- struct bio_list unhandled_bios;
- bool any_writes;
-};
-
-static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
+static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
{
- struct bio *bio;
- struct inc_detail *detail = context;
- struct cache *cache = detail->cache;
-
- inc_ds(cache, cell->holder, cell);
- if (bio_data_dir(cell->holder) == WRITE)
- detail->any_writes = true;
-
- while ((bio = bio_list_pop(&cell->bios))) {
- if (discard_or_flush(bio)) {
- bio_list_add(&detail->unhandled_bios, bio);
- continue;
- }
-
- if (bio_data_dir(bio) == WRITE)
- detail->any_writes = true;
-
- bio_list_add(&detail->bios_for_issue, bio);
- inc_ds(cache, bio, cell);
- }
+ return (bio_data_dir(bio) == WRITE) &&
+ (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
}
-// FIXME: refactor these two
-static void remap_cell_to_origin_clear_discard(struct cache *cache,
- struct dm_bio_prison_cell *cell,
- dm_oblock_t oblock, bool issue_holder)
+static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
{
- struct bio *bio;
- unsigned long flags;
- struct inc_detail detail;
-
- detail.cache = cache;
- bio_list_init(&detail.bios_for_issue);
- bio_list_init(&detail.unhandled_bios);
- detail.any_writes = false;
-
- spin_lock_irqsave(&cache->lock, flags);
- dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
- bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- remap_to_origin(cache, cell->holder);
- if (issue_holder)
- issue(cache, cell->holder);
- else
- accounted_begin(cache, cell->holder);
-
- if (detail.any_writes)
- clear_discard(cache, oblock_to_dblock(cache, oblock));
-
- while ((bio = bio_list_pop(&detail.bios_for_issue))) {
- remap_to_origin(cache, bio);
- issue(cache, bio);
- }
-
- free_prison_cell(cache, cell);
+ return writeback_mode(&cache->features) &&
+ (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
}
-static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
- dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder)
+static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
+ bool *commit_needed)
{
- struct bio *bio;
- unsigned long flags;
- struct inc_detail detail;
-
- detail.cache = cache;
- bio_list_init(&detail.bios_for_issue);
- bio_list_init(&detail.unhandled_bios);
- detail.any_writes = false;
-
- spin_lock_irqsave(&cache->lock, flags);
- dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
- bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- remap_to_cache(cache, cell->holder, cblock);
- if (issue_holder)
- issue(cache, cell->holder);
- else
- accounted_begin(cache, cell->holder);
+ int r, data_dir;
+ bool rb, background_queued;
+ dm_cblock_t cblock;
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
- if (detail.any_writes) {
- set_dirty(cache, oblock, cblock);
- clear_discard(cache, oblock_to_dblock(cache, oblock));
- }
+ *commit_needed = false;
- while ((bio = bio_list_pop(&detail.bios_for_issue))) {
- remap_to_cache(cache, bio, cblock);
- issue(cache, bio);
+ rb = bio_detain_shared(cache, block, bio);
+ if (!rb) {
+ /*
+ * An exclusive lock is held for this block, so we have to
+ * wait. We set the commit_needed flag so the current
+ * transaction will be committed asap, allowing this lock
+ * to be dropped.
+ */
+ *commit_needed = true;
+ return DM_MAPIO_SUBMITTED;
}
- free_prison_cell(cache, cell);
-}
-
-/*----------------------------------------------------------------*/
+ data_dir = bio_data_dir(bio);
-struct old_oblock_lock {
- struct policy_locker locker;
- struct cache *cache;
- struct prealloc *structs;
- struct dm_bio_prison_cell *cell;
-};
+ if (optimisable_bio(cache, bio, block)) {
+ struct policy_work *op = NULL;
-static int null_locker(struct policy_locker *locker, dm_oblock_t b)
-{
- /* This should never be called */
- BUG();
- return 0;
-}
+ r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
+ if (unlikely(r && r != -ENOENT)) {
+ DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
+ cache_device_name(cache), r);
+ bio_io_error(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
-static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
-{
- struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
- struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
+ if (r == -ENOENT && op) {
+ bio_drop_shared_lock(cache, bio);
+ BUG_ON(op->op != POLICY_PROMOTE);
+ mg_start(cache, op, bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+ } else {
+ r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
+ if (unlikely(r && r != -ENOENT)) {
+ DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
+ cache_device_name(cache), r);
+ bio_io_error(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
- return bio_detain(l->cache, b, NULL, cell_prealloc,
- (cell_free_fn) prealloc_put_cell,
- l->structs, &l->cell);
-}
+ if (background_queued)
+ wake_migration_worker(cache);
+ }
-static void process_cell(struct cache *cache, struct prealloc *structs,
- struct dm_bio_prison_cell *new_ocell)
-{
- int r;
- bool release_cell = true;
- struct bio *bio = new_ocell->holder;
- dm_oblock_t block = get_bio_block(cache, bio);
- struct policy_result lookup_result;
- bool passthrough = passthrough_mode(&cache->features);
- bool fast_promotion, can_migrate;
- struct old_oblock_lock ool;
-
- fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
- can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
-
- ool.locker.fn = cell_locker;
- ool.cache = cache;
- ool.structs = structs;
- ool.cell = NULL;
- r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
- bio, &ool.locker, &lookup_result);
-
- if (r == -EWOULDBLOCK)
- /* migration has been denied */
- lookup_result.op = POLICY_MISS;
-
- switch (lookup_result.op) {
- case POLICY_HIT:
- if (passthrough) {
- inc_miss_counter(cache, bio);
+ if (r == -ENOENT) {
+ /*
+ * Miss.
+ */
+ inc_miss_counter(cache, bio);
+ if (pb->req_nr == 0) {
+ accounted_begin(cache, bio);
+ remap_to_origin_clear_discard(cache, bio, block);
+ } else {
/*
- * Passthrough always maps to the origin,
- * invalidating any cache blocks that are written
- * to.
+ * This is a duplicate writethrough io that is no
+ * longer needed because the block has been demoted.
*/
+ bio_endio(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+ } else {
+ /*
+ * Hit.
+ */
+ inc_hit_counter(cache, bio);
+ /*
+ * Passthrough always maps to the origin, invalidating any
+ * cache blocks that are written to.
+ */
+ if (passthrough_mode(&cache->features)) {
if (bio_data_dir(bio) == WRITE) {
+ bio_drop_shared_lock(cache, bio);
atomic_inc(&cache->stats.demotion);
- invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
- release_cell = false;
-
- } else {
- /* FIXME: factor out issue_origin() */
+ invalidate_start(cache, cblock, block, bio);
+ } else
remap_to_origin_clear_discard(cache, bio, block);
- inc_and_issue(cache, bio, new_ocell);
- }
+
} else {
- inc_hit_counter(cache, bio);
-
- if (bio_data_dir(bio) == WRITE &&
- writethrough_mode(&cache->features) &&
- !is_dirty(cache, lookup_result.cblock)) {
- remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
- inc_and_issue(cache, bio, new_ocell);
-
- } else {
- remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
- release_cell = false;
- }
+ if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
+ !is_dirty(cache, cblock)) {
+ remap_to_origin_then_cache(cache, bio, block, cblock);
+ accounted_begin(cache, bio);
+ } else
+ remap_to_cache_dirty(cache, bio, block, cblock);
}
-
- break;
-
- case POLICY_MISS:
- inc_miss_counter(cache, bio);
- remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
- release_cell = false;
- break;
-
- case POLICY_NEW:
- atomic_inc(&cache->stats.promotion);
- promote(cache, structs, block, lookup_result.cblock, new_ocell);
- release_cell = false;
- break;
-
- case POLICY_REPLACE:
- atomic_inc(&cache->stats.demotion);
- atomic_inc(&cache->stats.promotion);
- demote_then_promote(cache, structs, lookup_result.old_oblock,
- block, lookup_result.cblock,
- ool.cell, new_ocell);
- release_cell = false;
- break;
-
- default:
- DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
- cache_device_name(cache), __func__,
- (unsigned) lookup_result.op);
- bio_io_error(bio);
}
- if (release_cell)
- cell_defer(cache, new_ocell, false);
-}
-
-static void process_bio(struct cache *cache, struct prealloc *structs,
- struct bio *bio)
-{
- int r;
- dm_oblock_t block = get_bio_block(cache, bio);
- struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
-
/*
- * Check to see if that block is currently migrating.
+ * dm core turns FUA requests into a separate payload and FLUSH req.
*/
- cell_prealloc = prealloc_get_cell(structs);
- r = bio_detain(cache, block, bio, cell_prealloc,
- (cell_free_fn) prealloc_put_cell,
- structs, &new_ocell);
- if (r > 0)
- return;
+ if (bio->bi_opf & REQ_FUA) {
+ /*
+ * issue_after_commit will call accounted_begin a second time. So
+ * we call accounted_complete() to avoid double accounting.
+ */
+ accounted_complete(cache, bio);
+ issue_after_commit(&cache->committer, bio);
+ *commit_needed = true;
+ return DM_MAPIO_SUBMITTED;
+ }
- process_cell(cache, structs, new_ocell);
+ return DM_MAPIO_REMAPPED;
}
-static int need_commit_due_to_time(struct cache *cache)
+static bool process_bio(struct cache *cache, struct bio *bio)
{
- return jiffies < cache->last_commit_jiffies ||
- jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
+ bool commit_needed;
+
+ if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
+ generic_make_request(bio);
+
+ return commit_needed;
}
/*
return r;
}
-static int commit_if_needed(struct cache *cache)
-{
- int r = 0;
-
- if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
- dm_cache_changed_this_transaction(cache->cmd)) {
- r = commit(cache, false);
- cache->commit_requested = false;
- cache->last_commit_jiffies = jiffies;
- }
-
- return r;
-}
-
-static void process_deferred_bios(struct cache *cache)
+/*
+ * Used by the batcher.
+ */
+static int commit_op(void *context)
{
- bool prealloc_used = false;
- unsigned long flags;
- struct bio_list bios;
- struct bio *bio;
- struct prealloc structs;
+ struct cache *cache = context;
- memset(&structs, 0, sizeof(structs));
- bio_list_init(&bios);
+ if (dm_cache_changed_this_transaction(cache->cmd))
+ return commit(cache, false);
- spin_lock_irqsave(&cache->lock, flags);
- bio_list_merge(&bios, &cache->deferred_bios);
- bio_list_init(&cache->deferred_bios);
- spin_unlock_irqrestore(&cache->lock, flags);
+ return 0;
+}
- while (!bio_list_empty(&bios)) {
- /*
- * If we've got no free migration structs, and processing
- * this bio might require one, we pause until there are some
- * prepared mappings to process.
- */
- prealloc_used = true;
- if (prealloc_data_structs(cache, &structs)) {
- spin_lock_irqsave(&cache->lock, flags);
- bio_list_merge(&cache->deferred_bios, &bios);
- spin_unlock_irqrestore(&cache->lock, flags);
- break;
- }
+/*----------------------------------------------------------------*/
- bio = bio_list_pop(&bios);
+static bool process_flush_bio(struct cache *cache, struct bio *bio)
+{
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
- if (bio->bi_opf & REQ_PREFLUSH)
- process_flush_bio(cache, bio);
- else if (bio_op(bio) == REQ_OP_DISCARD)
- process_discard_bio(cache, &structs, bio);
- else
- process_bio(cache, &structs, bio);
- }
+ if (!pb->req_nr)
+ remap_to_origin(cache, bio);
+ else
+ remap_to_cache(cache, bio, 0);
- if (prealloc_used)
- prealloc_free_structs(cache, &structs);
+ issue_after_commit(&cache->committer, bio);
+ return true;
}
-static void process_deferred_cells(struct cache *cache)
+static bool process_discard_bio(struct cache *cache, struct bio *bio)
{
- bool prealloc_used = false;
- unsigned long flags;
- struct dm_bio_prison_cell *cell, *tmp;
- struct list_head cells;
- struct prealloc structs;
-
- memset(&structs, 0, sizeof(structs));
-
- INIT_LIST_HEAD(&cells);
-
- spin_lock_irqsave(&cache->lock, flags);
- list_splice_init(&cache->deferred_cells, &cells);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- list_for_each_entry_safe(cell, tmp, &cells, user_list) {
- /*
- * If we've got no free migration structs, and processing
- * this bio might require one, we pause until there are some
- * prepared mappings to process.
- */
- prealloc_used = true;
- if (prealloc_data_structs(cache, &structs)) {
- spin_lock_irqsave(&cache->lock, flags);
- list_splice(&cells, &cache->deferred_cells);
- spin_unlock_irqrestore(&cache->lock, flags);
- break;
- }
+ dm_dblock_t b, e;
- process_cell(cache, &structs, cell);
+ // FIXME: do we need to lock the region? Or can we just assume the
+ // user wont be so foolish as to issue discard concurrently with
+ // other IO?
+ calc_discard_block_range(cache, bio, &b, &e);
+ while (b != e) {
+ set_discard(cache, b);
+ b = to_dblock(from_dblock(b) + 1);
}
- if (prealloc_used)
- prealloc_free_structs(cache, &structs);
+ bio_endio(bio);
+
+ return false;
}
-static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
+static void process_deferred_bios(struct work_struct *ws)
{
+ struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
+
unsigned long flags;
+ bool commit_needed = false;
struct bio_list bios;
struct bio *bio;
bio_list_init(&bios);
spin_lock_irqsave(&cache->lock, flags);
- bio_list_merge(&bios, &cache->deferred_flush_bios);
- bio_list_init(&cache->deferred_flush_bios);
+ bio_list_merge(&bios, &cache->deferred_bios);
+ bio_list_init(&cache->deferred_bios);
spin_unlock_irqrestore(&cache->lock, flags);
- /*
- * These bios have already been through inc_ds()
- */
- while ((bio = bio_list_pop(&bios)))
- submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
+ while ((bio = bio_list_pop(&bios))) {
+ if (bio->bi_opf & REQ_PREFLUSH)
+ commit_needed = process_flush_bio(cache, bio) || commit_needed;
+
+ else if (bio_op(bio) == REQ_OP_DISCARD)
+ commit_needed = process_discard_bio(cache, bio) || commit_needed;
+
+ else
+ commit_needed = process_bio(cache, bio) || commit_needed;
+ }
+
+ if (commit_needed)
+ schedule_commit(&cache->committer);
}
-static void process_deferred_writethrough_bios(struct cache *cache)
+static void process_deferred_writethrough_bios(struct work_struct *ws)
{
+ struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker);
+
unsigned long flags;
struct bio_list bios;
struct bio *bio;
spin_unlock_irqrestore(&cache->lock, flags);
/*
- * These bios have already been through inc_ds()
+ * These bios have already been through accounted_begin()
*/
while ((bio = bio_list_pop(&bios)))
- accounted_request(cache, bio);
-}
-
-static void writeback_some_dirty_blocks(struct cache *cache)
-{
- bool prealloc_used = false;
- dm_oblock_t oblock;
- dm_cblock_t cblock;
- struct prealloc structs;
- struct dm_bio_prison_cell *old_ocell;
- bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
-
- memset(&structs, 0, sizeof(structs));
-
- while (spare_migration_bandwidth(cache)) {
- if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
- break; /* no work to do */
-
- prealloc_used = true;
- if (prealloc_data_structs(cache, &structs) ||
- get_cell(cache, oblock, &structs, &old_ocell)) {
- policy_set_dirty(cache->policy, oblock);
- break;
- }
-
- writeback(cache, &structs, oblock, cblock, old_ocell);
- }
-
- if (prealloc_used)
- prealloc_free_structs(cache, &structs);
-}
-
-/*----------------------------------------------------------------
- * Invalidations.
- * Dropping something from the cache *without* writing back.
- *--------------------------------------------------------------*/
-
-static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
-{
- int r = 0;
- uint64_t begin = from_cblock(req->cblocks->begin);
- uint64_t end = from_cblock(req->cblocks->end);
-
- while (begin != end) {
- r = policy_remove_cblock(cache->policy, to_cblock(begin));
- if (!r) {
- r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
- if (r) {
- metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
- break;
- }
-
- } else if (r == -ENODATA) {
- /* harmless, already unmapped */
- r = 0;
-
- } else {
- DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
- break;
- }
-
- begin++;
- }
-
- cache->commit_requested = true;
-
- req->err = r;
- atomic_set(&req->complete, 1);
-
- wake_up(&req->result_wait);
-}
-
-static void process_invalidation_requests(struct cache *cache)
-{
- struct list_head list;
- struct invalidation_request *req, *tmp;
-
- INIT_LIST_HEAD(&list);
- spin_lock(&cache->invalidation_lock);
- list_splice_init(&cache->invalidation_requests, &list);
- spin_unlock(&cache->invalidation_lock);
-
- list_for_each_entry_safe (req, tmp, &list, list)
- process_invalidation_request(cache, req);
+ generic_make_request(bio);
}
/*----------------------------------------------------------------
* Main worker loop
*--------------------------------------------------------------*/
-static bool is_quiescing(struct cache *cache)
-{
- return atomic_read(&cache->quiescing);
-}
-
-static void ack_quiescing(struct cache *cache)
-{
- if (is_quiescing(cache)) {
- atomic_inc(&cache->quiescing_ack);
- wake_up(&cache->quiescing_wait);
- }
-}
-
-static void wait_for_quiescing_ack(struct cache *cache)
-{
- wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
-}
-
-static void start_quiescing(struct cache *cache)
-{
- atomic_inc(&cache->quiescing);
- wait_for_quiescing_ack(cache);
-}
-
-static void stop_quiescing(struct cache *cache)
-{
- atomic_set(&cache->quiescing, 0);
- atomic_set(&cache->quiescing_ack, 0);
-}
-
-static void wait_for_migrations(struct cache *cache)
-{
- wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
-}
-
-static void stop_worker(struct cache *cache)
-{
- cancel_delayed_work(&cache->waker);
- flush_workqueue(cache->wq);
-}
-
-static void requeue_deferred_cells(struct cache *cache)
-{
- unsigned long flags;
- struct list_head cells;
- struct dm_bio_prison_cell *cell, *tmp;
-
- INIT_LIST_HEAD(&cells);
- spin_lock_irqsave(&cache->lock, flags);
- list_splice_init(&cache->deferred_cells, &cells);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- list_for_each_entry_safe(cell, tmp, &cells, user_list)
- cell_requeue(cache, cell);
-}
static void requeue_deferred_bios(struct cache *cache)
{
}
}
-static int more_work(struct cache *cache)
-{
- if (is_quiescing(cache))
- return !list_empty(&cache->quiesced_migrations) ||
- !list_empty(&cache->completed_migrations) ||
- !list_empty(&cache->need_commit_migrations);
- else
- return !bio_list_empty(&cache->deferred_bios) ||
- !list_empty(&cache->deferred_cells) ||
- !bio_list_empty(&cache->deferred_flush_bios) ||
- !bio_list_empty(&cache->deferred_writethrough_bios) ||
- !list_empty(&cache->quiesced_migrations) ||
- !list_empty(&cache->completed_migrations) ||
- !list_empty(&cache->need_commit_migrations) ||
- cache->invalidate;
-}
-
-static void do_worker(struct work_struct *ws)
-{
- struct cache *cache = container_of(ws, struct cache, worker);
-
- do {
- if (!is_quiescing(cache)) {
- writeback_some_dirty_blocks(cache);
- process_deferred_writethrough_bios(cache);
- process_deferred_bios(cache);
- process_deferred_cells(cache);
- process_invalidation_requests(cache);
- }
-
- process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
- process_migrations(cache, &cache->completed_migrations, complete_migration);
-
- if (commit_if_needed(cache)) {
- process_deferred_flush_bios(cache, false);
- process_migrations(cache, &cache->need_commit_migrations, migration_failure);
- } else {
- process_deferred_flush_bios(cache, true);
- process_migrations(cache, &cache->need_commit_migrations,
- migration_success_post_commit);
- }
-
- ack_quiescing(cache);
-
- } while (more_work(cache));
-}
-
/*
* We want to commit periodically so that not too much
* unwritten metadata builds up.
static void do_waker(struct work_struct *ws)
{
struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
+
policy_tick(cache->policy, true);
- wake_worker(cache);
+ wake_migration_worker(cache);
+ schedule_commit(&cache->committer);
queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
}
-/*----------------------------------------------------------------*/
-
-static int is_congested(struct dm_dev *dev, int bdi_bits)
+static void check_migrations(struct work_struct *ws)
{
- struct request_queue *q = bdev_get_queue(dev->bdev);
- return bdi_congested(q->backing_dev_info, bdi_bits);
-}
+ int r;
+ struct policy_work *op;
+ struct cache *cache = container_of(ws, struct cache, migration_worker);
+ enum busy b;
-static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
-{
- struct cache *cache = container_of(cb, struct cache, callbacks);
+ for (;;) {
+ b = spare_migration_bandwidth(cache);
+ if (b == BUSY)
+ break;
- return is_congested(cache->origin_dev, bdi_bits) ||
- is_congested(cache->cache_dev, bdi_bits);
+ r = policy_get_background_work(cache->policy, b == IDLE, &op);
+ if (r == -ENODATA)
+ break;
+
+ if (r) {
+ DMERR_LIMIT("%s: policy_background_work failed",
+ cache_device_name(cache));
+ break;
+ }
+
+ r = mg_start(cache, op, NULL);
+ if (r)
+ break;
+ }
}
/*----------------------------------------------------------------
mempool_destroy(cache->migration_pool);
- if (cache->all_io_ds)
- dm_deferred_set_destroy(cache->all_io_ds);
-
if (cache->prison)
- dm_bio_prison_destroy(cache->prison);
+ dm_bio_prison_destroy_v2(cache->prison);
if (cache->wq)
destroy_workqueue(cache->wq);
return PTR_ERR(p);
}
cache->policy = p;
+ BUG_ON(!cache->policy);
return 0;
}
cache->cache_size = size;
}
+static int is_congested(struct dm_dev *dev, int bdi_bits)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+ return bdi_congested(q->backing_dev_info, bdi_bits);
+}
+
+static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
+{
+ struct cache *cache = container_of(cb, struct cache, callbacks);
+
+ return is_congested(cache->origin_dev, bdi_bits) ||
+ is_congested(cache->cache_dev, bdi_bits);
+}
+
#define DEFAULT_MIGRATION_THRESHOLD 2048
static int cache_create(struct cache_args *ca, struct cache **result)
ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
- /* FIXME: factor out this whole section */
origin_blocks = cache->origin_sectors = ca->origin_sectors;
origin_blocks = block_div(origin_blocks, ca->block_size);
cache->origin_blocks = to_oblock(origin_blocks);
r = -EINVAL;
goto bad;
}
+
+ policy_allow_migrations(cache->policy, false);
}
spin_lock_init(&cache->lock);
INIT_LIST_HEAD(&cache->deferred_cells);
bio_list_init(&cache->deferred_bios);
- bio_list_init(&cache->deferred_flush_bios);
bio_list_init(&cache->deferred_writethrough_bios);
- INIT_LIST_HEAD(&cache->quiesced_migrations);
- INIT_LIST_HEAD(&cache->completed_migrations);
- INIT_LIST_HEAD(&cache->need_commit_migrations);
atomic_set(&cache->nr_allocated_migrations, 0);
atomic_set(&cache->nr_io_migrations, 0);
init_waitqueue_head(&cache->migration_wait);
- init_waitqueue_head(&cache->quiescing_wait);
- atomic_set(&cache->quiescing, 0);
- atomic_set(&cache->quiescing_ack, 0);
-
r = -ENOMEM;
atomic_set(&cache->nr_dirty, 0);
cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
goto bad;
}
- cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
+ cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
if (!cache->wq) {
*error = "could not create workqueue for metadata object";
goto bad;
}
- INIT_WORK(&cache->worker, do_worker);
+ INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
+ INIT_WORK(&cache->deferred_writethrough_worker,
+ process_deferred_writethrough_bios);
+ INIT_WORK(&cache->migration_worker, check_migrations);
INIT_DELAYED_WORK(&cache->waker, do_waker);
- cache->last_commit_jiffies = jiffies;
- cache->prison = dm_bio_prison_create();
+ cache->prison = dm_bio_prison_create_v2(cache->wq);
if (!cache->prison) {
*error = "could not create bio prison";
goto bad;
}
- cache->all_io_ds = dm_deferred_set_create();
- if (!cache->all_io_ds) {
- *error = "could not create all_io deferred set";
- goto bad;
- }
-
cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
migration_cache);
if (!cache->migration_pool) {
spin_lock_init(&cache->invalidation_lock);
INIT_LIST_HEAD(&cache->invalidation_requests);
+ batcher_init(&cache->committer, commit_op, cache,
+ issue_op, cache, cache->wq);
iot_init(&cache->origin_tracker);
+ init_rwsem(&cache->background_work_lock);
+ prevent_background_work(cache);
+
*result = cache;
return 0;
-
bad:
destroy(cache);
return r;
}
ti->private = cache;
-
out:
destroy_cache_args(ca);
return r;
struct cache *cache = ti->private;
int r;
- struct dm_bio_prison_cell *cell = NULL;
+ bool commit_needed;
dm_oblock_t block = get_bio_block(cache, bio);
size_t pb_data_size = get_per_bio_data_size(cache);
- bool can_migrate = false;
- bool fast_promotion;
- struct policy_result lookup_result;
- struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
- struct old_oblock_lock ool;
-
- ool.locker.fn = null_locker;
+ init_per_bio_data(bio, pb_data_size);
if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
/*
* This can only occur if the io goes to a partial block at
return DM_MAPIO_SUBMITTED;
}
- /*
- * Check to see if that block is currently migrating.
- */
- cell = alloc_prison_cell(cache);
- if (!cell) {
- defer_bio(cache, bio);
- return DM_MAPIO_SUBMITTED;
- }
-
- r = bio_detain(cache, block, bio, cell,
- (cell_free_fn) free_prison_cell,
- cache, &cell);
- if (r) {
- if (r < 0)
- defer_bio(cache, bio);
-
- return DM_MAPIO_SUBMITTED;
- }
-
- fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
-
- r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
- bio, &ool.locker, &lookup_result);
- if (r == -EWOULDBLOCK) {
- cell_defer(cache, cell, true);
- return DM_MAPIO_SUBMITTED;
-
- } else if (r) {
- DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
- cache_device_name(cache), r);
- cell_defer(cache, cell, false);
- bio_io_error(bio);
- return DM_MAPIO_SUBMITTED;
- }
-
- r = DM_MAPIO_REMAPPED;
- switch (lookup_result.op) {
- case POLICY_HIT:
- if (passthrough_mode(&cache->features)) {
- if (bio_data_dir(bio) == WRITE) {
- /*
- * We need to invalidate this block, so
- * defer for the worker thread.
- */
- cell_defer(cache, cell, true);
- r = DM_MAPIO_SUBMITTED;
-
- } else {
- inc_miss_counter(cache, bio);
- remap_to_origin_clear_discard(cache, bio, block);
- accounted_begin(cache, bio);
- inc_ds(cache, bio, cell);
- // FIXME: we want to remap hits or misses straight
- // away rather than passing over to the worker.
- cell_defer(cache, cell, false);
- }
-
- } else {
- inc_hit_counter(cache, bio);
- if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
- !is_dirty(cache, lookup_result.cblock)) {
- remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
- accounted_begin(cache, bio);
- inc_ds(cache, bio, cell);
- cell_defer(cache, cell, false);
-
- } else
- remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
- }
- break;
-
- case POLICY_MISS:
- inc_miss_counter(cache, bio);
- if (pb->req_nr != 0) {
- /*
- * This is a duplicate writethrough io that is no
- * longer needed because the block has been demoted.
- */
- bio_endio(bio);
- // FIXME: remap everything as a miss
- cell_defer(cache, cell, false);
- r = DM_MAPIO_SUBMITTED;
-
- } else
- remap_cell_to_origin_clear_discard(cache, cell, block, false);
- break;
-
- default:
- DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
- cache_device_name(cache), __func__,
- (unsigned) lookup_result.op);
- cell_defer(cache, cell, false);
- bio_io_error(bio);
- r = DM_MAPIO_SUBMITTED;
- }
+ r = map_bio(cache, bio, block, &commit_needed);
+ if (commit_needed)
+ schedule_commit(&cache->committer);
return r;
}
spin_unlock_irqrestore(&cache->lock, flags);
}
- check_for_quiesced_migrations(cache, pb);
+ bio_drop_shared_lock(cache, bio);
accounted_complete(cache, bio);
return 0;
{
struct cache *cache = ti->private;
- start_quiescing(cache);
- wait_for_migrations(cache);
- stop_worker(cache);
+ prevent_background_work(cache);
+ BUG_ON(atomic_read(&cache->nr_io_migrations));
+
+ cancel_delayed_work(&cache->waker);
+ flush_workqueue(cache->wq);
+ WARN_ON(cache->origin_tracker.in_flight);
+
+ /*
+ * If it's a flush suspend there won't be any deferred bios, so this
+ * call is harmless.
+ */
requeue_deferred_bios(cache);
- requeue_deferred_cells(cache);
- stop_quiescing(cache);
if (get_cache_mode(cache) == CM_WRITE)
(void) sync_metadata(cache);
int r;
struct cache *cache = context;
- r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
+ r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
if (r)
return r;
- if (dirty)
- set_dirty(cache, oblock, cblock);
- else
- clear_dirty(cache, oblock, cblock);
-
return 0;
}
struct cache *cache = ti->private;
cache->need_tick_bio = true;
+ allow_background_work(cache);
do_waker(&cache->waker.work);
}
DMEMIT("Error");
}
+/*
+ * Defines a range of cblocks, begin to (end - 1) are in the range. end is
+ * the one-past-the-end value.
+ */
+struct cblock_range {
+ dm_cblock_t begin;
+ dm_cblock_t end;
+};
+
/*
* A cache block range can take two forms:
*
* i) A single cblock, eg. '3456'
- * ii) A begin and end cblock with dots between, eg. 123-234
+ * ii) A begin and end cblock with a dash between, eg. 123-234
*/
static int parse_cblock_range(struct cache *cache, const char *str,
struct cblock_range *result)
return 0;
}
+static inline dm_cblock_t cblock_succ(dm_cblock_t b)
+{
+ return to_cblock(from_cblock(b) + 1);
+}
+
static int request_invalidation(struct cache *cache, struct cblock_range *range)
{
- struct invalidation_request req;
+ int r = 0;
- INIT_LIST_HEAD(&req.list);
- req.cblocks = range;
- atomic_set(&req.complete, 0);
- req.err = 0;
- init_waitqueue_head(&req.result_wait);
+ /*
+ * We don't need to do any locking here because we know we're in
+ * passthrough mode. There's is potential for a race between an
+ * invalidation triggered by an io and an invalidation message. This
+ * is harmless, we must not worry if the policy call fails.
+ */
+ while (range->begin != range->end) {
+ r = invalidate_cblock(cache, range->begin);
+ if (r)
+ return r;
- spin_lock(&cache->invalidation_lock);
- list_add(&req.list, &cache->invalidation_requests);
- spin_unlock(&cache->invalidation_lock);
- wake_worker(cache);
+ range->begin = cblock_succ(range->begin);
+ }
- wait_event(req.result_wait, atomic_read(&req.complete));
- return req.err;
+ cache->commit_requested = true;
+ return r;
}
static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 10, 0},
+ .version = {2, 0, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,