bcache: set max writeback rate when I/O request is idle
authorColy Li <colyli@suse.de>
Thu, 9 Aug 2018 07:48:49 +0000 (15:48 +0800)
committerJens Axboe <axboe@kernel.dk>
Thu, 9 Aug 2018 14:21:15 +0000 (08:21 -0600)
Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
allows the writeback rate to be faster if there is no I/O request on a
bcache device. It works well if there is only one bcache device attached
to the cache set. If there are many bcache devices attached to a cache
set, it may introduce performance regression because multiple faster
writeback threads of the idle bcache devices will compete the btree level
locks with the bcache device who have I/O requests coming.

This patch fixes the above issue by only permitting fast writebac when
all bcache devices attached on the cache set are idle. And if one of the
bcache devices has new I/O request coming, minimized all writeback
throughput immediately and let PI controller __update_writeback_rate()
to decide the upcoming writeback rate for each bcache device.

Also when all bcache devices are idle, limited wrieback rate to a small
number is wast of thoughput, especially when backing devices are slower
non-rotation devices (e.g. SATA SSD). This patch sets a max writeback
rate for each backing device if the whole cache set is idle. A faster
writeback rate in idle time means new I/Os may have more available space
for dirty data, and people may observe a better write performance then.

Please note bcache may change its cache mode in run time, and this patch
still works if the cache mode is switched from writeback mode and there
is still dirty data on cache.

Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
Cc: stable@vger.kernel.org #4.16+
Signed-off-by: Coly Li <colyli@suse.de>
Tested-by: Kai Krakow <kai@kaishome.de>
Tested-by: Stefan Priebe <s.priebe@profihost.ag>
Cc: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
drivers/md/bcache/bcache.h
drivers/md/bcache/request.c
drivers/md/bcache/super.c
drivers/md/bcache/sysfs.c
drivers/md/bcache/util.c
drivers/md/bcache/util.h
drivers/md/bcache/writeback.c

index b393b3fd06b69520555da7dc9dde36347fb3ed25..05f82ff6f016ebc348381cefda2258b23101b4af 100644 (file)
@@ -328,13 +328,6 @@ struct cached_dev {
         */
        atomic_t                has_dirty;
 
-       /*
-        * Set to zero by things that touch the backing volume-- except
-        * writeback.  Incremented by writeback.  Used to determine when to
-        * accelerate idle writeback.
-        */
-       atomic_t                backing_idle;
-
        struct bch_ratelimit    writeback_rate;
        struct delayed_work     writeback_rate_update;
 
@@ -515,6 +508,8 @@ struct cache_set {
        struct cache_accounting accounting;
 
        unsigned long           flags;
+       atomic_t                idle_counter;
+       atomic_t                at_max_writeback_rate;
 
        struct cache_sb         sb;
 
@@ -524,6 +519,7 @@ struct cache_set {
 
        struct bcache_device    **devices;
        unsigned                devices_max_used;
+       atomic_t                attached_dev_nr;
        struct list_head        cached_devs;
        uint64_t                cached_dev_sectors;
        atomic_long_t           flash_dev_dirty_sectors;
index 914d501ad1e07d1e100ebc02a7f7c12389e27601..7dbe8b6316a0038670e9c73f9d6c3b583a1554a2 100644 (file)
@@ -1103,6 +1103,44 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
                generic_make_request(bio);
 }
 
+static void quit_max_writeback_rate(struct cache_set *c,
+                                   struct cached_dev *this_dc)
+{
+       int i;
+       struct bcache_device *d;
+       struct cached_dev *dc;
+
+       /*
+        * mutex bch_register_lock may compete with other parallel requesters,
+        * or attach/detach operations on other backing device. Waiting to
+        * the mutex lock may increase I/O request latency for seconds or more.
+        * To avoid such situation, if mutext_trylock() failed, only writeback
+        * rate of current cached device is set to 1, and __update_write_back()
+        * will decide writeback rate of other cached devices (remember now
+        * c->idle_counter is 0 already).
+        */
+       if (mutex_trylock(&bch_register_lock)) {
+               for (i = 0; i < c->devices_max_used; i++) {
+                       if (!c->devices[i])
+                               continue;
+
+                       if (UUID_FLASH_ONLY(&c->uuids[i]))
+                               continue;
+
+                       d = c->devices[i];
+                       dc = container_of(d, struct cached_dev, disk);
+                       /*
+                        * set writeback rate to default minimum value,
+                        * then let update_writeback_rate() to decide the
+                        * upcoming rate.
+                        */
+                       atomic_long_set(&dc->writeback_rate.rate, 1);
+               }
+               mutex_unlock(&bch_register_lock);
+       } else
+               atomic_long_set(&this_dc->writeback_rate.rate, 1);
+}
+
 /* Cached devices - read & write stuff */
 
 static blk_qc_t cached_dev_make_request(struct request_queue *q,
@@ -1120,8 +1158,25 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
                return BLK_QC_T_NONE;
        }
 
-       atomic_set(&dc->backing_idle, 0);
-       generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0);
+       if (likely(d->c)) {
+               if (atomic_read(&d->c->idle_counter))
+                       atomic_set(&d->c->idle_counter, 0);
+               /*
+                * If at_max_writeback_rate of cache set is true and new I/O
+                * comes, quit max writeback rate of all cached devices
+                * attached to this cache set, and set at_max_writeback_rate
+                * to false.
+                */
+               if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
+                       atomic_set(&d->c->at_max_writeback_rate, 0);
+                       quit_max_writeback_rate(d->c, dc);
+               }
+       }
+
+       generic_start_io_acct(q,
+                             bio_op(bio),
+                             bio_sectors(bio),
+                             &d->disk->part0);
 
        bio_set_dev(bio, dc->bdev);
        bio->bi_iter.bi_sector += dc->sb.data_offset;
index 1e85cbb4c159f5073d87f25cbde4f0c6e59574c6..55a37641aa95a511f175fbb5b0b0fe3463954a0f 100644 (file)
@@ -696,6 +696,8 @@ static void bcache_device_detach(struct bcache_device *d)
 {
        lockdep_assert_held(&bch_register_lock);
 
+       atomic_dec(&d->c->attached_dev_nr);
+
        if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
                struct uuid_entry *u = d->c->uuids + d->id;
 
@@ -1144,6 +1146,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 
        bch_cached_dev_run(dc);
        bcache_device_link(&dc->disk, c, "bdev");
+       atomic_inc(&c->attached_dev_nr);
 
        /* Allow the writeback thread to proceed */
        up_write(&dc->writeback_lock);
@@ -1696,6 +1699,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
        c->block_bits           = ilog2(sb->block_size);
        c->nr_uuids             = bucket_bytes(c) / sizeof(struct uuid_entry);
        c->devices_max_used     = 0;
+       atomic_set(&c->attached_dev_nr, 0);
        c->btree_pages          = bucket_pages(c);
        if (c->btree_pages > BTREE_MAX_PAGES)
                c->btree_pages = max_t(int, c->btree_pages / 4,
index 3e9d3459a224c9f93ef2c38ce4cf23cede91ac11..6e88142514fb2ff01eec01a5c30e9201370b0a6b 100644 (file)
@@ -171,7 +171,8 @@ SHOW(__bch_cached_dev)
        var_printf(writeback_running,   "%i");
        var_print(writeback_delay);
        var_print(writeback_percent);
-       sysfs_hprint(writeback_rate,    wb ? dc->writeback_rate.rate << 9 : 0);
+       sysfs_hprint(writeback_rate,
+                    wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0);
        sysfs_hprint(io_errors,         atomic_read(&dc->io_errors));
        sysfs_printf(io_error_limit,    "%i", dc->error_limit);
        sysfs_printf(io_disable,        "%i", dc->io_disable);
@@ -193,7 +194,9 @@ SHOW(__bch_cached_dev)
                 * Except for dirty and target, other values should
                 * be 0 if writeback is not running.
                 */
-               bch_hprint(rate, wb ? dc->writeback_rate.rate << 9 : 0);
+               bch_hprint(rate,
+                          wb ? atomic_long_read(&dc->writeback_rate.rate) << 9
+                             : 0);
                bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
                bch_hprint(target, dc->writeback_rate_target << 9);
                bch_hprint(proportional,
@@ -261,8 +264,12 @@ STORE(__cached_dev)
 
        sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
 
-       sysfs_strtoul_clamp(writeback_rate,
-                           dc->writeback_rate.rate, 1, INT_MAX);
+       if (attr == &sysfs_writeback_rate) {
+               int v;
+
+               sysfs_strtoul_clamp(writeback_rate, v, 1, INT_MAX);
+               atomic_long_set(&dc->writeback_rate.rate, v);
+       }
 
        sysfs_strtoul_clamp(writeback_rate_update_seconds,
                            dc->writeback_rate_update_seconds,
index fc479b026d6d86f4aa0c8cfd43f21f34fc5f9e82..b15256bcf0e7568e761a8afcae08659cb7a3a95e 100644 (file)
@@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
 {
        uint64_t now = local_clock();
 
-       d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+       d->next += div_u64(done * NSEC_PER_SEC, atomic_long_read(&d->rate));
 
        /* Bound the time.  Don't let us fall further than 2 seconds behind
         * (this prevents unnecessary backlog that would make it impossible
index cced87f8eb278fcfe9715fdc24c887f3b06109a1..f7b0133c9d2f1aacd08dec33af2924df8594ebc9 100644 (file)
@@ -442,7 +442,7 @@ struct bch_ratelimit {
         * Rate at which we want to do work, in units per second
         * The units here correspond to the units passed to bch_next_delay()
         */
-       uint32_t                rate;
+       atomic_long_t           rate;
 };
 
 static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
index 912e969fedbacbbc33bf9d5cb12bf6d08c1b80c2..481d4cf38ac0e9ea2e04e10303b86524de5021c4 100644 (file)
@@ -104,11 +104,56 @@ static void __update_writeback_rate(struct cached_dev *dc)
 
        dc->writeback_rate_proportional = proportional_scaled;
        dc->writeback_rate_integral_scaled = integral_scaled;
-       dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
-       dc->writeback_rate.rate = new_rate;
+       dc->writeback_rate_change = new_rate -
+                       atomic_long_read(&dc->writeback_rate.rate);
+       atomic_long_set(&dc->writeback_rate.rate, new_rate);
        dc->writeback_rate_target = target;
 }
 
+static bool set_at_max_writeback_rate(struct cache_set *c,
+                                      struct cached_dev *dc)
+{
+       /*
+        * Idle_counter is increased everytime when update_writeback_rate() is
+        * called. If all backing devices attached to the same cache set have
+        * identical dc->writeback_rate_update_seconds values, it is about 6
+        * rounds of update_writeback_rate() on each backing device before
+        * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
+        * to each dc->writeback_rate.rate.
+        * In order to avoid extra locking cost for counting exact dirty cached
+        * devices number, c->attached_dev_nr is used to calculate the idle
+        * throushold. It might be bigger if not all cached device are in write-
+        * back mode, but it still works well with limited extra rounds of
+        * update_writeback_rate().
+        */
+       if (atomic_inc_return(&c->idle_counter) <
+           atomic_read(&c->attached_dev_nr) * 6)
+               return false;
+
+       if (atomic_read(&c->at_max_writeback_rate) != 1)
+               atomic_set(&c->at_max_writeback_rate, 1);
+
+       atomic_long_set(&dc->writeback_rate.rate, INT_MAX);
+
+       /* keep writeback_rate_target as existing value */
+       dc->writeback_rate_proportional = 0;
+       dc->writeback_rate_integral_scaled = 0;
+       dc->writeback_rate_change = 0;
+
+       /*
+        * Check c->idle_counter and c->at_max_writeback_rate agagain in case
+        * new I/O arrives during before set_at_max_writeback_rate() returns.
+        * Then the writeback rate is set to 1, and its new value should be
+        * decided via __update_writeback_rate().
+        */
+       if ((atomic_read(&c->idle_counter) <
+            atomic_read(&c->attached_dev_nr) * 6) ||
+           !atomic_read(&c->at_max_writeback_rate))
+               return false;
+
+       return true;
+}
+
 static void update_writeback_rate(struct work_struct *work)
 {
        struct cached_dev *dc = container_of(to_delayed_work(work),
@@ -136,13 +181,20 @@ static void update_writeback_rate(struct work_struct *work)
                return;
        }
 
-       down_read(&dc->writeback_lock);
-
-       if (atomic_read(&dc->has_dirty) &&
-           dc->writeback_percent)
-               __update_writeback_rate(dc);
+       if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
+               /*
+                * If the whole cache set is idle, set_at_max_writeback_rate()
+                * will set writeback rate to a max number. Then it is
+                * unncessary to update writeback rate for an idle cache set
+                * in maximum writeback rate number(s).
+                */
+               if (!set_at_max_writeback_rate(c, dc)) {
+                       down_read(&dc->writeback_lock);
+                       __update_writeback_rate(dc);
+                       up_read(&dc->writeback_lock);
+               }
+       }
 
-       up_read(&dc->writeback_lock);
 
        /*
         * CACHE_SET_IO_DISABLE might be set via sysfs interface,
@@ -422,27 +474,6 @@ static void read_dirty(struct cached_dev *dc)
 
                delay = writeback_delay(dc, size);
 
-               /* If the control system would wait for at least half a
-                * second, and there's been no reqs hitting the backing disk
-                * for awhile: use an alternate mode where we have at most
-                * one contiguous set of writebacks in flight at a time.  If
-                * someone wants to do IO it will be quick, as it will only
-                * have to contend with one operation in flight, and we'll
-                * be round-tripping data to the backing disk as quickly as
-                * it can accept it.
-                */
-               if (delay >= HZ / 2) {
-                       /* 3 means at least 1.5 seconds, up to 7.5 if we
-                        * have slowed way down.
-                        */
-                       if (atomic_inc_return(&dc->backing_idle) >= 3) {
-                               /* Wait for current I/Os to finish */
-                               closure_sync(&cl);
-                               /* And immediately launch a new set. */
-                               delay = 0;
-                       }
-               }
-
                while (!kthread_should_stop() &&
                       !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
                       delay) {
@@ -741,7 +772,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
        dc->writeback_running           = true;
        dc->writeback_percent           = 10;
        dc->writeback_delay             = 30;
-       dc->writeback_rate.rate         = 1024;
+       atomic_long_set(&dc->writeback_rate.rate, 1024);
        dc->writeback_rate_minimum      = 8;
 
        dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;