Btrfs: kill btrfs_clear_path_blocking
authorLiu Bo <bo.liu@linux.alibaba.com>
Tue, 21 Aug 2018 21:54:37 +0000 (05:54 +0800)
committerDavid Sterba <dsterba@suse.com>
Mon, 15 Oct 2018 15:23:38 +0000 (17:23 +0200)
Btrfs's btree locking has two modes, spinning mode and blocking mode,
while searching btree, locking is always acquired in spinning mode and
then converted to blocking mode if necessary, and in some hot paths we may
switch the locking back to spinning mode by btrfs_clear_path_blocking().

When acquiring locks, both of reader and writer need to wait for blocking
readers and writers to complete before doing read_lock()/write_lock().

The problem is that btrfs_clear_path_blocking() needs to switch nodes
in the path to blocking mode at first (by btrfs_set_path_blocking) to
make lockdep happy before doing its actual clearing blocking job.

When switching to blocking mode from spinning mode, it consists of

step 1) bumping up blocking readers counter and
step 2) read_unlock()/write_unlock(),

this has caused serious ping-pong effect if there're a great amount of
concurrent readers/writers, as waiters will be woken up and go to
sleep immediately.

1) Killing this kind of ping-pong results in a big improvement in my 1600k
files creation script,

MNT=/mnt/btrfs
mkfs.btrfs -f /dev/sdf
mount /dev/def $MNT
time fsmark  -D  10000  -S0  -n  100000  -s  0  -L  1 -l /tmp/fs_log.txt \
        -d  $MNT/0  -d  $MNT/1 \
        -d  $MNT/2  -d  $MNT/3 \
        -d  $MNT/4  -d  $MNT/5 \
        -d  $MNT/6  -d  $MNT/7 \
        -d  $MNT/8  -d  $MNT/9 \
        -d  $MNT/10  -d  $MNT/11 \
        -d  $MNT/12  -d  $MNT/13 \
        -d  $MNT/14  -d  $MNT/15

w/o patch:
real    2m27.307s
user    0m12.839s
sys     13m42.831s

w/ patch:
real    1m2.273s
user    0m15.802s
sys     8m16.495s

1.1) latency histogram from funclatency[1]

Overall with the patch, there're ~50% less write lock acquisition and
the 95% max latency that write lock takes also reduces to ~100ms from
>500ms.

--------------------------------------------
w/o patch:
--------------------------------------------
Function = btrfs_tree_lock
     msecs               : count     distribution
         0 -> 1          : 2385222  |****************************************|
         2 -> 3          : 37147    |                                        |
         4 -> 7          : 20452    |                                        |
         8 -> 15         : 13131    |                                        |
        16 -> 31         : 3877     |                                        |
        32 -> 63         : 3900     |                                        |
        64 -> 127        : 2612     |                                        |
       128 -> 255        : 974      |                                        |
       256 -> 511        : 165      |                                        |
       512 -> 1023       : 13       |                                        |

Function = btrfs_tree_read_lock
     msecs               : count     distribution
         0 -> 1          : 6743860  |****************************************|
         2 -> 3          : 2146     |                                        |
         4 -> 7          : 190      |                                        |
         8 -> 15         : 38       |                                        |
        16 -> 31         : 4        |                                        |

--------------------------------------------
w/ patch:
--------------------------------------------
Function = btrfs_tree_lock
     msecs               : count     distribution
         0 -> 1          : 1318454  |****************************************|
         2 -> 3          : 6800     |                                        |
         4 -> 7          : 3664     |                                        |
         8 -> 15         : 2145     |                                        |
        16 -> 31         : 809      |                                        |
        32 -> 63         : 219      |                                        |
        64 -> 127        : 10       |                                        |

Function = btrfs_tree_read_lock
     msecs               : count     distribution
         0 -> 1          : 6854317  |****************************************|
         2 -> 3          : 2383     |                                        |
         4 -> 7          : 601      |                                        |
         8 -> 15         : 92       |                                        |

2) dbench also proves the improvement,
dbench -t 120 -D /mnt/btrfs 16

w/o patch:
Throughput 158.363 MB/sec

w/ patch:
Throughput 449.52 MB/sec

3) xfstests didn't show any additional failures.

One thing to note is that callers may set path->leave_spinning to have
all nodes in the path stay in spinning mode, which means callers are
ready to not sleep before releasing the path, but it won't cause
problems if they don't want to sleep in blocking mode.

[1]: https://github.com/iovisor/bcc/blob/master/tools/funclatency.py

Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-inode.c

index 0a6c645fab0ac374f401192352fc95ffd65aa544..2ee43b6a4f0995367ca5497f87d343674b319553 100644 (file)
@@ -52,42 +52,6 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
        }
 }
 
-/*
- * reset all the locked nodes in the patch to spinning locks.
- *
- * held is used to keep lockdep happy, when lockdep is enabled
- * we set held to a blocking lock before we go around and
- * retake all the spinlocks in the path.  You can safely use NULL
- * for held
- */
-noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
-                                       struct extent_buffer *held, int held_rw)
-{
-       int i;
-
-       if (held) {
-               btrfs_set_lock_blocking_rw(held, held_rw);
-               if (held_rw == BTRFS_WRITE_LOCK)
-                       held_rw = BTRFS_WRITE_LOCK_BLOCKING;
-               else if (held_rw == BTRFS_READ_LOCK)
-                       held_rw = BTRFS_READ_LOCK_BLOCKING;
-       }
-       btrfs_set_path_blocking(p);
-
-       for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
-               if (p->nodes[i] && p->locks[i]) {
-                       btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
-                       if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
-                               p->locks[i] = BTRFS_WRITE_LOCK;
-                       else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
-                               p->locks[i] = BTRFS_READ_LOCK;
-               }
-       }
-
-       if (held)
-               btrfs_clear_lock_blocking_rw(held, held_rw);
-}
-
 /* this also releases the path */
 void btrfs_free_path(struct btrfs_path *p)
 {
@@ -1306,7 +1270,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
                }
        }
 
-       btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
        btrfs_tree_read_unlock_blocking(eb);
        free_extent_buffer(eb);
 
@@ -2482,7 +2445,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
                btrfs_set_path_blocking(p);
                reada_for_balance(fs_info, p, level);
                sret = split_node(trans, root, p, level);
-               btrfs_clear_path_blocking(p, NULL, 0);
 
                BUG_ON(sret > 0);
                if (sret) {
@@ -2503,7 +2465,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
                btrfs_set_path_blocking(p);
                reada_for_balance(fs_info, p, level);
                sret = balance_level(trans, root, p, level);
-               btrfs_clear_path_blocking(p, NULL, 0);
 
                if (sret) {
                        ret = sret;
@@ -2788,7 +2749,10 @@ again:
                }
 cow_done:
                p->nodes[level] = b;
-               btrfs_clear_path_blocking(p, NULL, 0);
+               /*
+                * Leave path with blocking locks to avoid massive
+                * lock context switch, this is made on purpose.
+                */
 
                /*
                 * we have a lock on b and as long as we aren't changing
@@ -2870,8 +2834,6 @@ cow_done:
                                        if (!err) {
                                                btrfs_set_path_blocking(p);
                                                btrfs_tree_lock(b);
-                                               btrfs_clear_path_blocking(p, b,
-                                                                 BTRFS_WRITE_LOCK);
                                        }
                                        p->locks[level] = BTRFS_WRITE_LOCK;
                                } else {
@@ -2879,8 +2841,6 @@ cow_done:
                                        if (!err) {
                                                btrfs_set_path_blocking(p);
                                                btrfs_tree_read_lock(b);
-                                               btrfs_clear_path_blocking(p, b,
-                                                                 BTRFS_READ_LOCK);
                                        }
                                        p->locks[level] = BTRFS_READ_LOCK;
                                }
@@ -2899,7 +2859,6 @@ cow_done:
                                btrfs_set_path_blocking(p);
                                err = split_leaf(trans, root, key,
                                                 p, ins_len, ret == 0);
-                               btrfs_clear_path_blocking(p, NULL, 0);
 
                                BUG_ON(err > 0);
                                if (err) {
@@ -2970,7 +2929,6 @@ again:
        while (b) {
                level = btrfs_header_level(b);
                p->nodes[level] = b;
-               btrfs_clear_path_blocking(p, NULL, 0);
 
                /*
                 * we have a lock on b and as long as we aren't changing
@@ -3016,8 +2974,6 @@ again:
                        if (!err) {
                                btrfs_set_path_blocking(p);
                                btrfs_tree_read_lock(b);
-                               btrfs_clear_path_blocking(p, b,
-                                                         BTRFS_READ_LOCK);
                        }
                        b = tree_mod_log_rewind(fs_info, p, b, time_seq);
                        if (!b) {
@@ -5201,7 +5157,6 @@ find_next_key:
                path->locks[level - 1] = BTRFS_READ_LOCK;
                path->nodes[level - 1] = cur;
                unlock_up(path, level, 1, 0, NULL);
-               btrfs_clear_path_blocking(path, NULL, 0);
        }
 out:
        path->keep_locks = keep_locks;
@@ -5786,8 +5741,6 @@ again:
                        if (!ret) {
                                btrfs_set_path_blocking(path);
                                btrfs_tree_read_lock(next);
-                               btrfs_clear_path_blocking(path, next,
-                                                         BTRFS_READ_LOCK);
                        }
                        next_rw_lock = BTRFS_READ_LOCK;
                }
@@ -5823,8 +5776,6 @@ again:
                        if (!ret) {
                                btrfs_set_path_blocking(path);
                                btrfs_tree_read_lock(next);
-                               btrfs_clear_path_blocking(path, next,
-                                                         BTRFS_READ_LOCK);
                        }
                        next_rw_lock = BTRFS_READ_LOCK;
                }
index e8244c0b059769b8891b39420a4520a08d1a1171..15c659f234111d15a9a0f3e5f1a185f730e64011 100644 (file)
@@ -2868,8 +2868,6 @@ void btrfs_release_path(struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
 void btrfs_set_path_blocking(struct btrfs_path *p);
-void btrfs_clear_path_blocking(struct btrfs_path *p,
-                              struct extent_buffer *held, int held_rw);
 void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
index 15dcbd6ef5313c18e89a3569b52309547a755c2e..c669f250d4a0e27a839ed91f0c0dcd59a18dbaa4 100644 (file)
@@ -765,9 +765,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
                i++;
        }
 
-       /* reset all the locked nodes in the patch to spinning locks. */
-       btrfs_clear_path_blocking(path, NULL, 0);
-
        /* insert the keys of the items */
        setup_items_for_insert(root, path, keys, data_size,
                               total_data_size, total_size, nitems);