btrfs: Switch memory allocations in async csum calculation path to kvmalloc
authorNikolay Borisov <nborisov@suse.com>
Mon, 1 Apr 2019 08:29:58 +0000 (11:29 +0300)
committerDavid Sterba <dsterba@suse.com>
Thu, 25 Apr 2019 12:17:38 +0000 (14:17 +0200)
Recent multi-page biovec rework allowed creation of bios that can span
large regions - up to 128 megabytes in the case of btrfs. OTOH btrfs'
submission path currently allocates a contiguous array to store the
checksums for every bio submitted. This means we can request up to
(128mb / BTRFS_SECTOR_SIZE) * 4 bytes + 32bytes of memory from kmalloc.
On busy systems with possibly fragmented memory said kmalloc can fail
which will trigger BUG_ON due to improper error handling IO submission
context in btrfs.

Until error handling is improved or bios in btrfs limited to a more
manageable size (e.g. 1m) let's use kvmalloc to fallback to vmalloc for
such large allocations. There is no hard requirement that the memory
allocated for checksums during IO submission has to be contiguous, but
this is a simple fix that does not require several non-contiguous
allocations.

For small writes this is unlikely to have any visible effect since
kmalloc will still satisfy allocation requests as usual. For larger
requests the code will just fallback to vmalloc.

We've performed evaluation on several workload types and there was no
significant difference kmalloc vs kvmalloc.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/file-item.c
fs/btrfs/ordered-data.c

index 920bf3b4b0ef5e5296d3cec2c2e82a8ab78dc0ac..cccc75d15970cbc61e8e1bcd1735fb28dc549123 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
+#include <linux/sched/mm.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -427,9 +428,13 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
        unsigned long this_sum_bytes = 0;
        int i;
        u64 offset;
+       unsigned nofs_flag;
+
+       nofs_flag = memalloc_nofs_save();
+       sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
+                      GFP_KERNEL);
+       memalloc_nofs_restore(nofs_flag);
 
-       sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
-                      GFP_NOFS);
        if (!sums)
                return BLK_STS_RESOURCE;
 
@@ -472,8 +477,10 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
 
                                bytes_left = bio->bi_iter.bi_size - total_bytes;
 
-                               sums = kzalloc(btrfs_ordered_sum_size(fs_info, bytes_left),
-                                              GFP_NOFS);
+                               nofs_flag = memalloc_nofs_save();
+                               sums = kvzalloc(btrfs_ordered_sum_size(fs_info,
+                                                     bytes_left), GFP_KERNEL);
+                               memalloc_nofs_restore(nofs_flag);
                                BUG_ON(!sums); /* -ENOMEM */
                                sums->len = bytes_left;
                                ordered = btrfs_lookup_ordered_extent(inode,
index 6fde2b2741ef13b2bdabfac4ad29937d7e784afa..45e3cfd1198bc29265d28a360f3d261a70144953 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/writeback.h>
+#include <linux/sched/mm.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
@@ -442,7 +443,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
                        cur = entry->list.next;
                        sum = list_entry(cur, struct btrfs_ordered_sum, list);
                        list_del(&sum->list);
-                       kfree(sum);
+                       kvfree(sum);
                }
                kmem_cache_free(btrfs_ordered_extent_cache, entry);
        }