swap: choose swap device according to numa node

author Aaron Lu <aaron.lu@intel.com>

Wed, 6 Sep 2017 23:24:57 +0000 (16:24 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 7 Sep 2017 00:27:30 +0000 (17:27 -0700)
author Aaron Lu <aaron.lu@intel.com>
Wed, 6 Sep 2017 23:24:57 +0000 (16:24 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 7 Sep 2017 00:27:30 +0000 (17:27 -0700)
diff --git a/Documentation/vm/swap_numa.txt b/Documentation/vm/swap_numa.txt

new file mode 100644 (file)

index 0000000..d5960c9
--- /dev/null
+++ b/Documentation/vm/swap_numa.txt
@@ -0,0 +1,69 @@
+Automatically bind swap device to numa node
+-------------------------------------------
+
+If the system has more than one swap device and swap device has the node
+information, we can make use of this information to decide which swap
+device to use in get_swap_pages() to get better performance.
+
+
+How to use this feature
+-----------------------
+
+Swap device has priority and that decides the order of it to be used. To make
+use of automatically binding, there is no need to manipulate priority settings
+for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
+swapB, with swapA attached to node 0 and swapB attached to node 1, are going
+to be swapped on. Simply swapping them on by doing:
+# swapon /dev/swapA
+# swapon /dev/swapB
+
+Then node 0 will use the two swap devices in the order of swapA then swapB and
+node 1 will use the two swap devices in the order of swapB then swapA. Note
+that the order of them being swapped on doesn't matter.
+
+A more complex example on a 4 node machine. Assume 6 swap devices are going to
+be swapped on: swapA and swapB are attached to node 0, swapC is attached to
+node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
+The way to swap them on is the same as above:
+# swapon /dev/swapA
+# swapon /dev/swapB
+# swapon /dev/swapC
+# swapon /dev/swapD
+# swapon /dev/swapE
+# swapon /dev/swapF
+
+Then node 0 will use them in the order of:
+swapA/swapB -> swapC -> swapD -> swapE -> swapF
+swapA and swapB will be used in a round robin mode before any other swap device.
+
+node 1 will use them in the order of:
+swapC -> swapA -> swapB -> swapD -> swapE -> swapF
+
+node 2 will use them in the order of:
+swapD/swapE -> swapA -> swapB -> swapC -> swapF
+Similaly, swapD and swapE will be used in a round robin mode before any
+other swap devices.
+
+node 3 will use them in the order of:
+swapF -> swapA -> swapB -> swapC -> swapD -> swapE
+
+
+Implementation details
+----------------------
+
+The current code uses a priority based list, swap_avail_list, to decide
+which swap device to use and if multiple swap devices share the same
+priority, they are used round robin. This change here replaces the single
+global swap_avail_list with a per-numa-node list, i.e. for each numa node,
+it sees its own priority based list of available swap devices. Swap
+device's priority can be promoted on its matching node's swap_avail_list.
+
+The current swap device's priority is set as: user can set a >=0 value,
+or the system will pick one starting from -1 then downwards. The priority
+value in the swap_avail_list is the negated value of the swap device's
+due to plist being sorted from low to high. The new policy doesn't change
+the semantics for priority >=0 cases, the previous starting from -1 then
+downwards now becomes starting from -2 then downwards and -1 is reserved
+as the promoted value. So if multiple swap devices are attached to the same
+node, they will all be promoted to priority -1 on that node's plist and will
+be used round robin before any other swap devices.
diff --git a/include/linux/swap.h b/include/linux/swap.h

index 9c4ae6f14eeaee14eac1951f772414a6696bba0d..8bf3487fb2046b1a03e4d28fca762c35281cc219 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -212,7 +212,7 @@ struct swap_info_struct {
         unsigned long   flags;          /* SWP_USED etc: see above */
         signed short    prio;           /* swap priority of this type */
         struct plist_node list;         /* entry in swap_active_head */
-       struct plist_node avail_list;   /* entry in swap_avail_head */
+       struct plist_node avail_lists[MAX_NUMNODES];/* entry in swap_avail_heads */
         signed char     type;           /* strange name for an index */
         unsigned int    max;            /* extent of the swap_map */
         unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
diff --git a/mm/swapfile.c b/mm/swapfile.c

index 4f8b3e08a5476e867366f1bf91f74751bb7fc8e7..d483278ee35b1fe9f728ec93c867dcd7621fd4af 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -60,7 +60,7 @@ atomic_long_t nr_swap_pages;
  EXPORT_SYMBOL_GPL(nr_swap_pages);
  /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
  long total_swap_pages;
-static int least_priority;
+static int least_priority = -1;
  
  static const char Bad_file[] = "Bad swap file entry ";
  static const char Unused_file[] = "Unused swap file entry ";
@@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head);
   * is held and the locking order requires swap_lock to be taken
   * before any swap_info_struct->lock.
   */
-static PLIST_HEAD(swap_avail_head);
+struct plist_head *swap_avail_heads;
  static DEFINE_SPINLOCK(swap_avail_lock);
  
  struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -592,6 +592,21 @@ new_cluster:
         return found_free;
  }
  
+static void __del_from_avail_list(struct swap_info_struct *p)
+{
+       int nid;
+
+       for_each_node(nid)
+               plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
+}
+
+static void del_from_avail_list(struct swap_info_struct *p)
+{
+       spin_lock(&swap_avail_lock);
+       __del_from_avail_list(p);
+       spin_unlock(&swap_avail_lock);
+}
+
  static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
                              unsigned int nr_entries)
  {
@@ -605,12 +620,22 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
         if (si->inuse_pages == si->pages) {
                 si->lowest_bit = si->max;
                 si->highest_bit = 0;
-               spin_lock(&swap_avail_lock);
-               plist_del(&si->avail_list, &swap_avail_head);
-               spin_unlock(&swap_avail_lock);
+               del_from_avail_list(si);
         }
  }
  
+static void add_to_avail_list(struct swap_info_struct *p)
+{
+       int nid;
+
+       spin_lock(&swap_avail_lock);
+       for_each_node(nid) {
+               WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
+               plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
+       }
+       spin_unlock(&swap_avail_lock);
+}
+
  static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
                             unsigned int nr_entries)
  {
@@ -623,13 +648,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
                 bool was_full = !si->highest_bit;
  
                 si->highest_bit = end;
-               if (was_full && (si->flags & SWP_WRITEOK)) {
-                       spin_lock(&swap_avail_lock);
-                       WARN_ON(!plist_node_empty(&si->avail_list));
-                       if (plist_node_empty(&si->avail_list))
-                               plist_add(&si->avail_list, &swap_avail_head);
-                       spin_unlock(&swap_avail_lock);
-               }
+               if (was_full && (si->flags & SWP_WRITEOK))
+                       add_to_avail_list(si);
         }
         atomic_long_add(nr_entries, &nr_swap_pages);
         si->inuse_pages -= nr_entries;
@@ -910,6 +930,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
         struct swap_info_struct *si, *next;
         long avail_pgs;
         int n_ret = 0;
+       int node;
  
         /* Only single cluster request supported */
         WARN_ON_ONCE(n_goal > 1 && cluster);
@@ -929,14 +950,15 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
         spin_lock(&swap_avail_lock);
  
  start_over:
-       plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+       node = numa_node_id();
+       plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
                 /* requeue si to after same-priority siblings */
-               plist_requeue(&si->avail_list, &swap_avail_head);
+               plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
                 spin_unlock(&swap_avail_lock);
                 spin_lock(&si->lock);
                 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
                         spin_lock(&swap_avail_lock);
-                       if (plist_node_empty(&si->avail_list)) {
+                       if (plist_node_empty(&si->avail_lists[node])) {
                                 spin_unlock(&si->lock);
                                 goto nextsi;
                         }
@@ -946,7 +968,7 @@ start_over:
                         WARN(!(si->flags & SWP_WRITEOK),
                              "swap_info %d in list but !SWP_WRITEOK\n",
                              si->type);
-                       plist_del(&si->avail_list, &swap_avail_head);
+                       __del_from_avail_list(si);
                         spin_unlock(&si->lock);
                         goto nextsi;
                 }
@@ -975,7 +997,7 @@ nextsi:
                  * swap_avail_head list then try it, otherwise start over
                  * if we have not gotten any slots.
                  */
-               if (plist_node_empty(&next->avail_list))
+               if (plist_node_empty(&next->avail_lists[node]))
                         goto start_over;
         }
  
@@ -2410,10 +2432,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
         return generic_swapfile_activate(sis, swap_file, span);
  }
  
+static int swap_node(struct swap_info_struct *p)
+{
+       struct block_device *bdev;
+
+       if (p->bdev)
+               bdev = p->bdev;
+       else
+               bdev = p->swap_file->f_inode->i_sb->s_bdev;
+
+       return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
+}
+
  static void _enable_swap_info(struct swap_info_struct *p, int prio,
                                 unsigned char *swap_map,
                                 struct swap_cluster_info *cluster_info)
  {
+       int i;
+
         if (prio >= 0)
                 p->prio = prio;
         else
@@ -2423,7 +2459,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
          * low-to-high, while swap ordering is high-to-low
          */
         p->list.prio = -p->prio;
-       p->avail_list.prio = -p->prio;
+       for_each_node(i) {
+               if (p->prio >= 0)
+                       p->avail_lists[i].prio = -p->prio;
+               else {
+                       if (swap_node(p) == i)
+                               p->avail_lists[i].prio = 1;
+                       else
+                               p->avail_lists[i].prio = -p->prio;
+               }
+       }
         p->swap_map = swap_map;
         p->cluster_info = cluster_info;
         p->flags |= SWP_WRITEOK;
@@ -2442,9 +2487,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
          * swap_info_struct.
          */
         plist_add(&p->list, &swap_active_head);
-       spin_lock(&swap_avail_lock);
-       plist_add(&p->avail_list, &swap_avail_head);
-       spin_unlock(&swap_avail_lock);
+       add_to_avail_list(p);
  }
  
  static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -2529,17 +2572,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                 spin_unlock(&swap_lock);
                 goto out_dput;
         }
-       spin_lock(&swap_avail_lock);
-       plist_del(&p->avail_list, &swap_avail_head);
-       spin_unlock(&swap_avail_lock);
+       del_from_avail_list(p);
         spin_lock(&p->lock);
         if (p->prio < 0) {
                 struct swap_info_struct *si = p;
+               int nid;
  
                 plist_for_each_entry_continue(si, &swap_active_head, list) {
                         si->prio++;
                         si->list.prio--;
-                       si->avail_list.prio--;
+                       for_each_node(nid) {
+                               if (si->avail_lists[nid].prio != 1)
+                                       si->avail_lists[nid].prio--;
+                       }
                 }
                 least_priority++;
         }
@@ -2783,6 +2828,7 @@ static struct swap_info_struct *alloc_swap_info(void)
  {
         struct swap_info_struct *p;
         unsigned int type;
+       int i;
  
         p = kzalloc(sizeof(*p), GFP_KERNEL);
         if (!p)
@@ -2818,7 +2864,8 @@ static struct swap_info_struct *alloc_swap_info(void)
         }
         INIT_LIST_HEAD(&p->first_swap_extent.list);
         plist_node_init(&p->list, 0);
-       plist_node_init(&p->avail_list, 0);
+       for_each_node(i)
+               plist_node_init(&p->avail_lists[i], 0);
         p->flags = SWP_USED;
         spin_unlock(&swap_lock);
         spin_lock_init(&p->lock);
@@ -3060,6 +3107,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
+       if (!swap_avail_heads)
+               return -ENOMEM;
+
         p = alloc_swap_info();
         if (IS_ERR(p))
                 return PTR_ERR(p);
@@ -3645,3 +3695,21 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
                 }
         }
  }
+
+static int __init swapfile_init(void)
+{
+       int nid;
+
+       swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
+                                        GFP_KERNEL);
+       if (!swap_avail_heads) {
+               pr_emerg("Not enough memory for swap heads, swap is disabled\n");
+               return -ENOMEM;
+       }
+
+       for_each_node(nid)
+               plist_head_init(&swap_avail_heads[nid]);
+
+       return 0;
+}
+subsys_initcall(swapfile_init);
author	Aaron Lu <aaron.lu@intel.com>
	Wed, 6 Sep 2017 23:24:57 +0000 (16:24 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 7 Sep 2017 00:27:30 +0000 (17:27 -0700)
Documentation/vm/swap_numa.txt	[new file with mode: 0644]	patch \| blob
include/linux/swap.h		patch \| blob \| history
mm/swapfile.c		patch \| blob \| history