swapfile: swap allocation use discard

author Hugh Dickins <hugh@veritas.com>

Tue, 6 Jan 2009 22:39:53 +0000 (14:39 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 6 Jan 2009 23:59:05 +0000 (15:59 -0800)
author Hugh Dickins <hugh@veritas.com>
Tue, 6 Jan 2009 22:39:53 +0000 (14:39 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 6 Jan 2009 23:59:05 +0000 (15:59 -0800)
diff --git a/include/linux/swap.h b/include/linux/swap.h

index 0b9210ea96c76be517e76e92e36115f46277059f..fe79f44c858e980cbcd8926ddf7b07afcd6db0af 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -121,6 +121,7 @@ enum {
         SWP_USED        = (1 << 0),     /* is slot in swap_info[] used? */
         SWP_WRITEOK     = (1 << 1),     /* ok to write to this swap?    */
         SWP_DISCARDABLE = (1 << 2),     /* blkdev supports discard */
+       SWP_DISCARDING  = (1 << 3),     /* now discarding a free cluster */
                                         /* add others here before... */
         SWP_SCANNING    = (1 << 8),     /* refcount in scan_swap_map */
  };
@@ -144,6 +145,8 @@ struct swap_info_struct {
         unsigned short *swap_map;
         unsigned int lowest_bit;
         unsigned int highest_bit;
+       unsigned int lowest_alloc;      /* while preparing discard cluster */
+       unsigned int highest_alloc;     /* while preparing discard cluster */
         unsigned int cluster_next;
         unsigned int cluster_nr;
         unsigned int pages;
diff --git a/mm/swapfile.c b/mm/swapfile.c

index fbeb4bb8eb50b2d2091db7a5e6a0846d016e7f10..ca75b9e7c09f6efd864fe78f45ec95627bef903d 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -115,14 +115,62 @@ static int discard_swap(struct swap_info_struct *si)
         return err;             /* That will often be -EOPNOTSUPP */
  }
  
+/*
+ * swap allocation tell device that a cluster of swap can now be discarded,
+ * to allow the swap device to optimize its wear-levelling.
+ */
+static void discard_swap_cluster(struct swap_info_struct *si,
+                                pgoff_t start_page, pgoff_t nr_pages)
+{
+       struct swap_extent *se = si->curr_swap_extent;
+       int found_extent = 0;
+
+       while (nr_pages) {
+               struct list_head *lh;
+
+               if (se->start_page <= start_page &&
+                   start_page < se->start_page + se->nr_pages) {
+                       pgoff_t offset = start_page - se->start_page;
+                       sector_t start_block = se->start_block + offset;
+                       pgoff_t nr_blocks = se->nr_pages - offset;
+
+                       if (nr_blocks > nr_pages)
+                               nr_blocks = nr_pages;
+                       start_page += nr_blocks;
+                       nr_pages -= nr_blocks;
+
+                       if (!found_extent++)
+                               si->curr_swap_extent = se;
+
+                       start_block <<= PAGE_SHIFT - 9;
+                       nr_blocks <<= PAGE_SHIFT - 9;
+                       if (blkdev_issue_discard(si->bdev, start_block,
+                                                       nr_blocks, GFP_NOIO))
+                               break;
+               }
+
+               lh = se->list.next;
+               if (lh == &si->extent_list)
+                       lh = lh->next;
+               se = list_entry(lh, struct swap_extent, list);
+       }
+}
+
+static int wait_for_discard(void *word)
+{
+       schedule();
+       return 0;
+}
+
  #define SWAPFILE_CLUSTER       256
  #define LATENCY_LIMIT          256
  
  static inline unsigned long scan_swap_map(struct swap_info_struct *si)
  {
         unsigned long offset;
-       unsigned long last_in_cluster;
+       unsigned long last_in_cluster = 0;
         int latency_ration = LATENCY_LIMIT;
+       int found_free_cluster = 0;
  
         /*
          * We try to cluster swap pages by allocating them sequentially
@@ -142,6 +190,19 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
                         si->cluster_nr = SWAPFILE_CLUSTER - 1;
                         goto checks;
                 }
+               if (si->flags & SWP_DISCARDABLE) {
+                       /*
+                        * Start range check on racing allocations, in case
+                        * they overlap the cluster we eventually decide on
+                        * (we scan without swap_lock to allow preemption).
+                        * It's hardly conceivable that cluster_nr could be
+                        * wrapped during our scan, but don't depend on it.
+                        */
+                       if (si->lowest_alloc)
+                               goto checks;
+                       si->lowest_alloc = si->max;
+                       si->highest_alloc = 0;
+               }
                 spin_unlock(&swap_lock);
  
                 offset = si->lowest_bit;
@@ -156,6 +217,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
                                 offset -= SWAPFILE_CLUSTER - 1;
                                 si->cluster_next = offset;
                                 si->cluster_nr = SWAPFILE_CLUSTER - 1;
+                               found_free_cluster = 1;
                                 goto checks;
                         }
                         if (unlikely(--latency_ration < 0)) {
@@ -167,6 +229,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
                 offset = si->lowest_bit;
                 spin_lock(&swap_lock);
                 si->cluster_nr = SWAPFILE_CLUSTER - 1;
+               si->lowest_alloc = 0;
         }
  
  checks:
@@ -191,6 +254,60 @@ checks:
         si->swap_map[offset] = 1;
         si->cluster_next = offset + 1;
         si->flags -= SWP_SCANNING;
+
+       if (si->lowest_alloc) {
+               /*
+                * Only set when SWP_DISCARDABLE, and there's a scan
+                * for a free cluster in progress or just completed.
+                */
+               if (found_free_cluster) {
+                       /*
+                        * To optimize wear-levelling, discard the
+                        * old data of the cluster, taking care not to
+                        * discard any of its pages that have already
+                        * been allocated by racing tasks (offset has
+                        * already stepped over any at the beginning).
+                        */
+                       if (offset < si->highest_alloc &&
+                           si->lowest_alloc <= last_in_cluster)
+                               last_in_cluster = si->lowest_alloc - 1;
+                       si->flags |= SWP_DISCARDING;
+                       spin_unlock(&swap_lock);
+
+                       if (offset < last_in_cluster)
+                               discard_swap_cluster(si, offset,
+                                       last_in_cluster - offset + 1);
+
+                       spin_lock(&swap_lock);
+                       si->lowest_alloc = 0;
+                       si->flags &= ~SWP_DISCARDING;
+
+                       smp_mb();       /* wake_up_bit advises this */
+                       wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
+
+               } else if (si->flags & SWP_DISCARDING) {
+                       /*
+                        * Delay using pages allocated by racing tasks
+                        * until the whole discard has been issued. We
+                        * could defer that delay until swap_writepage,
+                        * but it's easier to keep this self-contained.
+                        */
+                       spin_unlock(&swap_lock);
+                       wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
+                               wait_for_discard, TASK_UNINTERRUPTIBLE);
+                       spin_lock(&swap_lock);
+               } else {
+                       /*
+                        * Note pages allocated by racing tasks while
+                        * scan for a free cluster is in progress, so
+                        * that its final discard can exclude them.
+                        */
+                       if (offset < si->lowest_alloc)
+                               si->lowest_alloc = offset;
+                       if (offset > si->highest_alloc)
+                               si->highest_alloc = offset;
+               }
+       }
         return offset;
  
  scan:
author	Hugh Dickins <hugh@veritas.com>
	Tue, 6 Jan 2009 22:39:53 +0000 (14:39 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 6 Jan 2009 23:59:05 +0000 (15:59 -0800)
include/linux/swap.h		patch \| blob \| history
mm/swapfile.c		patch \| blob \| history