ore: Only IO one group at a time (API change)

author Boaz Harrosh <bharrosh@panasas.com>

Wed, 28 Sep 2011 08:55:51 +0000 (11:55 +0300)

committer Boaz Harrosh <bharrosh@panasas.com>

Fri, 14 Oct 2011 16:52:50 +0000 (18:52 +0200)
author Boaz Harrosh <bharrosh@panasas.com>
Wed, 28 Sep 2011 08:55:51 +0000 (11:55 +0300)
committer Boaz Harrosh <bharrosh@panasas.com>
Fri, 14 Oct 2011 16:52:50 +0000 (18:52 +0200)
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c

index 61b2f7e5cdbdb55d262e5a67bb430990e5abac2c..d87c1f7562fb9c828a3fa617dde37c493b1280b2 100644 (file)
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -259,6 +259,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
         }
  }
  
+static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
+       struct page_collect *pcol_src, struct page_collect *pcol)
+{
+       /* length was wrong or offset was not page aligned */
+       BUG_ON(pcol_src->nr_pages < ios->nr_pages);
+
+       if (pcol_src->nr_pages > ios->nr_pages) {
+               struct page **src_page;
+               unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
+               unsigned long len_less = pcol_src->length - ios->length;
+               unsigned i;
+               int ret;
+
+               /* This IO was trimmed */
+               pcol_src->nr_pages = ios->nr_pages;
+               pcol_src->length = ios->length;
+
+               /* Left over pages are passed to the next io */
+               pcol->expected_pages += pages_less;
+               pcol->nr_pages = pages_less;
+               pcol->length = len_less;
+               src_page = pcol_src->pages + pcol_src->nr_pages;
+               pcol->pg_first = (*src_page)->index;
+
+               ret = pcol_try_alloc(pcol);
+               if (unlikely(ret))
+                       return ret;
+
+               for (i = 0; i < pages_less; ++i)
+                       pcol->pages[i] = *src_page++;
+
+               EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
+                       "pages_less=0x%x expected_pages=0x%x "
+                       "next_offset=0x%llx next_len=0x%lx\n",
+                       pcol_src->nr_pages, pages_less, pcol->expected_pages,
+                       pcol->pg_first * PAGE_SIZE, pcol->length);
+       }
+       return 0;
+}
+
  static int read_exec(struct page_collect *pcol)
  {
         struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -280,7 +320,6 @@ static int read_exec(struct page_collect *pcol)
  
         ios = pcol->ios;
         ios->pages = pcol->pages;
-       ios->nr_pages = pcol->nr_pages;
  
         if (pcol->read_4_write) {
                 ore_read(pcol->ios);
@@ -296,17 +335,23 @@ static int read_exec(struct page_collect *pcol)
         *pcol_copy = *pcol;
         ios->done = readpages_done;
         ios->private = pcol_copy;
+
+       /* pages ownership was passed to pcol_copy */
+       _pcol_reset(pcol);
+
+       ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+       if (unlikely(ret))
+               goto err;
+
+       EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+               pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
         ret = ore_read(ios);
         if (unlikely(ret))
                 goto err;
  
         atomic_inc(&pcol->sbi->s_curr_pending);
  
-       EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
-                 oi->one_comp.obj.id, _LLU(ios->offset), pcol->length);
-
-       /* pages ownership was passed to pcol_copy */
-       _pcol_reset(pcol);
         return 0;
  
  err:
@@ -429,6 +474,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
                 return ret;
         }
  
+       ret = read_exec(&pcol);
+       if (unlikely(ret))
+               return ret;
+
         return read_exec(&pcol);
  }
  
@@ -519,7 +568,6 @@ static int write_exec(struct page_collect *pcol)
         ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
                                  pcol->pg_first << PAGE_CACHE_SHIFT,
                                  pcol->length, &pcol->ios);
-
         if (unlikely(ret))
                 goto err;
  
@@ -534,10 +582,19 @@ static int write_exec(struct page_collect *pcol)
  
         ios = pcol->ios;
         ios->pages = pcol_copy->pages;
-       ios->nr_pages = pcol_copy->nr_pages;
         ios->done = writepages_done;
         ios->private = pcol_copy;
  
+       /* pages ownership was passed to pcol_copy */
+       _pcol_reset(pcol);
+
+       ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+       if (unlikely(ret))
+               goto err;
+
+       EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+               pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
         ret = ore_write(ios);
         if (unlikely(ret)) {
                 EXOFS_ERR("write_exec: ore_write() Failed\n");
@@ -545,11 +602,6 @@ static int write_exec(struct page_collect *pcol)
         }
  
         atomic_inc(&pcol->sbi->s_curr_pending);
-       EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
-                 pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
-                 pcol->length);
-       /* pages ownership was passed to pcol_copy */
-       _pcol_reset(pcol);
         return 0;
  
  err:
@@ -689,12 +741,30 @@ static int exofs_writepages(struct address_space *mapping,
         _pcol_init(&pcol, expected_pages, mapping->host);
  
         ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
-       if (ret) {
+       if (unlikely(ret)) {
                 EXOFS_ERR("write_cache_pages => %d\n", ret);
                 return ret;
         }
  
-       return write_exec(&pcol);
+       ret = write_exec(&pcol);
+       if (unlikely(ret))
+               return ret;
+
+       if (wbc->sync_mode == WB_SYNC_ALL) {
+               return write_exec(&pcol); /* pump the last reminder */
+       } else if (pcol.nr_pages) {
+               /* not SYNC let the reminder join the next writeout */
+               unsigned i;
+
+               for (i = 0; i < pcol.nr_pages; i++) {
+                       struct page *page = pcol.pages[i];
+
+                       end_page_writeback(page);
+                       set_page_dirty(page);
+                       unlock_page(page);
+               }
+       }
+       return 0;
  }
  
  static int exofs_writepage(struct page *page, struct writeback_control *wbc)
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c

index a7d79257fc65c2c82f4d0a08b6507dd9bad9a57c..c1c2cc607adf592f888e67320e1cba58c1474a77 100644 (file)
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -47,6 +47,9 @@ MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
  MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
  MODULE_LICENSE("GPL");
  
+static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
+                                struct ore_striping_info *si);
+
  static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
  {
         return ios->oc->comps[index & ios->oc->single_comp].cred;
@@ -62,38 +65,85 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
         return ore_comp_dev(ios->oc, index);
  }
  
-int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
-                     bool is_reading, u64 offset, u64 length,
-                     struct ore_io_state **pios)
+static int  _get_io_state(struct ore_layout *layout,
+                         struct ore_components *oc, unsigned numdevs,
+                         struct ore_io_state **pios)
  {
         struct ore_io_state *ios;
  
         /*TODO: Maybe use kmem_cach per sbi of size
          * exofs_io_state_size(layout->s_numdevs)
          */
-       ios = kzalloc(ore_io_state_size(oc->numdevs), GFP_KERNEL);
+       ios = kzalloc(ore_io_state_size(numdevs), GFP_KERNEL);
         if (unlikely(!ios)) {
                 ORE_DBGMSG("Failed kzalloc bytes=%d\n",
-                            ore_io_state_size(oc->numdevs));
+                          ore_io_state_size(numdevs));
                 *pios = NULL;
                 return -ENOMEM;
         }
  
         ios->layout = layout;
         ios->oc = oc;
-       ios->offset = offset;
-       ios->length = length;
+       *pios = ios;
+       return 0;
+}
+
+/* Allocate an io_state for only a single group of devices
+ *
+ * If a user needs to call ore_read/write() this version must be used becase it
+ * allocates extra stuff for striping and raid.
+ * The ore might decide to only IO less then @length bytes do to alignmets
+ * and constrains as follows:
+ * - The IO cannot cross group boundary.
+ * - In raid5/6 The end of the IO must align at end of a stripe eg.
+ *   (@offset + @length) % strip_size == 0. Or the complete range is within a
+ *   single stripe.
+ * - Memory condition only permitted a shorter IO. (A user can use @length=~0
+ *   And check the returned ios->length for max_io_size.)
+ *
+ * The caller must check returned ios->length (and/or ios->nr_pages) and
+ * re-issue these pages that fall outside of ios->length
+ */
+int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
+                     bool is_reading, u64 offset, u64 length,
+                     struct ore_io_state **pios)
+{
+       struct ore_io_state *ios;
+       unsigned numdevs = layout->group_width * layout->mirrors_p1;
+       int ret;
+
+       ret = _get_io_state(layout, oc, numdevs, pios);
+       if (unlikely(ret))
+               return ret;
+
+       ios = *pios;
         ios->reading = is_reading;
+       ios->offset = offset;
+
+       if (length) {
+               struct ore_striping_info si;
+
+               ore_calc_stripe_info(layout, offset, &si);
+               ios->length = (length <= si.group_length) ? length :
+                                                       si.group_length;
+               ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
+       }
  
-       *pios = ios;
         return 0;
  }
  EXPORT_SYMBOL(ore_get_rw_state);
  
+/* Allocate an io_state for all the devices in the comps array
+ *
+ * This version of io_state allocation is used mostly by create/remove
+ * and trunc where we currently need all the devices. The only wastful
+ * bit is the read/write_attributes with no IO. Those sites should
+ * be converted to use ore_get_rw_state() with length=0
+ */
  int  ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
-                     struct ore_io_state **ios)
+                     struct ore_io_state **pios)
  {
-       return ore_get_rw_state(layout, oc, true, 0, 0, ios);
+       return _get_io_state(layout, oc, oc->numdevs, pios);
  }
  EXPORT_SYMBOL(ore_get_io_state);
  
@@ -374,12 +424,12 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
         unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
         unsigned dev = si->dev;
         unsigned first_dev = dev - (dev % devs_in_group);
-       unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
         unsigned cur_pg = ios->pages_consumed;
         int ret = 0;
  
         while (length) {
-               struct ore_per_dev_state *per_dev = &ios->per_dev[dev];
+               unsigned comp = dev - first_dev;
+               struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
                 unsigned cur_len, page_off = 0;
  
                 if (!per_dev->length) {
@@ -397,9 +447,6 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
                                 per_dev->offset = si->obj_offset - si->unit_off;
                                 cur_len = stripe_unit;
                         }
-
-                       if (max_comp < dev)
-                               max_comp = dev;
                 } else {
                         cur_len = stripe_unit;
                 }
@@ -417,17 +464,15 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
                 length -= cur_len;
         }
  out:
-       ios->numdevs = max_comp + mirrors_p1;
+       ios->numdevs = devs_in_group;
         ios->pages_consumed = cur_pg;
         return ret;
  }
  
  static int _prepare_for_striping(struct ore_io_state *ios)
  {
-       u64 length = ios->length;
-       u64 offset = ios->offset;
         struct ore_striping_info si;
-       int ret = 0;
+       int ret;
  
         if (!ios->pages) {
                 if (ios->kern_buff) {
@@ -446,21 +491,11 @@ static int _prepare_for_striping(struct ore_io_state *ios)
                 return 0;
         }
  
-       while (length) {
-               ore_calc_stripe_info(ios->layout, offset, &si);
-
-               if (length < si.group_length)
-                       si.group_length = length;
+       ore_calc_stripe_info(ios->layout, ios->offset, &si);
  
-               ret = _prepare_one_group(ios, si.group_length, &si);
-               if (unlikely(ret))
-                       goto out;
+       BUG_ON(ios->length > si.group_length);
+       ret = _prepare_one_group(ios, ios->length, &si);
  
-               offset += si.group_length;
-               length -= si.group_length;
-       }
-
-out:
         return ret;
  }
  
@@ -742,7 +777,6 @@ struct _trunc_info {
  
         unsigned first_group_dev;
         unsigned nex_group_dev;
-       unsigned max_devs;
  };
  
  static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
@@ -757,7 +791,6 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
  
         ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
         ti->nex_group_dev = ti->first_group_dev + layout->group_width;
-       ti->max_devs = layout->group_width * layout->group_count;
  }
  
  int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
@@ -777,7 +810,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
  
         _calc_trunk_info(ios->layout, size, &ti);
  
-       size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs),
+       size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
                              GFP_KERNEL);
         if (unlikely(!size_attrs)) {
                 ret = -ENOMEM;
@@ -786,7 +819,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
  
         ios->numdevs = ios->oc->numdevs;
  
-       for (i = 0; i < ti.max_devs; ++i) {
+       for (i = 0; i < ios->numdevs; ++i) {
                 struct exofs_trunc_attr *size_attr = &size_attrs[i];
                 u64 obj_size;
author	Boaz Harrosh <bharrosh@panasas.com>
	Wed, 28 Sep 2011 08:55:51 +0000 (11:55 +0300)
committer	Boaz Harrosh <bharrosh@panasas.com>
	Fri, 14 Oct 2011 16:52:50 +0000 (18:52 +0200)
fs/exofs/inode.c		patch \| blob \| history
fs/exofs/ore.c		patch \| blob \| history