ocfs2: Cache extent records
authorMark Fasheh <mark.fasheh@oracle.com>
Tue, 24 Apr 2007 01:53:12 +0000 (18:53 -0700)
committerMark Fasheh <mark.fasheh@oracle.com>
Thu, 26 Apr 2007 22:10:40 +0000 (15:10 -0700)
The extent map code was ripped out earlier because of an inability to deal
with holes. This patch adds back a simpler caching scheme requiring far less
code.

Our old extent map caching was designed back when meta data block caching in
Ocfs2 didn't work very well, resulting in many disk reads. These days our
metadata caching is much better, resulting in no un-necessary disk reads. As
a result, extent caching doesn't have to be as fancy, nor does it have to
cache as many extents. Keeping the last 3 extents seen should be sufficient
to give us a small performance boost on some streaming workloads.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
fs/ocfs2/alloc.c
fs/ocfs2/dlmglue.c
fs/ocfs2/extent_map.c
fs/ocfs2/extent_map.h
fs/ocfs2/inode.c
fs/ocfs2/inode.h
fs/ocfs2/super.c

index 412a2888a3ed3e80effbb8293fb2c4a3d282dea1..a0c8667caa722142ce7114ca73e240409aa8b478 100644 (file)
@@ -2417,6 +2417,8 @@ out_add:
        status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
        if (status < 0)
                mlog_errno(status);
+       else
+               ocfs2_extent_map_insert_rec(inode, &rec);
 
 bail:
        if (bh)
@@ -3640,6 +3642,9 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
                mlog_errno(status);
                goto bail;
        }
+
+       ocfs2_extent_map_trunc(inode, new_highest_cpos);
+
 start:
        /*
         * Check that we still have allocation to delete.
index 43267eea3538dbb5c0ca183f8828152712aacab2..27e43b0c0eaeef63e1564f33e6796717e832ca89 100644 (file)
@@ -1613,6 +1613,8 @@ static int ocfs2_meta_lock_update(struct inode *inode,
         * for the inode metadata. */
        ocfs2_metadata_cache_purge(inode);
 
+       ocfs2_extent_map_trunc(inode, 0);
+
        if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
                mlog(0, "Trusting LVB on inode %llu\n",
                     (unsigned long long)oi->ip_blkno);
index f35e04f27f3230149ca98e985f8ea541a2c81b7f..ba2b2ab1c6e4ea1d98341c87774d7843897eb69e 100644 (file)
 
 #include "buffer_head_io.h"
 
+/*
+ * The extent caching implementation is intentionally trivial.
+ *
+ * We only cache a small number of extents stored directly on the
+ * inode, so linear order operations are acceptable. If we ever want
+ * to increase the size of the extent map, then these algorithms must
+ * get smarter.
+ */
+
+void ocfs2_extent_map_init(struct inode *inode)
+{
+       struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+       oi->ip_extent_map.em_num_items = 0;
+       INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
+}
+
+static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+                                     unsigned int cpos,
+                                     struct ocfs2_extent_map_item **ret_emi)
+{
+       unsigned int range;
+       struct ocfs2_extent_map_item *emi;
+
+       *ret_emi = NULL;
+
+       list_for_each_entry(emi, &em->em_list, ei_list) {
+               range = emi->ei_cpos + emi->ei_clusters;
+
+               if (cpos >= emi->ei_cpos && cpos < range) {
+                       list_move(&emi->ei_list, &em->em_list);
+
+                       *ret_emi = emi;
+                       break;
+               }
+       }
+}
+
+static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
+                                  unsigned int *phys, unsigned int *len,
+                                  unsigned int *flags)
+{
+       unsigned int coff;
+       struct ocfs2_inode_info *oi = OCFS2_I(inode);
+       struct ocfs2_extent_map_item *emi;
+
+       spin_lock(&oi->ip_lock);
+
+       __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
+       if (emi) {
+               coff = cpos - emi->ei_cpos;
+               *phys = emi->ei_phys + coff;
+               if (len)
+                       *len = emi->ei_clusters - coff;
+               if (flags)
+                       *flags = emi->ei_flags;
+       }
+
+       spin_unlock(&oi->ip_lock);
+
+       if (emi == NULL)
+               return -ENOENT;
+
+       return 0;
+}
+
+/*
+ * Forget about all clusters equal to or greater than cpos.
+ */
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
+{
+       struct list_head *p, *n;
+       struct ocfs2_extent_map_item *emi;
+       struct ocfs2_inode_info *oi = OCFS2_I(inode);
+       struct ocfs2_extent_map *em = &oi->ip_extent_map;
+       LIST_HEAD(tmp_list);
+       unsigned int range;
+
+       spin_lock(&oi->ip_lock);
+       list_for_each_safe(p, n, &em->em_list) {
+               emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
+
+               if (emi->ei_cpos >= cpos) {
+                       /* Full truncate of this record. */
+                       list_move(&emi->ei_list, &tmp_list);
+                       BUG_ON(em->em_num_items == 0);
+                       em->em_num_items--;
+                       continue;
+               }
+
+               range = emi->ei_cpos + emi->ei_clusters;
+               if (range > cpos) {
+                       /* Partial truncate */
+                       emi->ei_clusters = cpos - emi->ei_cpos;
+               }
+       }
+       spin_unlock(&oi->ip_lock);
+
+       list_for_each_safe(p, n, &tmp_list) {
+               emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
+               list_del(&emi->ei_list);
+               kfree(emi);
+       }
+}
+
+/*
+ * Is any part of emi2 contained within emi1
+ */
+static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
+                                struct ocfs2_extent_map_item *emi2)
+{
+       unsigned int range1, range2;
+
+       /*
+        * Check if logical start of emi2 is inside emi1
+        */
+       range1 = emi1->ei_cpos + emi1->ei_clusters;
+       if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
+               return 1;
+
+       /*
+        * Check if logical end of emi2 is inside emi1
+        */
+       range2 = emi2->ei_cpos + emi2->ei_clusters;
+       if (range2 > emi1->ei_cpos && range2 <= range1)
+               return 1;
+
+       return 0;
+}
+
+static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
+                                 struct ocfs2_extent_map_item *src)
+{
+       dest->ei_cpos = src->ei_cpos;
+       dest->ei_phys = src->ei_phys;
+       dest->ei_clusters = src->ei_clusters;
+       dest->ei_flags = src->ei_flags;
+}
+
+/*
+ * Try to merge emi with ins. Returns 1 if merge succeeds, zero
+ * otherwise.
+ */
+static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
+                                        struct ocfs2_extent_map_item *ins)
+{
+       /*
+        * Handle contiguousness
+        */
+       if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
+           ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
+           ins->ei_flags == emi->ei_flags) {
+               emi->ei_clusters += ins->ei_clusters;
+               return 1;
+       } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
+                  (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys &&
+                  ins->ei_flags == emi->ei_flags) {
+               emi->ei_phys = ins->ei_phys;
+               emi->ei_cpos = ins->ei_cpos;
+               emi->ei_clusters += ins->ei_clusters;
+               return 1;
+       }
+
+       /*
+        * Overlapping extents - this shouldn't happen unless we've
+        * split an extent to change it's flags. That is exceedingly
+        * rare, so there's no sense in trying to optimize it yet.
+        */
+       if (ocfs2_ei_is_contained(emi, ins) ||
+           ocfs2_ei_is_contained(ins, emi)) {
+               ocfs2_copy_emi_fields(emi, ins);
+               return 1;
+       }
+
+       /* No merge was possible. */
+       return 0;
+}
+
+/*
+ * In order to reduce complexity on the caller, this insert function
+ * is intentionally liberal in what it will accept.
+ *
+ * The only rule is that the truncate call *must* be used whenever
+ * records have been deleted. This avoids inserting overlapping
+ * records with different physical mappings.
+ */
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+                                struct ocfs2_extent_rec *rec)
+{
+       struct ocfs2_inode_info *oi = OCFS2_I(inode);
+       struct ocfs2_extent_map *em = &oi->ip_extent_map;
+       struct ocfs2_extent_map_item *emi, *new_emi = NULL;
+       struct ocfs2_extent_map_item ins;
+
+       ins.ei_cpos = le32_to_cpu(rec->e_cpos);
+       ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
+                                              le64_to_cpu(rec->e_blkno));
+       ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
+       ins.ei_flags = rec->e_flags;
+
+search:
+       spin_lock(&oi->ip_lock);
+
+       list_for_each_entry(emi, &em->em_list, ei_list) {
+               if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
+                       list_move(&emi->ei_list, &em->em_list);
+                       spin_unlock(&oi->ip_lock);
+                       goto out;
+               }
+       }
+
+       /*
+        * No item could be merged.
+        *
+        * Either allocate and add a new item, or overwrite the last recently
+        * inserted.
+        */
+
+       if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
+               if (new_emi == NULL) {
+                       spin_unlock(&oi->ip_lock);
+
+                       new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
+                       if (new_emi == NULL)
+                               goto out;
+
+                       goto search;
+               }
+
+               ocfs2_copy_emi_fields(new_emi, &ins);
+               list_add(&new_emi->ei_list, &em->em_list);
+               em->em_num_items++;
+               new_emi = NULL;
+       } else {
+               BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
+               emi = list_entry(em->em_list.prev,
+                                struct ocfs2_extent_map_item, ei_list);
+               list_move(&emi->ei_list, &em->em_list);
+               ocfs2_copy_emi_fields(emi, &ins);
+       }
+
+       spin_unlock(&oi->ip_lock);
+
+out:
+       if (new_emi)
+               kfree(new_emi);
+}
+
 /*
  * Return the 1st index within el which contains an extent start
  * larger than v_cluster.
@@ -174,6 +422,11 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
        struct ocfs2_extent_rec *rec;
        u32 coff;
 
+       ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+                                     num_clusters, extent_flags);
+       if (ret == 0)
+               goto out;
+
        ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
                               &di_bh, OCFS2_BH_CACHED, inode);
        if (ret) {
@@ -245,6 +498,8 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
                        *num_clusters = ocfs2_rec_clusters(el, rec) - coff;
 
                flags = rec->e_flags;
+
+               ocfs2_extent_map_insert_rec(inode, rec);
        }
 
        if (extent_flags)
index 1d745e174afc01862e33d2e2a40a1916b873c070..de91e3e41a224ac7f1eb39f1e35bdd22034d811a 100644 (file)
 #ifndef _EXTENT_MAP_H
 #define _EXTENT_MAP_H
 
+struct ocfs2_extent_map_item {
+       unsigned int                    ei_cpos;
+       unsigned int                    ei_phys;
+       unsigned int                    ei_clusters;
+       unsigned int                    ei_flags;
+
+       struct list_head                ei_list;
+};
+
+#define OCFS2_MAX_EXTENT_MAP_ITEMS                     3
+struct ocfs2_extent_map {
+       unsigned int                    em_num_items;
+       struct list_head                em_list;
+};
+
+void ocfs2_extent_map_init(struct inode *inode);
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster);
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+                                struct ocfs2_extent_rec *rec);
+
 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
                       u32 *num_clusters, unsigned int *extent_flags);
 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
index 4bfc98c7013761a83a65c39a4fbb045077556e96..21a605079c62328b5c2b9a4615f3f7bb444dee26 100644 (file)
@@ -1008,6 +1008,8 @@ void ocfs2_clear_inode(struct inode *inode)
                        "Clear inode of %llu, inode has io markers\n",
                        (unsigned long long)oi->ip_blkno);
 
+       ocfs2_extent_map_trunc(inode, 0);
+
        status = ocfs2_drop_inode_locks(inode);
        if (status < 0)
                mlog_errno(status);
index aa84353d0d196d70983e40ace13063018ab5d7d6..03ae075869ee5a681a1cafd47e2ee3fa7fc13bda 100644 (file)
@@ -26,6 +26,8 @@
 #ifndef OCFS2_INODE_H
 #define OCFS2_INODE_H
 
+#include "extent_map.h"
+
 /* OCFS2 Inode Private Data */
 struct ocfs2_inode_info
 {
@@ -63,6 +65,8 @@ struct ocfs2_inode_info
 
        struct ocfs2_caching_info       ip_metadata_cache;
 
+       struct ocfs2_extent_map         ip_extent_map;
+
        struct inode                    vfs_inode;
 };
 
index 6ab52351943ab81df03b01353f15092a3f8bc284..5c9e8243691f6be9695f57334ffd1da71c612699 100644 (file)
@@ -942,6 +942,7 @@ static void ocfs2_inode_init_once(void *data,
                oi->ip_flags = 0;
                oi->ip_open_count = 0;
                spin_lock_init(&oi->ip_lock);
+               ocfs2_extent_map_init(&oi->vfs_inode);
                INIT_LIST_HEAD(&oi->ip_io_markers);
                oi->ip_created_trans = 0;
                oi->ip_last_trans = 0;