ext4: completed_io locking cleanup
authorDmitry Monakhov <dmonakhov@openvz.org>
Sat, 29 Sep 2012 04:14:55 +0000 (00:14 -0400)
committerTheodore Ts'o <tytso@mit.edu>
Sat, 29 Sep 2012 04:14:55 +0000 (00:14 -0400)
Current unwritten extent conversion state-machine is very fuzzy.
- For unknown reason it performs conversion under i_mutex. What for?
  My diagnosis:
  We already protect extent tree with i_data_sem, truncate and punch_hole
  should wait for DIO, so the only data we have to protect is end_io->flags
  modification, but only flush_completed_IO and end_io_work modified this
  flags and we can serialize them via i_completed_io_lock.

  Currently all these games with mutex_trylock result in the following deadlock
   truncate:                          kworker:
    ext4_setattr                       ext4_end_io_work
    mutex_lock(i_mutex)
    inode_dio_wait(inode)  ->BLOCK
                             DEADLOCK<- mutex_trylock()
                                        inode_dio_done()
  #TEST_CASE1_BEGIN
  MNT=/mnt_scrach
  unlink $MNT/file
  fallocate -l $((1024*1024*1024)) $MNT/file
  aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file
  sleep 2
  truncate -s 0 $MNT/file
  #TEST_CASE1_END

Or use 286's xfstests https://github.com/dmonakhov/xfstests/blob/devel/286

This patch makes state machine simple and clean:

(1) xxx_end_io schedule final extent conversion simply by calling
    ext4_add_complete_io(), which append it to ei->i_completed_io_list
    NOTE1: because of (2A) work should be queued only if
    ->i_completed_io_list was empty, otherwise the work is scheduled already.

(2) ext4_flush_completed_IO is responsible for handling all pending
    end_io from ei->i_completed_io_list
    Flushing sequence consists of following stages:
    A) LOCKED: Atomically drain completed_io_list to local_list
    B) Perform extents conversion
    C) LOCKED: move converted io's to to_free list for final deletion
             This logic depends on context which we was called from.
    D) Final end_io context destruction
    NOTE1: i_mutex is no longer required because end_io->flags modification
    is protected by ei->ext4_complete_io_lock

Full list of changes:
- Move all completion end_io related routines to page-io.c in order to improve
  logic locality
- Move open coded logic from various xx_end_xx routines to ext4_add_complete_io()
- remove EXT4_IO_END_FSYNC
- Improve SMP scalability by removing useless i_mutex which does not
  protect io->flags anymore.
- Reduce lock contention on i_completed_io_lock by optimizing list walk.
- Rename ext4_end_io_nolock to end4_end_io and make it static
- Check flush completion status to ext4_ext_punch_hole(). Because it is
  not good idea to punch blocks from corrupted inode.

Changes since V3 (in request to Jan's comments):
  Fall back to active flush_completed_IO() approach in order to prevent
  performance issues with nolocked DIO reads.
Changes since V2:
  Fix use-after-free caused by race truncate vs end_io_work

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
fs/ext4/ext4.h
fs/ext4/extents.c
fs/ext4/fsync.c
fs/ext4/indirect.c
fs/ext4/inode.c
fs/ext4/page-io.c

index 6ec9856065d6168d32aad1fc6916b16de6da19c7..edb049579420f5ef9cd02fd9d0360a85db31003f 100644 (file)
@@ -186,7 +186,6 @@ struct mpage_da_data {
 #define EXT4_IO_END_ERROR      0x0002
 #define EXT4_IO_END_QUEUED     0x0004
 #define EXT4_IO_END_DIRECT     0x0008
-#define EXT4_IO_END_IN_FSYNC   0x0010
 
 struct ext4_io_page {
        struct page     *p_page;
@@ -2418,11 +2417,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
+extern void ext4_add_complete_io(ext4_io_end_t *io_end);
 extern void ext4_exit_pageio(void);
 extern void ext4_ioend_wait(struct inode *);
 extern void ext4_free_io_end(ext4_io_end_t *io);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
-extern int ext4_end_io_nolock(ext4_io_end_t *io);
 extern void ext4_io_submit(struct ext4_io_submit *io);
 extern int ext4_bio_write_page(struct ext4_io_submit *io,
                               struct page *page,
index 54a94426ef7b2ef91d4b9105280399e3addc8144..232077439aa85499502d4cd9e1cc5cff8c3236e3 100644 (file)
@@ -4833,7 +4833,9 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
        }
 
        /* finish any pending end_io work */
-       ext4_flush_completed_IO(inode);
+       err = ext4_flush_completed_IO(inode);
+       if (err)
+               return err;
 
        credits = ext4_writepage_trans_blocks(inode);
        handle = ext4_journal_start(inode, credits);
index 323eb15c2d94ca86f237881af336ea639c06d9dc..460000868b8e982fda568d7913c22f8e2cb5159b 100644 (file)
 
 #include <trace/events/ext4.h>
 
-static void dump_completed_IO(struct inode * inode)
-{
-#ifdef EXT4FS_DEBUG
-       struct list_head *cur, *before, *after;
-       ext4_io_end_t *io, *io0, *io1;
-       unsigned long flags;
-
-       if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
-               ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
-               return;
-       }
-
-       ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
-       spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-       list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
-               cur = &io->list;
-               before = cur->prev;
-               io0 = container_of(before, ext4_io_end_t, list);
-               after = cur->next;
-               io1 = container_of(after, ext4_io_end_t, list);
-
-               ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
-                           io, inode->i_ino, io0, io1);
-       }
-       spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-#endif
-}
-
-/*
- * This function is called from ext4_sync_file().
- *
- * When IO is completed, the work to convert unwritten extents to
- * written is queued on workqueue but may not get immediately
- * scheduled. When fsync is called, we need to ensure the
- * conversion is complete before fsync returns.
- * The inode keeps track of a list of pending/completed IO that
- * might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents for completed IO
- * to written.
- * The function return the number of pending IOs on success.
- */
-int ext4_flush_completed_IO(struct inode *inode)
-{
-       ext4_io_end_t *io;
-       struct ext4_inode_info *ei = EXT4_I(inode);
-       unsigned long flags;
-       int ret = 0;
-       int ret2 = 0;
-
-       dump_completed_IO(inode);
-       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-       while (!list_empty(&ei->i_completed_io_list)){
-               io = list_entry(ei->i_completed_io_list.next,
-                               ext4_io_end_t, list);
-               list_del_init(&io->list);
-               io->flag |= EXT4_IO_END_IN_FSYNC;
-               /*
-                * Calling ext4_end_io_nolock() to convert completed
-                * IO to written.
-                *
-                * When ext4_sync_file() is called, run_queue() may already
-                * about to flush the work corresponding to this io structure.
-                * It will be upset if it founds the io structure related
-                * to the work-to-be schedule is freed.
-                *
-                * Thus we need to keep the io structure still valid here after
-                * conversion finished. The io structure has a flag to
-                * avoid double converting from both fsync and background work
-                * queue work.
-                */
-               spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-               ret = ext4_end_io_nolock(io);
-               if (ret < 0)
-                       ret2 = ret;
-               spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-               io->flag &= ~EXT4_IO_END_IN_FSYNC;
-       }
-       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-       return (ret2 < 0) ? ret2 : 0;
-}
-
 /*
  * If we're not journaling and this is a just-created file, we have to
  * sync our parent directory (if it was freshly created) since
index 830e1b2bf145c99f1ebdc2198147295d73eff8ac..61f13e57975e53233e60f809aff6b59da63ddfcf 100644 (file)
@@ -807,11 +807,9 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 
 retry:
        if (rw == READ && ext4_should_dioread_nolock(inode)) {
-               if (unlikely(!list_empty(&ei->i_completed_io_list))) {
-                       mutex_lock(&inode->i_mutex);
+               if (unlikely(!list_empty(&ei->i_completed_io_list)))
                        ext4_flush_completed_IO(inode);
-                       mutex_unlock(&inode->i_mutex);
-               }
+
                ret = __blockdev_direct_IO(rw, iocb, inode,
                                 inode->i_sb->s_bdev, iov,
                                 offset, nr_segs,
index a99588673566f1dddef44538d3b8ca499d269b42..09d0488e9a1595b6fc9b9f3aa31ec198582751ca 100644 (file)
@@ -2881,9 +2881,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 {
        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
         ext4_io_end_t *io_end = iocb->private;
-       struct workqueue_struct *wq;
-       unsigned long flags;
-       struct ext4_inode_info *ei;
 
        /* if not async direct IO or dio with 0 bytes write, just return */
        if (!io_end || !size)
@@ -2912,24 +2909,14 @@ out:
                io_end->iocb = iocb;
                io_end->result = ret;
        }
-       wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-
-       /* Add the io_end to per-inode completed aio dio list*/
-       ei = EXT4_I(io_end->inode);
-       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-       list_add_tail(&io_end->list, &ei->i_completed_io_list);
-       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 
-       /* queue the work to convert unwritten extents to written */
-       queue_work(wq, &io_end->work);
+       ext4_add_complete_io(io_end);
 }
 
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
 {
        ext4_io_end_t *io_end = bh->b_private;
-       struct workqueue_struct *wq;
        struct inode *inode;
-       unsigned long flags;
 
        if (!test_clear_buffer_uninit(bh) || !io_end)
                goto out;
@@ -2948,15 +2935,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
         */
        inode = io_end->inode;
        ext4_set_io_unwritten_flag(inode, io_end);
-
-       /* Add the io_end to per-inode completed io list*/
-       spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-       list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
-       spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-
-       wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
-       /* queue the work to convert unwritten extents to written */
-       queue_work(wq, &io_end->work);
+       ext4_add_complete_io(io_end);
 out:
        bh->b_private = NULL;
        bh->b_end_io = NULL;
index 997002218228424a39c6c415225368e41c076aea..5b24c407701b3c12895e1b3be56959199e0b0ca0 100644 (file)
@@ -71,6 +71,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
        int i;
 
        BUG_ON(!io);
+       BUG_ON(!list_empty(&io->list));
        BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
 
        if (io->page)
@@ -83,21 +84,14 @@ void ext4_free_io_end(ext4_io_end_t *io)
        kmem_cache_free(io_end_cachep, io);
 }
 
-/*
- * check a range of space and convert unwritten extents to written.
- *
- * Called with inode->i_mutex; we depend on this when we manipulate
- * io->flag, since we could otherwise race with ext4_flush_completed_IO()
- */
-int ext4_end_io_nolock(ext4_io_end_t *io)
+/* check a range of space and convert unwritten extents to written. */
+static int ext4_end_io(ext4_io_end_t *io)
 {
        struct inode *inode = io->inode;
        loff_t offset = io->offset;
        ssize_t size = io->size;
        int ret = 0;
 
-       BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
-
        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
                   "list->prev 0x%p\n",
                   io, inode->i_ino, io->list.next, io->list.prev);
@@ -110,7 +104,6 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
                         "(inode %lu, offset %llu, size %zd, error %d)",
                         inode->i_ino, offset, size, ret);
        }
-       io->flag &= ~EXT4_IO_END_UNWRITTEN;
        if (io->iocb)
                aio_complete(io->iocb, io->result, 0);
 
@@ -122,51 +115,122 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
        return ret;
 }
 
-/*
- * work on completed aio dio IO, to convert unwritten extents to extents
- */
-static void ext4_end_io_work(struct work_struct *work)
+static void dump_completed_IO(struct inode *inode)
+{
+#ifdef EXT4FS_DEBUG
+       struct list_head *cur, *before, *after;
+       ext4_io_end_t *io, *io0, *io1;
+       unsigned long flags;
+
+       if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
+               ext4_debug("inode %lu completed_io list is empty\n",
+                          inode->i_ino);
+               return;
+       }
+
+       ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
+       list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
+               cur = &io->list;
+               before = cur->prev;
+               io0 = container_of(before, ext4_io_end_t, list);
+               after = cur->next;
+               io1 = container_of(after, ext4_io_end_t, list);
+
+               ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+                           io, inode->i_ino, io0, io1);
+       }
+#endif
+}
+
+/* Add the io_end to per-inode completed end_io list. */
+void ext4_add_complete_io(ext4_io_end_t *io_end)
 {
-       ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
-       struct inode            *inode = io->inode;
-       struct ext4_inode_info  *ei = EXT4_I(inode);
-       unsigned long           flags;
+       struct ext4_inode_info *ei = EXT4_I(io_end->inode);
+       struct workqueue_struct *wq;
+       unsigned long flags;
+
+       BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+       wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
 
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-       if (io->flag & EXT4_IO_END_IN_FSYNC)
-               goto requeue;
-       if (list_empty(&io->list)) {
-               spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-               goto free;
+       if (list_empty(&ei->i_completed_io_list)) {
+               io_end->flag |= EXT4_IO_END_QUEUED;
+               queue_work(wq, &io_end->work);
        }
+       list_add_tail(&io_end->list, &ei->i_completed_io_list);
+       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
 
-       if (!mutex_trylock(&inode->i_mutex)) {
-               bool was_queued;
-requeue:
-               was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
-               io->flag |= EXT4_IO_END_QUEUED;
-               spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-               /*
-                * Requeue the work instead of waiting so that the work
-                * items queued after this can be processed.
-                */
-               queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
-               /*
-                * To prevent the ext4-dio-unwritten thread from keeping
-                * requeueing end_io requests and occupying cpu for too long,
-                * yield the cpu if it sees an end_io request that has already
-                * been requeued.
-                */
-               if (was_queued)
-                       yield();
-               return;
+static int ext4_do_flush_completed_IO(struct inode *inode,
+                                     ext4_io_end_t *work_io)
+{
+       ext4_io_end_t *io;
+       struct list_head unwritten, complete, to_free;
+       unsigned long flags;
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       int err, ret = 0;
+
+       INIT_LIST_HEAD(&complete);
+       INIT_LIST_HEAD(&to_free);
+
+       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+       dump_completed_IO(inode);
+       list_replace_init(&ei->i_completed_io_list, &unwritten);
+       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+       while (!list_empty(&unwritten)) {
+               io = list_entry(unwritten.next, ext4_io_end_t, list);
+               BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
+               list_del_init(&io->list);
+
+               err = ext4_end_io(io);
+               if (unlikely(!ret && err))
+                       ret = err;
+
+               list_add_tail(&io->list, &complete);
+       }
+       /* It is important to update all flags for all end_io in one shot w/o
+        * dropping the lock.*/
+       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+       while (!list_empty(&complete)) {
+               io = list_entry(complete.next, ext4_io_end_t, list);
+               io->flag &= ~EXT4_IO_END_UNWRITTEN;
+               /* end_io context can not be destroyed now because it still
+                * used by queued worker. Worker thread will destroy it later */
+               if (io->flag & EXT4_IO_END_QUEUED)
+                       list_del_init(&io->list);
+               else
+                       list_move(&io->list, &to_free);
+       }
+       /* If we are called from worker context, it is time to clear queued
+        * flag, and destroy it's end_io if it was converted already */
+       if (work_io) {
+               work_io->flag &= ~EXT4_IO_END_QUEUED;
+               if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
+                       list_add_tail(&work_io->list, &to_free);
        }
-       list_del_init(&io->list);
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-       (void) ext4_end_io_nolock(io);
-       mutex_unlock(&inode->i_mutex);
-free:
-       ext4_free_io_end(io);
+
+       while (!list_empty(&to_free)) {
+               io = list_entry(to_free.next, ext4_io_end_t, list);
+               list_del_init(&io->list);
+               ext4_free_io_end(io);
+       }
+       return ret;
+}
+
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_io_work(struct work_struct *work)
+{
+       ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+       ext4_do_flush_completed_IO(io->inode, io);
+}
+
+int ext4_flush_completed_IO(struct inode *inode)
+{
+       return ext4_do_flush_completed_IO(inode, NULL);
 }
 
 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -199,9 +263,7 @@ static void buffer_io_error(struct buffer_head *bh)
 static void ext4_end_bio(struct bio *bio, int error)
 {
        ext4_io_end_t *io_end = bio->bi_private;
-       struct workqueue_struct *wq;
        struct inode *inode;
-       unsigned long flags;
        int i;
        sector_t bi_sector = bio->bi_sector;
 
@@ -259,14 +321,7 @@ static void ext4_end_bio(struct bio *bio, int error)
                return;
        }
 
-       /* Add the io_end to per-inode completed io list*/
-       spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-       list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
-       spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-
-       wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
-       /* queue the work to convert unwritten extents to written */
-       queue_work(wq, &io_end->work);
+       ext4_add_complete_io(io_end);
 }
 
 void ext4_io_submit(struct ext4_io_submit *io)