ext4: fix ext4_flush_completed_IO wait semantics
authorDmitry Monakhov <dmonakhov@openvz.org>
Fri, 5 Oct 2012 15:31:55 +0000 (11:31 -0400)
committerTheodore Ts'o <tytso@mit.edu>
Fri, 5 Oct 2012 15:31:55 +0000 (11:31 -0400)
BUG #1) All places where we call ext4_flush_completed_IO are broken
    because buffered io and DIO/AIO goes through three stages
    1) submitted io,
    2) completed io (in i_completed_io_list) conversion pended
    3) finished  io (conversion done)
    And by calling ext4_flush_completed_IO we will flush only
    requests which were in (2) stage, which is wrong because:
     1) punch_hole and truncate _must_ wait for all outstanding unwritten io
      regardless to it's state.
     2) fsync and nolock_dio_read should also wait because there is
        a time window between end_page_writeback() and ext4_add_complete_io()
        As result integrity fsync is broken in case of buffered write
        to fallocated region:
        fsync                                      blkdev_completion
 ->filemap_write_and_wait_range
                                                   ->ext4_end_bio
                                                     ->end_page_writeback
          <-- filemap_write_and_wait_range return
 ->ext4_flush_completed_IO
     sees empty i_completed_io_list but pended
     conversion still exist
                                                     ->ext4_add_complete_io

BUG #2) Race window becomes wider due to the 'ext4: completed_io
locking cleanup V4' patch series

This patch make following changes:
1) ext4_flush_completed_io() now first try to flush completed io and when
   wait for any outstanding unwritten io via ext4_unwritten_wait()
2) Rename function to more appropriate name.
3) Assert that all callers of ext4_flush_unwritten_io should hold i_mutex to
   prevent endless wait

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
fs/ext4/ext4.h
fs/ext4/extents.c
fs/ext4/file.c
fs/ext4/fsync.c
fs/ext4/indirect.c
fs/ext4/page-io.c

index 1be2b4472a83c142f271c15b3dcdf490e6b649f3..3ab2539b7b2eb477222a3dbb7dd8b51cea3cca84 100644 (file)
@@ -1947,7 +1947,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 
 /* fsync.c */
 extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
-extern int ext4_flush_completed_IO(struct inode *);
+extern int ext4_flush_unwritten_io(struct inode *);
 
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -2371,6 +2371,7 @@ extern const struct file_operations ext4_dir_operations;
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
 extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
+extern void ext4_unwritten_wait(struct inode *inode);
 
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
index c1fcf489e056dc95e4e8711bdc369a668fa67dbd..1c94cca35ed1a12ce42ce39bddea7c9dd23111cf 100644 (file)
@@ -4268,7 +4268,7 @@ void ext4_ext_truncate(struct inode *inode)
         * finish any pending end_io work so we won't run the risk of
         * converting any truncated blocks to initialized later
         */
-       ext4_flush_completed_IO(inode);
+       ext4_flush_unwritten_io(inode);
 
        /*
         * probably first extent we're gonna free will be last in block
@@ -4847,10 +4847,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 
        /* Wait all existing dio workers, newcomers will block on i_mutex */
        ext4_inode_block_unlocked_dio(inode);
-       inode_dio_wait(inode);
-       err = ext4_flush_completed_IO(inode);
+       err = ext4_flush_unwritten_io(inode);
        if (err)
                goto out_dio;
+       inode_dio_wait(inode);
 
        credits = ext4_writepage_trans_blocks(inode);
        handle = ext4_journal_start(inode, credits);
index 39335bda404bd14aaa53ca5997ac39cb2c65ef46..ca6f07afe60154ecd3b996946e78173e52000487 100644 (file)
@@ -55,7 +55,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
        return 0;
 }
 
-static void ext4_unwritten_wait(struct inode *inode)
+void ext4_unwritten_wait(struct inode *inode)
 {
        wait_queue_head_t *wq = ext4_ioend_wq(inode);
 
index 460000868b8e982fda568d7913c22f8e2cb5159b..be1d89f385b42e2268f879f8ee6f8478bb967081 100644 (file)
@@ -138,7 +138,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        if (inode->i_sb->s_flags & MS_RDONLY)
                goto out;
 
-       ret = ext4_flush_completed_IO(inode);
+       ret = ext4_flush_unwritten_io(inode);
        if (ret < 0)
                goto out;
 
index 8d849dae84283dcca34e6f10918b4f4844e52375..792e388e7b444bece02de64261be376227bd22bc 100644 (file)
@@ -807,9 +807,11 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 
 retry:
        if (rw == READ && ext4_should_dioread_nolock(inode)) {
-               if (unlikely(!list_empty(&ei->i_completed_io_list)))
-                       ext4_flush_completed_IO(inode);
-
+               if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
+                       mutex_lock(&inode->i_mutex);
+                       ext4_flush_unwritten_io(inode);
+                       mutex_unlock(&inode->i_mutex);
+               }
                /*
                 * Nolock dioread optimization may be dynamically disabled
                 * via ext4_inode_block_unlocked_dio(). Check inode's state
index 5b24c407701b3c12895e1b3be56959199e0b0ca0..68e896e12a67e7bf4b9b4f478e1b406220839dd7 100644 (file)
@@ -189,8 +189,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
 
                list_add_tail(&io->list, &complete);
        }
-       /* It is important to update all flags for all end_io in one shot w/o
-        * dropping the lock.*/
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
        while (!list_empty(&complete)) {
                io = list_entry(complete.next, ext4_io_end_t, list);
@@ -228,9 +226,14 @@ static void ext4_end_io_work(struct work_struct *work)
        ext4_do_flush_completed_IO(io->inode, io);
 }
 
-int ext4_flush_completed_IO(struct inode *inode)
+int ext4_flush_unwritten_io(struct inode *inode)
 {
-       return ext4_do_flush_completed_IO(inode, NULL);
+       int ret;
+       WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
+                    !(inode->i_state & I_FREEING));
+       ret = ext4_do_flush_completed_IO(inode, NULL);
+       ext4_unwritten_wait(inode);
+       return ret;
 }
 
 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)