Mutual exclusion of device add/rm and balance was done by the volume
mutex up to version 3.7. The commit
5ac00addc7ac091109 ("Btrfs: disallow
mutually exclusive admin operations from user mode") added a bit that
essentially tracked the same information.
The status bit has an advantage over a mutex that it can be set without
restrictions of function context, so it started to be used in the
mount-time resuming of balance or device replace.
But we don't really need to track the same information in two ways.
1) After the previous cleanups, the main ioctl handlers for
add/del/resize copy the EXCL_OP bit next to the volume mutex, here
it's clearly safe.
2) Resuming balance during mount or after rw remount will set only the
EXCL_OP bit and the volume_mutex is held in the kernel thread that
calls btrfs_balance.
3) Resuming device replace during mount or after rw remount is done
after balance and is excluded by the EXCL_OP bit. It does not take
the volume_mutex at all and completely relies on the EXCL_OP bit.
4) The resuming of balance and dev-replace cannot hapen at the same time
as the ioctls cannot be started in parallel. Nevertheless, a crafted
image could trigger that and a warning is printed.
5) Balance is normally excluded by EXCL_OP and also uses own mutex to
protect against concurrent access to its status data. There's some
trickery to maintain the right lock nesting in case we need to
reexamine the status in btrfs_ioctl_balance. The volume_mutex is
removed and the unlock/lock sequence is left in place as we might
expect other waiters to proceed.
6) Similar to 5, the unlock/lock sequence is kept in
btrfs_cancel_balance to allow waiters to continue.
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
- struct mutex volume_mutex;
/*
* this is taken to make sure we don't set block groups ro after
mutex_init(&fs_info->chunk_mutex);
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
- mutex_init(&fs_info->volume_mutex);
mutex_init(&fs_info->ro_block_group_mutex);
init_rwsem(&fs_info->commit_root_sem);
init_rwsem(&fs_info->cleanup_work_sem);
* returns target flags in extended format or 0 if restripe for this
* chunk_type is not in progress
*
- * should be called with either volume_mutex or balance_lock held
+ * should be called with balance_lock held
*/
static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
{
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
}
- mutex_lock(&fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
out_free:
kfree(vol_args);
out:
- mutex_unlock(&fs_info->volume_mutex);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
mnt_drop_write_file(file);
return ret;
if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
- mutex_lock(&fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
kfree(vol_args);
out:
- mutex_unlock(&fs_info->volume_mutex);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return ret;
}
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out;
}
- mutex_lock(&fs_info->volume_mutex);
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
ret = btrfs_rm_device(fs_info, NULL, vol_args->devid);
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
}
- mutex_unlock(&fs_info->volume_mutex);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
if (!ret) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out_drop_write;
}
- mutex_lock(&fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
kfree(vol_args);
out:
- mutex_unlock(&fs_info->volume_mutex);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
out_drop_write:
mnt_drop_write_file(file);
again:
if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
- mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
need_unlock = true;
goto locked;
/* this is either (2) or (3) */
if (!atomic_read(&fs_info->balance_running)) {
mutex_unlock(&fs_info->balance_mutex);
- if (!mutex_trylock(&fs_info->volume_mutex))
- goto again;
+ /*
+ * Lock released to allow other waiters to continue,
+ * we'll reexamine the status again.
+ */
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl &&
}
mutex_unlock(&fs_info->balance_mutex);
- mutex_unlock(&fs_info->volume_mutex);
goto again;
} else {
/* this is (2) */
kfree(bargs);
out_unlock:
mutex_unlock(&fs_info->balance_mutex);
- mutex_unlock(&fs_info->volume_mutex);
if (need_unlock)
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
out:
* may be used to exclude some operations from running concurrently without any
* modifications to the list (see write_all_supers)
*
- * volume_mutex
- * ------------
- * coarse lock owned by a mounted filesystem; used to exclude some operations
- * that cannot run in parallel and affect the higher-level properties of the
- * filesystem like: device add/deleting/resize/replace, or balance
- *
* balance_mutex
* -------------
* protects balance structures (status, state) and context accessed from
}
/*
- * Should be called with both balance and volume mutexes held to
- * serialize other volume operations (add_dev/rm_dev/resize) with
- * restriper. Same goes for reset_balance_state.
+ * Should be called with balance mutex held to protect against checking the
+ * balance status or progress. Same goes for reset_balance_state.
*/
static void set_balance_control(struct btrfs_balance_control *bctl)
{
}
/*
- * Should be called with both balance and volume mutexes held
+ * Should be called with balance mutexe held
*/
int btrfs_balance(struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs)
struct btrfs_fs_info *fs_info = data;
int ret = 0;
- mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
-
if (fs_info->balance_ctl) {
btrfs_info(fs_info, "continuing balance");
ret = btrfs_balance(fs_info->balance_ctl, NULL);
}
-
mutex_unlock(&fs_info->balance_mutex);
- mutex_unlock(&fs_info->volume_mutex);
return ret;
}
btrfs_warn(fs_info,
"cannot set exclusive op status to balance, resume manually");
- mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
-
set_balance_control(bctl);
-
mutex_unlock(&fs_info->balance_mutex);
- mutex_unlock(&fs_info->volume_mutex);
out:
btrfs_free_path(path);
return ret;
atomic_read(&fs_info->balance_running) == 0);
mutex_lock(&fs_info->balance_mutex);
} else {
- /* reset_balance_state needs volume_mutex */
mutex_unlock(&fs_info->balance_mutex);
- mutex_lock(&fs_info->volume_mutex);
+ /*
+ * Lock released to allow other waiters to continue, we'll
+ * reexamine the status again.
+ */
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl) {
reset_balance_state(fs_info);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
}
-
- mutex_unlock(&fs_info->volume_mutex);
}
BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));