diff --git a/block/bdev.c b/block/bdev.c index bb0ffa3bb4dfb..e59052c2a081a 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -304,7 +304,12 @@ int bdev_freeze(struct block_device *bdev) mutex_lock(&bdev->bd_fsfreeze_mutex); - if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { + /* A device being removed from its filesystem refuses freezes. */ + if (!atomic_inc_unless_negative(&bdev->bd_fsfreeze_count)) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return -EBUSY; + } + if (atomic_read(&bdev->bd_fsfreeze_count) > 1) { mutex_unlock(&bdev->bd_fsfreeze_mutex); return 0; } @@ -368,6 +373,42 @@ int bdev_thaw(struct block_device *bdev) } EXPORT_SYMBOL(bdev_thaw); +/** + * bdev_deny_freeze - make a block device unfreezable + * @bdev: block device + * + * Reserve @bdev against bdev_freeze() the way deny_write_access() reserves a + * file against writers. bd_fsfreeze_count is sign-encoded: > 0 counts active + * freezes, < 0 counts deniers, so a deny succeeds only while no freeze is in + * progress. While held, bdev_freeze() returns -EBUSY. Pair with + * bdev_allow_freeze(). + * + * A filesystem removing, adding or replacing a member device denies freezes on + * it for the duration, so a claim a freeze walk might act on is never torn down + * behind the freezer's back. The deny is device-scoped, not (device, + * superblock)-scoped: a device shared by several superblocks is refused for all + * of them. No in-tree filesystem removes a shared claim from a live superblock. + * + * Return: 0, or -EBUSY if the device is currently frozen. + */ +int bdev_deny_freeze(struct block_device *bdev) +{ + return atomic_dec_unless_positive(&bdev->bd_fsfreeze_count) ? 0 : -EBUSY; +} +EXPORT_SYMBOL_GPL(bdev_deny_freeze); + +/** + * bdev_allow_freeze - allow freezing a block device again + * @bdev: block device + * + * Undo one bdev_deny_freeze(). + */ +void bdev_allow_freeze(struct block_device *bdev) +{ + atomic_inc(&bdev->bd_fsfreeze_count); +} +EXPORT_SYMBOL_GPL(bdev_allow_freeze); + /* * pseudo-fs */ @@ -1158,18 +1199,16 @@ void bdev_release(struct file *bdev_file) } /** - * bdev_fput - yield claim to the block device and put the file + * bdev_yield_claim - give up the holder claim on an open block device * @bdev_file: open block device * - * Yield claim on the block device and put the file. Ensure that the - * block device can be reclaimed before the file is closed which is a - * deferred operation. + * Yield the holder and any write access for @bdev_file without closing it, so + * the caller can still act on the device - e.g. bdev_allow_freeze() it - before + * the final bdev_fput(). bdev_fput() yields too, so calling it afterwards is + * safe. */ -void bdev_fput(struct file *bdev_file) +void bdev_yield_claim(struct file *bdev_file) { - if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops)) - return; - if (bdev_file->private_data) { struct block_device *bdev = file_bdev(bdev_file); struct gendisk *disk = bdev->bd_disk; @@ -1185,7 +1224,23 @@ void bdev_fput(struct file *bdev_file) bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host); mutex_unlock(&disk->open_mutex); } +} +EXPORT_SYMBOL_GPL(bdev_yield_claim); + +/** + * bdev_fput - yield claim to the block device and put the file + * @bdev_file: open block device + * + * Yield claim on the block device and put the file. Ensure that the + * block device can be reclaimed before the file is closed which is a + * deferred operation. + */ +void bdev_fput(struct file *bdev_file) +{ + if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops)) + return; + bdev_yield_claim(bdev_file); fput(bdev_file); } EXPORT_SYMBOL(bdev_fput); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 8f8fa14886ded..4d6bd6b4b0394 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -247,8 +247,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return -EINVAL; } - bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, - fs_info->sb, &fs_holder_ops); + /* Unfreezable for the whole replace; see btrfs_dev_replace_start(). */ + bdev_file = btrfs_open_device_deny_freeze(device_path, fs_info->sb); if (IS_ERR(bdev_file)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); return PTR_ERR(bdev_file); @@ -325,7 +325,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return 0; error: - bdev_fput(bdev_file); + /* Undo the open-time freeze deny. */ + btrfs_release_device_allow_freeze(bdev_file); return ret; } @@ -622,9 +623,18 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (ret) return ret; + /* Deny the source before mark, so every 'leave' unwinds both denied. */ + if (src_device->bdev) { + ret = bdev_deny_freeze(src_device->bdev); + if (ret) { + btrfs_destroy_dev_replace_tgtdev(tgt_device, true); + return ret; + } + } + ret = mark_block_group_to_copy(fs_info, src_device); if (ret) - return ret; + goto leave; down_write(&dev_replace->rwsem); dev_replace->replace_task = current; @@ -706,7 +716,9 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return ret; leave: - btrfs_destroy_dev_replace_tgtdev(tgt_device); + if (src_device->bdev) + bdev_allow_freeze(src_device->bdev); + btrfs_destroy_dev_replace_tgtdev(tgt_device, true); return ret; } @@ -887,6 +899,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, */ ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); if (ret) { + /* Stays started/resumable; keep both denied. */ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } @@ -900,6 +913,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, while (1) { trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { + /* Stays started/resumable; keep both denied. */ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return PTR_ERR(trans); } @@ -952,7 +966,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_devices->device_list_mutex); btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) - btrfs_destroy_dev_replace_tgtdev(tgt_device); + btrfs_destroy_dev_replace_tgtdev(tgt_device, true); + /* The source stays a member; re-allow freezing it. */ + if (src_device->bdev) + bdev_allow_freeze(src_device->bdev); btrfs_rm_dev_replace_unblocked(fs_info); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); @@ -1018,6 +1035,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + /* The target is now a member; the source is freed (allow + release). */ + bdev_allow_freeze(tgt_device->bdev); btrfs_rm_dev_replace_free_srcdev(src_device); return 0; @@ -1146,8 +1165,9 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) btrfs_dev_name(src_device), src_device->devid, btrfs_dev_name(tgt_device)); + /* A suspended replace never re-denied freezing; do not allow. */ if (tgt_device) - btrfs_destroy_dev_replace_tgtdev(tgt_device); + btrfs_destroy_dev_replace_tgtdev(tgt_device, false); break; default: up_write(&dev_replace->rwsem); @@ -1177,6 +1197,11 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) dev_replace->time_stopped = ktime_get_real_seconds(); dev_replace->item_needs_writeback = 1; btrfs_info(fs_info, "suspending dev_replace for unmount"); + /* Reopened freezable next mount; resume re-denies. */ + if (dev_replace->srcdev && dev_replace->srcdev->bdev) + bdev_allow_freeze(dev_replace->srcdev->bdev); + if (dev_replace->tgtdev && dev_replace->tgtdev->bdev) + bdev_allow_freeze(dev_replace->tgtdev->bdev); break; } @@ -1189,6 +1214,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) { struct task_struct *task; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int ret = 0; down_write(&dev_replace->rwsem); @@ -1232,8 +1258,33 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) return 0; } + /* Re-deny for the resumed replace; stay suspended if frozen now. */ + if (dev_replace->srcdev->bdev && + bdev_deny_freeze(dev_replace->srcdev->bdev)) + goto suspend; + if (bdev_deny_freeze(dev_replace->tgtdev->bdev)) { + if (dev_replace->srcdev->bdev) + bdev_allow_freeze(dev_replace->srcdev->bdev); + goto suspend; + } + task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); - return PTR_ERR_OR_ZERO(task); + if (IS_ERR(task)) { + bdev_allow_freeze(dev_replace->tgtdev->bdev); + if (dev_replace->srcdev->bdev) + bdev_allow_freeze(dev_replace->srcdev->bdev); + /* Undo the deny and suspend, but still fail the mount. */ + ret = PTR_ERR(task); + goto suspend; + } + return 0; + +suspend: + btrfs_exclop_finish(fs_info); + down_write(&dev_replace->rwsem); + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + up_write(&dev_replace->rwsem); + return ret; } static int btrfs_dev_replace_kthread(void *data) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a39460bf68a77..9351c79187a7e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2579,7 +2579,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) err_drop: mnt_drop_write_file(file); if (bdev_file) - bdev_fput(bdev_file); + btrfs_release_device_allow_freeze(bdev_file); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2630,7 +2630,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) mnt_drop_write_file(file); if (bdev_file) - bdev_fput(bdev_file); + btrfs_release_device_allow_freeze(bdev_file); out: btrfs_put_dev_args_from_path(&args); out_free: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a88e68f905646..d9f2cd37a3650 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1119,7 +1119,16 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) mutex_unlock(&uuid_mutex); } -static void btrfs_close_bdev(struct btrfs_device *device) +/* Release a device that was made unfreezable for a membership change. */ +void btrfs_release_device_allow_freeze(struct file *bdev_file) +{ + /* Yield before allow (strand-safe); file still open for the allow (UAF-safe). */ + bdev_yield_claim(bdev_file); + bdev_allow_freeze(file_bdev(bdev_file)); + bdev_fput(bdev_file); +} + +static void btrfs_close_bdev(struct btrfs_device *device, bool allow_freeze) { if (!device->bdev) return; @@ -1129,7 +1138,11 @@ static void btrfs_close_bdev(struct btrfs_device *device) invalidate_bdev(device->bdev); } - bdev_fput(device->bdev_file); + /* @allow_freeze undoes a replace-time deny; unmount-close was never denied. */ + if (allow_freeze) + btrfs_release_device_allow_freeze(device->bdev_file); + else + bdev_fput(device->bdev_file); } static void btrfs_close_one_device(struct btrfs_device *device) @@ -1150,7 +1163,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) fs_devices->missing_devices--; } - btrfs_close_bdev(device); + btrfs_close_bdev(device, false); if (device->bdev) { fs_devices->open_devices--; device->bdev = NULL; @@ -2336,6 +2349,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, fs_info->fs_devices->rw_devices == 1) return BTRFS_ERROR_DEV_ONLY_WRITABLE; + /* Removal and freezing are mutually exclusive; refuse if frozen now. */ + if (device->bdev) { + ret = bdev_deny_freeze(device->bdev); + if (ret) + return ret; + } + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_del_init(&device->dev_alloc_list); @@ -2362,6 +2382,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, device->devid, ret); btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); + if (device->bdev) + bdev_allow_freeze(device->bdev); return ret; } @@ -2447,6 +2469,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, return btrfs_commit_transaction(trans); error_undo: + if (device->bdev) + bdev_allow_freeze(device->bdev); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, @@ -2491,7 +2515,8 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) mutex_lock(&uuid_mutex); - btrfs_close_bdev(srcdev); + /* The source was made unfreezable for the replace; undo it. */ + btrfs_close_bdev(srcdev, true); synchronize_rcu(); btrfs_free_device(srcdev); @@ -2512,7 +2537,8 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) mutex_unlock(&uuid_mutex); } -void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev, + bool allow_freeze) { struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; @@ -2533,7 +2559,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev); - btrfs_close_bdev(tgtdev); + btrfs_close_bdev(tgtdev, allow_freeze); synchronize_rcu(); btrfs_free_device(tgtdev); } @@ -2802,6 +2828,36 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) return 0; } +/* + * Open @path for @sb with freezing denied before the holder claim is published, + * so a racing bdev_freeze() can never reach a claim a device add or replace may + * still abort. The deny is taken on a throwaway non-holder probe open, then the + * holder is opened by the probe's dev_t. Balanced by the caller. + */ +struct file *btrfs_open_device_deny_freeze(const char *path, + struct super_block *sb) +{ + struct file *probe_file, *bdev_file; + int ret; + + probe_file = bdev_file_open_by_path(path, BLK_OPEN_READ, NULL, NULL); + if (IS_ERR(probe_file)) + return probe_file; + + ret = bdev_deny_freeze(file_bdev(probe_file)); + if (ret) { + bdev_fput(probe_file); + return ERR_PTR(ret); + } + + bdev_file = bdev_file_open_by_dev(file_bdev(probe_file)->bd_dev, + BLK_OPEN_WRITE, sb, &fs_holder_ops); + if (IS_ERR(bdev_file)) + bdev_allow_freeze(file_bdev(probe_file)); + bdev_fput(probe_file); + return bdev_file; +} + int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) { struct btrfs_root *root = fs_info->dev_root; @@ -2820,8 +2876,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; - bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, - fs_info->sb, &fs_holder_ops); + /* Forbid freezing until the device is a committed member (or unwound). */ + bdev_file = btrfs_open_device_deny_freeze(device_path, fs_info->sb); if (IS_ERR(bdev_file)) return PTR_ERR(bdev_file); @@ -2986,8 +3042,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path up_write(&sb->s_umount); locked = false; - if (ret) /* transaction commit */ + if (ret) { /* transaction commit */ + bdev_allow_freeze(file_bdev(bdev_file)); return ret; + } ret = btrfs_relocate_sys_chunks(fs_info); if (ret < 0) @@ -2995,8 +3053,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { - if (PTR_ERR(trans) == -ENOENT) + if (PTR_ERR(trans) == -ENOENT) { + bdev_allow_freeze(file_bdev(bdev_file)); return 0; + } ret = PTR_ERR(trans); trans = NULL; goto error_sysfs; @@ -3016,6 +3076,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path /* Update ctime/mtime for blkid or udev */ update_dev_time(device_path); + bdev_allow_freeze(file_bdev(bdev_file)); return ret; error_sysfs: @@ -3045,7 +3106,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path error_free_device: btrfs_free_device(device); error: - bdev_fput(bdev_file); + btrfs_release_device_allow_freeze(bdev_file); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 0082c166af91f..65de9504d887e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -744,6 +744,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *btrfs_scan_one_device(const char *path, bool mount_arg_dev); int btrfs_forget_devices(dev_t devt); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); +void btrfs_release_device_allow_freeze(struct file *bdev_file); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); void btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *this_dev); @@ -768,6 +769,8 @@ struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices const struct btrfs_dev_lookup_args *args); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); +struct file *btrfs_open_device_deny_freeze(const char *path, + struct super_block *sb); int btrfs_balance(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); @@ -787,7 +790,8 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev); -void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev, + bool allow_freeze); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8808ee76e73c0..5a725a0cd35f4 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -66,7 +66,7 @@ struct block_device { int bd_holders; struct kobject *bd_holder_dir; - atomic_t bd_fsfreeze_count; /* number of freeze requests */ + atomic_t bd_fsfreeze_count; /* >0 freeze requests, <0 freeze deniers */ struct mutex bd_fsfreeze_mutex; /* serialize freeze/thaw */ struct partition_meta_info *bd_meta_info; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 890128cdea1ce..9fc16e3c8075e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1829,7 +1829,10 @@ static inline int early_lookup_bdev(const char *pathname, dev_t *dev) int bdev_freeze(struct block_device *bdev); int bdev_thaw(struct block_device *bdev); +int bdev_deny_freeze(struct block_device *bdev); +void bdev_allow_freeze(struct block_device *bdev); void bdev_fput(struct file *bdev_file); +void bdev_yield_claim(struct file *bdev_file); struct io_comp_batch { struct rq_list req_list;