From 28287d9aeb17b5fccbf14af6e0a312add0c75a8c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 15 Jun 2026 15:18:04 +0200 Subject: [PATCH 1/6] btrfs: destroy the target device when mark_block_group_to_copy() fails btrfs_dev_replace_start() opens the replacement target with btrfs_init_dev_replace_tgtdev(), which adds it to the device list and opens its block device. Every error path after that point reaches the 'leave' label to tear the target back down with btrfs_destroy_dev_replace_tgtdev() - except the mark_block_group_to_copy() failure, which returns directly. The target is then leaked: it stays on the device list with its block device held until the filesystem is unmounted. Goto leave like the other post-open error paths so the target is destroyed. Fixes: 78ce9fc269af ("btrfs: zoned: mark block groups to copy for device-replace") Signed-off-by: Christian Brauner (Amutable) Reviewed-by: Johannes Thumshirn --- fs/btrfs/dev-replace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 8f8fa14886ded..0112aa6d7ab1c 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -624,7 +624,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, ret = mark_block_group_to_copy(fs_info, src_device); if (ret) - return ret; + goto leave; down_write(&dev_replace->rwsem); dev_replace->replace_task = current; From ae27b7fd650f12149afd3a01926415464962852b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 15 Jun 2026 15:18:05 +0200 Subject: [PATCH 2/6] block: allow making a block device unfreezable Add bdev_deny_freeze() and bdev_allow_freeze(), modeled on deny_write_access()/allow_write_access(). bd_fsfreeze_count becomes a signed counter: > 0 counts active freezes, < 0 counts deniers, and the two regimes are mutually exclusive. bdev_freeze() refuses with -EBUSY while a deny is held, and bdev_deny_freeze() refuses while the device is frozen. A filesystem that mutates a device's membership (a btrfs device add, remove or replace) denies freezing on the device for the duration, so a claim a freeze walk might act on is never added or torn down behind the freezer's back. The deny/allow helpers are a single atomic on bd_fsfreeze_count and take no lock, so they can be called while holding s_umount without inverting against bdev_freeze()'s bd_fsfreeze_mutex -> s_umount order. Signed-off-by: Christian Brauner (Amutable) Reviewed-by: Jan Kara --- block/bdev.c | 43 ++++++++++++++++++++++++++++++++++++++- include/linux/blk_types.h | 2 +- include/linux/blkdev.h | 2 ++ 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index bb0ffa3bb4dfb..939dec3517724 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -304,7 +304,12 @@ int bdev_freeze(struct block_device *bdev) mutex_lock(&bdev->bd_fsfreeze_mutex); - if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { + /* A device being removed from its filesystem refuses freezes. */ + if (!atomic_inc_unless_negative(&bdev->bd_fsfreeze_count)) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return -EBUSY; + } + if (atomic_read(&bdev->bd_fsfreeze_count) > 1) { mutex_unlock(&bdev->bd_fsfreeze_mutex); return 0; } @@ -368,6 +373,42 @@ int bdev_thaw(struct block_device *bdev) } EXPORT_SYMBOL(bdev_thaw); +/** + * bdev_deny_freeze - make a block device unfreezable + * @bdev: block device + * + * Reserve @bdev against bdev_freeze() the way deny_write_access() reserves a + * file against writers. bd_fsfreeze_count is sign-encoded: > 0 counts active + * freezes, < 0 counts deniers, so a deny succeeds only while no freeze is in + * progress. While held, bdev_freeze() returns -EBUSY. Pair with + * bdev_allow_freeze(). + * + * A filesystem removing, adding or replacing a member device denies freezes on + * it for the duration, so a claim a freeze walk might act on is never torn down + * behind the freezer's back. The deny is device-scoped, not (device, + * superblock)-scoped: a device shared by several superblocks is refused for all + * of them. No in-tree filesystem removes a shared claim from a live superblock. + * + * Return: 0, or -EBUSY if the device is currently frozen. + */ +int bdev_deny_freeze(struct block_device *bdev) +{ + return atomic_dec_unless_positive(&bdev->bd_fsfreeze_count) ? 0 : -EBUSY; +} +EXPORT_SYMBOL_GPL(bdev_deny_freeze); + +/** + * bdev_allow_freeze - allow freezing a block device again + * @bdev: block device + * + * Undo one bdev_deny_freeze(). + */ +void bdev_allow_freeze(struct block_device *bdev) +{ + atomic_inc(&bdev->bd_fsfreeze_count); +} +EXPORT_SYMBOL_GPL(bdev_allow_freeze); + /* * pseudo-fs */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8808ee76e73c0..5a725a0cd35f4 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -66,7 +66,7 @@ struct block_device { int bd_holders; struct kobject *bd_holder_dir; - atomic_t bd_fsfreeze_count; /* number of freeze requests */ + atomic_t bd_fsfreeze_count; /* >0 freeze requests, <0 freeze deniers */ struct mutex bd_fsfreeze_mutex; /* serialize freeze/thaw */ struct partition_meta_info *bd_meta_info; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 890128cdea1ce..cf1951caadb2f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1829,6 +1829,8 @@ static inline int early_lookup_bdev(const char *pathname, dev_t *dev) int bdev_freeze(struct block_device *bdev); int bdev_thaw(struct block_device *bdev); +int bdev_deny_freeze(struct block_device *bdev); +void bdev_allow_freeze(struct block_device *bdev); void bdev_fput(struct file *bdev_file); struct io_comp_batch { From 9f0a90b957cfe6351aa1bad7ce10f5e9961789b5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 15 Jun 2026 15:18:06 +0200 Subject: [PATCH 3/6] block: split bdev_yield_claim() out of bdev_fput() bdev_fput() yields the holder claim and then closes the file, which is a deferred operation. Split the yield half into bdev_yield_claim() so a caller can give up the holder while the file - and therefore the block device - is still open, act on the device, and only then bdev_fput(). A filesystem that made a device unfreezable for a membership change with bdev_deny_freeze() undoes the deny on release with bdev_yield_claim(bdev_file); bdev_allow_freeze(file_bdev(bdev_file)); bdev_fput(bdev_file); Re-allowing only after the holder is yielded avoids stranding the filesystem on a racing freeze, and doing it while the file is still open avoids touching the block device after bdev_fput(). bdev_fput() yields again, which is a no-op once the claim has already been given up. Signed-off-by: Christian Brauner (Amutable) --- block/bdev.c | 30 ++++++++++++++++++++++-------- include/linux/blkdev.h | 1 + 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 939dec3517724..e59052c2a081a 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -1199,18 +1199,16 @@ void bdev_release(struct file *bdev_file) } /** - * bdev_fput - yield claim to the block device and put the file + * bdev_yield_claim - give up the holder claim on an open block device * @bdev_file: open block device * - * Yield claim on the block device and put the file. Ensure that the - * block device can be reclaimed before the file is closed which is a - * deferred operation. + * Yield the holder and any write access for @bdev_file without closing it, so + * the caller can still act on the device - e.g. bdev_allow_freeze() it - before + * the final bdev_fput(). bdev_fput() yields too, so calling it afterwards is + * safe. */ -void bdev_fput(struct file *bdev_file) +void bdev_yield_claim(struct file *bdev_file) { - if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops)) - return; - if (bdev_file->private_data) { struct block_device *bdev = file_bdev(bdev_file); struct gendisk *disk = bdev->bd_disk; @@ -1226,7 +1224,23 @@ void bdev_fput(struct file *bdev_file) bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host); mutex_unlock(&disk->open_mutex); } +} +EXPORT_SYMBOL_GPL(bdev_yield_claim); + +/** + * bdev_fput - yield claim to the block device and put the file + * @bdev_file: open block device + * + * Yield claim on the block device and put the file. Ensure that the + * block device can be reclaimed before the file is closed which is a + * deferred operation. + */ +void bdev_fput(struct file *bdev_file) +{ + if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops)) + return; + bdev_yield_claim(bdev_file); fput(bdev_file); } EXPORT_SYMBOL(bdev_fput); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cf1951caadb2f..9fc16e3c8075e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1832,6 +1832,7 @@ int bdev_thaw(struct block_device *bdev); int bdev_deny_freeze(struct block_device *bdev); void bdev_allow_freeze(struct block_device *bdev); void bdev_fput(struct file *bdev_file); +void bdev_yield_claim(struct file *bdev_file); struct io_comp_batch { struct rq_list req_list; From 0f867ac5b5241c44f49523e6b77689c501f30271 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 15 Jun 2026 15:18:07 +0200 Subject: [PATCH 4/6] btrfs: deny freezing a device while it is being removed btrfs_rm_device() runs under mnt_want_write_file(), but the claim on the removed device is released by the ioctl after mnt_drop_write_file(), so a bdev_freeze() racing that window could freeze the filesystem through the device just as its claim is torn down, leaving nothing for bdev_thaw() to rebalance. The window cannot be closed by reordering the teardown. btrfs_rm_device() hands the final bdev_fput() back to the ioctl, run only after mnt_drop_write_file(), because bdev_release() takes the disk ->open_mutex and its dependency chain, which must not nest under the superblock's freeze/write protection -- freeze_super() drops s_umount before draining writers precisely to keep sb_start_write ordered above s_umount. Holding mnt_want_write across bdev_fput() would reintroduce that inversion, so the holder teardown is forced outside the write-protected section. A freeze landing in the resulting gap resolves the still-live holder, rides in, and strands when the claim is released; no ordering of the close against the drop removes the gap. The device itself therefore has to refuse freezing for the whole removal. Deny freezing the device for the duration of the removal: bdev_deny_freeze() at the start of btrfs_rm_device() (it cannot be frozen yet, the ioctl holds the write count), and release it through btrfs_release_device_allow_freeze() in the ioctls on success, or bdev_allow_freeze() on the error paths that keep the device a member. A device frozen before the removal begins is refused with -EBUSY. btrfs_release_device_allow_freeze() yields the holder, re-allows freezing, then closes the device, so the re-allow neither strands the filesystem on a racing freeze nor touches the block device after the final fput. Signed-off-by: Christian Brauner (Amutable) --- fs/btrfs/ioctl.c | 4 ++-- fs/btrfs/volumes.c | 20 ++++++++++++++++++++ fs/btrfs/volumes.h | 1 + 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a39460bf68a77..9351c79187a7e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2579,7 +2579,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) err_drop: mnt_drop_write_file(file); if (bdev_file) - bdev_fput(bdev_file); + btrfs_release_device_allow_freeze(bdev_file); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2630,7 +2630,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) mnt_drop_write_file(file); if (bdev_file) - bdev_fput(bdev_file); + btrfs_release_device_allow_freeze(bdev_file); out: btrfs_put_dev_args_from_path(&args); out_free: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a88e68f905646..36f9835f65e3d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1119,6 +1119,15 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) mutex_unlock(&uuid_mutex); } +/* Release a device that was made unfreezable for a membership change. */ +void btrfs_release_device_allow_freeze(struct file *bdev_file) +{ + /* Yield before allow (strand-safe); file still open for the allow (UAF-safe). */ + bdev_yield_claim(bdev_file); + bdev_allow_freeze(file_bdev(bdev_file)); + bdev_fput(bdev_file); +} + static void btrfs_close_bdev(struct btrfs_device *device) { if (!device->bdev) @@ -2336,6 +2345,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, fs_info->fs_devices->rw_devices == 1) return BTRFS_ERROR_DEV_ONLY_WRITABLE; + /* Removal and freezing are mutually exclusive; refuse if frozen now. */ + if (device->bdev) { + ret = bdev_deny_freeze(device->bdev); + if (ret) + return ret; + } + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_del_init(&device->dev_alloc_list); @@ -2362,6 +2378,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, device->devid, ret); btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); + if (device->bdev) + bdev_allow_freeze(device->bdev); return ret; } @@ -2447,6 +2465,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, return btrfs_commit_transaction(trans); error_undo: + if (device->bdev) + bdev_allow_freeze(device->bdev); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 0082c166af91f..60e82c15881a4 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -744,6 +744,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *btrfs_scan_one_device(const char *path, bool mount_arg_dev); int btrfs_forget_devices(dev_t devt); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); +void btrfs_release_device_allow_freeze(struct file *bdev_file); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); void btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *this_dev); From bb39893f11b2ec2859ba00ff0f945d44fba95bc9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 15 Jun 2026 15:18:08 +0200 Subject: [PATCH 5/6] btrfs: deny freezing a device while it is being added btrfs_init_new_device() opens and claims the new device on a live superblock without holding the write count, so a bdev_freeze() racing the window between the claim being published and the device becoming a member could freeze the filesystem through a claim the add may still abort and tear down. Add btrfs_open_device_deny_freeze(): it opens the device once non-exclusively to take the freeze deny, then claims it by the same dev_t, so the holder is only ever published while the device is already unfreezable. Keep it denied until the add is durable: bdev_allow_freeze() on each success return (the device is now a committed member), btrfs_release_device_allow_freeze() on the error unwind. The deny spans the whole add, including the seeding tail whose late failures still release the device. A device already frozen when the add starts is refused with -EBUSY. Signed-off-by: Christian Brauner (Amutable) --- fs/btrfs/volumes.c | 45 ++++++++++++++++++++++++++++++++++++++++----- fs/btrfs/volumes.h | 2 ++ 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 36f9835f65e3d..4558e018b53b7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2822,6 +2822,36 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) return 0; } +/* + * Open @path for @sb with freezing denied before the holder claim is published, + * so a racing bdev_freeze() can never reach a claim a device add or replace may + * still abort. The deny is taken on a throwaway non-holder probe open, then the + * holder is opened by the probe's dev_t. Balanced by the caller. + */ +struct file *btrfs_open_device_deny_freeze(const char *path, + struct super_block *sb) +{ + struct file *probe_file, *bdev_file; + int ret; + + probe_file = bdev_file_open_by_path(path, BLK_OPEN_READ, NULL, NULL); + if (IS_ERR(probe_file)) + return probe_file; + + ret = bdev_deny_freeze(file_bdev(probe_file)); + if (ret) { + bdev_fput(probe_file); + return ERR_PTR(ret); + } + + bdev_file = bdev_file_open_by_dev(file_bdev(probe_file)->bd_dev, + BLK_OPEN_WRITE, sb, &fs_holder_ops); + if (IS_ERR(bdev_file)) + bdev_allow_freeze(file_bdev(probe_file)); + bdev_fput(probe_file); + return bdev_file; +} + int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) { struct btrfs_root *root = fs_info->dev_root; @@ -2840,8 +2870,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; - bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, - fs_info->sb, &fs_holder_ops); + /* Forbid freezing until the device is a committed member (or unwound). */ + bdev_file = btrfs_open_device_deny_freeze(device_path, fs_info->sb); if (IS_ERR(bdev_file)) return PTR_ERR(bdev_file); @@ -3006,8 +3036,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path up_write(&sb->s_umount); locked = false; - if (ret) /* transaction commit */ + if (ret) { /* transaction commit */ + bdev_allow_freeze(file_bdev(bdev_file)); return ret; + } ret = btrfs_relocate_sys_chunks(fs_info); if (ret < 0) @@ -3015,8 +3047,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { - if (PTR_ERR(trans) == -ENOENT) + if (PTR_ERR(trans) == -ENOENT) { + bdev_allow_freeze(file_bdev(bdev_file)); return 0; + } ret = PTR_ERR(trans); trans = NULL; goto error_sysfs; @@ -3036,6 +3070,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path /* Update ctime/mtime for blkid or udev */ update_dev_time(device_path); + bdev_allow_freeze(file_bdev(bdev_file)); return ret; error_sysfs: @@ -3065,7 +3100,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path error_free_device: btrfs_free_device(device); error: - bdev_fput(bdev_file); + btrfs_release_device_allow_freeze(bdev_file); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 60e82c15881a4..75c7963f5d4cc 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -769,6 +769,8 @@ struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices const struct btrfs_dev_lookup_args *args); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); +struct file *btrfs_open_device_deny_freeze(const char *path, + struct super_block *sb); int btrfs_balance(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); From fdf18d2bfb6646854ac8fb302921235c2184d5ab Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 15 Jun 2026 15:18:09 +0200 Subject: [PATCH 6/6] btrfs: deny freezing devices undergoing a replace A device replace opens a target and, on success, frees the source on a live filesystem from btrfs_dev_replace_finishing() - which cannot fail and also runs from a kthread on mount resume. A bdev_freeze() racing the source free or the target swap-in would freeze the filesystem through a claim that is being torn down or replaced, leaving nothing for bdev_thaw() to rebalance. Make both devices unfreezable for the whole replace, with the invariant that a STARTED replace holds one deny on each device and any other state holds none. The target is denied at open (btrfs_open_device_deny_freeze(), undone on btrfs_init_dev_replace_tgtdev()'s error unwind); the source is denied at the start of btrfs_dev_replace_start(), before mark_block_group_to_copy() so every 'leave' unwind sees both denied. The deny tracks the STARTED state and is dropped whenever the replace leaves it: btrfs_dev_replace_finishing() re-allows the target it makes a member and frees the source through btrfs_close_bdev(allow_freeze=true), and its scrub-error path re-allows both as it cancels. Its early failures (before the device swap) keep the replace STARTED and resumable, so both stay denied. Suspending for unmount re-allows both, so they are reopened freezable at the next mount where btrfs_resume_dev_replace_async() re-denies them (staying suspended if a device is frozen right then); a replace cancelled from the suspended state therefore destroys the target without allowing. btrfs_close_bdev() and btrfs_destroy_dev_replace_tgtdev() take an allow_freeze argument to carry this distinction; the unmount path (btrfs_close_one_device()) passes false. On resume, a failed kthread_run() re-allows both devices and goes through the suspend path, resetting the replace to SUSPENDED and finishing the exclusive operation instead of returning straight away. The (re)mount still aborts on that error; routing it through suspend keeps the deny balanced against the unmount teardown and additionally drops BTRFS_EXCLOP_DEV_REPLACE, closing a pre-existing leak that was harmless on the failed mount that frees the fs but would have wedged future exclusive operations after a failed remount-rw. Signed-off-by: Christian Brauner (Amutable) --- fs/btrfs/dev-replace.c | 65 +++++++++++++++++++++++++++++++++++++----- fs/btrfs/volumes.c | 18 ++++++++---- fs/btrfs/volumes.h | 3 +- 3 files changed, 72 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 0112aa6d7ab1c..4d6bd6b4b0394 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -247,8 +247,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return -EINVAL; } - bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, - fs_info->sb, &fs_holder_ops); + /* Unfreezable for the whole replace; see btrfs_dev_replace_start(). */ + bdev_file = btrfs_open_device_deny_freeze(device_path, fs_info->sb); if (IS_ERR(bdev_file)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); return PTR_ERR(bdev_file); @@ -325,7 +325,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return 0; error: - bdev_fput(bdev_file); + /* Undo the open-time freeze deny. */ + btrfs_release_device_allow_freeze(bdev_file); return ret; } @@ -622,6 +623,15 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (ret) return ret; + /* Deny the source before mark, so every 'leave' unwinds both denied. */ + if (src_device->bdev) { + ret = bdev_deny_freeze(src_device->bdev); + if (ret) { + btrfs_destroy_dev_replace_tgtdev(tgt_device, true); + return ret; + } + } + ret = mark_block_group_to_copy(fs_info, src_device); if (ret) goto leave; @@ -706,7 +716,9 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return ret; leave: - btrfs_destroy_dev_replace_tgtdev(tgt_device); + if (src_device->bdev) + bdev_allow_freeze(src_device->bdev); + btrfs_destroy_dev_replace_tgtdev(tgt_device, true); return ret; } @@ -887,6 +899,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, */ ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); if (ret) { + /* Stays started/resumable; keep both denied. */ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } @@ -900,6 +913,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, while (1) { trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { + /* Stays started/resumable; keep both denied. */ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return PTR_ERR(trans); } @@ -952,7 +966,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_devices->device_list_mutex); btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) - btrfs_destroy_dev_replace_tgtdev(tgt_device); + btrfs_destroy_dev_replace_tgtdev(tgt_device, true); + /* The source stays a member; re-allow freezing it. */ + if (src_device->bdev) + bdev_allow_freeze(src_device->bdev); btrfs_rm_dev_replace_unblocked(fs_info); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); @@ -1018,6 +1035,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + /* The target is now a member; the source is freed (allow + release). */ + bdev_allow_freeze(tgt_device->bdev); btrfs_rm_dev_replace_free_srcdev(src_device); return 0; @@ -1146,8 +1165,9 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) btrfs_dev_name(src_device), src_device->devid, btrfs_dev_name(tgt_device)); + /* A suspended replace never re-denied freezing; do not allow. */ if (tgt_device) - btrfs_destroy_dev_replace_tgtdev(tgt_device); + btrfs_destroy_dev_replace_tgtdev(tgt_device, false); break; default: up_write(&dev_replace->rwsem); @@ -1177,6 +1197,11 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) dev_replace->time_stopped = ktime_get_real_seconds(); dev_replace->item_needs_writeback = 1; btrfs_info(fs_info, "suspending dev_replace for unmount"); + /* Reopened freezable next mount; resume re-denies. */ + if (dev_replace->srcdev && dev_replace->srcdev->bdev) + bdev_allow_freeze(dev_replace->srcdev->bdev); + if (dev_replace->tgtdev && dev_replace->tgtdev->bdev) + bdev_allow_freeze(dev_replace->tgtdev->bdev); break; } @@ -1189,6 +1214,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) { struct task_struct *task; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int ret = 0; down_write(&dev_replace->rwsem); @@ -1232,8 +1258,33 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) return 0; } + /* Re-deny for the resumed replace; stay suspended if frozen now. */ + if (dev_replace->srcdev->bdev && + bdev_deny_freeze(dev_replace->srcdev->bdev)) + goto suspend; + if (bdev_deny_freeze(dev_replace->tgtdev->bdev)) { + if (dev_replace->srcdev->bdev) + bdev_allow_freeze(dev_replace->srcdev->bdev); + goto suspend; + } + task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); - return PTR_ERR_OR_ZERO(task); + if (IS_ERR(task)) { + bdev_allow_freeze(dev_replace->tgtdev->bdev); + if (dev_replace->srcdev->bdev) + bdev_allow_freeze(dev_replace->srcdev->bdev); + /* Undo the deny and suspend, but still fail the mount. */ + ret = PTR_ERR(task); + goto suspend; + } + return 0; + +suspend: + btrfs_exclop_finish(fs_info); + down_write(&dev_replace->rwsem); + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + up_write(&dev_replace->rwsem); + return ret; } static int btrfs_dev_replace_kthread(void *data) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4558e018b53b7..d9f2cd37a3650 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1128,7 +1128,7 @@ void btrfs_release_device_allow_freeze(struct file *bdev_file) bdev_fput(bdev_file); } -static void btrfs_close_bdev(struct btrfs_device *device) +static void btrfs_close_bdev(struct btrfs_device *device, bool allow_freeze) { if (!device->bdev) return; @@ -1138,7 +1138,11 @@ static void btrfs_close_bdev(struct btrfs_device *device) invalidate_bdev(device->bdev); } - bdev_fput(device->bdev_file); + /* @allow_freeze undoes a replace-time deny; unmount-close was never denied. */ + if (allow_freeze) + btrfs_release_device_allow_freeze(device->bdev_file); + else + bdev_fput(device->bdev_file); } static void btrfs_close_one_device(struct btrfs_device *device) @@ -1159,7 +1163,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) fs_devices->missing_devices--; } - btrfs_close_bdev(device); + btrfs_close_bdev(device, false); if (device->bdev) { fs_devices->open_devices--; device->bdev = NULL; @@ -2511,7 +2515,8 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) mutex_lock(&uuid_mutex); - btrfs_close_bdev(srcdev); + /* The source was made unfreezable for the replace; undo it. */ + btrfs_close_bdev(srcdev, true); synchronize_rcu(); btrfs_free_device(srcdev); @@ -2532,7 +2537,8 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) mutex_unlock(&uuid_mutex); } -void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev, + bool allow_freeze) { struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; @@ -2553,7 +2559,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev); - btrfs_close_bdev(tgtdev); + btrfs_close_bdev(tgtdev, allow_freeze); synchronize_rcu(); btrfs_free_device(tgtdev); } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 75c7963f5d4cc..65de9504d887e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -790,7 +790,8 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev); -void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev, + bool allow_freeze); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);