From f598a6df9c849f5e0050c5603a976b0f7c7c83d3 Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Tue, 13 Jan 2026 14:10:33 +0800 Subject: [PATCH 1/3] blk-cgroup: fix race between policy activation and blkg destruction When switching an IO scheduler on a block device, blkcg_activate_policy() allocates blkg_policy_data (pd) for all blkgs attached to the queue. However, blkcg_activate_policy() may race with concurrent blkcg deletion, leading to use-after-free and memory leak issues. The use-after-free occurs in the following race: T1 (blkcg_activate_policy): - Successfully allocates pd for blkg1 (loop0->queue, blkcgA) - Fails to allocate pd for blkg2 (loop0->queue, blkcgB) - Enters the enomem rollback path to release blkg1 resources T2 (blkcg deletion): - blkcgA is deleted concurrently - blkg1 is freed via blkg_free_workfn() - blkg1->pd is freed T1 (continued): - Rollback path accesses blkg1->pd->online after pd is freed - Triggers use-after-free In addition, blkg_free_workfn() frees pd before removing the blkg from q->blkg_list. This allows blkcg_activate_policy() to allocate a new pd for a blkg that is being destroyed, leaving the newly allocated pd unreachable when the blkg is finally freed. Fix these races by extending blkcg_mutex coverage to serialize blkcg_activate_policy() rollback and blkg destruction, ensuring pd lifecycle is synchronized with blkg list visibility. Link: https://lore.kernel.org/all/20260108014416.3656493-3-zhengqixing@huaweicloud.com/ Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()") Signed-off-by: Zheng Qixing --- block/blk-cgroup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 3cffb68ba5d87..600f8c5843eaf 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1596,6 +1596,8 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) if (queue_is_mq(q)) memflags = blk_mq_freeze_queue(q); + + mutex_lock(&q->blkcg_mutex); retry: spin_lock_irq(&q->queue_lock); @@ -1658,6 +1660,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) spin_unlock_irq(&q->queue_lock); out: + mutex_unlock(&q->blkcg_mutex); if (queue_is_mq(q)) blk_mq_unfreeze_queue(q, memflags); if (pinned_blkg) From 88d08fe00392e808f2335e88d094fbd6f97c4e0f Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Tue, 13 Jan 2026 14:10:34 +0800 Subject: [PATCH 2/3] blk-cgroup: skip dying blkg in blkcg_activate_policy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When switching IO schedulers on a block device, blkcg_activate_policy() can race with concurrent blkcg deletion, leading to a use-after-free in rcu_accelerate_cbs. T1: T2: blkg_destroy kill(&blkg->refcnt) // blkg->refcnt=1->0 blkg_release // call_rcu(__blkg_release) ... blkg_free_workfn ->pd_free_fn(pd) elv_iosched_store elevator_switch ... iterate blkg list blkg_get(blkg) // blkg->refcnt=0->1 list_del_init(&blkg->q_node) blkg_put(pinned_blkg) // blkg->refcnt=1->0 blkg_release // call_rcu again rcu_accelerate_cbs // uaf Fix this by replacing blkg_get() with blkg_tryget(), which fails if the blkg's refcount has already reached zero. If blkg_tryget() fails, skip processing this blkg since it's already being destroyed. Link: https://lore.kernel.org/all/20260108014416.3656493-4-zhengqixing@huaweicloud.com/ Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()") Signed-off-by: Zheng Qixing Reviewed-by: Christoph Hellwig Reviewed-by: Michal Koutný --- block/blk-cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 600f8c5843eaf..5dbc107eec538 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1622,9 +1622,10 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) * GFP_NOWAIT failed. Free the existing one and * prealloc for @blkg w/ GFP_KERNEL. */ + if (!blkg_tryget(blkg)) + continue; if (pinned_blkg) blkg_put(pinned_blkg); - blkg_get(blkg); pinned_blkg = blkg; spin_unlock_irq(&q->queue_lock); From 51d09b6f2ab5e2aa609d41546971166e87983301 Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Tue, 13 Jan 2026 14:10:35 +0800 Subject: [PATCH 3/3] blk-cgroup: factor policy pd teardown loop into helper Move the teardown sequence which offlines and frees per-policy blkg_policy_data (pd) into a helper for readability. No functional change intended. Signed-off-by: Zheng Qixing Reviewed-by: Christoph Hellwig Reviewed-by: Yu Kuai --- block/blk-cgroup.c | 58 +++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5dbc107eec538..78227ab0c1d74 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1559,6 +1559,31 @@ struct cgroup_subsys io_cgrp_subsys = { }; EXPORT_SYMBOL_GPL(io_cgrp_subsys); +/* + * Tear down per-blkg policy data for @pol on @q. + */ +static void blkcg_policy_teardown_pds(struct request_queue *q, + const struct blkcg_policy *pol) +{ + struct blkcg_gq *blkg; + + list_for_each_entry(blkg, &q->blkg_list, q_node) { + struct blkcg *blkcg = blkg->blkcg; + struct blkg_policy_data *pd; + + spin_lock(&blkcg->lock); + pd = blkg->pd[pol->plid]; + if (pd) { + if (pd->online && pol->pd_offline_fn) + pol->pd_offline_fn(pd); + pd->online = false; + pol->pd_free_fn(pd); + blkg->pd[pol->plid] = NULL; + } + spin_unlock(&blkcg->lock); + } +} + /** * blkcg_activate_policy - activate a blkcg policy on a gendisk * @disk: gendisk of interest @@ -1673,21 +1698,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) enomem: /* alloc failed, take down everything */ spin_lock_irq(&q->queue_lock); - list_for_each_entry(blkg, &q->blkg_list, q_node) { - struct blkcg *blkcg = blkg->blkcg; - struct blkg_policy_data *pd; - - spin_lock(&blkcg->lock); - pd = blkg->pd[pol->plid]; - if (pd) { - if (pd->online && pol->pd_offline_fn) - pol->pd_offline_fn(pd); - pd->online = false; - pol->pd_free_fn(pd); - blkg->pd[pol->plid] = NULL; - } - spin_unlock(&blkcg->lock); - } + blkcg_policy_teardown_pds(q, pol); spin_unlock_irq(&q->queue_lock); ret = -ENOMEM; goto out; @@ -1706,7 +1717,6 @@ void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { struct request_queue *q = disk->queue; - struct blkcg_gq *blkg; unsigned int memflags; if (!blkcg_policy_enabled(q, pol)) @@ -1717,22 +1727,8 @@ void blkcg_deactivate_policy(struct gendisk *disk, mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); - __clear_bit(pol->plid, q->blkcg_pols); - - list_for_each_entry(blkg, &q->blkg_list, q_node) { - struct blkcg *blkcg = blkg->blkcg; - - spin_lock(&blkcg->lock); - if (blkg->pd[pol->plid]) { - if (blkg->pd[pol->plid]->online && pol->pd_offline_fn) - pol->pd_offline_fn(blkg->pd[pol->plid]); - pol->pd_free_fn(blkg->pd[pol->plid]); - blkg->pd[pol->plid] = NULL; - } - spin_unlock(&blkcg->lock); - } - + blkcg_policy_teardown_pds(q, pol); spin_unlock_irq(&q->queue_lock); mutex_unlock(&q->blkcg_mutex);