From 7dd1f96c2e166104bde734a9161d84ca0abd598c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 15:46:37 +0800 Subject: [PATCH 1/2] block: pass io_comp_batch to rq_end_io_fn callback Add a third parameter 'const struct io_comp_batch *' to the rq_end_io_fn callback signature. This allows end_io handlers to access the completion batch context when requests are completed via blk_mq_end_request_batch(). The io_comp_batch is passed from blk_mq_end_request_batch(), while NULL is passed from __blk_mq_end_request() and blk_mq_put_rq_ref() which don't have batch context. This infrastructure change enables drivers to detect whether they're being called from a batched completion path (like iopoll) and access additional context stored in the io_comp_batch. Update all rq_end_io_fn implementations: - block/blk-mq.c: blk_end_sync_rq - block/blk-flush.c: flush_end_io, mq_flush_data_end_io - drivers/nvme/host/ioctl.c: nvme_uring_cmd_end_io - drivers/nvme/host/core.c: nvme_keep_alive_end_io - drivers/nvme/host/pci.c: abort_endio, nvme_del_queue_end, nvme_del_cq_end - drivers/nvme/target/passthru.c: nvmet_passthru_req_done - drivers/scsi/scsi_error.c: eh_lock_door_done - drivers/scsi/sg.c: sg_rq_end_io - drivers/scsi/st.c: st_scsi_execute_end - drivers/target/target_core_pscsi.c: pscsi_req_done - drivers/md/dm-rq.c: end_clone_request Signed-off-by: Ming Lei --- block/blk-flush.c | 6 ++++-- block/blk-mq.c | 9 +++++---- drivers/md/dm-rq.c | 3 ++- drivers/nvme/host/core.c | 3 ++- drivers/nvme/host/ioctl.c | 3 ++- drivers/nvme/host/pci.c | 11 +++++++---- drivers/nvme/target/passthru.c | 3 ++- drivers/scsi/scsi_error.c | 3 ++- drivers/scsi/sg.c | 6 ++++-- drivers/scsi/st.c | 3 ++- drivers/target/target_core_pscsi.c | 6 ++++-- include/linux/blk-mq.h | 4 +++- 12 files changed, 39 insertions(+), 21 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 43d6152897a42..403a46c864117 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -199,7 +199,8 @@ static void blk_flush_complete_seq(struct request *rq, } static enum rq_end_io_ret flush_end_io(struct request *flush_rq, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct request_queue *q = flush_rq->q; struct list_head *running; @@ -335,7 +336,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, } static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; diff --git a/block/blk-mq.c b/block/blk-mq.c index a29d8ac9d3e35..cf1daedbb39fd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1156,7 +1156,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) if (rq->end_io) { rq_qos_done(rq->q, rq); - if (rq->end_io(rq, error) == RQ_END_IO_FREE) + if (rq->end_io(rq, error, NULL) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else { blk_mq_free_request(rq); @@ -1211,7 +1211,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) * If end_io handler returns NONE, then it still has * ownership of the request. */ - if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE) + if (rq->end_io && rq->end_io(rq, 0, iob) == RQ_END_IO_NONE) continue; WRITE_ONCE(rq->state, MQ_RQ_IDLE); @@ -1458,7 +1458,8 @@ struct blk_rq_wait { blk_status_t ret; }; -static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret) +static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret, + const struct io_comp_batch *iob) { struct blk_rq_wait *wait = rq->end_io_data; @@ -1688,7 +1689,7 @@ static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expi void blk_mq_put_rq_ref(struct request *rq) { if (is_flush_rq(rq)) { - if (rq->end_io(rq, 0) == RQ_END_IO_FREE) + if (rq->end_io(rq, 0, NULL) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else if (req_ref_put_and_test(rq)) { __blk_mq_free_request(rq); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index a6ca92049c10e..e9a7563b4b2fe 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -295,7 +295,8 @@ static void dm_kill_unmapped_request(struct request *rq, blk_status_t error) } static enum rq_end_io_ret end_clone_request(struct request *clone, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct dm_rq_target_io *tio = clone->end_io_data; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7bf228df6001f..19b67cf5d5506 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1333,7 +1333,8 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl) } static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { struct nvme_ctrl *ctrl = rq->end_io_data; unsigned long rtt = jiffies - (rq->deadline - rq->timeout); diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index a9c097dacad6f..e45ac0ca174e0 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -410,7 +410,8 @@ static void nvme_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw) } static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, - blk_status_t err) + blk_status_t err, + const struct io_comp_batch *iob) { struct io_uring_cmd *ioucmd = req->end_io_data; struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0e4caeab739c7..92d9eff3d4608 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1616,7 +1616,8 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error) +static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error, + const struct io_comp_batch *iob) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; @@ -2859,7 +2860,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) } static enum rq_end_io_ret nvme_del_queue_end(struct request *req, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct nvme_queue *nvmeq = req->end_io_data; @@ -2869,14 +2871,15 @@ static enum rq_end_io_ret nvme_del_queue_end(struct request *req, } static enum rq_end_io_ret nvme_del_cq_end(struct request *req, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct nvme_queue *nvmeq = req->end_io_data; if (error) set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); - return nvme_del_queue_end(req, error); + return nvme_del_queue_end(req, error, iob); } static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 96648ec2fadb5..0823c87637d37 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -247,7 +247,8 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w) } static enum rq_end_io_ret nvmet_passthru_req_done(struct request *rq, - blk_status_t blk_status) + blk_status_t blk_status, + const struct io_comp_batch *iob) { struct nvmet_req *req = rq->end_io_data; diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index eebca96c1fc15..3f3710ea1a988 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -2109,7 +2109,8 @@ enum scsi_disposition scsi_decide_disposition(struct scsi_cmnd *scmd) } static enum rq_end_io_ret eh_lock_door_done(struct request *req, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { blk_mq_free_request(req); return RQ_END_IO_NONE; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 57fba34832ad1..1a521f9d821a0 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -177,7 +177,8 @@ typedef struct sg_device { /* holds the state of each scsi generic device */ } Sg_device; /* tasklet or soft irq callback */ -static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status); +static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status, + const struct io_comp_batch *iob); static int sg_start_req(Sg_request *srp, unsigned char *cmd); static int sg_finish_rem_req(Sg_request * srp); static int sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size); @@ -1309,7 +1310,8 @@ sg_rq_end_io_usercontext(struct work_struct *work) * level when a command is completed (or has failed). */ static enum rq_end_io_ret -sg_rq_end_io(struct request *rq, blk_status_t status) +sg_rq_end_io(struct request *rq, blk_status_t status, + const struct io_comp_batch *iob) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq); struct sg_request *srp = rq->end_io_data; diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 168f25e4aaa38..8aeaa3b68c25e 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -525,7 +525,8 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req) } static enum rq_end_io_ret st_scsi_execute_end(struct request *req, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); struct st_request *SRpnt = req->end_io_data; diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index db4e09042469c..823b2665f95b0 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -39,7 +39,8 @@ static inline struct pscsi_dev_virt *PSCSI_DEV(struct se_device *dev) } static sense_reason_t pscsi_execute_cmd(struct se_cmd *cmd); -static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t); +static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t, + const struct io_comp_batch *); /* pscsi_attach_hba(): * @@ -1001,7 +1002,8 @@ static sector_t pscsi_get_blocks(struct se_device *dev) } static enum rq_end_io_ret pscsi_req_done(struct request *req, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { struct se_cmd *cmd = req->end_io_data; struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index cae9e857aea42..18a2388ba581d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -13,6 +13,7 @@ struct blk_mq_tags; struct blk_flush_queue; +struct io_comp_batch; #define BLKDEV_MIN_RQ 4 #define BLKDEV_DEFAULT_RQ 128 @@ -22,7 +23,8 @@ enum rq_end_io_ret { RQ_END_IO_FREE, }; -typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); +typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t, + const struct io_comp_batch *); /* * request flags */ From eb9f85441b40ef800be240e55d4f7fd1235b4e91 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 15:46:38 +0800 Subject: [PATCH 2/2] nvme/io_uring: optimize IOPOLL completions for local ring context When multiple io_uring rings poll on the same NVMe queue, one ring can find completions belonging to another ring. The current code always uses task_work to handle this, but this adds overhead for the common single-ring case. This patch passes the polling io_ring_ctx through io_comp_batch's new poll_ctx field. In io_do_iopoll(), the polling ring's context is stored in iob.poll_ctx before calling the iopoll callbacks. In nvme_uring_cmd_end_io(), we now compare iob->poll_ctx with the request's owning io_ring_ctx (via io_uring_cmd_ctx_handle()). If they match (local context), we complete inline with io_uring_cmd_done32(). If they differ (remote context) or iob is NULL (non-iopoll path), we use task_work as before. This optimization eliminates task_work scheduling overhead for the common case where a ring polls and finds its own completions. ~10% IOPS improvement is observed in the following benchmark: fio/t/io_uring -b512 -d128 -c32 -s32 -p1 -F1 -O0 -P1 -u1 -n1 /dev/ng0n1 Signed-off-by: Ming Lei --- drivers/nvme/host/ioctl.c | 20 +++++++++++++------- include/linux/blkdev.h | 1 + io_uring/rw.c | 6 ++++++ 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index e45ac0ca174e0..fb62633ccbb0d 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -426,14 +426,20 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, pdu->result = le64_to_cpu(nvme_req(req)->result.u64); /* - * IOPOLL could potentially complete this request directly, but - * if multiple rings are polling on the same queue, then it's possible - * for one ring to find completions for another ring. Punting the - * completion via task_work will always direct it to the right - * location, rather than potentially complete requests for ringA - * under iopoll invocations from ringB. + * For IOPOLL, check if this completion is happening in the context + * of the same io_ring that owns the request (local context). If so, + * we can complete inline without task_work overhead. Otherwise, we + * must punt to task_work to ensure completion happens in the correct + * ring's context. */ - io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + if (blk_rq_is_poll(req) && iob && + iob->poll_ctx == io_uring_cmd_ctx_handle(ioucmd)) { + if (pdu->bio) + blk_rq_unmap_user(pdu->bio); + io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, 0); + } else { + io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + } return RQ_END_IO_FREE; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 72e34acd439c9..eae487e79a350 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1822,6 +1822,7 @@ struct io_comp_batch { struct rq_list req_list; bool need_ts; void (*complete)(struct io_comp_batch *); + void *poll_ctx; }; static inline bool blk_atomic_write_start_sect_aligned(sector_t sector, diff --git a/io_uring/rw.c b/io_uring/rw.c index 70ca88cc1f547..ff3192f603f35 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1320,6 +1320,12 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) DEFINE_IO_COMP_BATCH(iob); int nr_events = 0; + /* + * Store the polling io_ring_ctx so drivers can detect if they're + * completing a request in the same ring context that's polling. + */ + iob.poll_ctx = ctx; + /* * Only spin for completions if we don't have multiple devices hanging * off our complete list.