From 7dd1f96c2e166104bde734a9161d84ca0abd598c Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 16 Jan 2026 15:46:37 +0800
Subject: [PATCH 1/2] block: pass io_comp_batch to rq_end_io_fn callback

Add a third parameter 'const struct io_comp_batch *' to the rq_end_io_fn
callback signature. This allows end_io handlers to access the completion
batch context when requests are completed via blk_mq_end_request_batch().

The io_comp_batch is passed from blk_mq_end_request_batch(), while NULL
is passed from __blk_mq_end_request() and blk_mq_put_rq_ref() which don't
have batch context.

This infrastructure change enables drivers to detect whether they're
being called from a batched completion path (like iopoll) and access
additional context stored in the io_comp_batch.

Update all rq_end_io_fn implementations:
- block/blk-mq.c: blk_end_sync_rq
- block/blk-flush.c: flush_end_io, mq_flush_data_end_io
- drivers/nvme/host/ioctl.c: nvme_uring_cmd_end_io
- drivers/nvme/host/core.c: nvme_keep_alive_end_io
- drivers/nvme/host/pci.c: abort_endio, nvme_del_queue_end, nvme_del_cq_end
- drivers/nvme/target/passthru.c: nvmet_passthru_req_done
- drivers/scsi/scsi_error.c: eh_lock_door_done
- drivers/scsi/sg.c: sg_rq_end_io
- drivers/scsi/st.c: st_scsi_execute_end
- drivers/target/target_core_pscsi.c: pscsi_req_done
- drivers/md/dm-rq.c: end_clone_request

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-flush.c                  |  6 ++++--
 block/blk-mq.c                     |  9 +++++----
 drivers/md/dm-rq.c                 |  3 ++-
 drivers/nvme/host/core.c           |  3 ++-
 drivers/nvme/host/ioctl.c          |  3 ++-
 drivers/nvme/host/pci.c            | 11 +++++++----
 drivers/nvme/target/passthru.c     |  3 ++-
 drivers/scsi/scsi_error.c          |  3 ++-
 drivers/scsi/sg.c                  |  6 ++++--
 drivers/scsi/st.c                  |  3 ++-
 drivers/target/target_core_pscsi.c |  6 ++++--
 include/linux/blk-mq.h             |  4 +++-
 12 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 43d6152897a42..403a46c864117 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -199,7 +199,8 @@ static void blk_flush_complete_seq(struct request *rq,
 }
 
 static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
-				       blk_status_t error)
+				       blk_status_t error,
+				       const struct io_comp_batch *iob)
 {
 	struct request_queue *q = flush_rq->q;
 	struct list_head *running;
@@ -335,7 +336,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 }
 
 static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
-					       blk_status_t error)
+					       blk_status_t error,
+					       const struct io_comp_batch *iob)
 {
 	struct request_queue *q = rq->q;
 	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a29d8ac9d3e35..cf1daedbb39fd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1156,7 +1156,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 
 	if (rq->end_io) {
 		rq_qos_done(rq->q, rq);
-		if (rq->end_io(rq, error) == RQ_END_IO_FREE)
+		if (rq->end_io(rq, error, NULL) == RQ_END_IO_FREE)
 			blk_mq_free_request(rq);
 	} else {
 		blk_mq_free_request(rq);
@@ -1211,7 +1211,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
 		 * If end_io handler returns NONE, then it still has
 		 * ownership of the request.
 		 */
-		if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE)
+		if (rq->end_io && rq->end_io(rq, 0, iob) == RQ_END_IO_NONE)
 			continue;
 
 		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
@@ -1458,7 +1458,8 @@ struct blk_rq_wait {
 	blk_status_t ret;
 };
 
-static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret)
+static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret,
+					  const struct io_comp_batch *iob)
 {
 	struct blk_rq_wait *wait = rq->end_io_data;
 
@@ -1688,7 +1689,7 @@ static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expi
 void blk_mq_put_rq_ref(struct request *rq)
 {
 	if (is_flush_rq(rq)) {
-		if (rq->end_io(rq, 0) == RQ_END_IO_FREE)
+		if (rq->end_io(rq, 0, NULL) == RQ_END_IO_FREE)
 			blk_mq_free_request(rq);
 	} else if (req_ref_put_and_test(rq)) {
 		__blk_mq_free_request(rq);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index a6ca92049c10e..e9a7563b4b2fe 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -295,7 +295,8 @@ static void dm_kill_unmapped_request(struct request *rq, blk_status_t error)
 }
 
 static enum rq_end_io_ret end_clone_request(struct request *clone,
-					    blk_status_t error)
+					    blk_status_t error,
+					    const struct io_comp_batch *iob)
 {
 	struct dm_rq_target_io *tio = clone->end_io_data;
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7bf228df6001f..19b67cf5d5506 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1333,7 +1333,8 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
 }
 
 static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
-						 blk_status_t status)
+						 blk_status_t status,
+						 const struct io_comp_batch *iob)
 {
 	struct nvme_ctrl *ctrl = rq->end_io_data;
 	unsigned long rtt = jiffies - (rq->deadline - rq->timeout);
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index a9c097dacad6f..e45ac0ca174e0 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -410,7 +410,8 @@ static void nvme_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw)
 }
 
 static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
-						blk_status_t err)
+						blk_status_t err,
+						const struct io_comp_batch *iob)
 {
 	struct io_uring_cmd *ioucmd = req->end_io_data;
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 0e4caeab739c7..92d9eff3d4608 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1616,7 +1616,8 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
 	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
 }
 
-static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error)
+static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error,
+				      const struct io_comp_batch *iob)
 {
 	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
 
@@ -2859,7 +2860,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 }
 
 static enum rq_end_io_ret nvme_del_queue_end(struct request *req,
-					     blk_status_t error)
+					     blk_status_t error,
+					     const struct io_comp_batch *iob)
 {
 	struct nvme_queue *nvmeq = req->end_io_data;
 
@@ -2869,14 +2871,15 @@ static enum rq_end_io_ret nvme_del_queue_end(struct request *req,
 }
 
 static enum rq_end_io_ret nvme_del_cq_end(struct request *req,
-					  blk_status_t error)
+					  blk_status_t error,
+					  const struct io_comp_batch *iob)
 {
 	struct nvme_queue *nvmeq = req->end_io_data;
 
 	if (error)
 		set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
 
-	return nvme_del_queue_end(req, error);
+	return nvme_del_queue_end(req, error, iob);
 }
 
 static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index 96648ec2fadb5..0823c87637d37 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -247,7 +247,8 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w)
 }
 
 static enum rq_end_io_ret nvmet_passthru_req_done(struct request *rq,
-						  blk_status_t blk_status)
+						  blk_status_t blk_status,
+						  const struct io_comp_batch *iob)
 {
 	struct nvmet_req *req = rq->end_io_data;
 
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index eebca96c1fc15..3f3710ea1a988 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -2109,7 +2109,8 @@ enum scsi_disposition scsi_decide_disposition(struct scsi_cmnd *scmd)
 }
 
 static enum rq_end_io_ret eh_lock_door_done(struct request *req,
-					    blk_status_t status)
+					    blk_status_t status,
+					    const struct io_comp_batch *iob)
 {
 	blk_mq_free_request(req);
 	return RQ_END_IO_NONE;
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 57fba34832ad1..1a521f9d821a0 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -177,7 +177,8 @@ typedef struct sg_device { /* holds the state of each scsi generic device */
 } Sg_device;
 
 /* tasklet or soft irq callback */
-static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status);
+static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status,
+				       const struct io_comp_batch *iob);
 static int sg_start_req(Sg_request *srp, unsigned char *cmd);
 static int sg_finish_rem_req(Sg_request * srp);
 static int sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size);
@@ -1309,7 +1310,8 @@ sg_rq_end_io_usercontext(struct work_struct *work)
  * level when a command is completed (or has failed).
  */
 static enum rq_end_io_ret
-sg_rq_end_io(struct request *rq, blk_status_t status)
+sg_rq_end_io(struct request *rq, blk_status_t status,
+	     const struct io_comp_batch *iob)
 {
 	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq);
 	struct sg_request *srp = rq->end_io_data;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 168f25e4aaa38..8aeaa3b68c25e 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -525,7 +525,8 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req)
 }
 
 static enum rq_end_io_ret st_scsi_execute_end(struct request *req,
-					      blk_status_t status)
+					      blk_status_t status,
+					      const struct io_comp_batch *iob)
 {
 	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req);
 	struct st_request *SRpnt = req->end_io_data;
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index db4e09042469c..823b2665f95b0 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -39,7 +39,8 @@ static inline struct pscsi_dev_virt *PSCSI_DEV(struct se_device *dev)
 }
 
 static sense_reason_t pscsi_execute_cmd(struct se_cmd *cmd);
-static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t);
+static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t,
+					 const struct io_comp_batch *);
 
 /*	pscsi_attach_hba():
  *
@@ -1001,7 +1002,8 @@ static sector_t pscsi_get_blocks(struct se_device *dev)
 }
 
 static enum rq_end_io_ret pscsi_req_done(struct request *req,
-					 blk_status_t status)
+					 blk_status_t status,
+					 const struct io_comp_batch *iob)
 {
 	struct se_cmd *cmd = req->end_io_data;
 	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index cae9e857aea42..18a2388ba581d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -13,6 +13,7 @@
 
 struct blk_mq_tags;
 struct blk_flush_queue;
+struct io_comp_batch;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_DEFAULT_RQ	128
@@ -22,7 +23,8 @@ enum rq_end_io_ret {
 	RQ_END_IO_FREE,
 };
 
-typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t);
+typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t,
+					  const struct io_comp_batch *);
 
 /*
  * request flags */

From eb9f85441b40ef800be240e55d4f7fd1235b4e91 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 16 Jan 2026 15:46:38 +0800
Subject: [PATCH 2/2] nvme/io_uring: optimize IOPOLL completions for local ring
 context

When multiple io_uring rings poll on the same NVMe queue, one ring can
find completions belonging to another ring. The current code always
uses task_work to handle this, but this adds overhead for the common
single-ring case.

This patch passes the polling io_ring_ctx through io_comp_batch's new
poll_ctx field. In io_do_iopoll(), the polling ring's context is stored
in iob.poll_ctx before calling the iopoll callbacks.

In nvme_uring_cmd_end_io(), we now compare iob->poll_ctx with the
request's owning io_ring_ctx (via io_uring_cmd_ctx_handle()). If they
match (local context), we complete inline with io_uring_cmd_done32().
If they differ (remote context) or iob is NULL (non-iopoll path), we
use task_work as before.

This optimization eliminates task_work scheduling overhead for the
common case where a ring polls and finds its own completions.

~10% IOPS improvement is observed in the following benchmark:

fio/t/io_uring -b512 -d128 -c32 -s32 -p1 -F1 -O0 -P1 -u1 -n1 /dev/ng0n1

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/nvme/host/ioctl.c | 20 +++++++++++++-------
 include/linux/blkdev.h    |  1 +
 io_uring/rw.c             |  6 ++++++
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index e45ac0ca174e0..fb62633ccbb0d 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -426,14 +426,20 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
 	pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
 
 	/*
-	 * IOPOLL could potentially complete this request directly, but
-	 * if multiple rings are polling on the same queue, then it's possible
-	 * for one ring to find completions for another ring. Punting the
-	 * completion via task_work will always direct it to the right
-	 * location, rather than potentially complete requests for ringA
-	 * under iopoll invocations from ringB.
+	 * For IOPOLL, check if this completion is happening in the context
+	 * of the same io_ring that owns the request (local context). If so,
+	 * we can complete inline without task_work overhead. Otherwise, we
+	 * must punt to task_work to ensure completion happens in the correct
+	 * ring's context.
 	 */
-	io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
+	if (blk_rq_is_poll(req) && iob &&
+	    iob->poll_ctx == io_uring_cmd_ctx_handle(ioucmd)) {
+		if (pdu->bio)
+			blk_rq_unmap_user(pdu->bio);
+		io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, 0);
+	} else {
+		io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
+	}
 	return RQ_END_IO_FREE;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 72e34acd439c9..eae487e79a350 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1822,6 +1822,7 @@ struct io_comp_batch {
 	struct rq_list req_list;
 	bool need_ts;
 	void (*complete)(struct io_comp_batch *);
+	void *poll_ctx;
 };
 
 static inline bool blk_atomic_write_start_sect_aligned(sector_t sector,
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 70ca88cc1f547..ff3192f603f35 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -1320,6 +1320,12 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 	DEFINE_IO_COMP_BATCH(iob);
 	int nr_events = 0;
 
+	/*
+	 * Store the polling io_ring_ctx so drivers can detect if they're
+	 * completing a request in the same ring context that's polling.
+	 */
+	iob.poll_ctx = ctx;
+
 	/*
 	 * Only spin for completions if we don't have multiple devices hanging
 	 * off our complete list.