diff --git a/block/blk-flush.c b/block/blk-flush.c index 43d6152897a4..403a46c86411 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -199,7 +199,8 @@ static void blk_flush_complete_seq(struct request *rq, } static enum rq_end_io_ret flush_end_io(struct request *flush_rq, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct request_queue *q = flush_rq->q; struct list_head *running; @@ -335,7 +336,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, } static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; diff --git a/block/blk-mq.c b/block/blk-mq.c index a29d8ac9d3e3..cf1daedbb39f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1156,7 +1156,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) if (rq->end_io) { rq_qos_done(rq->q, rq); - if (rq->end_io(rq, error) == RQ_END_IO_FREE) + if (rq->end_io(rq, error, NULL) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else { blk_mq_free_request(rq); @@ -1211,7 +1211,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) * If end_io handler returns NONE, then it still has * ownership of the request. */ - if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE) + if (rq->end_io && rq->end_io(rq, 0, iob) == RQ_END_IO_NONE) continue; WRITE_ONCE(rq->state, MQ_RQ_IDLE); @@ -1458,7 +1458,8 @@ struct blk_rq_wait { blk_status_t ret; }; -static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret) +static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret, + const struct io_comp_batch *iob) { struct blk_rq_wait *wait = rq->end_io_data; @@ -1688,7 +1689,7 @@ static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expi void blk_mq_put_rq_ref(struct request *rq) { if (is_flush_rq(rq)) { - if (rq->end_io(rq, 0) == RQ_END_IO_FREE) + if (rq->end_io(rq, 0, NULL) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else if (req_ref_put_and_test(rq)) { __blk_mq_free_request(rq); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index a6ca92049c10..e9a7563b4b2f 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -295,7 +295,8 @@ static void dm_kill_unmapped_request(struct request *rq, blk_status_t error) } static enum rq_end_io_ret end_clone_request(struct request *clone, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct dm_rq_target_io *tio = clone->end_io_data; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7bf228df6001..19b67cf5d550 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1333,7 +1333,8 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl) } static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { struct nvme_ctrl *ctrl = rq->end_io_data; unsigned long rtt = jiffies - (rq->deadline - rq->timeout); diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index a9c097dacad6..fb62633ccbb0 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -410,7 +410,8 @@ static void nvme_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw) } static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, - blk_status_t err) + blk_status_t err, + const struct io_comp_batch *iob) { struct io_uring_cmd *ioucmd = req->end_io_data; struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); @@ -425,14 +426,20 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, pdu->result = le64_to_cpu(nvme_req(req)->result.u64); /* - * IOPOLL could potentially complete this request directly, but - * if multiple rings are polling on the same queue, then it's possible - * for one ring to find completions for another ring. Punting the - * completion via task_work will always direct it to the right - * location, rather than potentially complete requests for ringA - * under iopoll invocations from ringB. + * For IOPOLL, check if this completion is happening in the context + * of the same io_ring that owns the request (local context). If so, + * we can complete inline without task_work overhead. Otherwise, we + * must punt to task_work to ensure completion happens in the correct + * ring's context. */ - io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + if (blk_rq_is_poll(req) && iob && + iob->poll_ctx == io_uring_cmd_ctx_handle(ioucmd)) { + if (pdu->bio) + blk_rq_unmap_user(pdu->bio); + io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, 0); + } else { + io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + } return RQ_END_IO_FREE; } diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0e4caeab739c..92d9eff3d460 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1616,7 +1616,8 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error) +static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error, + const struct io_comp_batch *iob) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; @@ -2859,7 +2860,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) } static enum rq_end_io_ret nvme_del_queue_end(struct request *req, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct nvme_queue *nvmeq = req->end_io_data; @@ -2869,14 +2871,15 @@ static enum rq_end_io_ret nvme_del_queue_end(struct request *req, } static enum rq_end_io_ret nvme_del_cq_end(struct request *req, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct nvme_queue *nvmeq = req->end_io_data; if (error) set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); - return nvme_del_queue_end(req, error); + return nvme_del_queue_end(req, error, iob); } static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 96648ec2fadb..0823c87637d3 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -247,7 +247,8 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w) } static enum rq_end_io_ret nvmet_passthru_req_done(struct request *rq, - blk_status_t blk_status) + blk_status_t blk_status, + const struct io_comp_batch *iob) { struct nvmet_req *req = rq->end_io_data; diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index eebca96c1fc1..3f3710ea1a98 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -2109,7 +2109,8 @@ enum scsi_disposition scsi_decide_disposition(struct scsi_cmnd *scmd) } static enum rq_end_io_ret eh_lock_door_done(struct request *req, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { blk_mq_free_request(req); return RQ_END_IO_NONE; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 57fba34832ad..1a521f9d821a 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -177,7 +177,8 @@ typedef struct sg_device { /* holds the state of each scsi generic device */ } Sg_device; /* tasklet or soft irq callback */ -static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status); +static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status, + const struct io_comp_batch *iob); static int sg_start_req(Sg_request *srp, unsigned char *cmd); static int sg_finish_rem_req(Sg_request * srp); static int sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size); @@ -1309,7 +1310,8 @@ sg_rq_end_io_usercontext(struct work_struct *work) * level when a command is completed (or has failed). */ static enum rq_end_io_ret -sg_rq_end_io(struct request *rq, blk_status_t status) +sg_rq_end_io(struct request *rq, blk_status_t status, + const struct io_comp_batch *iob) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq); struct sg_request *srp = rq->end_io_data; diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 168f25e4aaa3..8aeaa3b68c25 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -525,7 +525,8 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req) } static enum rq_end_io_ret st_scsi_execute_end(struct request *req, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); struct st_request *SRpnt = req->end_io_data; diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index db4e09042469..823b2665f95b 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -39,7 +39,8 @@ static inline struct pscsi_dev_virt *PSCSI_DEV(struct se_device *dev) } static sense_reason_t pscsi_execute_cmd(struct se_cmd *cmd); -static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t); +static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t, + const struct io_comp_batch *); /* pscsi_attach_hba(): * @@ -1001,7 +1002,8 @@ static sector_t pscsi_get_blocks(struct se_device *dev) } static enum rq_end_io_ret pscsi_req_done(struct request *req, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { struct se_cmd *cmd = req->end_io_data; struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index cae9e857aea4..18a2388ba581 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -13,6 +13,7 @@ struct blk_mq_tags; struct blk_flush_queue; +struct io_comp_batch; #define BLKDEV_MIN_RQ 4 #define BLKDEV_DEFAULT_RQ 128 @@ -22,7 +23,8 @@ enum rq_end_io_ret { RQ_END_IO_FREE, }; -typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); +typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t, + const struct io_comp_batch *); /* * request flags */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 72e34acd439c..eae487e79a35 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1822,6 +1822,7 @@ struct io_comp_batch { struct rq_list req_list; bool need_ts; void (*complete)(struct io_comp_batch *); + void *poll_ctx; }; static inline bool blk_atomic_write_start_sect_aligned(sector_t sector, diff --git a/io_uring/rw.c b/io_uring/rw.c index 70ca88cc1f54..ff3192f603f3 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1320,6 +1320,12 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) DEFINE_IO_COMP_BATCH(iob); int nr_events = 0; + /* + * Store the polling io_ring_ctx so drivers can detect if they're + * completing a request in the same ring context that's polling. + */ + iob.poll_ctx = ctx; + /* * Only spin for completions if we don't have multiple devices hanging * off our complete list.