From 8b85d700ded85ae20e2b84e451e4607fab14385d Mon Sep 17 00:00:00 2001 From: Ruizhe Zhou Date: Mon, 13 Oct 2025 16:54:17 +0800 Subject: [PATCH] perftest: fix premature exit when select() is interrupted by SIGALRM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The perftest framework makes extensive use of alarm() to control test duration (--duration) and to schedule periodic tasks. Functions such as `run_iter_bw()`, `run_iter_lat_send()`, and `run_iter_bi()` install a handler via `signal(SIGALRM, catch_alarm)` when the -D option is used, and then set an alarm. In `run_iter_bw_server()` and `run_iter_bi()`, a watchdog is also installed in iterations mode via `signal(SIGALRM, check_alive)` followed by `alarm(60)` to detect stalled tests. In the problematic case, `run_iter_bi()` with the -e option invokes `ctx_notify_send_recv_events()`, which performs a `select()` on two file descriptors: `ctx->recv_channel->fd` — CQ receive completion channel `ctx->send_channel->fd` — CQ send completion channel When a completion event is generated, the kernel marks the corresponding file descriptor readable and `select()` returns. However, due to low processing speed on the some NICs, no completion event is generated within 60 seconds(test case is not finished under high pressure test). The watchdog `alarm()` fires, delivering SIGALRM, which interrupts the blocking `select()` call. The function then exits with an error instead of retrying. This behavior exposes a robustness issue in perftest: SIGALRM in this context is meant only as a check-alive signal, not as a fatal condition. A `select()` call interrupted by SIGALRM should be restarted rather than causing an unexpected termination. This patch updates perftest to properly handle EINTR by retrying `select()` when it is interrupted by SIGALRM, ensuring correct behavior even under slow device processing conditions. Signed-off-by: Ruizhe Zhou --- src/perftest_communication.c | 5 +++-- src/perftest_resources.c | 3 +++ src/perftest_resources.h | 34 +++++++++++++++++++++++++++------- 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/src/perftest_communication.c b/src/perftest_communication.c index 5c89dd0f..6384811e 100755 --- a/src/perftest_communication.c +++ b/src/perftest_communication.c @@ -1125,9 +1125,10 @@ int rdma_client_connect(struct pingpong_context *ctx,struct perftest_parameters } if (event->event != RDMA_CM_EVENT_ESTABLISHED) { - fprintf(stderr, "Unexpected CM event bl blka %d\n", event->event); + fprintf(stderr, "Unexpected CM event bl blka %s; error: %d.\n", + rdma_event_str(event->event), event->status); rdma_ack_cm_event(event); - return FAILURE; + return FAILURE; } if (user_param->connection_type == UD) { diff --git a/src/perftest_resources.c b/src/perftest_resources.c index 94e95864..a260290e 100755 --- a/src/perftest_resources.c +++ b/src/perftest_resources.c @@ -108,6 +108,7 @@ static __always_inline int poll_completions( struct perftest_parameters* duration_param; struct check_alive_data check_alive_data; +volatile sig_atomic_t g_sigalarm_fired = 0; /****************************************************************************** * Beginning @@ -5859,6 +5860,7 @@ uint16_t ctx_get_local_lid(struct ibv_context *context,int port) ******************************************************************************/ void catch_alarm(int sig) { + g_sigalarm_fired = 1; switch (duration_param->state) { case START_STATE: duration_param->state = SAMPLE_STATE; @@ -5886,6 +5888,7 @@ void catch_alarm(int sig) void check_alive(int sig) { + g_sigalarm_fired = 1; if (check_alive_data.current_totrcnt > check_alive_data.last_totrcnt) { check_alive_data.last_totrcnt = check_alive_data.current_totrcnt; alarm(60); diff --git a/src/perftest_resources.h b/src/perftest_resources.h index 3bdc7952..5e2d894b 100644 --- a/src/perftest_resources.h +++ b/src/perftest_resources.h @@ -72,6 +72,9 @@ #include #include #include +#include +#include +#include #include "perftest_parameters.h" #define NUM_OF_RETRIES (10) @@ -185,6 +188,7 @@ static inline uint64_t build_wr_id(uint32_t wr_index, uint16_t qp_index) return ((uint64_t)wr_index) | ((uint64_t)qp_index << WR_ID_QP_INDEX_OFFSET); } +extern volatile sig_atomic_t g_sigalarm_fired; /****************************************************************************** * Perftest resources Structures and data types. ******************************************************************************/ @@ -865,15 +869,31 @@ static __inline void increase_rem_addr(struct ibv_send_wr *wr,int size,uint64_t static __inline int ctx_notify_send_recv_events(struct pingpong_context *ctx) { fd_set rfds; + int ret; - FD_ZERO(&rfds); - FD_SET(ctx->recv_channel->fd, &rfds); - FD_SET(ctx->send_channel->fd, &rfds); + do { + FD_ZERO(&rfds); + FD_SET(ctx->recv_channel->fd, &rfds); + FD_SET(ctx->send_channel->fd, &rfds); - if (select(MAX(ctx->recv_channel->fd, - ctx->send_channel->fd) + 1, - &rfds, NULL, NULL, NULL) == -1) { - fprintf(stderr, "Failed to get completion events\n"); + g_sigalarm_fired = 0; + + ret = select(MAX(ctx->recv_channel->fd, + ctx->send_channel->fd) + 1, + &rfds, NULL, NULL, NULL); + + if (ret == -1 && errno == EINTR) { + if (g_sigalarm_fired) { + fprintf(stderr, "Confirmed: select() was interrupted by SIGALARM. Retrying...\n"); + } else { + fprintf(stderr, "Warning: select() interrupted by another signal. Retrying...\n"); + } + } + + } while (ret == -1 && errno == EINTR); + + if (ret == -1) { + fprintf(stderr, "Failed to get completion events: %s\n", strerror(errno)); return FAILURE; }