From 4d1e59220eb7d38fbccce73ebc8f589ef99326f5 Mon Sep 17 00:00:00 2001 From: Milan Miladinovic Date: Tue, 23 Dec 2025 18:47:59 -0500 Subject: [PATCH] Run alarm without syncing alarm manager if currentTime past local time Historically, if a SQLite DOs local alarm time in SQLite didn't match the AlarmManager's alarm time, we would reschedule the alarm and try to run the alarm again later. However, if the time we are attempting to sync to is already before the current real system time, there is no point in rescheduling to run later, since we might as well run now. This commit runs the alarm immediately if the current system time is already past the time we are trying to sync the alarm manager to, since it's better to run an alarm a little late rather than very late. --- src/workerd/api/global-scope.c++ | 3 ++- src/workerd/io/actor-cache-test.c++ | 16 +++++++----- src/workerd/io/actor-cache.c++ | 6 ++++- src/workerd/io/actor-cache.h | 5 ++++ src/workerd/io/actor-sqlite-test.c++ | 38 +++++++++++++++------------- src/workerd/io/actor-sqlite.c++ | 35 ++++++++++++++++++++++--- src/workerd/io/actor-sqlite.h | 1 + 7 files changed, 76 insertions(+), 28 deletions(-) diff --git a/src/workerd/api/global-scope.c++ b/src/workerd/api/global-scope.c++ index 3542c4bc714..ebd2cc05ae3 100644 --- a/src/workerd/api/global-scope.c++ +++ b/src/workerd/api/global-scope.c++ @@ -438,8 +438,9 @@ kj::Promise ServiceWorkerGlobalScope::runAlarm(kj: } } + auto currentTime = kj::systemPreciseCalendarClock().now(); KJ_SWITCH_ONEOF(persistent.armAlarmHandler( - scheduledTime, context.getCurrentTraceSpan(), false, actorId)) { + scheduledTime, context.getCurrentTraceSpan(), currentTime, false, actorId)) { KJ_CASE_ONEOF(armResult, ActorCacheInterface::RunAlarmHandler) { auto& handler = KJ_REQUIRE_NONNULL(exportedHandler); if (handler.alarm == kj::none) { diff --git a/src/workerd/io/actor-cache-test.c++ b/src/workerd/io/actor-cache-test.c++ index 7a3ae5b6772..893d458f1e3 100644 --- a/src/workerd/io/actor-cache-test.c++ +++ b/src/workerd/io/actor-cache-test.c++ @@ -4938,6 +4938,8 @@ KJ_TEST("ActorCache alarm get/put") { auto oneMs = 1 * kj::MILLISECONDS + kj::UNIX_EPOCH; auto twoMs = 2 * kj::MILLISECONDS + kj::UNIX_EPOCH; + // Used as the "current time" parameter for armAlarmHandler in tests. + auto testCurrentTime = kj::UNIX_EPOCH; { // Test alarm writes happen transactionally with storage ops test.setAlarm(oneMs); @@ -4979,7 +4981,8 @@ KJ_TEST("ActorCache alarm get/put") { { // we have a cached time == nullptr, so we should not attempt to run an alarm - auto armResult = test.cache.armAlarmHandler(10 * kj::SECONDS + kj::UNIX_EPOCH, nullptr); + auto armResult = + test.cache.armAlarmHandler(10 * kj::SECONDS + kj::UNIX_EPOCH, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); auto cancelResult = kj::mv(armResult.get()); KJ_ASSERT(cancelResult.waitBeforeCancel.poll(ws)); @@ -4997,7 +5000,7 @@ KJ_TEST("ActorCache alarm get/put") { { // Test that alarm handler handle clears alarm when dropped with no writes { - auto armResult = test.cache.armAlarmHandler(oneMs, nullptr); + auto armResult = test.cache.armAlarmHandler(oneMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); } mockStorage->expectCall("deleteAlarm", ws) @@ -5010,7 +5013,7 @@ KJ_TEST("ActorCache alarm get/put") { // Test that alarm handler handle does not clear alarm when dropped with writes { - auto armResult = test.cache.armAlarmHandler(oneMs, nullptr); + auto armResult = test.cache.armAlarmHandler(oneMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); test.setAlarm(twoMs); } @@ -5024,7 +5027,7 @@ KJ_TEST("ActorCache alarm get/put") { // Test that alarm handler handle does not cache delete when it fails { - auto armResult = test.cache.armAlarmHandler(oneMs, nullptr); + auto armResult = test.cache.armAlarmHandler(oneMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); } mockStorage->expectCall("deleteAlarm", ws) @@ -5036,7 +5039,7 @@ KJ_TEST("ActorCache alarm get/put") { { // Test that alarm handler handle does not cache alarm delete when noCache == true { - auto armResult = test.cache.armAlarmHandler(twoMs, nullptr, true); + auto armResult = test.cache.armAlarmHandler(twoMs, nullptr, testCurrentTime, true); KJ_ASSERT(armResult.is()); } mockStorage->expectCall("deleteAlarm", ws) @@ -5073,6 +5076,7 @@ KJ_TEST("ActorCache alarm delete when flush fails") { auto& mockStorage = test.mockStorage; auto oneMs = 1 * kj::MILLISECONDS + kj::UNIX_EPOCH; + auto testCurrentTime = kj::UNIX_EPOCH; { auto time = expectUncached(test.getAlarm()); @@ -5090,7 +5094,7 @@ KJ_TEST("ActorCache alarm delete when flush fails") { // we want to test that even if a flush is retried // that the post-delete actions for a checked delete happen. { - auto handle = test.cache.armAlarmHandler(oneMs, nullptr); + auto handle = test.cache.armAlarmHandler(oneMs, nullptr, testCurrentTime); auto time = expectCached(test.getAlarm()); KJ_ASSERT(time == kj::none); diff --git a/src/workerd/io/actor-cache.c++ b/src/workerd/io/actor-cache.c++ index 4cf33d65631..c87edd94c2e 100644 --- a/src/workerd/io/actor-cache.c++ +++ b/src/workerd/io/actor-cache.c++ @@ -164,7 +164,11 @@ kj::Maybe> ActorCache::evictStale(kj::Date now) { } kj::OneOf ActorCache::armAlarmHandler( - kj::Date scheduledTime, SpanParent parentSpan, bool noCache, kj::StringPtr actorId) { + kj::Date scheduledTime, + SpanParent parentSpan, + kj::Date /*currentTime -- unused*/, + bool noCache, + kj::StringPtr actorId) { noCache = noCache || lru.options.noCache; KJ_ASSERT(!currentAlarmTime.is()); diff --git a/src/workerd/io/actor-cache.h b/src/workerd/io/actor-cache.h index 38365622cf2..5aa4b4a4994 100644 --- a/src/workerd/io/actor-cache.h +++ b/src/workerd/io/actor-cache.h @@ -243,8 +243,12 @@ class ActorCacheInterface: public ActorCacheOps { }; // Call when entering the alarm handler. + // + // `currentTime` is used to determine if an overdue alarm should run immediately even when + // the local alarm state differs from the scheduled time (to avoid blocking on storage sync). virtual kj::OneOf armAlarmHandler(kj::Date scheduledTime, SpanParent parentSpan, + kj::Date currentTime, bool noCache = false, kj::StringPtr actorId = "") = 0; @@ -363,6 +367,7 @@ class ActorCache final: public ActorCacheInterface { kj::OneOf armAlarmHandler(kj::Date scheduledTime, SpanParent parentSpan, + kj::Date currentTime, bool noCache = false, kj::StringPtr actorId = "") override; void cancelDeferredAlarmDeletion() override; diff --git a/src/workerd/io/actor-sqlite-test.c++ b/src/workerd/io/actor-sqlite-test.c++ index f926cf8c7f5..212ed027c78 100644 --- a/src/workerd/io/actor-sqlite-test.c++ +++ b/src/workerd/io/actor-sqlite-test.c++ @@ -20,6 +20,10 @@ static constexpr kj::Date twoMs = 2 * kj::MILLISECONDS + kj::UNIX_EPOCH; static constexpr kj::Date threeMs = 3 * kj::MILLISECONDS + kj::UNIX_EPOCH; static constexpr kj::Date fourMs = 4 * kj::MILLISECONDS + kj::UNIX_EPOCH; static constexpr kj::Date fiveMs = 5 * kj::MILLISECONDS + kj::UNIX_EPOCH; +// Used as the "current time" parameter for armAlarmHandler in tests. +// Set to epoch (before all test alarm times) so existing tests aren't affected by +// the overdue alarm check. +static constexpr kj::Date testCurrentTime = kj::UNIX_EPOCH; template kj::Promise eagerlyReportExceptions(kj::Promise promise, kj::SourceLocation location = {}) { @@ -586,7 +590,7 @@ KJ_TEST("tells alarm handler to cancel when committed alarm is empty") { ActorSqliteTest test; { - auto armResult = test.actor.armAlarmHandler(oneMs, nullptr); + auto armResult = test.actor.armAlarmHandler(oneMs, nullptr, testCurrentTime); // We expect armAlarmHandler() to tell us to cancel the alarm. KJ_ASSERT(armResult.is()); auto waitPromise = kj::mv(armResult.get().waitBeforeCancel); @@ -612,7 +616,7 @@ KJ_TEST("tells alarm handler to reschedule when handler alarm is later than comm KJ_ASSERT(expectSync(test.getAlarm()) == oneMs); // Request handler run at 2ms. Expect cancellation with rescheduling. - auto armResult = test.actor.armAlarmHandler(twoMs, nullptr); + auto armResult = test.actor.armAlarmHandler(twoMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); auto cancelResult = kj::mv(armResult.get()); @@ -636,7 +640,7 @@ KJ_TEST("tells alarm handler to reschedule when handler alarm is earlier than co KJ_ASSERT(expectSync(test.getAlarm()) == twoMs); // Expect that armAlarmHandler() tells caller to cancel after rescheduling completes. - auto armResult = test.actor.armAlarmHandler(oneMs, nullptr); + auto armResult = test.actor.armAlarmHandler(oneMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); auto cancelResult = kj::mv(armResult.get()); @@ -661,7 +665,7 @@ KJ_TEST("does not cancel handler when local db alarm state is later than schedul test.setAlarm(twoMs); { - auto armResult = test.actor.armAlarmHandler(oneMs, nullptr); + auto armResult = test.actor.armAlarmHandler(oneMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); } test.pollAndExpectCalls({"commit"})[0]->fulfill(); @@ -680,7 +684,7 @@ KJ_TEST("does not cancel handler when local db alarm state is earlier than sched test.setAlarm(oneMs); { - auto armResult = test.actor.armAlarmHandler(twoMs, nullptr); + auto armResult = test.actor.armAlarmHandler(twoMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); } test.pollAndExpectCalls({"scheduleRun(1ms)"})[0]->fulfill(); @@ -698,7 +702,7 @@ KJ_TEST("getAlarm() returns null during handler") { KJ_ASSERT(expectSync(test.getAlarm()) == oneMs); { - auto armResult = test.actor.armAlarmHandler(oneMs, nullptr); + auto armResult = test.actor.armAlarmHandler(oneMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); test.pollAndExpectCalls({}); @@ -719,7 +723,7 @@ KJ_TEST("alarm handler handle clears alarm when dropped with no writes") { KJ_ASSERT(expectSync(test.getAlarm()) == oneMs); { - auto armResult = test.actor.armAlarmHandler(oneMs, nullptr); + auto armResult = test.actor.armAlarmHandler(oneMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); } test.pollAndExpectCalls({"commit"})[0]->fulfill(); @@ -738,7 +742,7 @@ KJ_TEST("alarm deleter does not clear alarm when dropped with writes") { KJ_ASSERT(expectSync(test.getAlarm()) == oneMs); { - auto armResult = test.actor.armAlarmHandler(oneMs, nullptr); + auto armResult = test.actor.armAlarmHandler(oneMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); test.setAlarm(twoMs); } @@ -759,7 +763,7 @@ KJ_TEST("can cancel deferred alarm deletion during handler") { KJ_ASSERT(expectSync(test.getAlarm()) == oneMs); { - auto armResult = test.actor.armAlarmHandler(oneMs, nullptr); + auto armResult = test.actor.armAlarmHandler(oneMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); test.actor.cancelDeferredAlarmDeletion(); } @@ -778,7 +782,7 @@ KJ_TEST("canceling deferred alarm deletion outside handler has no effect") { KJ_ASSERT(expectSync(test.getAlarm()) == oneMs); { - auto armResult = test.actor.armAlarmHandler(oneMs, nullptr); + auto armResult = test.actor.armAlarmHandler(oneMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); } test.pollAndExpectCalls({"commit"})[0]->fulfill(); @@ -803,7 +807,7 @@ KJ_TEST("canceling deferred alarm deletion outside handler edge case") { KJ_ASSERT(expectSync(test.getAlarm()) == oneMs); { - auto armResult = test.actor.armAlarmHandler(oneMs, nullptr); + auto armResult = test.actor.armAlarmHandler(oneMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); } test.actor.cancelDeferredAlarmDeletion(); @@ -825,7 +829,7 @@ KJ_TEST("canceling deferred alarm deletion is idempotent") { KJ_ASSERT(expectSync(test.getAlarm()) == oneMs); { - auto armResult = test.actor.armAlarmHandler(oneMs, nullptr); + auto armResult = test.actor.armAlarmHandler(oneMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); test.actor.cancelDeferredAlarmDeletion(); test.actor.cancelDeferredAlarmDeletion(); @@ -846,7 +850,7 @@ KJ_TEST("alarm handler cleanup succeeds when output gate is broken") { test.pollAndExpectCalls({}); KJ_ASSERT(expectSync(test.getAlarm()) == oneMs); - auto armResult = test.actor.armAlarmHandler(oneMs, nullptr); + auto armResult = test.actor.armAlarmHandler(oneMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); auto deferredDelete = kj::mv(armResult.get().deferredDelete); @@ -893,7 +897,7 @@ KJ_TEST("handler alarm is not deleted when commit fails") { KJ_ASSERT(expectSync(test.getAlarm()) == oneMs); { - auto armResult = test.actor.armAlarmHandler(oneMs, nullptr); + auto armResult = test.actor.armAlarmHandler(oneMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); KJ_ASSERT(expectSync(test.getAlarm()) == kj::none); @@ -1340,7 +1344,7 @@ KJ_TEST("rolling back transaction leaves deferred alarm deletion in expected sta KJ_ASSERT(expectSync(test.getAlarm()) == twoMs); { - auto armResult = test.actor.armAlarmHandler(twoMs, nullptr); + auto armResult = test.actor.armAlarmHandler(twoMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); auto txn = test.actor.startTransaction(); @@ -1373,7 +1377,7 @@ KJ_TEST("committing transaction leaves deferred alarm deletion in expected state KJ_ASSERT(expectSync(test.getAlarm()) == twoMs); { - auto armResult = test.actor.armAlarmHandler(twoMs, nullptr); + auto armResult = test.actor.armAlarmHandler(twoMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); auto txn = test.actor.startTransaction(); @@ -1404,7 +1408,7 @@ KJ_TEST("rolling back nested transaction leaves deferred alarm deletion in expec KJ_ASSERT(expectSync(test.getAlarm()) == twoMs); { - auto armResult = test.actor.armAlarmHandler(twoMs, nullptr); + auto armResult = test.actor.armAlarmHandler(twoMs, nullptr, testCurrentTime); KJ_ASSERT(armResult.is()); auto txn1 = test.actor.startTransaction(); diff --git a/src/workerd/io/actor-sqlite.c++ b/src/workerd/io/actor-sqlite.c++ index b8d2976f609..5353ee207c6 100644 --- a/src/workerd/io/actor-sqlite.c++ +++ b/src/workerd/io/actor-sqlite.c++ @@ -863,8 +863,11 @@ void ActorSqlite::shutdown(kj::Maybe maybeException) { } kj::OneOf ActorSqlite:: - armAlarmHandler( - kj::Date scheduledTime, SpanParent parentSpan, bool noCache, kj::StringPtr actorId) { + armAlarmHandler(kj::Date scheduledTime, + SpanParent parentSpan, + kj::Date currentTime, + bool noCache, + kj::StringPtr actorId) { KJ_ASSERT(!inAlarmHandler); if (haveDeferredDelete) { @@ -879,6 +882,19 @@ kj::OneOf ActorSq // If there's a clean db time that differs from the requested handler's scheduled time, this // run should be canceled. if (willFireEarlier(scheduledTime, localAlarmState)) { + // If the local alarm time is already in the past, just run the handler now. This avoids + // blocking alarm execution on storage sync when storage is overloaded. The alarm will + // either delete itself on success or reschedule on failure. + if (localAlarmState != kj::none && willFireEarlier(localAlarmState, currentTime)) { + LOG_WARNING_PERIODICALLY( + "NOSENTRY SQLite alarm overdue, running despite AlarmManager mismatch", scheduledTime, + KJ_ASSERT_NONNULL(localAlarmState), currentTime, actorId); + haveDeferredDelete = true; + inAlarmHandler = true; + static const DeferredAlarmDeleter disposer; + return RunAlarmHandler{.deferredDelete = kj::Own(this, disposer)}; + } + // If the handler's scheduled time is earlier than the clean scheduled time, we may be // recovering from a failed db commit or scheduling request, so we need to request that // the alarm be rescheduled for the current db time, and tell the caller to wait for @@ -909,10 +925,23 @@ kj::OneOf ActorSq // which suggests that either the alarm manager is working with stale data or that local // alarm time has somehow gotten out of sync with the scheduled alarm time. - // Only log if the alarm manager is significantly late (>10 seconds behind SQLite) // We know localAlarmState has a value here because we're in the branch where it's earlier // than scheduledTime (not equal, and not later). auto localTime = KJ_ASSERT_NONNULL(localAlarmState); + + // If the local alarm time is already in the past, just run the handler now. This avoids + // blocking alarm execution on storage sync when storage is overloaded. + if (localTime <= currentTime) { + LOG_WARNING_PERIODICALLY( + "NOSENTRY SQLite alarm overdue, running despite stale AlarmManager time", + scheduledTime, localTime, currentTime, actorId); + haveDeferredDelete = true; + inAlarmHandler = true; + static const DeferredAlarmDeleter disposer; + return RunAlarmHandler{.deferredDelete = kj::Own(this, disposer)}; + } + + // Only log if the alarm manager is significantly late (>10 seconds behind SQLite) if (scheduledTime - localTime > 10 * kj::SECONDS) { LOG_WARNING_PERIODICALLY( "NOSENTRY SQLite alarm handler canceled.", scheduledTime, actorId, localTime); diff --git a/src/workerd/io/actor-sqlite.h b/src/workerd/io/actor-sqlite.h index e51ed2a7fea..0f47e424605 100644 --- a/src/workerd/io/actor-sqlite.h +++ b/src/workerd/io/actor-sqlite.h @@ -96,6 +96,7 @@ class ActorSqlite final: public ActorCacheInterface, private kj::TaskSet::ErrorH void shutdown(kj::Maybe maybeException) override; kj::OneOf armAlarmHandler(kj::Date scheduledTime, SpanParent parentSpan, + kj::Date currentTime, bool noCache = false, kj::StringPtr actorId = "") override; void cancelDeferredAlarmDeletion() override;