From d67d0e351da3945b2faf830a89186c0b554bec0e Mon Sep 17 00:00:00 2001 From: Michael Karg Date: Tue, 16 Dec 2025 18:40:25 +0100 Subject: [PATCH] PrometheusSimple backend robustness improvements --- cardano-node/src/Cardano/Node/Tracing/API.hs | 25 +++++--- .../src/Cardano/Node/Tracing/StateRep.hs | 27 +++++++- trace-dispatcher/CHANGELOG.md | 1 + .../Cardano/Logging/Prometheus/NetworkRun.hs | 18 +++--- .../Cardano/Logging/Prometheus/TCPServer.hs | 61 ++++++++++++++++--- 5 files changed, 104 insertions(+), 28 deletions(-) diff --git a/cardano-node/src/Cardano/Node/Tracing/API.hs b/cardano-node/src/Cardano/Node/Tracing/API.hs index 43e702b8ff3..e33d1c88915 100644 --- a/cardano-node/src/Cardano/Node/Tracing/API.hs +++ b/cardano-node/src/Cardano/Node/Tracing/API.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE BangPatterns #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE MonoLocalBinds #-} {-# LANGUAGE PackageImports #-} @@ -11,7 +12,7 @@ module Cardano.Node.Tracing.API ) where import Cardano.Logging hiding (traceWith) -import Cardano.Logging.Prometheus.TCPServer (runPrometheusSimple) +import Cardano.Logging.Prometheus.TCPServer import Cardano.Node.Configuration.NodeAddress (PortNumber) import Cardano.Node.Configuration.POM (NodeConfiguration (..)) import Cardano.Node.Protocol.Types @@ -36,11 +37,12 @@ import Ouroboros.Network.NodeToNode (RemoteAddress) import Prelude +import Control.Concurrent.Async (link) import Control.DeepSeq (deepseq) import Control.Exception (SomeException (..)) -import Control.Monad (forM_) import "contra-tracer" Control.Tracer (traceWith) import "trace-dispatcher" Control.Tracer (nullTracer) +import Data.Functor.Contravariant ((>$<)) import qualified Data.Map.Strict as Map import Data.Maybe import Data.Time.Clock (getCurrentTime) @@ -83,10 +85,7 @@ initTraceDispatcher nc p networkMagic nodeKernel noBlockForging = do traceWith (nodeStateTracer tracers) NodeTracingOnlineConfiguring - mError <- kickoffPrometheusSimple - forM_ mError $ \errMsg -> - let errMsg' = "PrometheusSimple backend disabled due to initialisation error: " ++ errMsg - in traceWith (nodeStateTracer tracers) (NodeTracingFailure errMsg') + kickoffPrometheusSimple startResourceTracer (resourcesTracer tracers) @@ -105,7 +104,7 @@ initTraceDispatcher nc p networkMagic nodeKernel noBlockForging = do mkTracers :: TraceConfig -> IO ( IO () - , IO (Maybe String) + , IO () , Tracers RemoteAddress LocalAddress blk IO ) mkTracers trConfig = mdo @@ -113,8 +112,6 @@ initTraceDispatcher nc p networkMagic nodeKernel noBlockForging = do EKG.registerGcMetrics ekgStore ekgTrace <- ekgTracer trConfig ekgStore - let kickoffPrometheusSimple = maybe (pure Nothing) (runPrometheusSimple ekgStore) prometheusSimple - stdoutTrace <- standardTracer -- We should initialize forwarding only if 'Forwarder' backend @@ -155,6 +152,16 @@ initTraceDispatcher nc p networkMagic nodeKernel noBlockForging = do dpTracer trConfig p + + let + kickoffPrometheusSimple = case prometheusSimple of + Nothing -> pure () + Just ps -> + let + !nsTr = nodeStateTracer tracers + !tracePrometheus = NodePrometheusSimple >$< nsTr + in runPrometheusSimple tracePrometheus ekgStore ps >>= link + pure (kickoffForwarder, kickoffPrometheusSimple, tracers) where diff --git a/cardano-node/src/Cardano/Node/Tracing/StateRep.hs b/cardano-node/src/Cardano/Node/Tracing/StateRep.hs index 0a4e12bea43..4563ee4d819 100644 --- a/cardano-node/src/Cardano/Node/Tracing/StateRep.hs +++ b/cardano-node/src/Cardano/Node/Tracing/StateRep.hs @@ -22,6 +22,7 @@ module Cardano.Node.Tracing.StateRep import Cardano.Api (textShow) import Cardano.Logging +import Cardano.Logging.Prometheus.TCPServer (TracePrometheusSimple (..)) import Cardano.Node.Handlers.Shutdown (ShutdownTrace) import Cardano.Node.Protocol.Types (SomeConsensusProtocol (..)) import qualified Cardano.Node.Startup as Startup @@ -48,6 +49,11 @@ deriving instance ToJSON ChunkNo deriving instance NFData ChunkNo +deriving instance Generic TracePrometheusSimple +deriving instance FromJSON TracePrometheusSimple +deriving instance ToJSON TracePrometheusSimple +deriving instance NFData TracePrometheusSimple + data OpeningDbs = StartedOpeningImmutableDB | OpenedImmutableDB (WithOrigin SlotNo) ChunkNo @@ -100,6 +106,7 @@ data NodeState = NodeTracingOnlineConfiguring | NodeTracingFailure String | NodeTracingForwardingInterrupted HowToConnect String + | NodePrometheusSimple TracePrometheusSimple | NodeOpeningDbs OpeningDbs | NodeReplays Replays | NodeInitChainSelection InitChainSelection @@ -112,7 +119,7 @@ data NodeState deriving instance (NFData NodeState) instance LogFormatting NodeState where - forMachine _ = \case + forMachine _dtal = \case NodeTracingOnlineConfiguring -> mconcat [ "kind" .= String "NodeTracingOnlineConfiguring" ] NodeOpeningDbs x -> mconcat @@ -136,12 +143,16 @@ instance LogFormatting NodeState where , "conn" .= howToConnect , "message" .= toJSON x ] + NodePrometheusSimple promSimple -> + forMachine _dtal promSimple forHuman = \case NodeTracingFailure errMsg -> T.pack errMsg NodeTracingForwardingInterrupted howToConnect errMsg -> T.pack $ "trace forwarding connection with " <> show howToConnect <> " failed: " <> errMsg + NodePrometheusSimple promSimple -> + forHuman promSimple _ -> "" @@ -152,6 +163,10 @@ instance MetaTrace NodeState where Namespace [] ["NodeTracingFailure"] namespaceFor NodeTracingForwardingInterrupted {} = Namespace [] ["NodeTracingForwardingInterrupted"] + namespaceFor (NodePrometheusSimple TracePrometheusSimpleStart{}) = + Namespace [] ["PrometheusSimple", "Start"] + namespaceFor (NodePrometheusSimple TracePrometheusSimpleStop{}) = + Namespace [] ["PrometheusSimple", "Stop"] namespaceFor NodeOpeningDbs {} = Namespace [] ["OpeningDbs"] namespaceFor NodeReplays {} = @@ -173,6 +188,10 @@ instance MetaTrace NodeState where Just Error severityFor (Namespace _ ["NodeTracingForwardingInterrupted"]) _ = Just Warning + severityFor (Namespace _ ["PrometheusSimple", "Start"]) _ = + Just Info + severityFor (Namespace _ ["PrometheusSimple", "Stop"]) _ = + Just Warning severityFor (Namespace _ ["OpeningDbs"]) _ = Just Info severityFor (Namespace _ ["NodeReplays"]) _ = @@ -210,12 +229,18 @@ instance MetaTrace NodeState where "Node startup" documentFor (Namespace _ ["NodeShutdown"]) = Just "Node shutting down" + documentFor (Namespace _ ["PrometheusSimple", "Start"]) = + Just "PrometheusSimple backend is starting" + documentFor (Namespace _ ["PrometheusSimple", "Stop"]) = + Just "PrometheusSimple backend stopped" documentFor _ns = Nothing allNamespaces = [ Namespace [] ["NodeTracingOnlineConfiguring"] , Namespace [] ["NodeTracingFailure"] , Namespace [] ["NodeTracingForwardingInterrupted"] + , Namespace [] ["PrometheusSimple", "Start"] + , Namespace [] ["PrometheusSimple", "Stop"] , Namespace [] ["OpeningDbs"] , Namespace [] ["NodeReplays"] , Namespace [] ["NodeInitChainSelection"] diff --git a/trace-dispatcher/CHANGELOG.md b/trace-dispatcher/CHANGELOG.md index 08a8037431d..53feb7fd53f 100644 --- a/trace-dispatcher/CHANGELOG.md +++ b/trace-dispatcher/CHANGELOG.md @@ -2,6 +2,7 @@ ## 2.11.1 -- Dez 2025 +* Increase `PrometheusSimple` robustness by restarting the backend upon crash, adding start/stop traces and more eagerly reaping of dangling sockets * Removed `TraceConfig.tcPeerFrequency` and hence `TraceOptionPeerFrequency` from config representation * Removed unused module `Cardano.Logging.Types.NodePeers` diff --git a/trace-dispatcher/src/Cardano/Logging/Prometheus/NetworkRun.hs b/trace-dispatcher/src/Cardano/Logging/Prometheus/NetworkRun.hs index c6cd98f56d9..0bcc40d4ac7 100644 --- a/trace-dispatcher/src/Cardano/Logging/Prometheus/NetworkRun.hs +++ b/trace-dispatcher/src/Cardano/Logging/Prometheus/NetworkRun.hs @@ -44,12 +44,12 @@ data NetworkRunParams = NetworkRunParams defaultRunParams :: String -> NetworkRunParams defaultRunParams name = NetworkRunParams - { runSocketTimeout = 30 + { runSocketTimeout = 22 , runSocketGraceful = 1000 , runRecvMaxSize = 2048 , runRateLimit = 3.0 , runConnLimitGlobal = 12 - , runConnLimitPerHost = 3 + , runConnLimitPerHost = 4 , runServerName = name } @@ -68,8 +68,8 @@ mkTCPServerRunner :: NetworkRunParams -> Maybe HostName -> PortNumber - -> TimeoutServer a - -> IO (IO a) + -> TimeoutServer () + -> IO (IO ()) mkTCPServerRunner runParams (fromMaybe "127.0.0.1" -> host) portNo server = do !sock <- openTCPServerSocket =<< resolve host portNo let @@ -81,8 +81,8 @@ mkTCPServerRunner runParams (fromMaybe "127.0.0.1" -> host) portNo server = do runTCPServerWithSocket :: NetworkRunParams -> Socket - -> TimeoutServer a - -> IO a + -> TimeoutServer () + -> IO () runTCPServerWithSocket runParams@NetworkRunParams{..} sock server = do rateLimiter <- mkRateLimiter runServerName runRateLimit ConnLimiter{..} <- mkConnLimiter runConnLimitGlobal runConnLimitPerHost @@ -91,13 +91,13 @@ runTCPServerWithSocket runParams@NetworkRunParams{..} sock server = do E.bracketOnError (accept sock) (close . fst) $ \(conn, peer) -> do noLimitHit <- canServeThisPeer peer if noLimitHit - then void $ forkFinally (server' mgr conn) (const $ gclose conn >> releasePeer peer) + then void $ forkFinally (runServer mgr conn) (const $ gclose conn >> releasePeer peer) else close conn where gclose = if runSocketGraceful > 0 then flip gracefulClose runSocketGraceful else close - server' mgr conn = do + runServer mgr conn = do threadLabelMe $ runServerName ++ " timeout server" - T.withHandle mgr (return ()) $ \timeoutHandle -> + T.withHandleKillThread mgr (return ()) $ \timeoutHandle -> server runParams (T.tickle timeoutHandle) conn resolve :: HostName -> PortNumber -> IO AddrInfo diff --git a/trace-dispatcher/src/Cardano/Logging/Prometheus/TCPServer.hs b/trace-dispatcher/src/Cardano/Logging/Prometheus/TCPServer.hs index 6f04948b28c..5f218221823 100644 --- a/trace-dispatcher/src/Cardano/Logging/Prometheus/TCPServer.hs +++ b/trace-dispatcher/src/Cardano/Logging/Prometheus/TCPServer.hs @@ -1,20 +1,35 @@ +{-# LANGUAGE PackageImports #-} + +{-# OPTIONS_GHC -Wno-partial-fields #-} + -- | Run a simple Prometheus TCP server, responding *only* to the '/metrics' URL with current Node metrics -module Cardano.Logging.Prometheus.TCPServer (runPrometheusSimple) where +module Cardano.Logging.Prometheus.TCPServer + ( runPrometheusSimple + , runPrometheusSimpleSilent + + , TracePrometheusSimple (..) + ) where import Cardano.Logging.Prometheus.Exposition (renderExpositionFromSample) import Cardano.Logging.Prometheus.NetworkRun +import Cardano.Logging.Types +import Cardano.Logging.Utils (runInLoop, showT) -import Control.Concurrent.Async (async, link) +import Control.Concurrent.Async (Async, async) import qualified Control.Exception as E -import Control.Monad (when) +import Control.Monad (join, when) +import "contra-tracer" Control.Tracer +import Data.Aeson.Types as AE (Value (String), (.=)) import Data.ByteString (ByteString) import Data.ByteString.Builder import qualified Data.ByteString.Char8 as BC import Data.Int (Int64) import Data.List (find, intersperse) +import Data.Text as TS (pack) import Data.Text.Lazy (Text) import qualified Data.Text.Lazy as T import qualified Data.Text.Lazy.Encoding as T (encodeUtf8Builder) +import Data.Word (Word16) import Network.HTTP.Date (epochTimeToHTTPDate, formatHTTPDate) import Network.Socket (HostName, PortNumber) import qualified Network.Socket.ByteString as Strict (recv) @@ -24,13 +39,41 @@ import System.Posix.Types (EpochTime) import System.PosixCompat.Time (epochTime) --- Will provide a 'Just errormessage' iff creating the Prometheus server failed -runPrometheusSimple :: EKG.Store -> (Bool, Maybe HostName, PortNumber) -> IO (Maybe String) -runPrometheusSimple ekgStore (noSuffixes, mHost, portNo) = - E.try createRunner >>= \case - Left (E.SomeException e) -> pure (Just $ E.displayException e) - Right runner -> async runner >>= link >> pure Nothing +data TracePrometheusSimple = + TracePrometheusSimpleStart { port :: Word16 } + | TracePrometheusSimpleStop { message :: String } + deriving Show + +instance LogFormatting TracePrometheusSimple where + forMachine _ = \case + TracePrometheusSimpleStart portNo -> mconcat + [ "kind" .= AE.String "PrometheusSimpleStart" + , "port" .= portNo + ] + TracePrometheusSimpleStop message -> mconcat + [ "kind" .= AE.String "TracePrometheusSimpleStop" + , "message" .= message + ] + + forHuman = \case + TracePrometheusSimpleStart portNo -> "PrometheusSimple backend starting on port " <> showT portNo + TracePrometheusSimpleStop message -> "PrometheusSimple backend stop: " <> TS.pack message + + +-- Same as below, but will not trace anything +runPrometheusSimpleSilent :: EKG.Store -> (Bool, Maybe HostName, PortNumber) -> IO (Async ()) +runPrometheusSimpleSilent = runPrometheusSimple nullTracer + +-- Will retry / restart Prometheus server when an exception occurs, in increasing intervals +runPrometheusSimple :: Tracer IO TracePrometheusSimple -> EKG.Store -> (Bool, Maybe HostName, PortNumber) -> IO (Async ()) +runPrometheusSimple tr ekgStore (noSuffixes, mHost, portNo) = + async $ runInLoop fromScratchThrowing traceInterruption 1 60 where + traceInterruption (E.SomeException e) = + traceWith tr $ TracePrometheusSimpleStop (E.displayException e) + + fromScratchThrowing = traceWith tr (TracePrometheusSimpleStart $ fromIntegral portNo) >> join createRunner + getCurrentExposition = renderExpositionFromSample noSuffixes <$> sampleAll ekgStore createRunner = mkTCPServerRunner (defaultRunParams "PrometheusSimple") mHost portNo (serveAccepted getCurrentExposition)